
    qiK                     @   d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ d	d
lmZmZmZmZ ddlmZ  ej&                  e      Z G d ded      Z G d dej.                        Z G d de      Z G d de      Z G d de      Z G d de      Zg dZy)    )	TypedDictN)nn   )ACT2FN)Cache)Unpack)logging   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)GraniteFlashAttentionKwargsaT  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    cu_seq_lens_q (`torch.LongTensor`):
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`):
        Gets cumulative sequence length for key state.
    max_length_q (`int`):
        Maximum sequence length for query state.
    max_length_k (`int`):
        Maximum sequence length for key state.
    seq_idx (`torch.IntTensor):
        Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     o/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   r   $   s7      ######__r"   r   F)totalc                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                 `   t         |           |j                  | _        |j                  | _        t
        |j                     | _        t        j                  | j                  | j                  dz  d      | _
        t        j                  | j                  | j                  d      | _        y )Nr
   F)bias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr'   	__class__s     r#   r+   zGraniteMoeSharedMLP.__init__E   s     ,,!:: !2!23IIdoot7G7G!7KRWXYYt'7'7uUr"   hidden_statesreturnc                     | j                  |      }|j                  dd      }| j                  |d         |d   z  }| j                  |      }|S )Nr
   )dimr   r   )r2   chunkr0   r3   )r5   r7   chunked_hidden_statess      r#   forwardzGraniteMoeSharedMLP.forwardN   s^    ))-8 - 3 3A2 3 >(=a(@ADYZ[D\\**=9r"   )
r   r   r   r   r   r+   r   Tensorr>   __classcell__r6   s   @r#   r&   r&   <   s2    V5 VU\\ ell r"   r&   c                   p    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
e
dz  dej                  dz  deej                  ej                  f   dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )GraniteMoeSharedDecoderLayerr'   	layer_idxc                 t    t         |   ||       |j                  dk(  rd | _        y t        |      | _        y )Nr   )r*   r+   r.   r&   
shared_mlpr5   r'   rD   r6   s      r#   r+   z%GraniteMoeSharedDecoderLayer.__init__W   s3    +"("A"AQ"F$L_`fLgr"   Nr7   attention_maskposition_idspast_key_valuesoutput_attentions	use_cachecache_positionposition_embeddingskwargsr8   c	                 >   |}
| j                  |      } | j                  d||||||||d|	\  }}|
|| j                  z  z   }|}
| j                  |      }| j	                  |      }| j
                  |}n|| j                  |      z   }|
|| j                  z  z   }|S )N)r7   rH   rI   rJ   rK   rL   rM   rN   r!   )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moerF   )r5   r7   rH   rI   rJ   rK   rL   rM   rN   rO   residual_moe_hidden_statess                r#   r>   z$GraniteMoeSharedDecoderLayer.forward[   s     !,,]; *4>> 

')%+/) 3

 

q !=43K3K#KK 55mD 11-@??"-M-0NNM =43K3K#KKr"   )NNNFFNN)r   r   r   r   r   r+   r   r?   r   r   booltupler   r   FloatTensorr>   r@   rA   s   @r#   rC   rC   V   s   h5 h# h /304(,).!&26HL'||' t+' &&-	'
 '  $;' $;' ((4/' #5<<#=>E' 45' 
u  %(9(95;L;L(L"MPT"TT	U'r"   rC   c                       e Zd ZU eed<   dgZy)GraniteMoeSharedPreTrainedModelr'   rC   N)r   r   r   r   r   _no_split_modulesr!   r"   r#   r]   r]      s    ""78r"   r]   c                   $     e Zd Zdef fdZ xZS )GraniteMoeSharedModelr'   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w N)r*   r+   r   
ModuleListrangenum_hidden_layersrC   layersrG   s      r#   r+   zGraniteMoeSharedModel.__init__   sE     mmNSTZTlTlNmn)&)<n
ns   A)r   r   r   r   r+   r@   rA   s   @r#   r`   r`      s    
5 
 
r"   r`   c                   ,     e Zd ZddiZdef fdZ xZS )GraniteMoeSharedForCausalLMzlm_head.weightzmodel.embed_tokens.weightr'   c                 d    t         |   |       t        |      | _        | j	                          y rb   )r*   r+   r`   model	post_initr4   s     r#   r+   z$GraniteMoeSharedForCausalLM.__init__   s&     *62
r"   )r   r   r   _tied_weights_keysr   r+   r@   rA   s   @r#   rh   rh      s!    *,GH5  r"   rh   )rh   r`   r]   )typingr   r   r   activationsr   cache_utilsr   processing_utilsr   utilsr	   granitemoe.modeling_granitemoer   r   r   r   configuration_granitemoesharedr   
get_loggerr   loggerr   Moduler&   rC   r]   r`   rh   __all__r!   r"   r#   <module>rx      s       !   &   C 
		H	%)5 0")) 4,#9 ,^9&? 9

O 
"7  fr"   