
    qi,                     D   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,  ejZ                  e.      Z/ G d de!      Z0 G d de"      Z1 G d de      Z2 G d de      Z3 G d de&      Z4 G d  d!e*      Z5 G d" d#ejl                        Z7 G d$ d%e       Z8e G d& d'e             Z9e G d( d)e(             Z: G d* d+e'e      Z;g d,Z<y)-zPyTorch OLMoE model.    )CallableN)nn   )initialization)CacheDynamicCache)GenerationMixin)create_causal_mask)MoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)OutputRecorder   )GemmaMLP)LlamaAttentionLlamaDecoderLayerLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)MixtralExpertsMixtralForCausalLMMixtralModel)Qwen2MoeTopKRouter   )OlmoeConfigc                         e Zd Zd fd	Z xZS )OlmoeRMSNormc                 &    t         |   ||       y N)super__init__)selfhidden_sizeeps	__class__s      Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/olmoe/modular_olmoe.pyr&   zOlmoeRMSNorm.__init__.   s    c*    )gh㈵>)__name__
__module____qualname__r&   __classcell__r*   s   @r+   r"   r"   -   s    + +r,   r"   c                       e Zd Zy)OlmoeRotaryEmbeddingNr-   r.   r/    r,   r+   r3   r3   2       r,   r3   c                       e Zd Zy)OlmoeMLPNr4   r5   r,   r+   r8   r8   6   r6   r,   r8   c                   :    e Zd Zddededz  f fdZ	 	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  dz  eej                     dz  f   fdZ xZS )OlmoeAttentionNconfig	layer_idxc                     t         |   ||       t        |j                  |j                        | _        t        |j                  |j                  z  |j                  z  |j                        | _        y )Nr)   )	r%   r&   r"   r(   rms_norm_epsq_normnum_attention_headsnum_key_value_headsk_normr'   r;   r<   r*   s      r+   r&   zOlmoeAttention.__init__;   s`    +"6#5#56;N;NO"6#=#==A[A[[agatat
r,   hidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionkwargsreturnc           
         |j                   d d }g |d| j                  }| j                  | j                  |            }	| j	                  | j                  |            }
| j                  |      }| j                  j                  |	j                  | j                  j                   | j                  j                         |
j                  | j                  j                   | j                  j                         |j                  | j                  j                   | j                  j                          |	j                  | j                  dd      }	 |
j                  | j                  dd      }
 |j                  | j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                   | j                  j"                  t$              } || |	|
||f| j&                  sdn| j(                  | j*                  t-        | j                  dd       d|\  }} |j.                  g |d j1                         }| j3                  |      }||fS )	N)minmaxr   r   )sincosrI           sliding_window)dropoutscalingrS   )shapehead_dimr@   q_projrC   k_projv_projr;   clip_qkvclamp_view	transposer   updater<   r   get_interface_attn_implementationr   trainingattention_dropoutrU   getattrreshape
contiguouso_proj)r'   rE   rF   rG   rH   rI   rJ   input_shapehidden_shapequery_states
key_statesvalue_statesrQ   rP   cache_kwargsattention_interfaceattn_outputattn_weightss                     r+   forwardzOlmoeAttention.forwardB   sa    $))#2.88b8$--8{{4;;}#=>[[]!;<
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST(|((,7AA!QG$Z__l3==aC
(|((,7AA!QG&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ *k));;;;FFHkk+.L((r,   r$   )NN)r-   r.   r/   r    intr&   torchTensortupler   
LongTensorr   r   rq   r0   r1   s   @r+   r:   r:   :   s    
{ 
sTz 
 )-262)||2) #5<<#=>2) t+	2)
 2) ((4/2) +,2) 
u||U\\D0%2E2LL	M2)r,   r:   c                       e Zd Zy)OlmoeExpertsNr4   r5   r,   r+   rx   rx   w   r6   r,   rx   c                       e Zd Zy)OlmoeTopKRouterNr4   r5   r,   r+   rz   rz   {   r6   r,   rz   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )OlmoeSparseMoeBlockc                 b    t         |           t        |      | _        t	        |      | _        y r$   )r%   r&   rz   gaterx   expertsr'   r;   r*   s     r+   r&   zOlmoeSparseMoeBlock.__init__   s&    #F+	#F+r,   rE   rK   c                     |j                   \  }}}|j                  d|      }| j                  |      \  }}}| j                  |||      j	                  |||      }|S )NrM   )rV   r]   r~   r   re   )	r'   rE   
batch_sizesequence_length
hidden_dim_top_k_weightstop_k_indexfinal_hidden_statess	            r+   rq   zOlmoeSparseMoeBlock.forward   sh    2?2E2E/
OZ%**2z:(,		-(@%=+"ll=+}U]]
 #"r,   )r-   r.   r/   r&   rs   rt   rq   r0   r1   s   @r+   r|   r|      s#    ,
#U\\ #ell #r,   r|   c                   (     e Zd Zdedef fdZ xZS )OlmoeDecoderLayerr;   r<   c                 $   t         |   ||       |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y )N)r;   r<   r>   )r%   r&   r(   r:   	self_attnr|   mlpr"   r?   input_layernormpost_attention_layernormrD   s      r+   r&   zOlmoeDecoderLayer.__init__   sp    +!--'vK&v.+F,>,>FDWDWX(4V5G5GVM`M`(a%r,   )r-   r.   r/   r    rr   r&   r0   r1   s   @r+   r   r      s    b{ bs b br,   r   c                       e Zd ZU eed<   dZdZdgZdgZdZ	dZ
 eed      eedZdZ ej$                         d	        Zy
)OlmoePreTrainedModelr;   modelTr   rH   r   )index)router_logitsrE   
attentionsc                    t        j                  | |       t        |t              rmt	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         y t        |t              r7t	        j
                  |j                  d| j                  j                         y y )NrR   )meanstd)r   _init_weights
isinstancerx   initnormal_gate_up_projr;   initializer_range	down_projrz   weight)r'   modules     r+   r   z"OlmoePreTrainedModel._init_weights   s    %%dF3fl+LL,,3DKK<Y<YZLL))9V9VW0LLSdkk6S6ST 1r,   N)r-   r.   r/   r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpar   rz   r   r:   _can_record_outputs_supports_attention_backendrs   no_gradr   r5   r,   r+   r   r      sm    &*#,-#4"5N'qA*$ #'U]]_U Ur,   r   c                        e Zd Zdef fdZ	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	e
dz  d
ej                  dz  dee   defdZ xZS )
OlmoeModelr;   c           	         t         |   |       t        j                  |j                  |j
                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j
                  |j                        | _        t!        |      | _        y c c}w )Nr>   r;   )r%   r&   r   	Embedding
vocab_sizer(   padding_idxembed_tokens
ModuleListrangenum_hidden_layersr   layersr"   r?   normr3   
rotary_embrD   s      r+   r&   zOlmoeModel.__init__   s     LL):):F<N<NPTP`P`ammCHIaIaCbcivy1c
 !!3!39L9LM	.f= ds   1CN	input_idsrG   position_idsrH   inputs_embeds	use_cacherI   rJ   rK   c                 D   |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f||
||||d|} | j                  |      }t        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )device)r;   r   rG   rI   rH   r   )rF   rG   r   rH   r   rI   )last_hidden_staterH   )
ValueErrorr   r;   r   get_seq_lengthrs   arangerV   r   	unsqueezer
   r   r   r   r   r   )r'   r   rG   r   rH   r   r   rI   rJ   past_seen_tokenscausal_maskrE   rF   decoder_layers                 r+   rq   zOlmoeModel.forward   s`    -t";<YZZ0*$++>O  --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 & #oom\J![[)H4;;+H+HI 
	M)	$7*) /#-	 	M
	 		-0%++
 	
r,   )NNNNNNN)r-   r.   r/   r    r&   rs   rv   rt   r   FloatTensorboolr   r   r   rq   r0   r1   s   @r+   r   r      s    >{ > .2.204(,26!%26;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
 $;;
 ((4/;
 +,;
 
 ;
r,   r   c                   0     e Zd ZddiZ fdZ fdZ xZS )OlmoeForCausalLMzlm_head.weightzmodel.embed_tokens.weightc                 f    t         |   |       t        |      | _        |j                  | _        y r$   )r%   r&   r   r   num_expertsr   s     r+   r&   zOlmoeForCausalLM.__init__   s*     '
!--r,   c                 "    t        |   di |S )u  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OlmoeForCausalLM

        >>> model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924")
        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        'Hey, are you conscious? Can you talk to me?\nI’m not sure if you’re conscious of this, but I’m'
        ```
        r5   )r%   rq   )r'   super_kwargsr*   s     r+   rq   zOlmoeForCausalLM.forward  s    0 w...r,   )r-   r.   r/   _tied_weights_keysr&   rq   r0   r1   s   @r+   r   r      s    *,GH.
/ /r,   r   )r   r   r   )=__doc__collections.abcr   rs   r    r   r   cache_utilsr   r   
generationr	   masking_utilsr
   modeling_outputsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.output_capturingr   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   mixtral.modeling_mixtralr   r   r   qwen2_moe.modeling_qwen2_moer   configuration_olmoer    
get_loggerr-   loggerr"   r3   r8   r:   rx   rz   Moduler|   r   r   r   r   __all__r5   r,   r+   <module>r      s     $   & . ) / 6 F & @ @ 4 +  X W = , 
		H	%+< +
	/ 		x 	:)^ :)z	> 		( 	#")) # b) b U? U U4 E
 E
 E
P /)?  /F Er,   