Ë
    «q±iÜ8  ã                   óŽ  — d dl mZ d dlZd dlmZ d dlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZmZmZ d
dlmZ d
dlmZmZmZmZmZmZ  ej<                  e«      Z  G d„ de«      Z! G d„ de«      Z" G d„ de«      Z#d„ Z$ G d„ de«      Z% G d„ de«      Z& G d„ de«      Z' G d„ de«      Z( G d„ de«      Z)g d¢Z*y) é    )ÚCallableN)ÚTransformersKwargsé   )ÚCache)ÚRopeParameters)ÚALL_ATTENTION_FUNCTIONS)ÚUnpack)Úloggingé   )ÚLlamaPreTrainedModelÚLlamaRMSNormÚeager_attention_forward)Ú
OlmoConfig)ÚOlmoAttentionÚOlmoDecoderLayerÚOlmoForCausalLMÚ	OlmoModelÚOlmoRotaryEmbeddingÚapply_rotary_pos_embc            &       óL  ‡ — e Zd ZdZdZddddddddœZdgd	gfd
dgd
gfd
gd
gfdœZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d!dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  de	dz  de
dz  dedz  dedz  dedz  de
dz  deeeef   z  dz  de
dz  de	dz  dedz  f$ˆ fd „Zˆ xZS )"ÚOlmo2ConfigaÖ  
    This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50304):
            Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Olmo2Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 1):
            Padding token id.
        bos_token_id (`int`, *optional*):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 50279):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.

    ```python
    >>> from transformers import Olmo2Model, Olmo2Config

    >>> # Initializing a Olmo2 7B style configuration
    >>> configuration = Olmo2Config()

    >>> # Initializing a model from the Olmo2 7B style configuration
    >>> model = Olmo2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    Úolmo2Úcolwise_gather_outputÚrowwise_split_inputÚcolwiseÚrowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projÚ	input_idsÚinputs_embedsÚhidden_statesÚattention_mask)Úembed_tokensÚlayersÚnormNÚ
vocab_sizeÚhidden_sizeÚintermediate_sizeÚnum_hidden_layersÚnum_attention_headsÚnum_key_value_headsÚ
hidden_actÚmax_position_embeddingsÚinitializer_rangeÚ	use_cacheÚpad_token_idÚbos_token_idÚeos_token_idÚtie_word_embeddingsÚrope_parametersÚattention_biasÚattention_dropoutÚrms_norm_epsc                 óœ   •— t        ‰|   di d|“d|“d|“d|“d|“d|“d|“d|“d	|	“d
|
“d|“d|“d|“d|“d|“d|“d|“|¤Ž || _        | `y )Nr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   © )ÚsuperÚ__init__r5   Úclip_qkv)Úselfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   ÚkwargsÚ	__class__s                       €úY/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/olmo2/modular_olmo2.pyr9   zOlmo2Config.__init__‡   sÊ   ø€ ô, 	‰Ñò 	
Ù!ð	
á#ð	
ñ 0ð	
ñ 0ð		
ñ
 !4ð	
ñ !4ð	
ñ "ð	
ñ %<ð	
ñ 0ð	
ñ  ð	
ñ &ð	
ñ &ð	
ñ &ð	
ñ !4ð	
ñ ,ð	
ñ  *ð!	
ñ" 0Øò%	
ð* )ˆÔØ‰Mó    )i€Ä  i   i +  é    r@   NÚsilui   g{®Gáz”?Té   NigÄ  FNFç        gñhãˆµøä>)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_typeÚbase_model_tp_planÚbase_model_pp_planÚintÚstrÚfloatÚboolr   Údictr9   Ú__classcell__©r=   s   @r>   r   r   /   s°  ø„ ñEðN €Jà%<Ø%<Ø%<Ø%:Ø"+Ø )Ø"+ñÐð &˜¨Ð(9Ð:Ø#Ð%5Ð6¸Ð8IÐJØ!Ð" _Ð$5Ð6ñÐð "'Ø"&Ø(-Ø(*Ø*,Ø*.Ø!'Ø.2Ø*.Ø!%Ø#$Ø#'Ø#(Ø+0ØMQØ&+Ø*-Ø#'ñ',à˜$‘Jð,ð ˜4‘Zð,ð  ™:ð	,ð
  ™:ð,ð ! 4™Zð,ð ! 4™Zð,ð ˜$‘Jð,ð "% t¡ð,ð ! 4™<ð,ð ˜$‘;ð,ð ˜D‘jð,ð ˜D‘jð,ð ˜D‘jð,ð " D™[ð,ð  (¨$¨s°NÐ/BÑ*CÑCÀdÑJð!,ð" ˜t™ð#,ð$ ! 4™<ð%,ð& ˜D‘j÷',ñ ,r?   r   c                   ó   — e Zd Zd„ Zy)ÚOlmo2RMSNormc                 ó"  — |j                   }|j                  t        j                  «      }|j	                  d«      j                  dd¬«      }|t        j                  || j                  z   «      z  }| j                  |z  j                  |«      S )Nr   éÿÿÿÿT)Úkeepdim)	ÚdtypeÚtoÚtorchÚfloat32ÚpowÚmeanÚrsqrtÚvariance_epsilonÚweight)r;   r   Úinput_dtypeÚvariances       r>   ÚforwardzOlmo2RMSNorm.forward¹   sy   € Ø#×)Ñ)ˆØ%×(Ñ(¬¯©Ó7ˆØ ×$Ñ$ QÓ'×,Ñ,¨R¸Ð,Ó>ˆØ%¬¯©°H¸t×?TÑ?TÑ4TÓ(UÑUˆØ—‘˜mÑ+×/Ñ/°Ó<Ð<r?   N)rD   rE   rF   rb   r7   r?   r>   rS   rS   ¸   s   „ ó=r?   rS   c                   ó   — e Zd Zy)ÚOlmo2RotaryEmbeddingN©rD   rE   rF   r7   r?   r>   rd   rd   Á   ó   „ Ør?   rd   c                 óš   — | dd| j                   d   dz  …f   }| d| j                   d   dz  d…f   }t        j                  | |fd¬«      S )z*Rotates half the hidden dims of the input..NrU   r   )Údim)ÚshaperY   Úcat)ÚxÚx1Úx2s      r>   Úrotate_halfrn   Å   sZ   € à	
ˆ3Ð"!—'‘'˜"‘+ Ñ"Ð"Ð"Ñ	#€BØ	
ˆ3—‘˜‘˜qÑ Ñ"Ð"Ñ	#€BÜ9‰9rc˜2Y BÔ'Ð'r?   c                   ó  ‡ — e Zd Zddededz  fˆ fd„Z	 	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  dz  f   fd„Zˆ xZS )ÚOlmo2AttentionNÚconfigÚ	layer_idxc                 óð   •— t         ‰|   ||¬«       t        |j                  | j                  z  |j
                  «      | _        t        |j                  | j                  z  |j
                  «      | _        y )N©rr   )	r8   r9   rS   r(   Úhead_dimr5   Úq_normr)   Úk_norm©r;   rq   rr   r=   s      €r>   r9   zOlmo2Attention.__init__Ð   s[   ø€ Ü‰Ñ˜¨9ÐÔ5Ü" 6×#=Ñ#=ÀÇÁÑ#MÈv×ObÑObÓcˆŒÜ" 6×#=Ñ#=ÀÇÁÑ#MÈv×ObÑObÓcˆr?   r   Úposition_embeddingsr    Úpast_key_valuesÚcache_positionr<   Úreturnc                 ó`  — |j                   d d }g |¢d‘| j                  ‘­}| j                  | j                  |«      «      }	| j	                  | j                  |«      «      }
| j                  |«      }|	j                  |«      j                  dd«      }	|
j                  |«      j                  dd«      }
|j                  |«      j                  dd«      }|\  }}t        |	|
||«      \  }	}
|'|||dœ}|j                  |
|| j                  |«      \  }
}t        j                  | j                  j                  t         «      } || |	|
||f| j"                  sdn| j$                  | j&                  dœ|¤Ž\  }} |j(                  g |¢d‘­Ž j+                  «       }| j-                  |«      }||fS )NrU   rB   r   )ÚsinÚcosr{   rC   )ÚdropoutÚscaling)ri   ru   rv   Úq_projrw   Úk_projÚv_projÚviewÚ	transposer   Úupdaterr   r   Úget_interfacerq   Ú_attn_implementationr   Útrainingr4   r   ÚreshapeÚ
contiguousÚo_proj)r;   r   ry   r    rz   r{   r<   Úinput_shapeÚhidden_shapeÚquery_statesÚ
key_statesÚvalue_statesr   r~   Úcache_kwargsÚattention_interfaceÚattn_outputÚattn_weightss                     r>   rb   zOlmo2Attention.forwardÕ   s¼  € ð $×)Ñ)¨#¨2Ð.ˆØ8˜Ð8 bÐ8¨$¯-©-Ñ8ˆà—{‘{ 4§;¡;¨}Ó#=Ó>ˆØ—[‘[ §¡¨]Ó!;Ó<ˆ
Ø—{‘{ =Ó1ˆà#×(Ñ(¨Ó6×@Ñ@ÀÀAÓFˆØ—_‘_ \Ó2×<Ñ<¸QÀÓBˆ
Ø#×(Ñ(¨Ó6×@Ñ@ÀÀAÓFˆà&‰ˆˆSÜ#7¸ÀjÐRUÐWZÓ#[Ñ ˆjàÐ&à#&¨sÀnÑUˆLØ'6×'=Ñ'=¸jÈ,ÐX\×XfÑXfÐhtÓ'uÑ$ˆJ˜ä(?×(MÑ(MØK‰K×,Ñ,Ô.Eó)
Ðñ %8ØØØØØð	%
ð  $Ÿ}š}‘C°$×2HÑ2HØ—L‘Lñ	%
ð ñ	%
Ñ!ˆ\ð *k×)Ñ)Ð;¨;Ð;¸Ò;×FÑFÓHˆØ—k‘k +Ó.ˆØ˜LÐ(Ð(r?   )N)NN)rD   rE   rF   r   rK   r9   rY   ÚTensorÚtupler   Ú
LongTensorr	   r   rb   rP   rQ   s   @r>   rp   rp   Ï   s¹   ø„ ñd˜{ð d°s¸T±zõ dð )-Ø26ñ-)à—|‘|ð-)ð # 5§<¡<°·±Ð#=Ñ>ð-)ð Ÿ™ tÑ+ð	-)ð
  ™ð-)ð ×(Ñ(¨4Ñ/ð-)ð Ð+Ñ,ð-)ð 
ˆu|‰|˜UŸ\™\¨DÑ0Ð0Ñ	1÷-)r?   rp   c                   ó"  ‡ — e Zd Zdedefˆ fd„Z	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  deej                  ej                  f   dz  dee   dej                  fd„Zˆ xZS )ÚOlmo2DecoderLayerrq   rr   c                 óè   •— t         ‰|   ||¬«       t        |j                  |j                  ¬«      | _        t        |j                  |j                  ¬«      | _        t        ||¬«      | _        | `	y )Nrt   ©Úeps)rq   rr   )
r8   r9   rS   r%   r5   Úpost_attention_layernormÚpost_feedforward_layernormrp   Ú	self_attnÚinput_layernormrx   s      €r>   r9   zOlmo2DecoderLayer.__init__	  s_   ø€ Ü‰Ñ˜¨9ÐÔ5Ü(4°V×5GÑ5GÈV×M`ÑM`Ô(aˆÔ%Ü*6°v×7IÑ7IÈv×ObÑObÔ*cˆÔ'Ü'¨vÀÔKˆŒØÑ r?   Nr   r    Úposition_idsrz   r-   r{   ry   r<   r|   c                 óÂ   — |}	 | j                   d|||||||dœ|¤Ž\  }}
| j                  |«      }|	|z   }|}	| j                  |«      }| j                  |«      }|	|z   }|S )N)r   r    r£   rz   r-   r{   ry   r7   )r¡   rŸ   Úmlpr    )r;   r   r    r£   rz   r-   r{   ry   r<   ÚresidualÚ_s              r>   rb   zOlmo2DecoderLayer.forward  s“   € ð !ˆØ)˜4Ÿ>™>ð 	
Ø'Ø)Ø%Ø+ØØ)Ø 3ñ	
ð ñ	
Ñˆqð ×5Ñ5°mÓDˆØ  =Ñ0ˆð !ˆØŸ™ Ó/ˆØ×7Ñ7¸ÓFˆØ  =Ñ0ˆØÐr?   )NNNFNN)rD   rE   rF   r   rK   r9   rY   r—   r™   r   rN   r˜   r	   r   rb   rP   rQ   s   @r>   r›   r›     sÓ   ø„ ð!˜{ð !°sõ !ð /3Ø04Ø(,Ø!&Ø26ØHLñà—|‘|ðð Ÿ™ tÑ+ðð ×&Ñ&¨Ñ-ð	ð
  ™ðð ˜$‘;ðð ×(Ñ(¨4Ñ/ðð # 5§<¡<°·±Ð#=Ñ>ÀÑEðð Ð+Ñ,ðð 
‰÷r?   r›   c                   ó   — e Zd Zy)ÚOlmo2PreTrainedModelNre   r7   r?   r>   r©   r©   1  rf   r?   r©   c                   ó$   ‡ — e Zd Zdefˆ fd„Zˆ xZS )Ú
Olmo2Modelrq   c           	      ó  •— t         ‰|   |«       t        |j                  |j                  ¬«      | _        t        j                  t        |j                  «      D cg c]  }t        ||«      ‘Œ c}«      | _        y c c}w )Nr   )r8   r9   rS   r%   r5   r#   ÚnnÚ
ModuleListÚranger'   r›   r"   rx   s      €r>   r9   zOlmo2Model.__init__8  s^   ø€ Ü‰Ñ˜Ô Ü  ×!3Ñ!3¸×9LÑ9LÔMˆŒ	Ü—m‘mÜCHÈ×IaÑIaÓCbÖc°iÔ˜v yÕ1Òcó
ˆùÚcs   ÁA=)rD   rE   rF   r   r9   rP   rQ   s   @r>   r«   r«   7  s   ø„ ð
˜{÷ 
ñ 
r?   r«   c                   ó   — e Zd Zy)ÚOlmo2ForCausalLMNre   r7   r?   r>   r±   r±   A  rf   r?   r±   )r   r±   r«   r©   )+Úcollections.abcr   rY   Útorch.nnr­   Útransformers.utils.genericr   Úcache_utilsr   Úmodeling_rope_utilsr   Úmodeling_utilsr   Úprocessing_utilsr	   Úutilsr
   Úllama.modeling_llamar   r   r   Úolmo.configuration_olmor   Úolmo.modeling_olmor   r   r   r   r   r   Ú
get_loggerrD   Úloggerr   rS   rd   rn   rp   r›   r©   r«   r±   Ú__all__r7   r?   r>   ú<module>rÀ      s½   ðõ( %ã Ý å 9å  Ý 1Ý 5Ý &Ý ß ^Ñ ^Ý 0÷÷ ð 
ˆ×	Ñ	˜HÓ	%€ôD*ô DôR=<ô =ô	Ð.ô 	ò(ô3)]ô 3)ôr&Ð(ô &ôR	Ð/ô 	ô
ô 
ô	ô 	òr?   