Ë
    «q±iï<  ã                   ó|  — d dl mZ d dlZd dlmZ d dlmZ ddlmZm	Z	 ddl
mZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZm Z m!Z!m"Z"  G d„ de«      Z# G d„ de «      Z$ G d„ de«      Z% G d„ de«      Z& G d„ de«      Z' G d„ de«      Z( G d„ de«      Z) G d„ de«      Z*g d¢Z+y) é    )ÚCallableN)ÚTransformersKwargsé   )ÚCacheÚDynamicCache)ÚPreTrainedConfigÚlayer_type_validation)Úcreate_causal_maskÚ!create_sliding_window_causal_mask)ÚBaseModelOutputWithPast)ÚRopeParameters)ÚALL_ATTENTION_FUNCTIONS)ÚUnpacké   )ÚGemma2RotaryEmbedding)ÚOlmo2AttentionÚOlmo2DecoderLayerÚOlmo2ForCausalLMÚ
Olmo2ModelÚOlmo2PreTrainedModelÚOlmo2RMSNormÚapply_rotary_pos_embÚeager_attention_forwardc            *       óp  ‡ — e Zd ZdZdZdgZddddddddœZd	gd
gfddgdgfdgdgfdœZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d$dedz  dedz  dedz  dedz  dedz  dedz  de	dz  dedz  de
dz  dedz  dedz  dedz  dedz  dedz  deee	ef   z  dz  dedz  de
dz  d e
dz  d!edz  d"ee	   dz  f(ˆ fd#„Zˆ xZS )%ÚOlmo3Configa  
    This is the configuration class to store the configuration of a [`Olmo3Model`]. It is used to instantiate an OLMo3
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the [allenai/OLMo-3-0725-1B](https://huggingface.co/allenai/OLMo-3-0725-1B).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50304):
            Vocabulary size of the Olmo3 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Olmo3Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 1):
            Padding token id.
        bos_token_id (`int`, *optional*):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 50279):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        sliding_window (`int`, *optional*, defaults to 4096):
            Size of the sliding window for sliding window attention.
        layer_types (`list`, *optional*):
            Attention pattern for each layer. Defaults to sliding window attention
            for 3 out of 4 layers, and full attention for every 4th layer.

    ```python
    >>> from transformers import Olmo3Model, Olmo3Config

    >>> # Initializing a Olmo3 7B style configuration
    >>> configuration = Olmo3Config()

    >>> # Initializing a model from the Olmo3 7B style configuration
    >>> model = Olmo3Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    Úolmo3Úpast_key_valuesÚcolwise_gather_outputÚrowwise_split_inputÚcolwiseÚrowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projÚ	input_idsÚinputs_embedsÚhidden_statesÚattention_mask)Úembed_tokensÚlayersÚnormNÚ
vocab_sizeÚhidden_sizeÚintermediate_sizeÚnum_hidden_layersÚnum_attention_headsÚnum_key_value_headsÚ
hidden_actÚmax_position_embeddingsÚinitializer_rangeÚ	use_cacheÚpad_token_idÚbos_token_idÚeos_token_idÚtie_word_embeddingsÚrope_parametersÚattention_biasÚattention_dropoutÚrms_norm_epsÚsliding_windowÚlayer_typesc                 ó  •— || _         || _        || _        || _        || _        || _        |€|}|| _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        | j$                  €5t'        | j                  «      D cg c]  }|dz   dz  dk7  rdnd‘Œ c}| _        t)        | j$                  | j                  «       || _        t-        ‰| \  di |¤Ž y c c}w )Né   é   r   Úsliding_attentionÚfull_attention© )r)   r0   r*   r+   r,   r-   r.   r/   r1   r2   r8   r9   r6   r3   r4   r5   r:   r;   r<   Úranger	   r7   ÚsuperÚ__init__)Úselfr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   ÚkwargsÚiÚ	__class__s                          €úY/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/olmo3/modular_olmo3.pyrE   zOlmo3Config.__init__ˆ   s#  ø€ ð0 %ˆŒØ'>ˆÔ$Ø&ˆÔØ!2ˆÔØ!2ˆÔØ#6ˆÔ ð Ð&Ø"5Ðà#6ˆÔ Ø$ˆŒØ!2ˆÔØ"ˆŒØ,ˆÔØ!2ˆÔØ#6ˆÔ Ø(ˆÔØ(ˆÔØ(ˆÔà(ˆÔØ,ˆÔØ&ˆÔØ×ÑÐ#äW\Ð]a×]sÑ]sÓWtö ØRS¨¨A©°¡{°aÒ'7Ñ#Ð=MÑMò ˆDÔô 	˜d×.Ñ.°×0FÑ0FÔGà.ˆÔä‰ÑÑ"˜6Ó"ùò s   Â.D)i€Ä  é   i +  é    rL   NÚsilui   g{®Gáz”?Tr>   NigÄ  FNFç        gñhãˆµøä>rK   N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_typeÚkeys_to_ignore_at_inferenceÚbase_model_tp_planÚbase_model_pp_planÚintÚstrÚfloatÚboolr   ÚdictÚlistrE   Ú__classcell__©rI   s   @rJ   r   r   *   sà  ø„ ñJðX €JØ#4Ð"5Ðà%<Ø%<Ø%<Ø%:Ø"+Ø )Ø"+ñÐð &˜¨Ð(9Ð:Ø#Ð%5Ð6¸Ð8IÐJØ!Ð" _Ð$5Ð6ñÐð "'Ø"&Ø(-Ø(*Ø*,Ø*.Ø!'Ø.2Ø*.Ø!%Ø#$Ø#'Ø#(Ø+0ØMQØ&+Ø*-Ø%)Ø%)Ø(,ñ+9#à˜$‘Jð9#ð ˜4‘Zð9#ð  ™:ð	9#ð
  ™:ð9#ð ! 4™Zð9#ð ! 4™Zð9#ð ˜$‘Jð9#ð "% t¡ð9#ð ! 4™<ð9#ð ˜$‘;ð9#ð ˜D‘jð9#ð ˜D‘jð9#ð ˜D‘jð9#ð " D™[ð9#ð  (¨$¨s°NÐ/BÑ*CÑCÀdÑJð!9#ð" ˜t™ð#9#ð$ ! 4™<ð%9#ð& ˜d‘lð'9#ð( ˜d™
ð)9#ð* ˜#‘Y Ñ%÷+9#ñ 9#ó    r   c                   ó   — e Zd Zy)ÚOlmo3RMSNormN©rO   rP   rQ   rB   r_   rJ   ra   ra   Ä   ó   „ Ør_   ra   c                   ó  ‡ — e Zd Zdedefˆ fd„Z	 	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  dz  f   fd„Zˆ xZS )ÚOlmo3AttentionÚconfigÚ	layer_idxc                 ó    •— t         ‰|   ||¬«       |j                  |   | _        | j                  dk(  r|j                  | _        y d | _        y )N)rg   r@   )rD   rE   r<   Úattention_typer;   ©rF   rf   rg   rI   s      €rJ   rE   zOlmo3Attention.__init__Ë   sL   ø€ Ü‰Ñ˜¨9ÐÔ5Ø$×0Ñ0°Ñ;ˆÔØ7;×7JÑ7JÐNaÒ7a˜f×3Ñ3ˆÕÐgkˆÕr_   Nr$   Úposition_embeddingsr%   r   Úcache_positionrG   Úreturnc                 óv  — |j                   d d }g |¢d‘| j                  ‘­}| j                  | j                  |«      «      }	| j	                  | j                  |«      «      }
| j                  |«      }|	j                  |«      j                  dd«      }	|
j                  |«      j                  dd«      }
|j                  |«      j                  dd«      }|\  }}t        |	|
||«      \  }	}
|'|||dœ}|j                  |
|| j                  |«      \  }
}t        j                  | j                  j                  t         «      } || |	|
||f| j"                  sdn| j$                  | j&                  | j(                  dœ|¤Ž\  }} |j*                  g |¢d‘­Ž j-                  «       }| j/                  |«      }||fS )Néÿÿÿÿr>   r   )ÚsinÚcosrl   rN   )ÚdropoutÚscalingr;   )ÚshapeÚhead_dimÚq_normÚq_projÚk_normÚk_projÚv_projÚviewÚ	transposer   Úupdaterg   r   Úget_interfacerf   Ú_attn_implementationr   Útrainingr9   rs   r;   ÚreshapeÚ
contiguousÚo_proj)rF   r$   rk   r%   r   rl   rG   Úinput_shapeÚhidden_shapeÚquery_statesÚ
key_statesÚvalue_statesrq   rp   Úcache_kwargsÚattention_interfaceÚattn_outputÚattn_weightss                     rJ   ÚforwardzOlmo3Attention.forwardÐ   sÅ  € ð $×)Ñ)¨#¨2Ð.ˆØ8˜Ð8 bÐ8¨$¯-©-Ñ8ˆà—{‘{ 4§;¡;¨}Ó#=Ó>ˆØ—[‘[ §¡¨]Ó!;Ó<ˆ
Ø—{‘{ =Ó1ˆà#×(Ñ(¨Ó6×@Ñ@ÀÀAÓFˆØ—_‘_ \Ó2×<Ñ<¸QÀÓBˆ
Ø#×(Ñ(¨Ó6×@Ñ@ÀÀAÓFˆà&‰ˆˆSÜ#7¸ÀjÐRUÐWZÓ#[Ñ ˆjàÐ&à#&¨sÀnÑUˆLØ'6×'=Ñ'=¸jÈ,ÐX\×XfÑXfÐhtÓ'uÑ$ˆJ˜ä(?×(MÑ(MØK‰K×,Ñ,Ô.Eó)
Ðñ %8ØØØØØð
%
ð  $Ÿ}š}‘C°$×2HÑ2HØ—L‘LØ×.Ñ.ñ
%
ð ñ
%
Ñ!ˆ\ð *k×)Ñ)Ð;¨;Ð;¸Ò;×FÑFÓHˆØ—k‘k +Ó.ˆØ˜LÐ(Ð(r_   )NN)rO   rP   rQ   r   rW   rE   ÚtorchÚTensorÚtupler   Ú
LongTensorr   r   r   r]   r^   s   @rJ   re   re   Ê   sµ   ø„ ðl˜{ð l°sõ lð )-Ø26ñ.)à—|‘|ð.)ð # 5§<¡<°·±Ð#=Ñ>ð.)ð Ÿ™ tÑ+ð	.)ð
  ™ð.)ð ×(Ñ(¨4Ñ/ð.)ð Ð+Ñ,ð.)ð 
ˆu|‰|˜UŸ\™\¨DÑ0Ð0Ñ	1÷.)r_   re   c                   ó   — e Zd Zy)ÚOlmo3DecoderLayerNrb   rB   r_   rJ   r“   r“     rc   r_   r“   c                   ó   — e Zd Zy)ÚOlmo3RotaryEmbeddingNrb   rB   r_   rJ   r•   r•     rc   r_   r•   c                   ó   — e Zd Zy)ÚOlmo3PreTrainedModelNrb   rB   r_   rJ   r—   r—   	  rc   r_   r—   c                   óô   ‡ — e Zd Zdefˆ fd„Z	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	ej                  dz  d
e
dz  dee   defd„Zˆ xZS )Ú
Olmo3Modelrf   c           	      ó&  •— t         ‰|   |«       t        |j                  |j                  ¬«      | _        t        j                  t        |j                  «      D cg c]  }t        ||«      ‘Œ c}«      | _        t        |¬«      | _        y c c}w )N)Úeps©rf   )rD   rE   ra   r*   r:   r(   ÚnnÚ
ModuleListrC   r,   r“   r'   r•   Ú
rotary_embrj   s      €rJ   rE   zOlmo3Model.__init__  so   ø€ Ü‰Ñ˜Ô Ü  ×!3Ñ!3¸×9LÑ9LÔMˆŒ	Ü—m‘mÜCHÈ×IaÑIaÓCbÖc°iÔ˜v yÕ1Òcó
ˆŒô /°fÔ=ˆùò ds   ÁBNr"   r%   Úposition_idsr   r#   rl   r2   rG   rm   c           
      ó°  — |d u |d uz  rt        d«      ‚|€| j                  |«      }|r|€t        | j                  ¬«      }|€E||j	                  «       nd}	t        j                  |j                  d   |j                  ¬«      |	z   }|€|j                  d«      }t        |x}
t        «      s*| j                  |||||dœ}t        d
i |¤Žt        d
i |¤Ždœ}
|}| j                  ||«      }| j                  d | j                  j                    D ](  } ||f|
|j"                  j$                     ||||dœ|¤Ž}Œ* | j'                  |«      }t)        ||¬	«      S )Nz:You must specify exactly one of input_ids or inputs_embedsrœ   r   r>   )Údevice)rf   r#   r%   rl   r   r    )rA   r@   )r%   r    r   rl   rk   )Úlast_hidden_stater   rB   )Ú
ValueErrorr&   r   rf   Úget_seq_lengthrŽ   Úarangert   r¢   Ú	unsqueezeÚ
isinstancer[   r
   r   rŸ   r'   r,   Ú	self_attnri   r(   r   )rF   r"   r%   r    r   r#   rl   r2   rG   Úpast_seen_tokensÚcausal_mask_mappingÚmask_kwargsr$   rk   Údecoder_layers                  rJ   r   zOlmo3Model.forward  sš  € ð ˜Ð -°tÐ";Ò<ÜÐYÓZÐZàÐ Ø*.×*;Ñ*;¸IÓ*FˆMá˜Ð0Ü*°$·+±+Ô>ˆOàÐ!ØCRÐC^˜×=Ñ=Ô?ÐdeÐä—‘˜]×0Ñ0°Ñ3¸M×<PÑ<PÔQÐTdÑdð ð ÐØ)×3Ñ3°AÓ6ˆLô °Ð?Ð-ÄÔFð Ÿ+™+Ø!.Ø"0Ø"0Ø#2Ø ,ñˆKô #5Ñ"C°{Ñ"CÜ%FÑ%UÈÑ%Uñ#Ðð
 &ˆØ"Ÿo™o¨m¸\ÓJÐà!Ÿ[™[Ð)H¨4¯;©;×+HÑ+HÐIò 		ˆMÙ)Øðà2°=×3JÑ3J×3YÑ3YÑZØ)Ø /Ø-Ø$7ñð ñ‰Mð		ð Ÿ	™	 -Ó0ˆÜ&Ø+Ø+ô
ð 	
r_   )NNNNNNN)rO   rP   rQ   r   rE   rŽ   r‘   r   r   ÚFloatTensorrZ   r   r   r   r   r]   r^   s   @rJ   r™   r™     sÐ   ø„ ð>˜{õ >ð .2Ø.2Ø04Ø(,Ø26Ø26Ø!%ñ@
à×#Ñ# dÑ*ð@
ð Ÿ™ tÑ+ð@
ð ×&Ñ&¨Ñ-ð	@
ð
  ™ð@
ð ×(Ñ(¨4Ñ/ð@
ð ×(Ñ(¨4Ñ/ð@
ð ˜$‘;ð@
ð Ð+Ñ,ð@
ð 
!÷@
r_   r™   c                   ó   — e Zd Zy)ÚOlmo3ForCausalLMNrb   rB   r_   rJ   r°   r°   \  rc   r_   r°   )r   r°   r™   r—   ),Úcollections.abcr   rŽ   Útorch.nnr   Útransformers.utils.genericr   Úcache_utilsr   r   Úconfiguration_utilsr   r	   Úmasking_utilsr
   r   Úmodeling_outputsr   Úmodeling_rope_utilsr   Úmodeling_utilsr   Úprocessing_utilsr   Úgemma2.modeling_gemma2r   Úolmo2.modeling_olmo2r   r   r   r   r   r   r   r   r   ra   re   r“   r•   r—   r™   r°   Ú__all__rB   r_   rJ   ú<module>r¾      s­   ðõ %ã Ý å 9ç .ß Jß RÝ 7Ý 1Ý 5Ý &Ý :÷	÷ 	ó 	ôW#Ð"ô W#ôt	<ô 	ô4)^ô 4)ôn	Ð)ô 	ô	Ð0ô 	ô	Ð/ô 	ôI
ô I
ôX	Ð'ô 	òr_   