
    qi8                        d dl mZ d dlZd dlmZ d dlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZmZmZ d
dlmZ d
dlmZmZmZmZmZmZ  ej<                  e      Z  G d de      Z! G d de      Z" G d de      Z#d Z$ G d de      Z% G d de      Z& G d de      Z' G d de      Z( G d de      Z)g dZ*y)     )CallableN)TransformersKwargs   )Cache)RopeParameters)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )LlamaPreTrainedModelLlamaRMSNormeager_attention_forward)
OlmoConfig)OlmoAttentionOlmoDecoderLayerOlmoForCausalLM	OlmoModelOlmoRotaryEmbeddingapply_rotary_pos_embc            &       L    e Zd ZdZdZddddddddZdgd	gfd
dgd
gfd
gd
gfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d!dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  de	dz  de
dz  dedz  dedz  dedz  de
dz  deeeef   z  dz  de
dz  de	dz  dedz  f$ fd Z xZS )"Olmo2Configa  
    This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50304):
            Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Olmo2Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 1):
            Padding token id.
        bos_token_id (`int`, *optional*):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 50279):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.

    ```python
    >>> from transformers import Olmo2Model, Olmo2Config

    >>> # Initializing a Olmo2 7B style configuration
    >>> configuration = Olmo2Config()

    >>> # Initializing a model from the Olmo2 7B style configuration
    >>> model = Olmo2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    olmo2colwise_gather_outputrowwise_split_inputcolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormN
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actmax_position_embeddingsinitializer_range	use_cachepad_token_idbos_token_ideos_token_idtie_word_embeddingsrope_parametersattention_biasattention_dropoutrms_norm_epsc                     t        |   di d|d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|| || _        | `y )Nr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4    )super__init__r5   clip_qkv)selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   kwargs	__class__s                       Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/olmo2/modular_olmo2.pyr9   zOlmo2Config.__init__   s    , 	 	
!	
#	
 0	
 0		

 !4	
 !4	
 "	
 %<	
 0	
  	
 &	
 &	
 &	
 !4	
 ,	
  *!	
" 0%	
* )M    )i  i   i +      r@   Nsilui   g{Gz?T   Nig  FNF        gh㈵>)__name__
__module____qualname____doc__
model_typebase_model_tp_planbase_model_pp_planintstrfloatboolr   dictr9   __classcell__r=   s   @r>   r   r   /   s   EN J%<%<%<%:"+ )"+ &(9:#%568IJ!"_$56 "'"&(-(**,*.!'.2*.!%#$#'#(+0MQ&+*-#'',$J, 4Z, :	,
 :, !4Z, !4Z, $J, "%t, !4<, $;, Dj, Dj, Dj, "D[,  ($sN/B*CCdJ!," t#,$ !4<%,& Dj', ,r?   r   c                       e Zd Zd Zy)Olmo2RMSNormc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |z  j                  |      S )Nr   T)keepdim)	dtypetotorchfloat32powmeanrsqrtvariance_epsilonweight)r;   r   input_dtypevariances       r>   forwardzOlmo2RMSNorm.forward   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UUm+//<<r?   N)rD   rE   rF   rb   r7   r?   r>   rS   rS      s    =r?   rS   c                       e Zd Zy)Olmo2RotaryEmbeddingNrD   rE   rF   r7   r?   r>   rd   rd          r?   rd   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrU   r   )dim)shaperY   cat)xx1x2s      r>   rotate_halfrn      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r?   c                       e Zd Zddededz  f fdZ	 	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  dz  f   fdZ xZS )Olmo2AttentionNconfig	layer_idxc                     t         |   ||       t        |j                  | j                  z  |j
                        | _        t        |j                  | j                  z  |j
                        | _        y )Nrr   )	r8   r9   rS   r(   head_dimr5   q_normr)   k_normr;   rq   rr   r=   s      r>   r9   zOlmo2Attention.__init__   s[    95"6#=#=#MvObObc"6#=#=#MvObObcr?   r   position_embeddingsr    past_key_valuescache_positionr<   returnc                 `   |j                   d d }g |d| j                  }| j                  | j                  |            }	| j	                  | j                  |            }
| j                  |      }|	j                  |      j                  dd      }	|
j                  |      j                  dd      }
|j                  |      j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t               } || |	|
||f| j"                  sdn| j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )NrU   rB   r   )sincosr{   rC   )dropoutscaling)ri   ru   rv   q_projrw   k_projv_projview	transposer   updaterr   r   get_interfacerq   _attn_implementationr   trainingr4   r   reshape
contiguouso_proj)r;   r   ry   r    rz   r{   r<   input_shapehidden_shapequery_states
key_statesvalue_statesr   r~   cache_kwargsattention_interfaceattn_outputattn_weightss                     r>   rb   zOlmo2Attention.forward   s    $))#2.88b8$--8{{4;;}#=>[[]!;<
{{=1#((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r?   )N)NN)rD   rE   rF   r   rK   r9   rY   Tensortupler   
LongTensorr	   r   rb   rP   rQ   s   @r>   rp   rp      s    d{ dsTz d )-26-)||-) #5<<#=>-) t+	-)
 -) ((4/-) +,-) 
u||U\\D00	1-)r?   rp   c                   "    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  deej                  ej                  f   dz  dee   dej                  fdZ xZS )Olmo2DecoderLayerrq   rr   c                     t         |   ||       t        |j                  |j                        | _        t        |j                  |j                        | _        t        ||      | _        | `	y )Nrt   eps)rq   rr   )
r8   r9   rS   r%   r5   post_attention_layernormpost_feedforward_layernormrp   	self_attninput_layernormrx   s      r>   r9   zOlmo2DecoderLayer.__init__	  s_    95(4V5G5GVM`M`(a%*6v7I7IvObOb*c''vK r?   Nr   r    position_idsrz   r-   r{   ry   r<   r|   c                     |}	 | j                   d|||||||d|\  }}
| j                  |      }|	|z   }|}	| j                  |      }| j                  |      }|	|z   }|S )N)r   r    r   rz   r-   r{   ry   r7   )r   r   mlpr   )r;   r   r    r   rz   r-   r{   ry   r<   residual_s              r>   rb   zOlmo2DecoderLayer.forward  s     !)4>> 	
')%+) 3	
 	
q 55mD =0 !/77F =0r?   )NNNFNN)rD   rE   rF   r   rK   r9   rY   r   r   r   rN   r   r	   r   rb   rP   rQ   s   @r>   r   r     s    !{ !s ! /304(,!&26HL|| t+ &&-	
  $; ((4/ #5<<#=>E +, 
r?   r   c                       e Zd Zy)Olmo2PreTrainedModelNre   r7   r?   r>   r   r   1  rf   r?   r   c                   $     e Zd Zdef fdZ xZS )
Olmo2Modelrq   c           	         t         |   |       t        |j                  |j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w )Nr   )r8   r9   rS   r%   r5   r#   nn
ModuleListranger'   r   r"   rx   s      r>   r9   zOlmo2Model.__init__8  s^      !3!39L9LM	mmCHIaIaCbcivy1c
cs   A=)rD   rE   rF   r   r9   rP   rQ   s   @r>   r   r   7  s    
{ 
 
r?   r   c                       e Zd Zy)Olmo2ForCausalLMNre   r7   r?   r>   r   r   A  rf   r?   r   )r   r   r   r   )+collections.abcr   rY   torch.nnr   transformers.utils.genericr   cache_utilsr   modeling_rope_utilsr   modeling_utilsr   processing_utilsr	   utilsr
   llama.modeling_llamar   r   r   olmo.configuration_olmor   olmo.modeling_olmor   r   r   r   r   r   
get_loggerrD   loggerr   rS   rd   rn   rp   r   r   r   r   __all__r7   r?   r>   <module>r      s   ( %   9   1 5 &  ^ ^ 0  
		H	%D* DR=< =	. 	(3)] 3)r&( &R	/ 	
 
	 	r?   