
    qi                        d dl mZ d dlZd dlmZ d dlmc mZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ d	d
lmZmZmZmZmZmZmZmZ ddlmZ  ej:                  e      Z G d dej@                        Z! G d de      Z" G d de      Z#ddZ$ G d de      Z% G d de      Z& G d de      Z' G d de      Z(g dZ)y)    )CallableN   )Cache)dynamic_rope_update)ALL_ATTENTION_FUNCTIONS)logging)maybe_autocast   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forwardrotate_half   )
OlmoConfigc                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )OlmoLayerNormz/LayerNorm but with no learnable weight or bias.hidden_sizereturnNc                 2    t         |           |f| _        y N)super__init__normalized_shape)selfr   	__class__s     W/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/olmo/modular_olmo.pyr   zOlmoLayerNorm.__init__2   s    !,    hidden_statesc                     |j                   }t        j                  |j                  t        j
                        | j                  d d d      j                  |      S )N)dtypegh㈵>)eps)r$   F
layer_normtotorchfloat32r   )r   r"   
orig_dtypes      r    forwardzOlmoLayerNorm.forward6   sO    "((
||M,,5==,A4CXCXZ^`djnorr
 	
r!   )
__name__
__module____qualname____doc__intr   r)   Tensorr,   __classcell__r   s   @r    r   r   /   s4    9/C /D /
U\\ 
ell 
r!   r   c                        e Zd Z fdZ xZS )OlmoMLPc                 J   t         |   |       t        j                  | j                  | j
                  d      | _        t        j                  | j                  | j
                  d      | _        t        j                  | j
                  | j                  d      | _        y )NF)bias)	r   r   nnLinearr   intermediate_size	gate_projup_proj	down_proj)r   configr   s     r    r   zOlmoMLP.__init__>   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXr!   )r-   r.   r/   r   r3   r4   s   @r    r6   r6   =   s    Y Yr!   r6   c                   D    e Zd Z ej                         ed               Zy)OlmoRotaryEmbeddingc                    | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        ||	fS # 1 sw Y   	fS xY w)
Nr   r   mpscpuF)device_typeenabledr
   )dim)inv_freqfloatexpandshaper(   device
isinstancetypestrr	   	transposer)   catcosattention_scalingsin)
r   xposition_idsinv_freq_expandedposition_ids_expandedrF   freqsembrS   rU   s
             r    r,   zOlmoRotaryEmbedding.forwardH   s8    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5
 Cx	5
 Cxs   BE''E3N)r-   r.   r/   r)   no_gradr   r,    r!   r    rA   rA   G   s$    U]]_
  
r!   rA   c                 
   | j                   |j                   }}|j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }|j                  |      |j                  |      fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r$   	unsqueezer   r(   )	qkrS   rU   unsqueeze_dimq_typek_typeq_embedk_embeds	            r    apply_rotary_pos_embrg   W   s|    $ WWaggFF
--
&C
--
&C3w;q>C/0G3w;q>C/0G::fwzz&111r!   c                       e Zd Z	 	 d	dej                  deej                  ej                  f   dej                  dz  dedz  dej                  dz  deej                  ej                  dz  f   fdZy)
OlmoAttentionNr"   position_embeddingsattention_maskpast_key_valuescache_positionr   c                    |j                   d d }g |d| j                  }| j                  |      }	| j                  |      }
| j	                  |      }| j
                  j                  |	j                  | j
                  j                   | j
                  j                         |
j                  | j
                  j                   | j
                  j                         |j                  | j
                  j                   | j
                  j                         |	j                  |      j                  dd      }	|
j                  |      j                  dd      }
|j                  |      j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j
                  j                  t               } || |	|
||f| j"                  sdn| j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )NrC   )minmaxr   r
   )rU   rS   rm   g        )dropoutscaling)rL   head_dimq_projk_projv_projr?   clip_qkvclamp_viewrQ   rg   update	layer_idxr   get_interface_attn_implementationr   trainingattention_dropoutrr   reshape
contiguouso_proj)r   r"   rj   rk   rl   rm   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrS   rU   cache_kwargsattention_interfaceattn_outputattn_weightss                     r    r,   zOlmoAttention.forwardr   s8    $))#2.88b8$--8{{=1[[/
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST#((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r!   )NN)	r-   r.   r/   r)   r2   tupler   
LongTensorr,   r]   r!   r    ri   ri   q   s     )-262)||2) #5<<#=>2) t+	2)
 2) ((4/2) 
u||U\\D00	12)r!   ri   c                   (     e Zd Zdedef fdZ xZS )OlmoDecoderLayerr?   r{   c                     t         |   ||       t        |j                        | _        t        |j                        | _        t        ||      | _        y )N)r?   r{   )r   r   r   r   input_layernormpost_attention_layernormri   	self_attnr   r?   r{   r   s      r    r   zOlmoDecoderLayer.__init__   sF    +,V-?-?@(5f6H6H(I%&f	Jr!   )r-   r.   r/   r   r1   r   r3   r4   s   @r    r   r      s    Kz Kc K Kr!   r   c                   $     e Zd Zdef fdZ xZS )	OlmoModelr?   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |j                        | _
        y c c}w r   )r   r   r9   
ModuleListrangenum_hidden_layersr   layersr   r   normr   s      r    r   zOlmoModel.__init__   s[     mmBGH`H`BabYfi0b
 "&"4"45	 cs   A1)r-   r.   r/   r   r   r3   r4   s   @r    r   r      s    6z 6 6r!   r   c                       e Zd Zy)OlmoForCausalLMN)r-   r.   r/   r]   r!   r    r   r      s    r!   r   )r   r   OlmoPreTrainedModel)r   )*collections.abcr   r)   torch.nnr9   torch.nn.functional
functionalr&   cache_utilsr   modeling_rope_utilsr   modeling_utilsr   utilsr   utils.genericr	   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_olmor   
get_loggerr-   loggerModuler   r6   rA   rg   ri   r   r   r   __all__r]   r!   r    <module>r      s   ( %       6 5  +	 	 	 + 
		H	%
BII 
Yh Y.  243)N 3)lK( K6
 6	& 	r!   