
    qi                        d dl Z d dlZd dlmZ ddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZmZmZmZ dd	lmZ  ej(                  e      Z G d
 dej.                        Z G d de      Z G d de      Zd Zd!dZ G d de      Z G d de      Z G d de      Z G d dee      Z  G d de      Z! G d de      Z" G d de	      Z#g d Z$y)"    N   )logging   )GemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassification)GraniteAttention)LlamaDecoderLayerLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbedding   )HeliumConfigc                   ,     e Zd Zd fd	Zd Zd Z xZS )HeliumRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      [/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/helium/modular_helium.pyr   zHeliumRMSNorm.__init__   s/    ll5::k#:; #    c                 \   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  j                  t        j                        |z  j                  |      S )Nr   T)keepdim)	dtypetor   float32powmeanrsqrtr   r   )r   hidden_statesinput_dtypevariances       r!   forwardzHeliumRMSNorm.forward$   s    #))%((7 $$Q',,R,>%Ht?T?T4T(UUu}}-=AA+NNr"   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler   shaper   )r   s    r!   
extra_reprzHeliumRMSNorm.extra_repr+   s*    ))*+6$2G2G1HIIr"   )gư>)__name__
__module____qualname__r   r/   r3   __classcell__r    s   @r!   r   r      s    $
OJr"   r   c                       e Zd Zy)HeliumRotaryEmbeddingNr4   r5   r6    r"   r!   r:   r:   /       r"   r:   c                       e Zd Zy)	HeliumMLPNr;   r<   r"   r!   r?   r?   3   r=   r"   r?   c                 |    | ddddf   }| ddddf   }t        j                  | |fd      j                  d      S )	z*Rotates half the hidden dims of the input..r   Nr   r   r$   dim)r   stackflatten)xx1x2s      r!   rotate_halfrI   7   sJ    	
319B	
319B;;Ryb)11"55r"   c                 F   |j                  |      }|j                  |      }|dd|j                  d   dz  f   j                  dd      }|dd|j                  d   dz  f   j                  dd      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr$   r   rA   )	unsqueezer2   repeat_interleaverI   )qkcossinunsqueeze_dimq_embedk_embeds          r!   apply_rotary_pos_embrT   >   s    $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC3w;q>C/0G3w;q>C/0GGr"   c                   0     e Zd Zddededz  f fdZ xZS )HeliumAttentionNconfig	layer_idxc                     t         |   ||       t        j                  |j                  |j                  d      | _        dt        j                  | j                        z  | _	        y )NF)biasr   )
r   r   r   Linearr   o_projmathsqrthead_dimscalingr   rW   rX   r    s      r!   r   zHeliumAttention.__init__^   sK    +ii 2 2F4F4FUS499T]]33r"   r   r4   r5   r6   r   intr   r7   r8   s   @r!   rV   rV   ]   s    4| 4d
 4 4r"   rV   c                   0     e Zd Zddededz  f fdZ xZS )HeliumDecoderLayerNrW   rX   c                     t         |   ||       t        |      | _        t	        |j
                  |j                        | _        t	        |j
                  |j                        | _        y )Nr   )	r   r   r?   mlpr   r   rms_norm_epsinput_layernormpost_attention_layernormra   s      r!   r   zHeliumDecoderLayer.__init__e   sT    +V$,V-?-?VEXEXY(5f6H6HfNaNa(b%r"   r   rb   r8   s   @r!   re   re   d   s#    c| cd
 c cr"   re   c                       e Zd Zy)HeliumPreTrainedModelNr;   r<   r"   r!   rm   rm   m   r=   r"   rm   c                   $     e Zd Zdef fdZ xZS )HeliumModelrW   c           	      2   t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        d| _        | j                          y c c}w )Nrg   F)r   r   r   
ModuleListrangenum_hidden_layersre   layersr   r   ri   normgradient_checkpointing	post_initra   s      r!   r   zHeliumModel.__init__r   sw     mmDI&JbJbDcdy	2d
 "&"4"4&:M:MN	&+# 	 es   B)r4   r5   r6   r   r   r7   r8   s   @r!   ro   ro   q   s    	| 	 	r"   ro   c                       e Zd Zy)HeliumForCausalLMNr;   r<   r"   r!   ry   ry   ~   r=   r"   ry   c                       e Zd Zy)HeliumForSequenceClassificationNr;   r<   r"   r!   r{   r{      r=   r"   r{   c                       e Zd Zy)HeliumForTokenClassificationNr;   r<   r"   r!   r}   r}      r=   r"   r}   )rm   ro   ry   r{   r}   )r   )%r]   r   torch.nnr   utilsr   gemma.modeling_gemmar   r   r   granite.modeling_graniter	   llama.modeling_llamar
   r   r   r   r   configuration_heliumr   
get_loggerr4   loggerModuler   r:   r?   rI   rT   rV   re   rm   ro   ry   r{   r}   __all__r<   r"   r!   <module>r      s        p p 7 v v . 
		H	%JBII J"	0 		 	6>4& 4c* c	0 	
' 
	( 		&D 		#> 	r"   