
    qis                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ  ej<                  e      Z  G d dejB                        Z" G d dejF                        Z$ G d dejF                        Z% G d de      Z&e G d de             Z'e G d de'             Z( ed       G d de'e             Z)g d Z*y)!zPyTorch XGLM model.    N)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging   )
XGLMConfigc            
       `     e Zd ZdZd
dededededz  f fdZdej                  f fd	Z	 xZ
S )XGLMScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scaleNc                 6    t         |   |||       || _        y N)super__init__r   )selfr   r   r   r   	__class__s        X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/xglm/modeling_xglm.pyr   z XGLMScaledWordEmbedding.__init__*   s    D&    	input_idsc                 <    t         |   |      | j                  z  S r   )r   forwardr   )r   r#   r    s     r!   r%   zXGLMScaledWordEmbedding.forward.   s    wy)D,<,<<<r"   )      ?)__name__
__module____qualname____doc__intfloatr   torchTensorr%   __classcell__r    s   @r!   r   r   %   sE    's '3 'S '_dgk_k '= = =r"   r   c            	            e Zd ZdZddedededz  f fdZddedededz  fdZeddedededz  fd	       Z e	j                         dd
e	j                  dz  defd       Z xZS )!XGLMSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsr   r   c                     t         |           d| _        || _        || _        || _        | j                  || j                  z   ||       y )N   )r   r   offsetr3   r   r   make_weights)r   r3   r   r   r    s       r!   r   z*XGLMSinusoidalPositionalEmbedding.__init__5   sH    **&-$++5}kRr"   r   c                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )NweightsdtypedeviceF)
persistent)get_embeddinghasattrtor9   r;   r<   register_buffer)r   r   r   r   emb_weightss        r!   r7   z.XGLMSinusoidalPositionalEmbedding.make_weights=   s[    ((T4#%..t||/A/A$,,J]J].^KYFr"   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r5   i'  r   )r;   r   dimN)mathlogr-   exparangeint64r,   	unsqueezecatsincosviewzerosr@   get_default_dtype)r   r   r   half_dimembs        r!   r>   z/XGLMSinusoidalPositionalEmbedding.get_embeddingE   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r"   position_idspast_key_values_lengthc                    |j                         \  }}|| j                  z   }d|z   |z   }|| j                  j                  d      kD  r'| j                  || j                  | j
                         | j                  j                  d|j                  d            j                  ||| j                  j                  d         j                         S )Nr5   r   rF   )
sizer6   r9   r7   r   r   index_selectrP   shapedetach)r   rU   rV   bszseq_lenmax_poss         r!   r%   z)XGLMSinusoidalPositionalEmbedding.forwardZ   s    #((*W#dkk1g+ 66T\\&&q))gt'9'94;K;KL||((L,=,=b,ABGGWVZVbVbVhVhikVlmttvvr"   r   )Nr   )r'   r(   r)   r*   r+   r   r7   staticmethodr>   r-   no_gradr.   r%   r/   r0   s   @r!   r2   r2   2   s    NSc S# SCRVJ SG3 Gs GQTW[Q[ G 1c 1# 1CRVJ 1 1( U]]_wELL4$7 wX[ w wr"   r2   c                   P    e Zd ZdZ	 	 	 	 ddedededz  dedz  dedz  dedz  f fd	Z	 	 	 	 	 dd
ej                  dej                  dz  de
dz  dej                  dz  dedej                  dz  deej                  ej                  dz  eej                     dz  f   fdZ xZS )XGLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _        || _	        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rg   )r   r   rc   rd   re   head_dim
ValueErrorscalingrf   rh   r   Lineark_projv_projq_projout_proj)r   rc   rd   re   rf   rg   rh   r    s          r!   r   zXGLMAttention.__init__i   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr"   hidden_stateskey_value_statespast_key_valuesattention_maskoutput_attentionscache_positionreturnc                 ,   |du}|j                         \  }}	}
|r|j                  d   n|	}| j                  |      | j                  z  }d}|St	        |t
              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|I|rGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j                  |      }|j!                  ||d| j"                        j%                  dd      }|j!                  ||d| j"                        j%                  dd      }|T|s|nd}j'                  ||| j                  d|i      \  }}|r)t	        |t
              rd|j                  | j                  <   || j(                  z  d| j"                  f}|j!                  ||	| j(                  | j"                        j%                  dd      } |j*                  | } |j*                  | } |j*                  | }|j                  d      }t-        j.                  ||j%                  dd            }|j                         || j(                  z  |	|fk7  r/t1        d|| j(                  z  |	|f d	|j                                ||j                         |d|	|fk7  r#t1        d
|d|	|f d	|j                                |j!                  || j(                  |	|      |z   }t-        j2                  |t-        j4                  t-        j6                  |j8                        j:                  |j<                              }|j!                  || j(                  z  |	|      }|j8                  t,        j>                  k(  rNt@        jB                  jE                  |dt,        jF                        jI                  t,        j>                        }n!t@        jB                  jE                  |d      }|r?|j!                  || j(                  |	|      }|j!                  || j(                  z  |	|      }nd}t@        jB                  jK                  || jJ                  | jL                        }t-        j.                  ||      }|j                         || j(                  z  |	| j"                  fk7  r7t1        d|| j(                  |	| j"                  f d	|j                                |j!                  || j(                  |	| j"                        }|j%                  dd      }|j+                  ||	| jN                        }| jQ                  |      }||fS )z#Input shape: Batch x Time x ChannelNr   FrF   r5   rx   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size r<   )rE   r;   rD   ptrainingz `attn_output` should be of size ))rX   rZ   rq   rm   
isinstancer	   
is_updatedgetrh   cross_attention_cacheself_attention_cachelayerskeysvaluesro   rp   rP   rk   	transposeupdaterd   reshaper-   bmmrl   maxtensorfinfor;   minr<   float16r   
functionalsoftmaxfloat32r@   re   r~   rc   rr   )r   rs   rt   ru   rv   rw   rx   is_cross_attentionr\   tgt_len_src_lenquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_states
proj_shapeattn_weightsattn_weights_reshaped
attn_probsattn_outputs                          r!   r%   zXGLMAttention.forward   s0    .T9',,.Wa/A"((+w {{=1DLL@
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6L#gr4==ISSTUWXYJ',,S'2t}}MWWXY[\]L*7It+?+F+Fdnn?OQ_>`,(
L &*_FY*ZAEO..t~~>DNN*B>
#((gt~~t}}U__`acde+|++Z8'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL 99ell5;;|7I7I+J+N+NWcWjWjkL (,,S4>>-A7GTL .==002U]]0[^^_d_l_lmL==0020FL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK0111r"   )g        FTN)NNNFN)r'   r(   r)   r*   r+   r,   boolr   r-   r.   r   tupler%   r/   r0   s   @r!   rb   rb   f   s   G !$"' !%CC C 	C
 4KC TkC $;C@ 15(,.2"'.2s2||s2  ,,-s2 	s2
 t+s2  s2 t+s2 
u||U\\D0%2E2LL	Ms2r"   rb   c                       e Zd Zddef fdZ	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  dedz  d	edz  d
edz  dej                  dz  dej                  fdZ	 xZ
S )XGLMDecoderLayerNconfigc                 0   t         |           |j                  | _        t	        | j                  |j
                  |j                  d|      | _        |j                  | _        t        |j                     | _        |j                  | _        |j                  rWt	        | j                  |j
                  |j                  d|      | _        t        j                   | j                        | _        t        j                   | j                        | _        t        j&                  | j                  |j(                        | _        t        j&                  |j(                  | j                        | _        t        j                   | j                        | _        y )NT)rc   rd   re   rf   rh   )r   r   d_modelrc   rb   attention_headsattention_dropout	self_attnre   r   activation_functionactivation_fnactivation_dropoutadd_cross_attentionencoder_attnr   	LayerNormencoder_attn_layer_normself_attn_layer_normrn   ffn_dimfc1fc2final_layer_norm)r   r   rh   r    s      r!   r   zXGLMDecoderLayer.__init__   s   &nn,,,,
 ~~#F$>$>?"(";";%% -.. 0000#!D ,.<<+GD($&LL$@!99T^^V^^<99V^^T^^< "T^^ <r"   rs   rv   encoder_hidden_statesencoder_attention_maskru   rw   	use_cacherx   ry   c	                 .   |}	| j                  |      }| j                  |||||      \  }}
t        j                  j	                  || j                  | j
                        }|	|z   }d}|h|}	| j                  |      }| j                  ||||||      \  }}t        j                  j	                  || j                  | j
                        }|	|z   }|}	| j                  |      }| j                  | j                  |            }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }|	|z   }|f}|r||
|fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rs   ru   rv   rw   rx   r|   N)rs   rt   rv   ru   rw   rx   )r   r   r   r   re   r~   r   r   r   r   r   r   r   )r   rs   rv   r   r   ru   rw   r   rx   residualself_attn_weightscross_attn_weightsoutputss                r!   r%   zXGLMDecoderLayer.forward  s   2 !11-@ ,0>>'+)/) ,: ,
(( --mt||VZVcVc-d =0 " ,$H 88GM040A0A+!65 /"3- 1B 1-M- MM11-4<<Z^ZgZg1hM$}4M !--m<**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0 ")+=>>Gr"   r   )NNNNFTN)r'   r(   r)   r   r   r-   r.   r   r   r%   r/   r0   s   @r!   r   r      s    =z =D /3596:(,).!%.2F||F t+F  %||d2	F
 !&t 3F F  $;F $;F t+F 
Fr"   r   c                   8     e Zd ZU eed<   dZdZdgZ fdZ xZ	S )XGLMPreTrainedModelr   modelTr   c                    t         |   |       t        |t              r_|j	                  |j
                  |j                  z   |j                  |j                        }t        j                  |j                  |       y y r   )r   _init_weightsr   r2   r>   r3   r6   r   r   initcopy_r9   )r   modulerB   r    s      r!   r   z!XGLMPreTrainedModel._init_weightsl  sg    f%f?@ ..$$v}}4f6J6JFL^L^K JJv~~{3	 Ar"   )
r'   r(   r)   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r/   r0   s   @r!   r   r   e  s(    &*#+,4 4r"   r   c                   x    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
ej                  dz  de	dz  de	dz  de	dz  de	dz  dej                  dz  de
ej                     ez  fd       Z xZS )	XGLMModelr   c           	         t         |   |       |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  rt        j                  |j                        nd}t        |j                  |j                  | j
                  |      | _        t        |j                  |j                  |j                        | _        t#        j$                  t'        |j(                        D cg c]  }t+        ||       c}      | _        t#        j.                  |j                        | _        d| _        | j5                          y c c}w )Nr&   )r   )rh   F)r   r   re   	layerdroppad_token_idr   max_position_embeddingsmax_target_positionsscale_embeddingrG   sqrtr   r   
vocab_sizeembed_tokensr2   embed_positionsr   
ModuleListrange
num_layersr   r   r   
layer_normgradient_checkpointing	post_init)r   r   r   ir    s       r!   r   zXGLMModel.__init__w  s    ~~))!..$*$B$B!393I3Idii/s3v~~t/?/?[
  A**NN 

 mmTYZ`ZkZkTl$mq%5f%J$mn,,v~~6&+# %ns   
E&Nr#   rv   rU   r   r   ru   inputs_embedsr   rw   output_hidden_statesreturn_dictrx   ry   c                 4   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||t        d      |8| j                  ||       |j                         }|j                  d|d         }n!||j                         dd }nt        d      || j                  |      }| j                  r%| j                  r|rt        j                  d       d}|rd|b|| j                   j                  r4t        t!        | j                         t!        | j                               nt!        | j                         }||j#                         nd}|2t%        j&                  |||j(                  d	   z   |j*                  
      }t-        | j                   ||||      }|Vt%        j&                  ||d   |z   t$        j.                  ||j*                  n|j*                        }|j1                  d      }||t3        | j                   |||      }|| j5                  ||      j7                  |j*                        z   }t8        j:                  j=                  |t?        | j<                        | j                        }|
rdnd}|	rdnd}|	r|dnd}tA        | jB                        D ]k  \  }}|
r||fz  }| j                  r%t%        jD                  g       }|| jF                  k  r? |||||||	||      }|d   }|	sW||d	   fz  }|c||d   fz  }m | jI                  |      }|
r||fz  }|stK        d |||||fD              S tM        |||||      S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timerF   z5You have to specify either input_ids or inputs_embedsz_`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...F)r   r   r   r{   )r   r   rv   rx   ru   r:   )r   r   rv   r   r|    )r   ru   rw   r   rx   r5   c              3   $   K   | ]  }|| 
 y wr   r   ).0vs     r!   	<genexpr>z$XGLMModel.forward.<locals>.<genexpr>  s      = s   )last_hidden_stateru   rs   
attentionscross_attentions)'r   rw   r   r   use_return_dictrl   %warn_if_padding_and_no_attention_maskrX   rP   r   r   r~   loggerwarning_onceis_encoder_decoderr	   r   get_seq_lengthr-   rJ   rZ   r<   r   longrL   r   r   r@   r   r   re   r,   	enumerater   randr   r   r   r   )r   r#   rv   rU   r   r   ru   r   r   rw   r   r   rx   kwargsinput_shaperV   rs   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputss                           r!   r%   zXGLMModel.forward  s   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>cdd"66y.Q#..*K!r;r?;I&',,.s3KTUU  --i8M&&4==##u "	 0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg!"\\&(>ATATUVAW(W`m`t`tN ,;;'))+
  <<&B"88jj+4+@y''mFZFZ	L (11!4L !,1G1S%>{{+5&;	&" &(<(<\Ka(b(e(e  )
 
 --muT\\?R]a]j]j-k #7BD0d&7<Q<]rdh"+DKK"8 	@C#!m%55!}}&+jjn#&7)%'= /"3#-	M *!,M =#3"55(4(]1-=,??(3	@6 6  -!11 ':K^]qr  
 9+++%1
 	
r"   )NNNNNNNNNNNN)r'   r(   r)   r   r   r   r-   r.   r   r   r   r   r%   r/   r0   s   @r!   r   r   u  s:   z 0  *..2,0596:(,-1!%)-,0#'.2Y
<<$&Y
 t+Y
 llT)	Y

  %||d2Y
 !&t 3Y
 Y
 ||d*Y
 $;Y
  $;Y
 #TkY
 D[Y
 t+Y
 
u||	H	HY
 Y
r"   r   z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            !           e Zd ZdZddiZ fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de	dz  dej                  dz  dej                  dz  de
dz  de
dz  de
dz  de
dz  dej                  dz  deej                  z  deej                     ez  fd       Z xZS )XGLMForCausalLMr   zlm_head.weightzmodel.embed_tokens.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFrj   )
r   r   r   r   r   rn   hidden_sizer   lm_headr   )r   r   r    s     r!   r   zXGLMForCausalLM.__init__6  sH     v&
yy!3!3V5F5FUS 	r"   Nr#   rv   rU   r   r   ru   r   labelsr   rw   r   r   rx   logits_to_keepry   c                 x   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }| j	                  ||||||||	|
|||      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|? | j                  ||f| j                   j                  | j                   j                  d|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                         S )ai  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r#   rv   rU   r   r   ru   r   r   rw   r   r   rx   r   )r   r   r   )losslogitsru   rs   r   r   )r   rw   r   r   r   r   r+   slicer  loss_functionr   r   r   ru   rs   r   r   )r   r#   rv   rU   r   r   ru   r   r  r   rw   r   r   rx   r  r   r   rs   slice_indicesr  r  outputs                         r!   r%   zXGLMForCausalLM.forward>  s   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **)%"7#9+'/!5#)  
  
8B>SV8W~ot4]kmA}a,?@A%4%%  ;;11![[55	
 D Y,F'+'7D7V#CVC0#33!//))$55
 	
r"   )NNNNNNNNNNNNNr   )r'   r(   r)   r   _tied_weights_keysr   r   r-   r.   r   r   r+   r   r   r%   r/   r0   s   @r!   r   r   ,  su     *,GH  *..2,0596:(,-1&*!%)-,0#'.2-.T
<<$&T
 t+T
 llT)	T

  %||d2T
 !&t 3T
 T
 ||d*T
 t#T
 $;T
  $;T
 #TkT
 D[T
 t+T
 ell*T
" 
u||	@	@#T
 T
r"   r   )r   r   r   )+r*   rG   r-   r    r   r   activationsr   cache_utilsr   r   r	   
generationr
   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_xglmr   
get_loggerr'   r   	Embeddingr   Moduler2   rb   r   r   r   r   __all__r   r"   r!   <module>r     s        & ! C C ) J 9 l - , * 
		H	%
=bll 
=1w		 1whS2BII S2lf1 fR 4/ 4 4 s
# s
 s
l a
)? a
a
H Br"   