
    qiiR                     v   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ  ej8                  e      Zdededej@                  fdZ!dej@                  dej@                  fdZ"dej@                  dej@                  dej@                  dej@                  fdZ# G d dejH                        Z% G d dejH                        Z& G d de      Z'e G d  d!e             Z(e G d" d#e(             Z) ed$%       G d& d'e(e             Z*g d(Z+y))zPyTorch CodeGen model.    N)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringlogging   )CodeGenConfignum_posdimreturnc                    ddt        j                  d|dt         j                        |z  z  z  }t        j                  dt        j                  | t         j                        j	                         |      j	                         }t        j
                  t        j                  |      t        j                  |      fd      S )	Ng      ?i'  r      )dtypezi , j -> i jr   r   )torcharangeint64einsumfloatcatsincos)r   r   inv_freqsinusoid_inps       ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positionsr%   (   s    eQQekk JS PQRH<<WEKK0X0^0^0`bjkqqsL99eii-uyy/FGQOO    xc                     | d d d d d d d d df   }| d d d d d d dd df   }t        j                  | |fd      } | j                  d      S )Nr   r   r   )r   stackflatten)r'   x1x2s      r$   rotate_every_twor/   /   sS    	
1aCaC<B	
1aADqD=	BbS"I2&A99R=r&   tensorr    r!   c                     t        j                  |d d d d d d d f   dd      }t        j                  |d d d d d d d f   dd      }| |z  t        |       |z  z   S )Nr   r   )r   repeat_interleaver/   )r0   r    r!   s      r$   apply_rotary_pos_embr3   7   s^    

!
!#aD!m"4a
;C

!
!#aD!m"4a
;CSL-f5;<<r&   c                       e Zd Zd fd	Zd Zd Z	 ddZ	 	 	 	 	 	 ddej                  dz  de	dz  dej                  dz  d	ej                  dz  d
edz  dedz  dej                  dz  deej                  eej                     f   eej                  eej                     eej                  df   f   z  dz  fdZ xZS )CodeGenAttentionNc                 .   t         |           |j                  | _        t	        j
                  |j                        | _        t	        j
                  |j                        | _	        || _
        |-t        j                  d| j                  j                   d       |j                  | _        |j"                  | _        | j                   | j"                  z  | _        | j$                  | j"                  z  | j                   k7  r&t'        d| j                    d| j"                   d      t)        j*                  | j$                        | _        t	        j.                  | j                   | j                   dz  d      | _        t	        j.                  | j                   | j                   d      | _        |j4                  | _        | j4                  xs | j                   | _        | j9                  d	t;        | j                  | j6                        d
       y )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   F)biasembed_positions)
persistent)super__init__max_position_embeddingsmax_positionsr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout	layer_idxloggerwarning_once	__class____name__hidden_size	embed_dimnum_attention_headshead_dim
ValueErrormathsqrt
scale_attnLinearqkv_projout_proj
rotary_dimpos_embd_dimregister_bufferr%   )selfconfigrC   rF   s      r$   r;   zCodeGenAttention.__init__>   s   #;;JJv'8'89ZZ(:(:;" !8!8 9 :, ,  ++#)#=#= $*B*BB==4333t~~EWX\XfXfWg h++/+C+C*DBH  ))DMM2		$..$..12D5Q		$..$..uM ++ OO=t~~:4;M;MtO`O`ans 	 	
r&   c                     |j                  |j                  d d ||z  |fz         }|j                  |j                  d d dz   |j                  dd  z         }|S )Nr)   r*   )r)   )reshapeshape)rV   r'   n_headdim_headmp_numreshapeds         r$   _split_headszCodeGenAttention._split_heads^   s]    99QWWSb\Vv-=x,HHI##AGGCRL5$88>>"#;N$NOr&   c                    t        |j                        dk(  r$|j                  ddddd      j                         }n\t        |j                        dk(  r#|j                  dddd      j                         }n!t	        dt        |j                               |j                         dd	 ||z  fz   }|j                  |      S )
zM
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
           r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr*   )lenrZ   permute
contiguousrL   sizeview)rV   r0   rJ   attn_head_size	new_shapes        r$   _merge_headszCodeGenAttention._merge_headsc   s     v||!^^Aq!Q2==?F!#^^Aq!Q/::<FRSVW]WcWcSdRefggKKM#2&*=*N)PP	{{9%%r&   c                    |j                  t        j                        }|j                  t        j                        }t        j                  ||j	                  dd            }|||z   }|| j
                  z  } t        j                  d      |      }|j                  |j                        }| j                  |      }t        j                  ||      }||fS )Nr)   r*   r   )
tor   float32matmul	transposerO   r   Softmaxr   r@   )rV   querykeyvalueattention_maskattn_weightsattn_outputs          r$   _attnzCodeGenAttention._attnp   s     'ffU]]#||E3==R+@A%'.8L#doo5)rzzb),7#u{{3((6ll<7L((r&   hidden_states
layer_pastrt   position_ids	use_cacheoutput_attentionscache_positionr   .c                 Z   | j                  |      }d}	|j                  |j                  d d |	dfz         }
| j                  | j                  z  |	z  }t        j                  |
|d      \  }}}| j                  || j                  | j                  |	      }| j                  || j                  | j                  |	      }| j                  || j                  | j                  |	      }|j                  dddd      }| j                  }|j                  |j                  k7  r"|j                  |j                        }|| _	        ||   }t        j                  ||j                  d   dz  d      \  }}| j                  |d d d d d d d | j                  f   }|d d d d d d | j                  d f   }|d d d d d d d | j                  f   }|d d d d d d | j                  d f   }t        |||      }t        |||      }t        j                  ||gd      }t        j                  ||gd      }nt        |||      }t        |||      }|j                  dddd      }|j                  dddd      }|K||| j                  |d	}|j                  |j                  |j                         || j"                  |      \  }}| j%                  ||||      \  }}| j'                  || j                  | j                        }| j)                  |      }| j+                  |      }||fS )
Nrb   r)   r   )r]   r   r   r   r   )r    r!   partial_rotation_sizer}   )rQ   rY   rZ   rK   rJ   r   splitr_   rd   r8   devicerl   rS   r3   r   updater   rC   rw   rj   rR   rB   )rV   rx   ry   rt   rz   r{   r|   r}   qkvr]   	qkv_split	local_dimrq   rs   rr   r8   sincosr    r!   k_rotk_passq_rotq_passcache_kwargsrv   ru   s                             r$   forwardzCodeGenAttention.forward   s    mmM*KK		#2&" =>	MMD$<$<<F	!KK	9"Euc!!%)A)A4==Y_!`T%=%=t}}U[\!!%)A)A4==Y_!`aAq)..!!\%8%88-001D1DEO#2D  .;;vv||B'71'<"ES??&1a!24??!223EAq$//"334F!Q#4T__#445E1aDOO$556F(S9E(S9E))UFO4CIIufo26E&sC5C(S9Ekk!Q1%aAq) !)-"0	L $**366-2E2E+Ft~~_klJC %)JJuc5.$Q!\''T5M5Mt}}]mmK0((5L((r&   NNNNFFN)rG   
__module____qualname__r;   r_   rj   rw   r   FloatTensorr   
LongTensorbooltupleTensorr   __classcell__rF   s   @r$   r5   r5   =   s   
@
&$ )8 $(3704!&).26G)((4/G) DLG) ))D0	G)
 &&-G) $;G)  $;G) ((4/G) 	ellE%,,//0
eELL15s9J3KK
L	M
	G)r&   r5   c                   \     e Zd Z fdZdej
                  dz  dej
                  fdZ xZS )
CodeGenMLPc                    t         |           |j                  }t        j                  ||      | _        t        j                  ||      | _        t        |j                     | _	        t        j                  |j                        | _        y r   )r:   r;   n_embdr   rP   fc_infc_outr   activation_functionactr>   rA   dropout)rV   intermediate_sizerW   rI   rF   s       r$   r;   zCodeGenMLP.__init__   se    MM	YYy*;<
ii 19=&445zz&"4"45r&   rx   Nr   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r   )rV   rx   s     r$   r   zCodeGenMLP.forward   s@    

=1/M2]3r&   )rG   r   r   r;   r   r   r   r   r   s   @r$   r   r      s,    6U%6%6%= %BSBS r&   r   c                   2    e Zd Zd fd	Z	 	 	 	 	 	 ddej
                  dz  dedz  dej
                  dz  dej                  dz  dedz  dedz  d	ej                  dz  d
e	ej                     e	ej                  e	ej
                  df   f   z  dz  fdZ xZS )CodeGenBlockNc                    t         |           |j                  |j                  nd|j                  z  }t	        j
                  |j                  |j                        | _        t        ||      | _	        t        ||      | _        y )Nrb   eps)r:   r;   n_innerr   r   	LayerNormlayer_norm_epsilonln_1r5   attnr   mlp)rV   rW   rC   	inner_dimrF   s       r$   r;   zCodeGenBlock.__init__   sc    &,nn&@FNNa&--FW	LLF4M4MN	$VY7	i0r&   rx   ry   rt   rz   r{   r|   r}   r   .c           	          |}| j                  |      }| j                  |||||||      \  }	}
| j                  |      }|	|z   |z   }||
fS )N)rx   ry   rt   rz   r{   r|   r}   )r   r   r   )rV   rx   ry   rt   rz   r{   r|   r}   residualattn_outputsru   feed_forward_hidden_statess               r$   r   zCodeGenBlock.forward   sp     !		-0%)YY'!)%/) &/ &
"l &*XXm%<"$'AAHLl**r&   r   r   )rG   r   r   r;   r   r   r   r   r   r   r   r   r   r   s   @r$   r   r      s    1 $(3704!&).26+((4/+ DL+ ))D0	+
 &&-+ $;+  $;+ ((4/+ 
u||	uU\\59J9JC9O3P%PQ	QTX	X+r&   r   c                   @     e Zd ZU eed<   dZdZdgZdZdZ	 fdZ
 xZS )CodeGenPreTrainedModelrW   transformerTr   past_key_valuesc                     t         |   |       t        |t              r?t	        j
                  |j                  t        |j                  |j                               y y r   )
r:   _init_weights
isinstancer5   initcopy_r8   r%   r=   rT   )rV   modulerF   s     r$   r   z$CodeGenPreTrainedModel._init_weights  sI    f%f./JJv--/J6K_K_agatat/uv 0r&   )rG   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraphr   r   r   s   @r$   r   r     s6    %&*#'("3!w wr&   r   c                   D    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  de	dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deez  fd       Z xZS )CodeGenModelc           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _
        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                   | j                  |j"                        | _        t'        |j(                  |j*                  |j,                  z        | _        d| _        | j1                          y c c}w )N)rC   r   F)r:   r;   r   rI   
vocab_sizer   	Embeddingwter>   
embd_pdropdrop
ModuleListrangen_layerr   hr   r   ln_fminrS   n_ctxrJ   gradient_checkpointing	post_init)rV   rW   irF   s      r$   r;   zCodeGenModel.__init__  s      ++<< 1 14>>BJJv001	5QWQ_Q_K`aaVq AabLLV5N5NO	f//A[A[1[\&+# 	  bs   ,Ec                     | j                   S r   r   )rV   s    r$   get_input_embeddingsz!CodeGenModel.get_input_embeddings-  s    xxr&   c                     || _         y r   r   )rV   new_embeddingss     r$   set_input_embeddingsz!CodeGenModel.set_input_embeddings0  s	    !r&   N	input_idsr   rt   token_type_idsrz   inputs_embedsr{   r|   output_hidden_statesreturn_dictr}   r   c           
         ||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }|
|
n| j                   j                  }
|du |duz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r|t        | j                         }|j                  d   }|9||j                         nd}t        j                  |||z   |j                         }||j#                  d      }t%        | j                   |||||	      }|}|(|j'                  d
|      }| j                  |      }||z   }| j)                  |      }d
||j+                  d
      f}|rdnd}|	rdnd}t-        | j.                        D ]-  \  }}|	r||fz   } ||||||||      }|d   }|s%||d   fz   }/ | j1                  |      }|j'                  |      }|	r||fz   }|
st3        d ||||fD              S t5        ||||      S )a  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rW   r   r   )r   )rW   r   rt   r}   r   rz   r)    )ry   rt   rz   r{   r|   r}   c              3   &   K   | ]	  }||  y wr   r   ).0vs     r$   	<genexpr>z'CodeGenModel.forward.<locals>.<genexpr>  s      ghgts   )last_hidden_stater   rx   
attentions)rW   r|   r   r{   use_return_dictrL   r   trainingrD   rE   r   r   rZ   get_seq_lengthr   r   r   	unsqueezer
   rg   r   rf   	enumerater   r   r   r   )rV   r   r   rt   r   rz   r   r{   r|   r   r   r}   kwargs
seq_lengthpast_seen_tokenscausal_maskrx   token_type_embedsoutput_shapeall_self_attentionsall_hidden_statesr   blockoutputss                           r$   r   zCodeGenModel.forward3  s   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==##p "	  HHY/M0*$++>O"((+
!CRC^==?de"\\*:<Lz<YbobvbvwN)33A6L(;;'))+%
 &%+00Z@N $ 8),==M		-0J(:(:2(>?$5b4"6BD!$&&) 	JHAu#$58H$H!**)#"3-G $AJM &9WQZM&I#!	J$ 		-0%**<8 1]4D D )?<MObc   '+++*	
 	
r&   )NNNNNNNNNNN)rG   r   r   r;   r   r   r   r   r   r   r   r   r   r   r   r   r   s   @r$   r   r     s)    "  .2(,37260426!%)-,0#'26l
##d*l
 l
 ))D0	l

 ((4/l
 &&-l
 ((4/l
 $;l
  $;l
 #Tkl
 D[l
 ((4/l
 
(	(l
 l
r&   r   zM
    The CodeGen Model transformer with a language modeling head on top.
    )custom_introc                       e Zd ZddiZ fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  de
dz  de
dz  de
dz  de
dz  dej                  dz  deej                  z  deez  fd       Z xZS )CodeGenForCausalLMzlm_head.weightztransformer.wte.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        | j                          y r   )
r:   r;   r   r   r   rP   r   r   lm_headr   )rV   rW   rF   s     r$   r;   zCodeGenForCausalLM.__init__  sE     '/yy0A0AB 	r&   Nr   r   rt   r   rz   r   labelsr{   r|   r   r   r}   logits_to_keepr   c                    ||n| j                   j                  }| j                  ||||||||	|
||      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                   j                  d|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )aG  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r   rt   r   rz   r   r{   r|   r   r   r}   r   )logitsr   r   r   )lossr   r   rx   r   r   )rW   r   r   r   intslicer   loss_functionr   r   r   rx   r   )rV   r   r   rt   r   rz   r   r   r{   r|   r   r   r}   r   r   transformer_outputsrx   slice_indicesr   r   outputs                        r$   r   zCodeGenForCausalLM.forward  s"   8 &1%<k$++B]B]"..+))%'/!5#) / 
 ,A.8B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopDY!4QR!88F)-)9TGf$EvE%/??-;;*55
 	
r&   )NNNNNNNNNNNNr   )rG   r   r   _tied_weights_keysr;   r   r   r   r   r   r   r  r   r   r   r   r   r   s   @r$   r   r     sM    +,DE  .2(,37260426*.!%)-,0#'26-.>
##d*>
 >
 ))D0	>

 ((4/>
 &&->
 ((4/>
   4'>
 $;>
  $;>
 #Tk>
 D[>
 ((4/>
 ell*>
  
'	'!>
 >
r&   r   )r   r   r   ),__doc__rM   r   r    r   r   activationsr   cache_utilsr   r   
generationr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_codegenr   
get_loggerrG   rD   r  r   r%   r/   r3   Moduler5   r   r   r   r   r   __all__r   r&   r$   <module>r     s]       & ! . ) / 9 O - 1 
		H	%P P3 P5<< P  = =ELL =u|| =X]XdXd =S)ryy S)n (!+- !+H w_ w w D
) D
 D
N 
J
/ J

J
Z Kr&   