
    qi                     0   d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$  e       rddlm%Z%  e"jL                  e'      Z(de)de)dejT                  fdZ+d Z,dejT                  dejT                  fdZ-dejT                  dejT                  dejT                  dejT                  fdZ. G d dej^                        Z0 G d  d!e0      Z1e0e1d"Z2 G d# d$ej^                        Z3 G d% d&e      Z4e! G d' d(e             Z5e! G d) d*e5             Z6 e!d+,       G d- d.e5e             Z7 e!d/,       G d0 d1e5             Z8e! G d2 d3e5             Z9g d4Z:y)5zPyTorch GPT-J model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)!flash_attn_supports_top_left_maskis_flash_attn_available)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)PreTrainedModel)auto_docstringlogging   )
GPTJConfig)_flash_attention_forwardnum_posdimreturnc                    ddt        j                  d|dt         j                        |z  z  z  }t        j                  dt        j                  | t         j                        j	                         |      j	                         }t        j
                  t        j                  |      t        j                  |      fd      S )	Ng      ?i'  r      )dtypezi , j -> i jr   r   )torcharangeint64einsumfloatcatsincos)r   r   inv_freqsinusoid_inps       X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/gptj/modeling_gptj.pycreate_sinusoidal_positionsr-   /   s    eQQekk JS PQRH<<WEKK0X0^0^0`bjkqqsL99eii-uyy/FGQOO    c                 t    | j                  |j                        j                  |j                  d   dd      S Nr   r   )todevicerepeatshape)embed_positionsposition_idss     r,   get_embed_positionsr7   5   s5    l11299,:L:LQ:OQRTUVVr.   xc                     | d d d d d d d d df   }| d d d d d d dd df   }t        j                  | |fd      } | j                  d      S )Nr   r   r!   )r"   stackflatten)r8   x1x2s      r,   rotate_every_twor@   9   sS    	
1aCaC<B	
1aADqD=	BbS"I2&A99R=r.   tensorr(   r)   c                     t        j                  |d d d d d d d f   dd      }t        j                  |d d d d d d d f   dd      }| |z  t        |       |z  z   S )Nr   r   )r"   repeat_interleaver@   )rA   r(   r)   s      r,   apply_rotary_pos_embrD   @   s^    

!
!#aD!m"4a
;C

!
!#aD!m"4a
;CSL-f5;<<r.   c                       e Zd Zd fd	Zd Zd Z	 ddZd Z	 	 	 	 	 	 ddej                  de
dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dej                  dz  deej                  eej                     f   eej                  eej                     eej                  df   f   z  dz  fdZ xZS )GPTJAttentionNc                    t         |           || _        |j                  | _        t        j                  |j                        | _        t        j                  |j                        | _
        d| _        || _        |-t        j                  d| j                  j                    d       |j"                  | _        |j&                  | _        | j$                  | j&                  z  | _        | j(                  | j&                  z  | j$                  k7  r&t+        d| j$                   d| j&                   d      t-        j.                  | j(                        | _        t        j2                  | j$                  | j$                  d      | _        t        j2                  | j$                  | j$                  d      | _        t        j2                  | j$                  | j$                  d      | _        t        j2                  | j$                  | j$                  d      | _        |j<                  | _        | j<                  xs | j$                  | _        | jA                  d	tC        | j                  | j>                        d
       y )NTzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).Fbiasr5   )
persistent)"super__init__configmax_position_embeddingsmax_positionsr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout	is_causal	layer_idxloggerwarning_once	__class____name__hidden_size	embed_dimnum_attention_headshead_dim
ValueErrormathsqrt
scale_attnLineark_projv_projq_projout_proj
rotary_dimpos_embd_dimregister_bufferr-   )selfrM   rV   rY   s      r,   rL   zGPTJAttention.__init__G   s   #;;JJv'8'89ZZ(:(:;" !8!8 9 :, ,  ++#)#=#= $*B*BB==4333t~~EWX\XfXfWg h++/+C+C*DBH  ))DMM2iiUKiiUKiiUK		$..$..uM ++ OO=t~~:4;M;MtO`O`ans 	 	
r.   c                 P   |j                         dd ||fz   }|j                  |      }|r|S t        |j                        dk(  r|j	                  ddddd      S t        |j                        dk(  r|j	                  dddd      S t        d	t        |j                               )
zO
        Splits hidden dim into attn_head_size and num_attention_heads
        Nr:      r   r   r   r      3Input tensor rank should be one of [4, 5], but is: )sizeviewlenr4   permuter_   )rk   rA   r]   attn_head_sizerotary	new_shapes         r,   _split_headszGPTJAttention._split_headsl   s     KKM#2&*=~)NN	Y'Mv||!>>!Q1a00!#>>!Q1--RSVW]WcWcSdRefggr.   c                    t        |j                        dk(  r$|j                  ddddd      j                         }n\t        |j                        dk(  r#|j                  dddd      j                         }n!t	        dt        |j                               |j                         dd	 ||z  fz   }|j                  |      S )
zR
        Merges attn_head_size dim and num_attn_heads dim into hidden dim
        rm   r   r   r   r   rn   ro   Nr;   )rr   r4   rs   
contiguousr_   rp   rq   )rk   rA   r]   rt   rv   s        r,   _merge_headszGPTJAttention._merge_heads{   s     v||!^^Aq!Q2==?F!#^^Aq!Q/::<FRSVW]WcWcSdRefggKKM#2&*=*N)PP	{{9%%r.   c                    |j                  t        j                        }|j                  t        j                        }t        j                  ||j	                  dd            }|| j
                  z  }|||z   }t        j                  j                  |d      }|j                  |j                        }| j                  |      }t        j                  ||      }||fS )Nr:   r;   r!   )r1   r"   float32matmul	transposerb   r   
functionalsoftmaxr    rR   )rk   querykeyvalueattention_maskattn_weightsattn_outputs          r,   _attnzGPTJAttention._attn   s     'ffU]]#||E3==R+@A#doo5%'.8L}},,\r,B#u{{3((6ll<7L((r.   c                     | j                   }|j                  |j                  k7  r"|j                  |j                        }|| _         |j                  |j                  d   dd      S r0   )r5   r2   r1   r3   r4   )rk   r6   r5   s      r,   _get_embed_positionsz"GPTJAttention._get_embed_positions   s_    ..!!\%8%88-001D1DEO#2D %%l&8&8&;QBBr.   hidden_states
layer_pastr   r6   	use_cacheoutput_attentionscache_positionr   .c                    | j                  |      }| j                  |      }	| j                  |      }
| j                  || j                  | j
                  d      }| j                  |	| j                  | j
                  d      }	| j                  |
| j                  | j
                  d      }
| j                  |      }|j                  d      j                  dd|j                  d         }t        j                  |d|      j                  |	j                        }t        j                  ||j                  d   dz  d      \  }}| j                  |	d d d d d d d | j                  f   }|	d d d d d d | j                  d f   }|d d d d d d d | j                  f   }|d d d d d d | j                  d f   }t!        |||      }t!        |||      }t        j"                  ||gd      }	t        j"                  ||gd      }nt!        |	||      }	t!        |||      }|	j%                  dddd      }	|j%                  dddd      }|2||| j                  |d	}|j'                  |	|
| j(                  |      \  }	}
| j+                  ||	|
|      \  }}| j-                  || j                  | j
                        }| j/                  |      }| j1                  |      }||fS )
NTFr:   r   r   r!   r   r   r(   r)   partial_rotation_sizer   )rf   rd   re   rw   r]   r^   r   	unsqueezer3   r4   r"   gatherr1   r    splitrh   rD   r'   rs   updaterV   r   rz   rg   rT   )rk   r   r   r   r6   r   r   r   r   r   r   r5   repeated_position_idssincosr(   r)   k_rotk_passq_rotq_passcache_kwargsr   r   s                          r,   forwardzGPTJAttention.forward   s    M*kk-(M*!!%)A)A4==RVWT%=%=t}}dS!!%)A)A4==RWX33LA , 6 6r : A A!QH]H]^`Ha boq2GHKKCIIV;;vv||B'71'<"ES??&1a!24??!223EAq$//"334F!Q#4T__#445E1aDOO$556F(S9E(S9E))UFO4CIIufo26E&sC5C(S9Ekk!Q1%aAq)!)-"0	L $**3t~~|TJC %)JJuc5.$Q!\''T5M5Mt}}]mmK0((5L((r.   NNNNFFN)rZ   
__module____qualname__rL   rw   rz   r   r   r"   FloatTensorr
   
LongTensorbooltupleTensorr   __classcell__rY   s   @r,   rF   rF   F   s   #
Jh&$ )2C $(3704!&).26?)((?) DL?) ))D0	?)
 &&-?) $;?)  $;?) ((4/?) 	ellE%,,//0
eELL15s9J3KK
L	M
	?)r.   rF   c                   h    e Zd ZdZ fdZ	 	 	 	 	 	 ddej                  dedz  dej                  dz  dej                  dz  de	dz  d	e	dz  d
ej                  dz  de
ej                  e
ej                     f   e
ej                  e
ej                     e
ej                  df   f   z  dz  fdZ xZS )GPTJFlashAttention2aD  
    GPTJ flash attention module. This module inherits from `GPTJAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 B    t        |   |i | t               | _        y r   )rK   rL   r   _flash_attn_uses_top_left_mask)rk   argskwargsrY   s      r,   rL   zGPTJFlashAttention2.__init__   s#    $)&)
 /P.Q+r.   Nr   r   r   r6   r   r   r   r   .c           
      	   | j                  |      }| j                  |      }	| j                  |      }
| j                  || j                  | j
                  d      }| j                  |	| j                  | j
                  d      }	| j                  |
| j                  | j
                  d      }
| j                  |      }|j                  d      j                  dd|j                  d         }t        j                  |d|      j                  |	j                        }t        j                  ||j                  d   dz  d      \  }}| j                  |	d d d d d d d | j                  f   }|	d d d d d d | j                  d f   }|d d d d d d d | j                  f   }|d d d d d d | j                  d f   }t!        |||      }t!        |||      }t        j"                  ||gd      }	t        j"                  ||gd      }nt!        |	||      }	t!        |||      }|	j%                  dddd      }	|j%                  dddd      }|2||| j                  |d	}|j'                  |	|
| j(                  |      \  }	}
|	j%                  dddd      j+                         }	|j%                  dddd      j+                         }|
j%                  dddd      j+                         }
|j                  }|j,                  j.                  d
k7  r|j,                  j.                  nd}|t        j0                  k(  rt        j2                  |      rt        j4                  |      }nMt7        | j8                  d      r| j8                  j                  }n | j                   j:                  j                  }t<        j?                  d| d       |j                  |      }|	j                  |      }	|
j                  |      }
| j@                  r| j8                  jB                  nd}|j                  d   }tE        ||	|
|||| jF                  | jH                        }|jK                  |j                  d   |j                  d   |j                  d   |j                  d   z        }| jM                  |      }| jO                  |      }||fS )NTFr:   r   r   r!   r   r   r   mpscpu_is_quantizedzThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .g        )dropoutrU   use_top_left_mask)(rf   rd   re   rw   r]   r^   r   r   r3   r4   r"   r   r1   r    r   rh   rD   r'   rs   r   rV   ry   r2   typer|   is_autocast_enabledget_autocast_dtypehasattrrM   weightrW   rX   trainingrQ   r   rU   r   reshaperg   rT   )rk   r   r   r   r6   r   r   r   r   r   r   r5   r   r   r(   r)   r   r   r   r   r   input_dtypedevice_typetarget_dtypeattention_dropoutquery_lengthr   r   s                               r,   r   zGPTJFlashAttention2.forward   sJ    M*kk-(M*!!%)A)A4==RVWT%=%=t}}dS!!%)A)A4==RWX33LA , 6 6r : A A!QH]H]^`Ha boq2GHKKCIIV;;vv||B'71'<"ES??&1a!24??!223EAq$//"334F!Q#4T__#445E1aDOO$556F(S9E(S9E))UFO4CIIufo26E&sC5C(S9E
 kk!Q1%aAq) !)-"0	L $**3t~~|TJC kk!Q1%002aAq)446aAq)446 kk+0<<+<+<+Eell''5%--'((5$77Do6#{{00#{{1177 >$ HH\*E&&&CHH\*E6:mmDKK22{{1~ 0%nn"AA	
 #**q!<#5#5a#8,:L:LQ:OR^RdRdefRg:g
 mmK0((5L((r.   r   )rZ   r   r   __doc__rL   r"   r   r
   r   r   r   r   r   r   r   s   @r,   r   r      s    R $(3704!&).26u)((u) DLu) ))D0	u)
 &&-u) $;u)  $;u) ((4/u) 	ellE%,,//0
eELL15s9J3KK
L	M
	u)r.   r   )eagerflash_attention_2c                   \     e Zd Z fdZdej
                  dz  dej
                  fdZ xZS )GPTJMLPc                    t         |           |j                  }t        j                  ||      | _        t        j                  ||      | _        t        |j                     | _	        t        j                  |j                        | _        y r   )rK   rL   n_embdr   rc   fc_infc_outr	   activation_functionactrP   rS   r   )rk   intermediate_sizerM   r\   rY   s       r,   rL   zGPTJMLP.__init__x  se    MM	YYy*;<
ii 19=&445zz&"4"45r.   r   Nr   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r   )rk   r   s     r,   r   zGPTJMLP.forward  s@    

=1/M2]3r.   )rZ   r   r   rL   r"   r   r   r   r   s   @r,   r   r   w  s,    6U%6%6%= %BSBS r.   r   c                   2    e Zd Zd fd	Z	 	 	 	 	 	 ddej
                  dz  dedz  dej
                  dz  dej                  dz  dedz  dedz  d	ej                  dz  d
e	ej                     e	ej                  e	ej
                  df   f   z  dz  fdZ xZS )	GPTJBlockNc                 .   t         |           |j                  |j                  nd|j                  z  }t	        j
                  |j                  |j                        | _        t        |j                     ||      | _
        t        ||      | _        y )Nrn   eps)rK   rL   n_innerr   r   	LayerNormlayer_norm_epsilonln_1GPTJ_ATTENTION_CLASSES_attn_implementationattnr   mlp)rk   rM   rV   	inner_dimrY   s       r,   rL   zGPTJBlock.__init__  so    &,nn&@FNNa&--FW	LLF4M4MN	*6+F+FGPYZ	9f-r.   r   r   r   r6   r   r   r   r   .c           	          |}| j                  |      }| j                  |||||||      \  }	}
| j                  |      }|	|z   |z   }||
fS )N)r   r   r   r6   r   r   r   )r   r   r   )rk   r   r   r   r6   r   r   r   residualattn_outputsr   feed_forward_hidden_statess               r,   r   zGPTJBlock.forward  sp     !		-0%)YY'!)%/) &/ &
"l &*XXm%<"$'AAHLl**r.   r   r   )rZ   r   r   rL   r"   r   r
   r   r   r   r   r   r   r   s   @r,   r   r     s    . $(3704!&).26+((4/+ DL+ ))D0	+
 &&-+ $;+  $;+ ((4/+ 
u||	uU\\59J9JC9O3P%PQ	QTX	X+r.   r   c                   D     e Zd ZU eed<   dZdZdgZdZdZ	dZ
 fdZ xZS )GPTJPreTrainedModelrM   transformerTr   past_key_valuesc                     t         |   |       t        |t              r?t	        j
                  |j                  t        |j                  |j                               y y r   )
rK   _init_weights
isinstancerF   initcopy_r5   r-   rO   ri   )rk   modulerY   s     r,   r   z!GPTJPreTrainedModel._init_weights  sH    f%fm,JJv--/J6K_K_agatat/uv -r.   )rZ   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_can_compile_fullgraphr   r   r   s   @r,   r   r     s;    %&*#$"3!w wr.   r   c                   D    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  de	dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deez  fd       Z xZS )	GPTJModelc           	      2   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _
        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                   | j                  |j"                        | _        d| _        | j)                          y c c}w )N)rV   r   F)rK   rL   r   r\   
vocab_sizer   	EmbeddingwterP   
embd_pdropdrop
ModuleListrangen_layerr   hr   r   ln_fgradient_checkpointing	post_init)rk   rM   irY   s      r,   rL   zGPTJModel.__init__  s      ++<< 1 14>>BJJv001	fnnH]^1	&A >^_LLV5N5NO	&+# 	  _s   ,Dc                     | j                   S r   r   )rk   s    r,   get_input_embeddingszGPTJModel.get_input_embeddings  s    xxr.   c                     || _         y r   r  )rk   new_embeddingss     r,   set_input_embeddingszGPTJModel.set_input_embeddings  s	    !r.   N	input_idsr   r   token_type_idsr6   inputs_embedsr   r   output_hidden_statesreturn_dictr   r   c           
         ||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }|
|
n| j                   j                  }
|du |duz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r|t        | j                         }|j                  d   }|9||j                         nd}t        j                  |||z   |j                         }||j#                  d      }t%        | j                   |||||	      }|}|(|j'                  d
|      }| j                  |      }||z   }| j)                  |      }d
||j+                  d
      f}|rdnd}|	rdnd}t-        | j.                        D ]-  \  }}|	r||fz   } ||||||||      }|d   }|s%||d   fz   }/ | j1                  |      }|j'                  |      }|	r||fz   }|
st3        d ||||fD              S t5        ||||      S )  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rM   r   r   r2   )rM   r
  r   r   r   r6   r:    )r   r   r6   r   r   r   c              3   &   K   | ]	  }||  y wr   r  ).0vs     r,   	<genexpr>z$GPTJModel.forward.<locals>.<genexpr>:  s      ghgts   )last_hidden_stater   r   
attentions)rM   r   r  r   use_return_dictr_   r   r   rW   rX   r   r   r4   get_seq_lengthr"   r#   r2   r   r   rq   r   rp   	enumerater   r   r   r   )rk   r  r   r   r	  r6   r
  r   r   r  r  r   r   
seq_lengthpast_key_values_lengthcausal_maskr   token_type_embedsoutput_shapeall_self_attentionsall_hidden_statesr  blockoutputss                           r,   r   zGPTJModel.forward  s   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==##p "	  HHY/M0*$++>O"((+
!IXId_%C%C%Ejk""\\&(>(KTaThThN )33A6L(;;'))+%
 &%+00Z@N $ 8),==M		-0J(:(:2(>?$5b4"6BD!$&&) 	JHAu#$58H$H!**)#"3-G $AJM &9WQZM&I#!	J$ 		-0%**<8 1]4D D )?<MObc   '+++*	
 	
r.   NNNNNNNNNNN)rZ   r   r   rL   r  r  r   r"   r   r
   r   r   r   r   r   r   r   s   @r,   r   r     s)   "  .2(,37260426!%)-,0#'26n
##d*n
 n
 ))D0	n

 ((4/n
 &&-n
 ((4/n
 $;n
  $;n
 #Tkn
 D[n
 ((4/n
 
(	(n
 n
r.   r   zK
    The GPT-J Model transformer with a language modeling head on top.
    )custom_introc                       e Zd ZddiZ fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  de
dz  de
dz  de
dz  de
dz  dej                  dz  deej                  z  deez  fd       Z xZS )GPTJForCausalLMzlm_head.weightztransformer.wte.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        | j                          y r   )
rK   rL   r   r   r   rc   r   r   lm_headr   rk   rM   rY   s     r,   rL   zGPTJForCausalLM.__init__N  sE     $V,yy0A0AB 	r.   Nr  r   r   r	  r6   r
  labelsr   r   r  r  r   logits_to_keepr   c                    ||n| j                   j                  }| j                  ||||||||	|
||      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                   j                  d|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )aG  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r   r   r	  r6   r
  r   r   r  r  r   r   )logitsr*  r   r   lossr-  r   r   r  r  )rM   r  r   r   intslicer(  loss_functionr   r   r   r   r  )rk   r  r   r   r	  r6   r
  r*  r   r   r  r  r   r+  r   transformer_outputsr   slice_indicesr-  r/  outputs                        r,   r   zGPTJForCausalLM.forwardV  s"   8 &1%<k$++B]B]"..+))%'/!5#) / 
 ,A.8B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopDY!4QR!88F)-)9TGf$EvE%/??-;;*55
 	
r.   )NNNNNNNNNNNNr   )rZ   r   r   _tied_weights_keysrL   r   r"   r   r
   r   r   r0  r   r   r   r   r   r   s   @r,   r&  r&  F  sM    +,DE  .2(,37260426*.!%)-,0#'26-.>
##d*>
 >
 ))D0	>

 ((4/>
 &&->
 ((4/>
   4'>
 $;>
  $;>
 #Tk>
 D[>
 ((4/>
 ell*>
  
'	'!>
 >
r.   r&  a  
    The GPT-J Model transformer with a sequence classification head on top (linear layer).

    [`GPTJForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT, GPT-2, GPT-Neo) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   8    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	dz  de	dz  de	dz  de	dz  de
ez  fd       Z xZS )GPTJForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y )NFrH   )
rK   rL   
num_labelsr   r   r   rc   r   scorer   r)  s     r,   rL   z&GPTJForSequenceClassification.__init__  sR      ++$V,YYv}}dooEJ
 	r.   Nr  r   r   r	  r6   r
  r*  r   r   r  r  r   c                    ||n| j                   j                  }| j                  ||||||||	|
|
      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}||j                  |j                        }| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                   j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                   j"                  dk(  rt5               } |||      }|s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                        S )a!  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)	r   r   r	  r6   r
  r   r   r  r  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r:   )r2   r    z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  
regressionsingle_label_classificationmulti_label_classificationr.  )rM   r  r   r;  r4   pad_token_idr_   r1   r2   r"   int32r#   argmaxrW   rX   rY   rZ   problem_typer:  r    longr0  r   squeezer   rq   r   r   r   r   r  )rk   r  r   r   r	  r6   r
  r*  r   r   r  r  r   r3  r   r-  
batch_sizelast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr/  loss_fctr5  s                           r,   r   z%GPTJForSequenceClassification.forward  s!   4 &1%<k$++B]B]"..+))%'/!5# / 
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaabYY}334F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r.   r#  )rZ   r   r   rL   r   r"   r   r
   r   r   r   r   r   r   r   s   @r,   r8  r8    s     .2(,37260426*.!%)-,0#'b
##d*b
 b
 ))D0	b

 ((4/b
 &&-b
 ((4/b
   4'b
 $;b
  $;b
 #Tkb
 D[b
 
1	1b
 b
r.   r8  c                   @    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  de	e
z  fd       Z xZS )GPTJForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
rK   rL   r:  r   r   r   rc   r[   
qa_outputsr   r)  s     r,   rL   z!GPTJForQuestionAnswering.__init__  sT      ++$V,))F$6$68I8IJ 	r.   Nr  r   r	  r6   r
  start_positionsend_positionsr   r  r  r   c           
         |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r*|j                  d      j                  |j                        }t        |j                               dkD  r*|j                  d      j                  |j                        }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
r  N)r   r	  r6   r
  r   r  r  r   r   r:   r!   )ignore_indexr   )r/  start_logits
end_logitsr   r  )rM   r  r   rO  r   rE  ry   rr   rp   r1   r2   clampr   r   r   r  )rk   r  r   r	  r6   r
  rP  rQ  r   r  r  r   r"  sequence_outputr-  rT  rU  
total_lossignored_indexrK  
start_lossend_lossr5  s                          r,   r   z GPTJForQuestionAnswering.forward!  s   * &1%<k$++B]B]""))%'/!5# # 	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r.   )
NNNNNNNNNN)rZ   r   r   rL   r   r"   r   r   r   r   r   r   r   r   s   @r,   rM  rM    s     .2372604263715)-,0#'C
##d*C
 ))D0C
 ((4/	C

 &&-C
 ((4/C
 ))D0C
 ''$.C
  $;C
 #TkC
 D[C
 
-	-C
 C
r.   rM  )r&  rM  r8  r   r   );r   r`   r"   r   torch.nnr   r   r    r   r   activationsr	   cache_utilsr
   r   
generationr   masking_utilsr   modeling_flash_attention_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   configuration_gptjr   r   
get_loggerrZ   rW   r0  r   r-   r7   r@   rD   ModulerF   r   r   r   r   r   r   r&  r8  rM  __all__r  r.   r,   <module>rk     s       A A & ! . ) / h 9  . , * J 
		H	%P P3 P5<< PW  = =ELL =u|| =X]XdXd =a)BII a)HD)- D)P , bii & +*  +F w/ w w E
# E
 E
P 
J
)? J

J
Z m
$7 m
m
` N
2 N
 N
br.   