
    qi                        d Z ddlmZ ddlZddlmZ ddlmZmZmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  e$jN                  e(      Z) G d dejT                        Z+	 d0dejX                  dejZ                  dejZ                  dejZ                  dejZ                  dz  de.de.fdZ/ G d dejX                        Z0 G d d e      Z1e" G d! d"e             Z2 G d# d$e2      Z3e" G d% d&e2             Z4 G d' d(e2e      Z5 e"d)*       G d+ d,e2             Z6e" G d- d.e2             Z7g d/Z8y)1zPyTorch OPT model.    )CallableN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging   )	OPTConfigc                   x     e Zd ZdZdedef fdZ	 	 d
dej                  dedej                  dz  f fd	Z xZ	S )OPTLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y N   )offsetsuper__init__)selfr   r    	__class__s      V/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/opt/modeling_opt.pyr&   z&OPTLearnedPositionalEmbedding.__init__0   s$     $++5}E    Nattention_maskpast_key_values_lengthposition_idsc                     |8t        j                  |d      }||z  dz
  j                         }|dd|df   }t        |   || j
                  z         S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr%   forwardr$   )r'   r+   r,   r-   r(   s       r)   r4   z%OPTLearnedPositionalEmbedding.forward6   s^      <<A>L(>9A=CCEL'+A+B(BCLw|dkk9::r*   )r   N)
__name__
__module____qualname____doc__intr&   r1   
LongTensorr4   __classcell__r(   s   @r)   r   r   +   s]    Fs F3 F '(04	;((; !$; &&-	; ;r*   r   modulequerykeyvaluer+   scalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )N)r0   dtypeptrainingr   r#   )r1   matmul	transposer   
functionalsoftmaxfloat32torF   rB   rI   
contiguous)
r=   r>   r?   r@   r+   rA   rB   kwargsattn_weightsattn_outputs
             r)   eager_attention_forwardrT   H   s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r*   c                        e Zd ZdZ	 ddededz  f fdZ	 	 	 	 ddej                  de	dz  dej                  dz  d	e
d
ej                  dz  deej                  ej                  dz  e	dz  f   fdZ xZS )OPTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _	        || _
        |-t        j                  d| j                  j                   d       | j                  | j                  z  | _        d| _        | j                  | j                  z  | j                  k7  r&t#        d| j                   d| j                   d      | j                  dz  | _        t'        j(                  | j                  | j                  | j                        | _        t'        j(                  | j                  | j                  | j                        | _        t'        j(                  | j                  | j                  | j                        | _        t'        j(                  | j                  | j                  | j                        | _        y )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩bias)r%   r&   rW   hidden_size	embed_dimnum_attention_heads	num_headsattention_dropoutrB   enable_biasrX   loggerwarning_oncer(   r5   head_dim	is_causal
ValueErrorrA   r   Lineark_projv_projq_projout_proj)r'   rW   rX   rQ   r(   s       r)   r&   zOPTAttention.__init__b   s    	++33//!--" !8!8 9 :, , $..8MMDNN*t~~=MdnnM]$T^^$4B8  }}d*iiTEUEUViiTEUEUViiTEUEUV		$..$..tGWGWXr*   hidden_statespast_key_valuesr+   output_attentionscache_positionreturnc                 j   |j                         \  }}}	| j                  |      | j                  z  }
|
j                  |d| j                  | j
                        j                  dd      }
| j                  |      }| j                  |      }|j                  |d| j                  | j
                        j                  dd      }|j                  |d| j                  | j
                        j                  dd      }|#|j                  ||| j                  d|i      \  }}t        j                  | j                  j                  t              } || |
|||f| j                   sdn| j"                  dd|\  }}|j%                  ||d      j'                         }| j)                  |      }|sd}||fS )	z#Input shape: Batch x Time x ChannelrD   r   r#   Nro           g      ?)rB   rA   )sizerj   rA   viewr_   rd   rK   rh   ri   updaterX   r   get_interfacerW   _attn_implementationrT   rI   rB   reshaperP   rk   )r'   rl   rm   r+   rn   ro   rQ   bsztgt_len_query_states
key_statesvalue_statesattention_interfacerS   rR   s                   r)   r4   zOPTAttention.forward   s    (,,.Wa {{=1DLL@#((b$..$--PZZ[\^_`[[/
{{=1__S"dnndmmLVVWXZ[\
#((b$..$--PZZ[\^_`&'6'='=L$..;K^:\($J )@(M(MKK,,.E)
 %8	%
  $}}C$,,	%
 	%
!\ "))#w;FFHmmK0 LL((r*   N)NNFN)r5   r6   r7   r8   r   r9   r&   r1   Tensorr
   booltupler4   r;   r<   s   @r)   rV   rV   _   s    G
 !%!Y!Y :!YL )-.2"'.24)||4) 4) t+	4)
  4) t+4) 
u||U\\D0%$,>	?4)r*   rV   c                   :    e Zd Zddededz  f fdZ	 	 	 	 	 	 ddej                  dej                  dz  dedz  de	dz  d	e	dz  d
ej                  dz  dej                  dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )OPTDecoderLayerNrW   rX   c                    t         |           |j                  | _        t	        ||      | _        |j                  | _        |j                  | _        t        |j                     | _
        t        j                  | j                  |j                        | _        t        j                  | j                  |j                   |j"                        | _        t        j                  |j                   | j                  |j"                        | _        t        j                  | j                  |j                        | _        y )N)rW   rX   elementwise_affinerZ   )r%   r&   r\   r]   rV   	self_attndo_layer_norm_beforerB   r	   activation_functionactivation_fnr   	LayerNormlayer_norm_elementwise_affineself_attn_layer_normrg   ffn_dimra   fc1fc2final_layer_norm)r'   rW   rX   r(   s      r)   r&   zOPTDecoderLayer.__init__   s    ++%VyI$*$?$?!~~#F$>$>?$&LLNNv/S/S%
! 99T^^V^^&BTBTU99V^^T^^&BTBTU "T^^PVPtPt ur*   rl   r+   rm   rn   	use_cacher-   ro   rQ   rp   c           
         |}	| j                   r| j                  |      } | j                  d||||||d|\  }}
t        j                  j                  || j
                  | j                        }|	|z   }| j                   s| j                  |      }|j                  }|j                  d|j                  d            }|}	| j                   r| j                  |      }| j                  |      }| j                  |      }| j                  |      }t        j                  j                  || j
                  | j                        }|	|z   j                  |      }| j                   s| j                  |      }|f}|r||
fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence..
        )rl   rm   r-   r+   rn   ro   rG   rD    )r   r   r   r   rL   rB   rI   shaperx   rs   r   r   r   r   rt   )r'   rl   r+   rm   rn   r   r-   ro   rQ   residualself_attn_weightshidden_states_shapeoutputss                r)   r4   zOPTDecoderLayer.forward   s   6 ! $$ 55mDM ,:4>> ,
'+%)/),
 ,
(( --mt||VZVcVc-d =0 (( 55mDM ,11%--b-2D2DR2HI  $$ 11-@M/**=9/--mt||VZVcVc-d!M1778KL (( 11-@M ")++Gr*   r   )NNFFNN)r5   r6   r7   r   r9   r&   r1   r   r
   r   r:   r   r   r   FloatTensorr4   r;   r<   s   @r)   r   r      s    vy vS4Z v( /3(,).!&04.2L||L t+L 	L
  $;L $;L &&-L t+L -.L 
u  %(9(95;L;L(L"MPT"TT	ULr*   r   c                   :    e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZdZy)OPTPreTrainedModelrW   modelTr   N)r5   r6   r7   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   r*   r)   r   r     s:    &*#*+"&N!r*   r   c                   ,    e Zd ZdZdef fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  de
dz  dej                  dz  d	edz  d
edz  dedz  dedz  dej                  dz  dej                  dz  dee   deez  fd       Z xZS )
OPTDecoderz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]

    Args:
        config: OPTConfig
    rW   c           	      8   t         |   |       |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  |j                  |j                  | j
                        | _        t        |j                  |j                        | _        |j                  |j                  k7  r2t        j                   |j                  |j                  d      | _        nd | _        |j                  |j                  k7  r2t        j                   |j                  |j                  d      | _        nd | _        |j&                  r=|j(                  s1t        j*                  |j                  |j,                        | _        nd | _        t        j0                  t3        |j4                        D cg c]  }t7        ||       c}      | _        d| _        | j=                          y c c}w )NFrZ   r   )rX   )r%   r&   rB   	layerdroppad_token_idpadding_idxmax_position_embeddingsmax_target_positions
vocab_sizer   	Embeddingword_embed_proj_dimembed_tokensr   r\   embed_positionsrg   project_out
project_inr   _remove_final_layer_normr   r   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing	post_init)r'   rW   ir(   s      r)   r&   zOPTDecoder.__init__2  s    ~~))!..$*$B$B! ++LL):):F<V<VX\XhXhi<V=[=[]c]o]op%%););;!yy););V=W=W^cdD#D%%););; ii(B(BFDVDV]bcDO"DO
 &&v/N/N$&LL""v7[7[%D! %)D!mmSXY_YqYqSr$sa_Vq%I$st&+#	 %ts   HN	input_idsr+   rm   inputs_embedsr   rn   output_hidden_statesreturn_dictr-   ro   rQ   rp   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|du |duz  rt        d      | j                  r%| j                  r|rt        j                  d       d}||j                  d|j                  d         }|| j                  |      }|r|t        | j                         }||j                         nd}|
2t        j                   |||j                  d   z   |j"                  	      }
|A||j                  d   z   }t        j$                  |j                  d   ||j"                  	      }|	8t        j&                  |d
      }	|	|z  dz
  j)                         }	|	dd|df   }	t+        | j                   |||
|      }| j-                  |||	      }| j.                  | j/                  |      }||j1                  |j"                        z   }|rdnd}|rdnd}t3        | j4                        D ]_  \  }}|r||fz  }| j                  r%t        j6                  g       }|| j8                  k  r? ||f||	||||
d|}|d   }|sW||d   fz  }a | j:                  | j;                  |      }| j<                  | j=                  |      }|r||fz  }t?        ||||      S )aU  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
                config.n_positions - 1]`. for padding use -1.

                [What are position IDs?](../glossary#position-ids)
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
                the complete sequence length.
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrD   )rW   r   r   devicer/   )rW   r   r+   ro   rm   )r-   r   )r+   r-   rm   rn   r   ro   last_hidden_staterm   rl   
attentions) rW   rn   r   r   use_return_dictrf   r   rI   rb   rc   rt   r   r   r   get_seq_lengthr1   aranger   onesr2   r3   r   r   r   rO   	enumerater   randr   r   r   r   )r'   r   r+   rm   r   r   rn   r   r   r-   ro   rQ   past_seen_tokens
seq_lengthcausal_mask
pos_embedsrl   all_hidden_statesall_self_attnsidxdecoder_layerdropout_probabilitylayer_outputss                          r)   r4   zOPTDecoder.forwardW  sL   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I !r9??2+>?I  --i8M0*$++>O?N?Z?99;`a!"\\ "2]5H5H5K"KTaThThN !)M,?,?,BBJ"ZZ(;(;A(>
S`SgSghN <<A>L(>9A=CCEL'+;+<(<=L(;;'))+
 )).:JYe)f
??& OOM:M%
m6J6J(KK #7BD0d"+DKK"8 	6C#!m%55!}}&+jjn#&7)	*) /"3#-	 	M *!,M =#3"551	64   , 11-@M' ,,];M  -!11&+++%	
 	
r*   
NNNNNNNNNN)r5   r6   r7   r8   r   r&   r   r1   r:   r   r
   r   r   r   r   r   r   r4   r;   r<   s   @r)   r   r   *  s    #y #J  .2.2(,26!%)-,0#'04.2f
##d*f
 t+f
 	f

 ((4/f
 $;f
  $;f
 #Tkf
 D[f
 &&-f
 t+f
 -.f
 
(	(f
 f
r*   r   c                   >    e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  dedz  d	e	j                  dz  d
edz  dedz  dedz  dedz  de	j                  dz  de	j                  dz  dee   deez  fd              Z xZS )OPTModelrW   c                 d    t         |   |       t        |      | _        | j	                          y r   )r%   r&   r   decoderr   r'   rW   r(   s     r)   r&   zOPTModel.__init__  s&     !&)r*   c                 .    | j                   j                  S r   r   r   r'   s    r)   get_input_embeddingszOPTModel.get_input_embeddings	  s    ||(((r*   c                 &    || j                   _        y r   r   r'   r@   s     r)   set_input_embeddingszOPTModel.set_input_embeddings  s    $)!r*   Nr   r+   rm   r   r   rn   r   r   r-   ro   rQ   rp   c                 z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j
                  d|||	|||||d|
d
|}t        |j                  |j                  |j                  |j                        S )NT
r   r+   r-   rm   r   r   rn   r   r   ro   r   r   )rW   rn   r   r   r   r   r   r   rm   rl   r   )r'   r   r+   rm   r   r   rn   r   r   r-   ro   rQ   decoder_outputss                r)   r4   zOPTModel.forward  s      2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] '$,, 
)%+'/!5)
 
 '-??+;;)77&11	
 	
r*   r   )r5   r6   r7   r   r&   r   r   r   r   r1   r:   r   r
   r   r   r   r   r   r   r4   r;   r<   s   @r)   r   r     s   y )*  .2.2(,26!%)-,0#'04.2)
##d*)
 t+)
 	)

 ((4/)
 $;)
  $;)
 #Tk)
 D[)
 &&-)
 t+)
 -.)
 
(	()
  )
r*   r   c                        e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	edz  d
e	j                  dz  de	j                  dz  dedz  dedz  dedz  dedz  de	j                  dz  de	j                  dz  dee	j                  z  dee   deez  fd              Z xZS )OPTForCausalLMzlm_head.weightz!model.decoder.embed_tokens.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFrZ   )
r%   r&   r   r   r   rg   r   r   lm_headr   r   s     r)   r&   zOPTForCausalLM.__init__@  sK     f%
 yy!;!;V=N=NUZ[ 	r*   c                 B    | j                   j                  j                  S r   r   r   r   r   s    r)   r   z#OPTForCausalLM.get_input_embeddingsJ      zz!!...r*   c                 :    || j                   j                  _        y r   r   r   s     r)   r   z#OPTForCausalLM.set_input_embeddingsM      */

'r*   Nr   r+   rm   r   labelsr   rn   r   r   r-   ro   logits_to_keeprQ   rp   c                 L   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	 | j                  j
                  d|||
|||||d|d
|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         j                         }d}|* | j                  d||| j                   j                  d|}t        |||j                  |j                   |j"                        S )an  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForCausalLM

        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
        ```NTr   )logitsr   r   lossr   rm   rl   r   r   )rW   rn   r   r   r   r   r   
isinstancer9   slicer   rP   loss_functionr   r   rm   rl   r   )r'   r   r+   rm   r   r   r   rn   r   r   r-   ro   r   rQ   r   rl   slice_indicesr   r   s                      r)   r4   zOPTForCausalLM.forwardP  sC   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]+=4::+=+= ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@ALLN%4%%pVFt{{OeOepiopD%#33!//))
 	
r*   )NNNNNNNNNNNr   )r5   r6   r7   _tied_weights_keysr&   r   r   r   r   r1   r:   r   r
   r   r   r9   r   r   r   r   r4   r;   r<   s   @r)   r   r   =  sa   *,OP/0  .2.2(,26*.!%)-,0#'04.2-.J
##d*J
 t+J
 	J

 ((4/J
   4'J
 $;J
  $;J
 #TkJ
 D[J
 &&-J
 t+J
 ell*J
 +,J
 
'	'J
  J
r*   r   a  
    The OPT Model transformer with a sequence classification head on top (linear layer).

    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   *    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  de	dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  de
dz  dej                  dz  deez  fd       Zd Zd Z xZS )OPTForSequenceClassificationrW   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y r   )
r%   r&   
num_labelsr   r   r   rg   r   scorer   r   s     r)   r&   z%OPTForSequenceClassification.__init__  sT      ++f%
YYv994??QVW
 	r*   Nr   r+   rm   r   r   r   rn   r   r   r-   rp   c                    |	|	n| j                   j                  }	| j                  ||||
|||||		      }|d   }| j                  |      }||j                  dd \  }}n|j                  dd \  }}| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d	       |t        j                  ||j                  
      |f   }d}|| j                   j"                  | j$                  dk(  rd| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  dk(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                   j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                   j"                  dk(  rt5               } |||      }|	s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrm   r+   r-   r   r   rn   r   r   r   r#   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rD   )r   rF   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr   )rW   r   r   r  r   r   rf   rO   r   r1   int32r   argmaxrb   rc   r(   r5   problem_typer  rF   r3   r9   r   squeezer   rt   r   r   rm   rl   r   )r'   r   r+   rm   r   r   r   rn   r   r   r-   rQ   transformer_outputsrl   r   
batch_sizesequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr   loss_fctoutputs                           r)   r4   z$OPTForSequenceClassification.forward  s   * &1%<k$++B]B]"jj+)%'/!5# ) 

 ,A.M* *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r*   c                 B    | j                   j                  j                  S r   r   r   s    r)   r   z1OPTForSequenceClassification.get_input_embeddings  r   r*   c                 :    || j                   j                  _        y r   r   r   s     r)   r   z1OPTForSequenceClassification.set_input_embeddings  r   r*   r   )r5   r6   r7   r   r&   r   r1   r:   r   r
   r   r   r   r4   r   r   r;   r<   s   @r)   r  r    s   y   .237(,26*.!%)-,0#'04[
##d*[
 ))D0[
 	[

 ((4/[
   4'[
 $;[
  $;[
 #Tk[
 D[[
 &&-[
 
1	1[
 [
z/0r*   r  c                   J    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  de	dz  dej                  dz  dej                  dz  d	ej                  dz  d
e
dz  de
dz  de
dz  de
dz  dej                  dz  deez  fd       Zd Zd Z xZS )OPTForQuestionAnsweringrW   c                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y r"   )	r%   r&   r   r   r   rg   r   
qa_outputsr   r   s     r)   r&   z OPTForQuestionAnswering.__init__  s@     f%
))F$>$>B 	r*   Nr   r+   rm   r   start_positionsend_positionsr   rn   r   r   r-   rp   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      j                  |j                        }|j                  d|      j                  |j                        }t        |      } |||      } |||      }||z   dz  }|
s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForQuestionAnswering
        >>> import torch

        >>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> # note: we are loading a OPTForQuestionAnswering from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")

        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

        >>> inputs = tokenizer(question, text, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> answer_offset = len(tokenizer(question)[0])

        >>> predict_answer_tokens = inputs.input_ids[
        ...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
        ... ]
        >>> predicted = tokenizer.decode(predict_answer_tokens)
        >>> predicted
        ' a nice puppet'
        ```Nr  r   r   rD   r/   )ignore_indexr#   )r   start_logits
end_logitsrl   r   )rW   r   r   r  splitr  rP   lenrs   clamprO   r   r   r   rl   r   )r'   r   r+   rm   r   r  r  r   rn   r   r   r-   rQ   r  rl   r   r!  r"  
total_lossignored_indexr  
start_lossend_lossr  s                           r)   r4   zOPTForQuestionAnswering.forward&  s   ` &1%<k$++B]B]"jj+)%'/!5# ) 

 ,A./#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EHHWO)//=ADDV]]SM']CH!,@J
M:H$x/14J"J/2Eab2IIF/9/EZMF*Q6Q+%!-;;*55
 	
r*   c                 B    | j                   j                  j                  S r   r   r   s    r)   r   z,OPTForQuestionAnswering.get_input_embeddings  r   r*   c                 :    || j                   j                  _        y r   r   r   s     r)   r   z,OPTForQuestionAnswering.set_input_embeddings  r   r*   )NNNNNNNNNNN)r5   r6   r7   r   r&   r   r1   r:   r   r
   r   r   r   r4   r   r   r;   r<   s   @r)   r  r    s1   y   .237(,263715!%)-,0#'04^
##d*^
 ))D0^
 	^

 ((4/^
 ))D0^
 ''$.^
 $;^
  $;^
 #Tk^
 D[^
 &&-^
 
-	-^
 ^
@/0r*   r  )r   r   r   r  r  )rr   )9r8   collections.abcr   r1   r   torch.nnr   r   r   activationsr	   cache_utilsr
   r   
generationr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   configuration_optr   
get_loggerr5   rb   r   r   Moduler   floatrT   rV   r   r   r   r   r   r  r  __all__r   r*   r)   <module>r=     s    $   A A ! . ) / B 9  G & R R ( 
		H	%;BLL ;H %II%<<% 
% <<	%
 LL4'% % %.Z)299 Z)z^0 ^B 	" 	" 	"T
# T
n 8
! 8
 8
v_
' _
D l0#5 l0l0^ n00 n0 n0br*   