
    qij                        d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m
Z ddlmZmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZ  ej8                  e      Zd Zd Z d"dZ! G d dejD                        Z#d Z$ G d dejD                        Z%e G d de             Z&e G d de&             Z' ed       G d de&e             Z( ed       G d d e&             Z)g d!Z*y)#zPyTorch CTRL model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)CacheDynamicCache)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPastSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging   )
CTRLConfigc                 P    dt        j                  dd|dz  z  |z        z  }| |z  S )Nr   i'     )torchpow)posid_model_sizeangle_ratess       X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_ctrl.py
angle_defnr   %   s/    eiiQ!V'DEEK    c                    t        t        j                  | t        j                        j	                  |      j                  d      t        j                  |t        j                        j	                  |      j                  d      |      }t        j                  |d d dd df         }t        j                  |d d dd df         }t        j                  ||gd      }|S )Ndtyper   r   r   dim)	r   r   arangeint64to	unsqueezesincoscat)positionr   r!   
angle_radssinescosinespos_encodings          r   positional_encodingr1   *   s    XU[[144U;EEaH\588?II!LJ IIjADqD)*Eii
1add7+,G99eW-26Lr   c           	         t        j                  | |j                  dddd            }|j                  d   }|t	        j
                  |      z  }|6|j                  d      |j                  d      }	}|||	|z
  |	d |	f   dz  z  }|||z   }t        j                  |d      }
t        j                  |
|      }||
fS )	Nr   r   r   r   r"   g     r#   )r   matmulpermuteshapenpsqrtsizesoftmax)qkvmaskattention_mask	matmul_qkdkscaled_attention_logitsndnsattention_weightsoutputs               r   scaled_dot_product_attentionrG   9   s    Q		!Q1 56I	
B'"''"+5(--b13J3O3OPR3SB4R"crc(9#:T#AA!"9N"J&=2F\\+Q/F$$$r   c                   8     e Zd Zd fd	Zd Z	 	 	 	 	 ddZ xZS )MultiHeadAttentionc                 ^   t         |           || _        || _        || _        t        || j                  z        | _        t        j                  ||      | _	        t        j                  ||      | _
        t        j                  ||      | _        t        j                  ||      | _        y N)super__init__	num_headsr   	layer_idxintdepthr   LinearWqWkWvdense)selfr   rN   rO   	__class__s       r   rM   zMultiHeadAttention.__init__P   s    "("67
))L,7))L,7))L,7YY|\:
r   c                 x    |j                  |d| j                  | j                        }|j                  g d      S )Nr"   r   r   r   r   )reshaperN   rQ   r5   )rW   x
batch_sizes      r   split_into_headsz#MultiHeadAttention.split_into_heads^   s-    IIj"dnndjjAyy&&r   c
                    |j                   d   }
| j                  |      }| j                  |      }| j                  |      }| j	                  ||
      }| j	                  ||
      }| j	                  ||
      }|#|j                  ||| j                  d|	i      \  }}t        |||||      }|d   j                  g d      }|d   }|j                  |
d| j                        }| j                  |      }||fS )Nr   cache_positionrZ   r   r"   )r6   rS   rT   rU   r^   updaterO   rG   r5   r[   r   rV   )rW   r=   r<   r;   r>   
layer_pastr?   	use_cacheoutput_attentionsr`   r]   rF   scaled_attentionattnoriginal_size_attentions                  r   forwardzMultiHeadAttention.forwardb   s     WWQZ
GGAJGGAJGGAJ!!!Z0!!!Z0!!!Z0!$$Q4>><Ln;]^DAq-aAt^L!!9,,\:ay"2":"::r4K\K\"]34t|r   rK   NNFFN)__name__
__module____qualname__rM   r^   rh   __classcell__rX   s   @r   rI   rI   O   s#    ;' r   rI   c                     t        j                  t        j                  | |      t        j                         t        j                  ||             S rK   )r   
SequentialrR   ReLU)r   dffs     r   point_wise_feed_forward_networkrs      s2    ==<5rwwy"))CQ]B^__r   c                   2     e Zd Zd fd	Z	 	 	 	 	 ddZ xZS )EncoderLayerc                 B   t         |           t        |||      | _        t	        ||      | _        t        j                  |d      | _        t        j                  |d      | _	        t        j                  |      | _        t        j                  |      | _        y )NrO   gư>eps)rL   rM   rI   multi_head_attentionrs   ffnr   	LayerNorm
layernorm1
layernorm2Dropoutdropout1dropout2)rW   r   rN   rr   raterO   rX   s         r   rM   zEncoderLayer.__init__   sr    $6|YZc$d!2<E,,|>,,|>

4(

4(r   c                    | j                  |      }| j                  |||||||||	      }	|	d   }
| j                  |
      }
||
z   }| j                  |      }| j	                  |      }| j                  |      }||z   }|f|	dd  z   }|S )Nrb   r?   rc   rd   r`   r   r   )r}   rz   r   r~   r{   r   )rW   r\   r>   rb   r?   rc   rd   r`   normedattn_outputsattn_outputout1out2
ffn_outputoutputss                  r   rh   zEncoderLayer.forward   s     #00!)/) 1 

 #1ommK0;t$XXd^
]]:.
j 'L,,r   )g?Nri   )rj   rk   rl   rM   rh   rm   rn   s   @r   ru   ru      s    
)   r   ru   c                   .     e Zd ZU eed<   dZ fdZ xZS )CTRLPreTrainedModelconfigtransformerc                     t         |   |       t        |t              rXt	        j
                  |j                  t        |j                  j                  |j                  t        j                               y y rK   )rL   _init_weights
isinstance	CTRLModelinitcopy_r0   r1   r   n_positionsr   r   float)rW   modulerX   s     r   r   z!CTRLPreTrainedModel._init_weights   sX    f%fi(JJ##%89R9RTZTgTginitit%u )r   )rj   rk   rl   r   __annotations__base_model_prefixr   rm   rn   s   @r   r   r      s    % r   r   c                   ^    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  de	dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                     ez  fd       Z xZS )r   c                    t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]:  }t        |j                  |j                   |j"                  |j$                  |      < c}      | _        t        j(                  |j                  |j*                        | _        | j/                  dt1        |j2                  | j                  t4        j6                        d       | j9                          y c c}w )Nrw   rx   r0   F)
persistent)rL   rM   n_embdr   n_layer
num_layersr   	Embedding
vocab_sizewr   
embd_pdropdropout
ModuleListrangeru   n_headrr   resid_pdrophr|   layer_norm_epsilon	layernormregister_bufferr1   r   r   r   	post_init)rW   r   r   rX   s      r   rM   zCTRLModel.__init__   s    "MM ..f//?zz&"3"34 v~~. V]]FMM6::vGYGYefg
 fmm9R9RS/0B0BDDUDUW\WbWbcpu 	 	

 	s   ,?E9c                     | j                   S rK   r   )rW   s    r   get_input_embeddingszCTRLModel.get_input_embeddings   s    vvr   c                     || _         y rK   r   )rW   new_embeddingss     r   set_input_embeddingszCTRLModel.set_input_embeddings   s	    r   N	input_idspast_key_valuesr?   token_type_idsposition_idsinputs_embedsrc   rd   output_hidden_statesreturn_dictr`   returnc           
      V   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
||t        d      |G| j                  ||       |j                         }|j                  d|d         }|j                  d   }n0|#|j                         dd }|j                  d   }nt        d      ||j                  n|j                  }|r|t        | j                         }||j                         nd}|>t        j                  ||d   |z   t        j                  |      }|j!                  d      }||dk  rt        d      |j                  |d      }|j!                  d	      j!                  d
      }|j#                  | j$                        }d|z
  t        j&                  | j$                        j(                  z  }|I|j                  d|d         }| j+                  |      }|t-        j.                  | j0                        z  }nd}|| j+                  |      }|d   }t        j2                  t        j4                  ||z   ||z         d	      j#                  |      }|t-        j.                  | j0                        z  }| j6                  j#                  |      | _        | j6                  |ddf   }||z   |z   }| j9                  |      }|	rdnd}|rdnd}t;        | j<                        D ]-  \  }}|	r||fz   } ||||||||      }|d   }|s%||d	   fz  }/ | j?                  |      }|	r||fz   }|
stA        d ||||fD              S tC        ||||      S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, CTRLModel
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 5, 1280]
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer"   r   z5You have to specify either input_ids or inputs_embeds)r   )r!   devicez$batch_size has to be defined and > 0r   r   r    g      ? r   c              3   &   K   | ]	  }||  y wrK   r   ).0r=   s     r   	<genexpr>z$CTRLModel.forward.<locals>.<genexpr>h  s      bcbos   )last_hidden_stater   hidden_states
attentions)"r   rd   rc   r   use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr9   viewr6   r   r
   get_seq_lengthr   r%   longr(   r'   r!   finfominr   r7   r8   r   triuonesr0   r   	enumerater   r   tupler   )rW   r   r   r?   r   r   r   rc   rd   r   r   r`   kwargsinput_shaper]   r   past_lengthtoken_type_embedsseq_lenr>   
pos_embedsr   all_hidden_statesall_attentionsr   r   r   s                              r   rh   zCTRLModel.forward   s   H 2C1N-TXT_T_TqTq!*!6IDKK<Q<Q	$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T0*$++>O:I:Uo446[\ <<[_{5RZ_ZdZdmstL'11!4L %Q !GHH+00R@N ,55a8BB1EN ,..TZZ.@N!N2ekk$**6M6Q6QQN%+00[_EN $~ 6):):!;; !  FF9-Mb/zz%**W{%:Gk<QRTUVYYZ`a!2!233 !--008&&|Q7
%
25FF]3"6BD0ddff% 	0DAq#$58H$H!*-#"3-G $AJM 71:-/	0  }5 1]4D D )?<M~^   '+++%	
 	
r   NNNNNNNNNNN)rj   rk   rl   rM   r   r   r   r   
LongTensorr	   FloatTensorboolTensorr   r   rh   rm   rn   s   @r   r   r      s0   0   .2(,37260426!%)-,0#'.2L
##d*L
 L
 ))D0	L

 ((4/L
 &&-L
 ((4/L
 $;L
  $;L
 #TkL
 D[L
 t+L
 
u||	6	6L
 L
r   r   z
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       e Zd ZddiZ fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  de
dz  de
dz  de
dz  de
dz  dej                  dz  deej                  z  deej                     ez  fd       Z	 d fd	Z xZS )CTRLLMHeadModelzlm_head.weightztransformer.w.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NTbias)
rL   rM   r   r   r   rR   r   r   lm_headr   rW   r   rX   s     r   rM   zCTRLLMHeadModel.__init__}  sG     $V,yy0A0AM 	r   Nr   r   r?   r   r   r   labelsrc   rd   r   r   r`   logits_to_keepr   c                    ||n| j                   j                  }| j                  ||||||||	|
||      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|* | j                  ||fd| j                   j                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )ag  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLLMHeadModel

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> sequence_ids = model.generate(inputs["input_ids"])
        >>> sequences = tokenizer.batch_decode(sequence_ids)
        >>> sequences
        ['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']

        >>> outputs = model(**inputs, labels=inputs["input_ids"])
        >>> round(outputs.loss.item(), 2)
        9.21

        >>> list(outputs.logits.shape)
        [1, 5, 246534]
        ```N)
r   r?   r   r   r   rc   rd   r   r   r`   r   r   r   )losslogitsr   r   r   )r   r   r   r   rP   slicer   loss_functionr   r   r   r   r   )rW   r   r   r?   r   r   r   r   rc   rd   r   r   r`   r   r   transformer_outputsr   slice_indicesr   r   rF   s                        r   rh   zCTRLLMHeadModel.forward  s2   b &1%<k$++B]B]"..+))%'/!5#) / 
 ,A.8B>SV8W~ot4]kmA}a,?@A%4%%  ;;11 	D Y!4QR!88F)-)9TGf$EvE%/??-;;*55
 	
r   c                 T    t        |   |f|||d|}|j                  dd        |S )N)r   rc   is_first_iterationr   )rL   prepare_inputs_for_generationpop)rW   r   r   rc   r   r   model_inputsrX   s          r   r   z-CTRLLMHeadModel.prepare_inputs_for_generation  sH    
 w<
+1	

 
 	)40r   )NNNNNNNNNNNNr   )NNF)rj   rk   rl   _tied_weights_keysrM   r   r   r   r	   r   r   r   rP   r   r   rh   r   rm   rn   s   @r   r   r   t  sv    +,BC  .2(,37260426*.!%)-,0#'.2-.X
##d*X
 X
 ))D0	X

 ((4/X
 &&-X
 ((4/X
   4'X
 $;X
  $;X
 #TkX
 D[X
 t+X
 ell*X
  
u||	5	5!X
 X
v SX r   r   a  
    The CTRL Model transformer with a sequence classification head on top (linear layer).
    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
    value in each row of the batch).
    c                   R    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	dz  de	dz  de	dz  de	dz  de
ej                     ez  fd       Z xZS )CTRLForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y )NFr   )
rL   rM   
num_labelsr   r   r   rR   r   
classifierr   r   s     r   rM   z&CTRLForSequenceClassification.__init__  sR      ++$V,))FMM4??O 	r   Nr   r   r?   r   r   r   r   rc   rd   r   r   r   c                    ||n| j                   j                  }| j                  ||||||||	|
|
      }|d   }| j                  |      }||j                  dd \  }}n|j                  dd \  }}| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d	       |t        j                  ||j                  
      |f   }d}|| j                   j"                  | j$                  dk(  rd| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  dk(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                   j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                   j"                  dk(  rt5               } |||      }|s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> import torch

        >>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> labels = torch.tensor(1)
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)
        0.93
        ```

        Example of multi-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained(
        ...     "Salesforce/ctrl", problem_type="multi_label_classification"
        ... )

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> num_labels = len(model.config.id2label)
        >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
        ...     torch.float
        ... )
        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()  # doctest: +IGNORE_RESULT
        ```N)	r   r?   r   r   r   rc   rd   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r"   )r   r!   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   
regressionsingle_label_classificationmulti_label_classification)r   r   r   r   )r   r   r   r   r6   pad_token_idr   r'   r   r   int32r%   argmaxloggerwarning_oncerX   rj   problem_typer   r!   r   rP   r   squeezer   r   r   r   r   r   )rW   r   r   r?   r   r   r   r   rc   rd   r   r   r   r   r   r   r]   sequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr   loss_fctrF   s                            r   rh   z%CTRLForSequenceClassification.forward  s   z &1%<k$++B]B]"..+))%'/!5# / 
 ,A./ *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE' -;;*55	
 	
r   r   )rj   rk   rl   rM   r   r   r   r	   r   r   r   r   r   rh   rm   rn   s   @r   r   r     s(     .2(,37260426*.!%)-,0#'d
##d*d
 d
 ))D0	d

 ((4/d
 &&-d
 ((4/d
   4'd
 $;d
  $;d
 #Tkd
 D[d
 
u||	7	7d
 d
r   r   )r   r   r   r   rK   )+__doc__numpyr7   r   r   torch.nnr   r   r    r   r   cache_utilsr	   r
   
generationr   modeling_outputsr   r   r   modeling_utilsr   utilsr   r   configuration_ctrlr   
get_loggerrj   r  r   r1   rG   ModulerI   rs   ru   r   r   r   r   __all__r   r   r   <module>r     s       A A & . ) i i - + 
		H	%
%,1 1h`-299 -` 	/ 	 	 l
# l
 l
^ v)? vvr 
o
$7 o

o
d cr   