
    qi`                     6   d Z ddlZddlZddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z!  ejD                  e#      Z$g dZ%e ed       G d de                    Z& G d dejN                        Z( G d dejN                        Z) G d dejN                        Z* G d dejN                        Z+ G d dejN                        Z, G d d ejN                        Z- G d! d"ejN                        Z. G d# d$ejN                        Z/ G d% d&e      Z0 G d' d(ejN                        Z1 G d) d*ejN                        Z2 G d+ d,ejN                        Z3 G d- d.ejN                        Z4 G d/ d0ejN                        Z5e G d1 d2e             Z6e G d3 d4e6             Z7 ed5       G d6 d7e6             Z8e G d8 d9e6             Z9e G d: d;e6             Z:e G d< d=e6             Z;g d>Z<y)?zPyTorch CANINE model.    N)	dataclass)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputModelOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringlogging   )CanineConfig)   +   ;   =   I   a   g   q                           a  
    Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
    different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
    Transformer encoders.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                     dz  ed<   dZe
ej                     dz  ed<   y)CanineModelOutputWithPoolinga  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
        shallow Transformer encoder).
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
        Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
        weights are trained from the next sentence prediction (classification) objective during pretraining.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
        encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
        config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
        initial input to each Transformer encoder. The hidden states of the shallow encoders have length
        `sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
        `config.downsampling_rate`.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
        num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
        config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
        attention softmax, used to compute the weighted average in the self-attention heads.
    Nlast_hidden_statepooler_outputhidden_states
attentions)__name__
__module____qualname____doc__r+   torchFloatTensor__annotations__r,   r-   tupler.        \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/canine/modeling_canine.pyr*   r*   0   sh    , 37u((4/6.2M5$$t+259M5**+d2926Je''(4/6r8   r*   c                        e Zd ZdZ fdZdedefdZdededefdZ	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  f
dZ xZS )CanineEmbeddingsz<Construct the character, position and token_type embeddings.c           	         t         |           || _        |j                  |j                  z  }t        |j                        D ]2  }d| }t        | |t        j                  |j                  |             4 t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                         | _        | j%                  dt'        j(                  |j*                        j-                  d      d       y )NHashBucketCodepointEmbedder_epsposition_idsr   F)
persistent)super__init__confighidden_sizenum_hash_functionsrangesetattrr   	Embeddingnum_hash_bucketschar_position_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr3   arangemax_position_embeddingsexpand)selfrF   shard_embedding_sizeiname	__class__s        r9   rE   zCanineEmbeddings.__init__X   s      &11V5N5NNv001 	]A1!5DD$V-D-DFZ [\	] )+V5L5LfN`N`(a%%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
r8   
num_hashesnum_bucketsc                     |t        t              kD  rt        dt        t                     t        d| }g }|D ]  }|dz   |z  |z  }|j                  |         |S )a  
        Converts ids to hash bucket ids via multiple hashing.

        Args:
            input_ids: The codepoints or other IDs to be hashed.
            num_hashes: The number of hash functions to use.
            num_buckets: The number of hash buckets (i.e. embeddings in each table).

        Returns:
            A list of tensors, each of which is the hash bucket IDs from one hash function.
        z`num_hashes` must be <= Nr   )len_PRIMES
ValueErrorappend)rY   	input_idsr^   r_   primesresult_tensorsprimehasheds           r9   _hash_bucket_tensorsz%CanineEmbeddings._hash_bucket_tensorsm   sp     G$7G~FGG*% 	*E 1}-<F!!&)	* r8   embedding_sizec                    ||z  dk7  rt        d| d| d      | j                  |||      }g }t        |      D ]-  \  }}d| }	 t        | |	      |      }
|j	                  |
       / t        j                  |d      S )	zDConverts IDs (e.g. codepoints) into embeddings via multiple hashing.r   zExpected `embedding_size` (z) % `num_hashes` (z) == 0)r^   r_   r=   rB   dim)rc   rj   	enumerategetattrrd   r3   cat)rY   re   rk   r^   r_   hash_bucket_tensorsembedding_shardsr[   hash_bucket_idsr\   shard_embeddingss              r9   _embed_hash_bucketsz$CanineEmbeddings._embed_hash_buckets   s    J&!+:>:JJ\]g\hhnopp"77	jfq7r"+,?"@ 	6A1!5D2wtT2?C##$45	6
 yy)r22r8   Nre   token_type_idsr@   inputs_embedsreturnc                 B   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|:t        j                  |t        j                  | j                  j
                        }|P| j                  || j                  j                  | j                  j                  | j                  j                        }| j                  |      }||z   }| j                  |      }	||	z  }| j                  |      }| j                  |      }|S )NrB   r   dtypedevice)sizer@   r3   zeroslongr}   rv   rF   rG   rH   rL   rO   rM   rP   rT   )
rY   re   rw   r@   rx   input_shape
seq_lengthrO   
embeddingsposition_embeddingss
             r9   forwardzCanineEmbeddings.forward   s     #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  444;;22DKK4R4RTXT_T_TpTpM !% : :> J"%::
";;LI))
^^J/
\\*-
r8   )NNNN)r/   r0   r1   r2   rE   intrj   rv   r3   
LongTensorr4   r   __classcell__r]   s   @r9   r;   r;   U   s    F
*# C .3S 3c 3`c 3  .2260426!##d*! ((4/! &&-	!
 ((4/! 
		!r8   r;   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )CharactersToMoleculeszeConvert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions.c                 >   t         |           t        j                  |j                  |j                  |j
                  |j
                        | _        t        |j                     | _	        t        j                  |j                  |j                        | _
        y )Nin_channelsout_channelskernel_sizestrider>   )rD   rE   r   Conv1drG   downsampling_rateconvr
   
hidden_act
activationrP   rQ   rY   rF   r]   s     r9   rE   zCharactersToMolecules.__init__   st    II**++00++	
	 !!2!23f&8&8f>S>STr8   char_encodingry   c                 2   |d d ddd d f   }t        j                  |dd      }| j                  |      }t        j                  |dd      }| j                  |      }|d d ddd d f   }t        j                  ||gd      }| j                  |      }|S )Nr   r      rB   rm   )r3   	transposer   r   rq   rP   )rY   r   cls_encodingdownsampleddownsampled_truncatedresults         r9   r   zCharactersToMolecules.forward   s    $Q!QY/ q!<ii.ook1a8ook2 !,AqtQJ 7 L*?@aH'r8   	r/   r0   r1   r2   rE   r3   Tensorr   r   r   s   @r9   r   r      s'    oUU\\ ell r8   r   c                   |     e Zd ZdZ fdZ	 ddej                  dej                  dz  dej                  fdZ xZS )	ConvProjectionz
    Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size
    characters.
    c                    t         |           || _        t        j                  |j
                  dz  |j
                  |j                  d      | _        t        |j                     | _
        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y )Nr   r   r   r>   )rD   rE   rF   r   r   rG   upsampling_kernel_sizer   r
   r   r   rP   rQ   rR   rS   rT   r   s     r9   rE   zConvProjection.__init__   s    II**Q.++55	
	 !!2!23f&8&8f>S>STzz&"<"<=r8   Ninputsfinal_seq_char_positionsry   c                    t        j                  |dd      }| j                  j                  dz
  }|dz  }||z
  }t	        j
                  ||fd      }| j                   ||            }t        j                  |dd      }| j                  |      }| j                  |      }| j                  |      }|}|t        d      |}	|	S )Nr   r   r   z,CanineForMaskedLM is currently not supported)r3   r   rF   r   r   ConstantPad1dr   r   rP   rT   NotImplementedError)
rY   r   r   	pad_totalpad_begpad_endpadr   final_char_seq	query_seqs
             r9   r   zConvProjection.forward   s     A.
 KK66:	q.g%1153v;'A.('f%#/
 &&TUU&Ir8   Nr   r   s   @r9   r   r      sD    
>  9="" #(,,"5" 
	"r8   r   c                        e Zd Z fdZ	 	 d	dej
                  dej
                  dej                  dz  dedz  deej
                  ej
                  dz  f   f
dZ	 xZ
S )
CanineSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        y )Nr   rk   zThe hidden size (z6) is not a multiple of the number of attention heads ())rD   rE   rG   num_attention_headshasattrrc   r   attention_head_sizeall_head_sizer   LinearquerykeyvaluerR   attention_probs_dropout_probrT   r   s     r9   rE   zCanineSelfAttention.__init__  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EFr8   Nfrom_tensor	to_tensorattention_maskoutput_attentionsry   c                 ~   |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  |      j                  |d| j                  | j                        j                  dd      }
t        j                  |
|j                  dd            }|t        j                  | j                        z  }|h|j                  dk(  rTt        j                  |d      }d|j                         z
  t        j                  |j                         j"                  z  }||z   }t$        j&                  j)                  |d      }| j+                  |      }t        j                  ||	      }|j-                  dddd      j/                         }|j1                         d d | j2                  fz   } |j                  | }|r||f}|S |f}|S )	NrB   r   r   r   rm   g      ?r   )shaper   viewr   r   r   r   r   r3   matmulmathsqrtndim	unsqueezefloatfinfor|   minr   
functionalsoftmaxrT   permute
contiguousr~   r   )rY   r   r   r   r   
batch_sizer   _	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                   r9   r   zCanineSelfAttention.forward-  s    %0$5$5!
J HHYT*b$":":D<T<TUYq!_ 	 JJy!T*b$":":D<T<TUYq!_ 	 JJ{#T*b$":":D<T<TUYq!_ 	 !<<Y5H5HR5PQ+dii8P8P.QQ%""a'!&Q!G #&(<(<(>">%++N^NdNdBeBiBi!i/.@ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r8   NF)r/   r0   r1   rE   r3   r   r4   boolr6   r   r   r   s   @r9   r   r     st    G, 48).;\\; <<; ))D0	;
  $;; 
u||U\\D00	1;r8   r   c                        e Zd Z fdZdeej                     dej                  deej                  ej                  f   fdZ xZS )CanineSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr>   )rD   rE   r   r   rG   denserP   rQ   rR   rS   rT   r   s     r9   rE   zCanineSelfOutput.__init__l  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r8   r-   input_tensorry   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   rT   rP   rY   r-   r   s      r9   r   zCanineSelfOutput.forwardr  s9     

=1]3}|'CDr8   	r/   r0   r1   rE   r6   r3   r4   r   r   r   s   @r9   r   r   k  sL    >"5#4#45EJEVEV	u  %"3"33	4r8   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddedededededef fdZ	 	 dd
eej                     dej                  d	z  ded	z  deej                  ej                  d	z  f   fdZ
 xZS )CanineAttentionav  
    Additional arguments related to local attention:

        - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention.
        - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks be able to
          attend
        to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all** (`bool`,
        *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to attend to all
        positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*, defaults to 128) -- The
        width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride** (`int`, *optional*, defaults to
        128) -- The number of elements to skip when moving to the next block in `from_tensor`. -
        **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each block-wise chunk in
        *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The number of elements to
        skip when moving to the next block in `to_tensor`.
    always_attend_to_first_positionfirst_position_attends_to_allattend_from_chunk_widthattend_from_chunk_strideattend_to_chunk_widthattend_to_chunk_stridec	                    t         	|           t        |      | _        t	        |      | _        || _        ||k  rt        d      ||k  rt        d      || _        || _	        || _
        || _        || _        || _        y )Nze`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped.z``attend_to_chunk_width` < `attend_to_chunk_stride`would cause sequence positions to get skipped.)rD   rE   r   rY   r   outputlocalrc   r   r   r   r   r   r   
rY   rF   r   r   r   r   r   r   r   r]   s
            r9   rE   zCanineAttention.__init__  s     	'/	&v. 
"%==w  !#99r  0O,-J*'>$(@%%:"&<#r8   Nr-   r   r   ry   c                    | j                   s| j                  ||||      }|d   }n|j                  d   x}}|x}}	g }
| j                  r|
j	                  d       d}nd}t        ||| j                        D ].  }t        ||| j                  z         }|
j	                  ||f       0 g }| j                  r|j	                  d|f       t        d|| j                        D ].  }t        ||| j                  z         }|j	                  ||f       0 t        |
      t        |      k7  rt        d|
 d|
 d      g }g }t        |
|      D ]  \  \  }}\  }}|d d ||d d f   }|	d d ||d d f   }|d d ||||f   }| j                  rN|d d ||ddf   }t        j                   ||gd      }|	d d ddd d f   }t        j                   ||gd      }| j                  ||||      }|j	                  |d          |s|j	                  |d           t        j                   |d      }| j#                  ||      }|f}| j                   s
|dd  z   }|S |t%              z   }|S )	Nr   r   )r   r   z/Expected to have same number of `from_chunks` (z) and `to_chunks` (z). Check strides.r   rm   )r   rY   r   r   rd   rI   r   r   r   r   r   ra   rc   zipr   r3   rq   r   r6   )rY   r-   r   r   self_outputsattention_outputfrom_seq_lengthto_seq_lengthr   r   from_chunks
from_startchunk_start	chunk_end	to_chunksattention_output_chunksattention_probs_chunksfrom_endto_startto_endfrom_tensor_chunkto_tensor_chunkattention_mask_chunkcls_attention_maskcls_positionattention_outputs_chunkr   s                              r9   r   zCanineAttention.forward  s    zz99]M>SdeL+A.;.A.A!.DDOm&33K) K11""6* 

$Z$B_B_` =t?[?[1[\	""K#;<=
 I11  !]!34$Qt7R7RS ;{T=W=W/WX	  +y!9:; ;3y>1 Ek] S$$/=0AC  ')#%'">A+y>Y N:&X(:6$/:h3F0I$J!"+Ax,A"B (6aH9LhW]o6]'^$77)7:h;NPQRSPS8S)T&+0996HJ^5_ef+g(#,Q!QY#7L&+ii0OUV&WO*.))%8LN_+' (../Fq/IJ$*112I!2LM%N(  %yy)@aH;;'7G#%zzQR 00G  &< ==Gr8   FFF   r  r  r  r   )r/   r0   r1   r2   r   r   rE   r6   r3   r4   r   r   r   s   @r9   r   r   {  s    & 05.3'*(+%(&)= *.	=
 (,= "%= #&=  #= !$=F 48).	GU../G ))D0G  $;	G
 
u  %"3"3d"::	;Gr8   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CanineIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rD   rE   r   r   rG   intermediate_sizer   
isinstancer   strr
   intermediate_act_fnr   s     r9   rE   zCanineIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r8   r-   ry   c                 J    | j                  |      }| j                  |      }|S r   )r   r  rY   r-   s     r9   r   zCanineIntermediate.forward  s&    

=100?r8   )r/   r0   r1   rE   r3   r4   r   r   r   s   @r9   r  r    s'    9U%6%6 5;L;L r8   r  c                   t     e Zd Z fdZdeej                     dej                  dej                  fdZ xZS )CanineOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )rD   rE   r   r   r  rG   r   rP   rQ   rR   rS   rT   r   s     r9   rE   zCanineOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r8   r-   r   ry   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r9   r   zCanineOutput.forward  s7    

=1]3}|'CDr8   r   r   s   @r9   r  r    s:    >U5+<+<%= UM^M^ chctct r8   r  c                        e Zd Z fdZ	 	 d	deej                     dej                  dz  dedz  deej                  ej                  dz  f   fdZd Z	 xZ
S )
CanineLayerc	           
          t         	|           |j                  | _        d| _        t	        ||||||||      | _        t        |      | _        t        |      | _	        y Nr   )
rD   rE   chunk_size_feed_forwardseq_len_dimr   	attentionr  intermediater  r   r   s
            r9   rE   zCanineLayer.__init__  se     	'-'E'E$(+)#$!"	
 /v6"6*r8   Nr-   r   r   ry   c                     | j                  |||      }|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S )N)r   r   r   )r   r   feed_forward_chunkr  r  )rY   r-   r   r   self_attention_outputsr   r   layer_outputs           r9   r   zCanineLayer.forward/  ss     "&/ "0 "

 2!4(,0##T%A%A4CSCSUe
  /G+r8   c                 L    | j                  |      }| j                  ||      }|S r   )r!  r   )rY   r   intermediate_outputr%  s       r9   r#  zCanineLayer.feed_forward_chunkE  s,    "//0@A{{#68HIr8   r   )r/   r0   r1   rE   r6   r3   r4   r   r   r#  r   r   s   @r9   r  r    su    +< 48).	U../ ))D0  $;	
 
u  %"3"3d"::	;,r8   r  c                        e Zd Z	 	 	 	 	 	 	 d
 fd	Z	 	 	 	 ddeej                     dej                  dz  dedz  dedz  dedz  deez  fd	Z	 xZ
S )CanineEncoderc	                     t         
|           || _        t        j                  t        |j                        D 	cg c]  }	t        ||||||||       c}	      | _        d| _	        y c c}	w r   )
rD   rE   rF   r   
ModuleListrI   num_hidden_layersr  layergradient_checkpointing)rY   rF   r   r   r   r   r   r   r   r   r]   s             r9   rE   zCanineEncoder.__init__L  sx     	]] v778  31+,)*	

 ',#s   A*Nr-   r   r   output_hidden_statesreturn_dictry   c                     |rdnd }|rdnd }t        | j                        D ](  \  }}	|r||fz   } |	|||      }
|
d   }|s ||
d   fz   }* |r||fz   }|st        d |||fD              S t        |||      S )Nr7   r   r   c              3   &   K   | ]	  }||  y wr   r7   .0vs     r9   	<genexpr>z(CanineEncoder.forward.<locals>.<genexpr>  s     mq_`_lm   )r+   r-   r.   )ro   r-  r6   r   )rY   r-   r   r   r/  r0  all_hidden_statesall_self_attentionsr[   layer_modulelayer_outputss              r9   r   zCanineEncoder.forwardj  s     #7BD$5b4(4 	POA|#$58H$H!(HYZM)!,M &9]1=M<O&O#	P   1]4D Dm]4EGZ$[mmm++*
 	
r8   r  )NFFT)r/   r0   r1   rE   r6   r3   r4   r   r   r   r   r   s   @r9   r)  r)  K  s     (-&+ #!$!",B 48).,1#'
U../
 ))D0
  $;	

 #Tk
 D[
 
	 
r8   r)  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )CaninePoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rD   rE   r   r   rG   r   Tanhr   r   s     r9   rE   zCaninePooler.__init__  s9    YYv1163E3EF
'')r8   r-   ry   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r   )rY   r-   first_token_tensorpooled_outputs       r9   r   zCaninePooler.forward  s6     +1a40

#566r8   r   r   s   @r9   r=  r=    s,    $
U5+<+<%= %BSBS r8   r=  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )CaninePredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )rD   rE   r   r   rG   r   r  r   r  r
   transform_act_fnrP   rQ   r   s     r9   rE   z&CaninePredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr8   r-   ry   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   rF  rP   r  s     r9   r   z%CaninePredictionHeadTransform.forward  s4    

=1--m<}5r8   r   r   s   @r9   rD  rD    s-    UU5+<+<%= %BSBS r8   rD  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )CanineLMPredictionHeadc                    t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        y )NT)bias)rD   rE   rD  	transformr   r   rG   
vocab_sizedecoder	Parameterr3   r   rK  r   s     r9   rE   zCanineLMPredictionHead.__init__  s[    6v> yy!3!3V5F5FTRLLV->->!?@	r8   r-   ry   c                 J    | j                  |      }| j                  |      }|S r   )rL  rN  r  s     r9   r   zCanineLMPredictionHead.forward  s$    }5]3r8   r   r   s   @r9   rI  rI    s-    AU5+<+<%= %BSBS r8   rI  c                   b     e Zd Z fdZdeej                     deej                     fdZ xZS )CanineOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )rD   rE   rI  predictionsr   s     r9   rE   zCanineOnlyMLMHead.__init__  s    1&9r8   sequence_outputry   c                 (    | j                  |      }|S r   )rT  )rY   rU  prediction_scoress      r9   r   zCanineOnlyMLMHead.forward  s     !,,_=  r8   )	r/   r0   r1   rE   r6   r3   r   r   r   r   s   @r9   rR  rR    s1    :!u||,! 
u||	!r8   rR  c                   2     e Zd ZU eed<   dZdZ fdZ xZS )CaninePreTrainedModelrF   canineTc                     t         |   |       t        |t              rZt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             y y )NrB   rA   )rD   _init_weightsr  r;   initcopy_r@   r3   rV   r   rX   )rY   moduler]   s     r9   r\  z#CaninePreTrainedModel._init_weights  s[    f%f./JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 0r8   )	r/   r0   r1   r   r5   base_model_prefixsupports_gradient_checkpointingr\  r   r   s   @r9   rY  rY    s!     &*#i ir8   rY  c                   h    e Zd Zd fd	Zd Zdej                  defdZdej                  dedej                  fd	Z	e
	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  ded
z  ded
z  ded
z  deez  fd       Z xZS )CanineModelc           
         t         |   |       || _        t        j                  |      }d|_        t        |      | _        t        |ddd|j                  |j                  |j                  |j                        | _
        t        |      | _        t        |      | _        t        |      | _        t        |      | _        |rt#        |      nd| _        | j'                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   TF)r   r   r   r   r   r   r   N)rD   rE   rF   copydeepcopyr,  r;   char_embeddingsr)  local_transformer_strideinitial_char_encoderr   chars_to_moleculesencoderr   
projectionfinal_char_encoderr=  pooler	post_init)rY   rF   add_pooling_layershallow_configr]   s       r9   rE   zCanineModel.__init__  s    
 	 v.+,(/7$1,1*/$*$C$C%+%D%D"("A"A#)#B#B	%
! #8"?$V,(0"/"?.?l6*T 	r8   c                    |j                   d   |j                   d   }}|j                   d   }t        j                  ||d|f      j                         }t        j                  ||dft        j
                  |j                        }||z  }|S )aP  
        Create 3D attention mask from a 2D tensor mask.

        Args:
            from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
            to_mask: int32 Tensor of shape [batch_size, to_seq_length].

        Returns:
            float Tensor of shape [batch_size, from_seq_length, to_seq_length].
        r   r   )r~   r|   r}   )r   r3   reshaper   onesfloat32r}   )rY   r   to_maskr   r   r   broadcast_onesmasks           r9   )_create_3d_attention_mask_from_input_maskz5CanineModel._create_3d_attention_mask_from_input_mask  s     '2&7&7&:K<M<Ma<PO
a(--*a)GHNNP
 *oq)IQVQ^Q^gnguguv 'r8   char_attention_maskr   c                     |j                   \  }}t        j                  ||d|f      }t        j                  j	                  ||      |j                               }t        j                  |d      }|S )z[Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer.r   )r   r   rB   rm   )r   r3   rs  r   	MaxPool1dr   squeeze)rY   rz  r   r   char_seq_lenpoolable_char_maskpooled_molecule_maskmolecule_attention_masks           r9   _downsample_attention_maskz&CanineModel._downsample_attention_mask  sw     $7#<#< 
L"]]+>QP\@]^  %xx11>OXi1j$$& 

 #(--0D""M&&r8   	moleculeschar_seq_lengthry   c                    | j                   j                  }|ddddddf   }t        j                  ||d      }|ddddddf   }||z  }t        j                  |||z   d      }t        j                  ||gd      S )zDRepeats molecules to make them the same length as the char sequence.Nr   r   )repeatsrn   rB   rm   )rF   r   r3   repeat_interleaverq   )	rY   r  r  ratemolecules_without_extra_clsrepeatedlast_moleculeremainder_lengthremainder_repeateds	            r9   _repeat_moleculeszCanineModel._repeat_molecules'  s     {{,,&/12q&9#**+FPTZ\] "!RS!),*T1"44$t+	
 yy($67R@@r8   Nre   r   rw   r@   rx   r   r/  r0  c	                    ||n| j                   j                  }||n| j                   j                  }|rdnd }
|rdnd }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }n!||j                         d d }nt	        d      |\  }}||j                  n|j                  }|t        j                  ||f|      }|&t        j                  |t        j                  |      }| j                  ||      }| j                  || j                   j                        }| j                  |||j                  d   f      }| j!                  ||||      }| j#                  ||n||      }| j%                  ||||	      }|j&                  }| j)                  |      }| j+                  |||||
      }|d   }| j,                  | j-                  |      nd }| j/                  ||d         }t        j0                  ||gd      }| j3                  |      }| j5                  ||||	      }|j&                  }|r2|r|j6                  n|d   }|
|j6                  z   |z   |j6                  z   }
|r2|r|j8                  n|d   } ||j8                  z   | z   |j8                  z   }|s||f}!|!t;        d |
|fD              z  }!|!S t=        |||
|      S )Nr7   zDYou cannot specify both input_ids and inputs_embeds at the same timerB   z5You have to specify either input_ids or inputs_embeds)r}   r{   )r   )re   r@   rw   rx   )r   r   r/  )r   r   r/  r0  r   )r  rm   r   c              3   &   K   | ]	  }||  y wr   r7   r3  s     r9   r6  z&CanineModel.forward.<locals>.<genexpr>  s     a!STS`Aar7  )r+   r,   r-   r.   )rF   r   r/  use_return_dictrc   %warn_if_padding_and_no_attention_maskr~   r}   r3   rt  r   r   get_extended_attention_maskr  r   r   rg  ry  ri  r+   rj  rk  rn  r  rq   rl  rm  r-   r.   r6   r*   )"rY   re   r   rw   r@   rx   r   r/  r0  kwargsr8  r9  r   r   r   r}   extended_attention_maskr   extended_molecule_attention_maskinput_char_embeddingsrz  init_chars_encoder_outputsinput_char_encodinginit_molecule_encodingencoder_outputsmolecule_sequence_outputrB  repeated_moleculesconcatrU  final_chars_encoder_outputsdeep_encoder_hidden_statesdeep_encoder_self_attentionsr   s"                                     r9   r   zCanineModel.forward@  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 #7BD$5b4%0%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!"[[EJJvVN 150P0PQ_al0m"&"A"Adkk.K.K #B #
 :>9Y9Y#j2I2O2OPR2S%T:
(
 !% 4 4%)'	 !5 !
 #LL".IM>
 &*%>%>!./!5	 &? &
" 9JJ  "&!8!89L!M ,,";/!5# ' 
 $31#5 AEAX$<=^b "334L^ijl^m3n /1CD"M //&1 '+&=&=2/!5	 '> '
# 6GGJU)F)F[jkl[m&!,::;,- .;;<  IT?+E+EZijlZm(#,778./ .889   %}5Fea(9;N'OaaaFM+-'+*	
 	
r8   )T)NNNNNNNN)r/   r0   r1   rE   ry  r3   r   r   r  r  r   r   r4   r   r6   r*   r   r   r   s   @r9   rc  rc    s)    D6'ell '_b '"A5<< A# ARWR^R^ A2  .237260426)-,0#'T
##d*T
 ))D0T
 ((4/	T

 &&-T
 ((4/T
  $;T
 #TkT
 D[T
 
-	-T
 T
r8   rc  z
    CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
z  fd       Z xZS )CanineForSequenceClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   rD   rE   
num_labelsrc  rZ  r   rR   rS   rT   r   rG   
classifierro  r   s     r9   rE   z(CanineForSequenceClassification.__init__  i      ++!&)zz&"<"<=))F$6$68I8IJ 	r8   Nre   r   rw   r@   rx   labelsr   r/  r0  ry   c
           
      >   |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j
                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } |||      }|	s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   rw   r@   rx   r   r/  r0  r   
regressionsingle_label_classificationmulti_label_classificationrB   r   losslogitsr-   r.   )rF   r  rZ  rT   r  problem_typer  r|   r3   r   r   r   r}  r   r   r   r   r-   r.   )rY   re   r   rw   r@   rx   r  r   r/  r0  r  r   rB  r  r  loss_fctr   s                    r9   r   z'CanineForSequenceClassification.forward  s   ( &1%<k$++B]B]++))%'/!5#  	
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r8   	NNNNNNNNN)r/   r0   r1   rE   r   r3   r   r4   r   r6   r   r   r   r   s   @r9   r  r    s    	  .237260426*.)-,0#'D
##d*D
 ))D0D
 ((4/	D

 &&-D
 ((4/D
   4'D
  $;D
 #TkD
 D[D
 
)	)D
 D
r8   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
z  fd       Z xZS )CanineForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y r  )rD   rE   rc  rZ  r   rR   rS   rT   r   rG   r  ro  r   s     r9   rE   z CanineForMultipleChoice.__init__4  sV     !&)zz&"<"<=))F$6$6: 	r8   Nre   r   rw   r@   rx   r  r   r/  r0  ry   c
           
      J   |	|	n| j                   j                  }	||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rB   r   r  r   r  )rF   r  r   r   r~   rZ  rT   r  r   r   r-   r.   )rY   re   r   rw   r@   rx   r  r   r/  r0  r  num_choicesr   rB  r  reshaped_logitsr  r  r   s                      r9   r   zCanineForMultipleChoice.forward>  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ++))%'/!5#  	
  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r8   r  )r/   r0   r1   rE   r   r3   r   r4   r   r6   r   r   r   r   s   @r9   r  r  2  s      .237260426*.)-,0#'W
##d*W
 ))D0W
 ((4/	W

 &&-W
 ((4/W
   4'W
  $;W
 #TkW
 D[W
 
*	*W
 W
r8   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
z  fd       Z xZS )CanineForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   r  r   s     r9   rE   z%CanineForTokenClassification.__init__  r  r8   Nre   r   rw   r@   rx   r  r   r/  r0  ry   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CanineForTokenClassification
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
        >>> model = CanineForTokenClassification.from_pretrained("google/canine-s")

        >>> inputs = tokenizer(
        ...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
        ... )

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_token_class_ids = logits.argmax(-1)

        >>> # Note that tokens are classified rather then input words which means that
        >>> # there might be more predicted token classes than words.
        >>> # Multiple token classes might account for the same word
        >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
        >>> predicted_tokens_classes  # doctest: +SKIP
        ```

        ```python
        >>> labels = predicted_token_class_ids
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)  # doctest: +SKIP
        ```Nr  r   rB   r   r  )rF   r  rZ  rT   r  r   r   r  r   r-   r.   )rY   re   r   rw   r@   rx   r  r   r/  r0  r  r   rU  r  r  r  r   s                    r9   r   z$CanineForTokenClassification.forward  s    ` &1%<k$++B]B]++))%'/!5#  	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r8   r  )r/   r0   r1   rE   r   r3   r   r4   r   r6   r   r   r   r   s   @r9   r  r    s    	  .237260426*.)-,0#'O
##d*O
 ))D0O
 ((4/	O

 &&-O
 ((4/O
   4'O
  $;O
 #TkO
 D[O
 
&	&O
 O
r8   r  c                   @    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  de	e
z  fd       Z xZS )CanineForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
rD   rE   r  rc  rZ  r   r   rG   
qa_outputsro  r   s     r9   rE   z#CanineForQuestionAnswering.__init__  sS      ++!&)))F$6$68I8IJ 	r8   Nre   r   rw   r@   rx   start_positionsend_positionsr   r/  r0  ry   c           
         |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      }|j                  d      }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|       |j                  d|       t        |      } |||      } |||      }||z   dz  }|
s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r   rB   rm   )ignore_indexr   )r  start_logits
end_logitsr-   r.   )rF   r  rZ  r  splitr}  ra   r~   clamp_r   r   r-   r.   )rY   re   r   rw   r@   rx   r  r  r   r/  r0  r  r   rU  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          r9   r   z"CanineForQuestionAnswering.forward  s    &1%<k$++B]B]++))%'/!5#  	
 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r8   )
NNNNNNNNNN)r/   r0   r1   rE   r   r3   r   r4   r   r6   r   r   r   r   s   @r9   r  r    s     .2372604263715)-,0#'=
##d*=
 ))D0=
 ((4/	=

 &&-=
 ((4/=
 ))D0=
 ''$.=
  $;=
 #Tk=
 D[=
 
-	-=
 =
r8   r  )r  r  r  r  r  rc  rY  )=r2   re  r   dataclassesr   r3   r   torch.nnr   r   r    r	   r]  activationsr
   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   configuration_caniner   
get_loggerr/   loggerrb   r*   Moduler;   r   r   r   r   r   r  r  r  r)  r=  rD  rI  rR  rY  rc  r  r  r  r  __all__r7   r8   r9   <module>r     sB      !   A A & ! 9  . 6 , . 
		H	% U 7; 7 7:^ryy ^B)BII )X5RYY 5pN")) Nbryy  xbii xv 299 5, 5p=
BII =
@299 BII "RYY &
!		 
! iO i i }
' }
 }
@ Q
&; Q
Q
h c
3 c
 c
L \
#8 \
 \
~ I
!6 I
 I
Xr8   