
    qi                        d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z
 ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z"  ejF                  e$      Z% G d dejL                        Z' G d dejL                        Z( G d dejL                        Z) G d dejL                        Z* G d dejL                        Z+ G d dejL                        Z, G d dejL                        Z- G d dejL                        Z. G d dejL                        Z/e G d  d!e             Z0e G d" d#e0             Z1e G d$ d%e0             Z2 G d& d'ejL                        Z3 ed()       G d* d+e0             Z4e G d, d-e0             Z5e G d. d/e0             Z6 G d0 d1ejL                        Z7e G d2 d3e0             Z8d6d4Z9g d5Z:y)7zPyTorch I-BERT model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)gelu))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )IBertConfig)IntGELUIntLayerNorm
IntSoftmaxQuantActQuantEmbeddingQuantLinearc                   2     e Zd ZdZ fdZ	 ddZd Z xZS )IBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                 z   t         |           |j                  | _        d| _        d| _        d| _        d| _        d| _        t        |j                  |j                  |j                  | j                  | j                        | _        t        |j                  |j                  | j                  | j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d	
       |j                  | _        t        |j$                  |j                  | j(                  | j                  | j                        | _        t-        | j                  | j                        | _        t-        | j                  | j                        | _        t3        |j                  |j4                  | j                  | j                  |j6                        | _        t-        | j
                  | j                        | _        t=        j>                  |j@                        | _!        y )N             )padding_idx
weight_bit
quant_mode)r$   r%   position_idsr   F)
persistentr%   eps
output_bitr%   force_dequant)"super__init__r%   embedding_bitembedding_act_bitact_bitln_input_bitln_output_bitr   
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsregister_buffertorcharangemax_position_embeddingsexpandr#   position_embeddingsr   embeddings_act1embeddings_act2r   layer_norm_epsr.   	LayerNormoutput_activationr   Dropouthidden_dropout_probdropoutselfconfig	__class__s     Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/ibert/modeling_ibert.pyr0   zIBertEmbeddings.__init__2   s    ++!#-++)) 
 &4""F$6$64CUCUbfbqbq&
"
 	ELL)G)GHOOPWXej 	 	

 "..#1**(())$
   ((>(>4??['(>(>4??[%%%)) ..
 "*$,,4??!Szz&"<"<=    c                    |D|1t        || j                  |      j                  |j                        }n| j	                  |      }||j                         }n|j                         d d }|:t        j                  |t        j                  | j                  j                        }|| j                  |      \  }}nd }| j                  |      \  }}	| j                  ||||	      \  }
}| j                  |      \  }}| j                  |
|||      \  }
}| j                  |
|      \  }
}| j                  |
      }
| j!                  |
|      \  }
}|
|fS )Nr(   dtypedeviceidentityidentity_scaling_factor)"create_position_ids_from_input_idsr#   torS   &create_position_ids_from_inputs_embedssizer=   zeroslongr&   r9   r;   rB   rA   rE   rI   rF   )rK   	input_idstoken_type_idsr&   inputs_embedspast_key_values_lengthinput_shapeinputs_embeds_scaling_factorr;   $token_type_embeddings_scaling_factor
embeddingsembeddings_scaling_factorrA   "position_embeddings_scaling_factors                 rN   forwardzIBertEmbeddings.forwardc   s    $At//1G "Y%%&   $JJ=Y #..*K',,.s3K!"[[EJJtO`O`OgOghN :>:N:Ny:Y7M7+/(FJF`F`aoFpCC040D0D(*$H	 1E 1
-
- CGBZBZ[gBh??040D0D%($F	 1E 1
-
- 15zKd0e-
-\\*-
040F0FzSl0m-
-444rO   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr(   r   rQ   r   )rZ   r=   r>   r#   r\   rS   	unsqueezer@   )rK   r_   ra   sequence_lengthr&   s        rN   rY   z6IBertEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<rO   )NNNNr   )__name__
__module____qualname____doc__r0   rg   rY   __classcell__rM   s   @rN   r   r   -   s     />d rs,5\=rO   r   c                   *     e Zd Z fdZ	 	 ddZ xZS )IBertSelfAttentionc           	      F   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        d| _        d| _        d| _	        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        |j                  | j                  d| j                  | j                  | j                  d	      | _        t        |j                  | j                  d| j                  | j                  | j                  d	      | _        t        |j                  | j                  d| j                  | j                  | j                  d	      | _        t#        | j                  | j                  
      | _        t#        | j                  | j                  
      | _        t#        | j                  | j                  
      | _        t#        | j                  | j                  
      | _        t-        j.                  |j0                        | _        t5        | j                  | j                  |j6                        | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r"   Tbiasr$   bias_bitr%   per_channelr*   r%   r.   )r/   r0   r7   num_attention_headshasattr
ValueErrorr%   r$   rx   r3   intattention_head_sizeall_head_sizer   querykeyvaluer   query_activationkey_activationvalue_activationrF   r   rG   attention_probs_dropout_probrI   r   r.   softmaxrJ   s     rN   r0   zIBertSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8  !++#)#=#= #&v'9'9F<V<V'V#W !558P8PP !]]

 ]]
 !]]

 !)$// R&t||P ($// R!)$,,4??!Szz&"E"EF!$,,4??Z`ZnZnorO   c                    | j                  ||      \  }}| j                  ||      \  }}| j                  ||      \  }	}
| j                  ||      \  }}| j	                  ||      \  }}| j                  |	|
      \  }}|j                  \  }}}|j                  |d| j                  | j                        j                  dd      }|j                  |d| j                  | j                        j                  dd      }|j                  |d| j                  | j                        j                  dd      }t        j                  ||j                  dd            }t        j                  | j                        }||z  }| j                  r	||z  |z  }nd }|||z   }| j!                  ||      \  }}| j#                  |      }t        j                  ||      }|||z  }nd }|j%                  dddd      j'                         }|j)                         d d | j*                  fz   } |j                  | }| j-                  ||      \  }}|r||fn|f}|r||fn|f}||fS )Nr(   r      r   r   )r   r   r   r   r   r   shapeviewr{   r   	transposer=   matmulmathsqrtr%   r   rI   permute
contiguousrZ   r   rF   )rK   hidden_stateshidden_states_scaling_factorattention_maskoutput_attentionsmixed_query_layer mixed_query_layer_scaling_factormixed_key_layermixed_key_layer_scaling_factormixed_value_layer mixed_value_layer_scaling_factorquery_layerquery_layer_scaling_factor	key_layerkey_layer_scaling_factorvalue_layervalue_layer_scaling_factor
batch_size
seq_length_attention_scoresscaleattention_scores_scaling_factorattention_probsattention_probs_scaling_factorcontext_layercontext_layer_scaling_factornew_context_layer_shapeoutputsoutput_scaling_factors                                 rN   rg   zIBertSelfAttention.forward   s    ?CjjXt>u;;:>((=Rn:o77>BjjXt>u;; 372G2G?3
// /3.A.A/Sq.r+	+262G2G?3
//
 %2$7$7!
J!&&z2t7O7OQUQiQijttq
 NN:r43K3KTMeMefppqrtuv	!&&z2t7O7OQUQiQijttq

 !<<Y5H5HR5PQ		$223+e3??.HKc.cfk.k+.2+%/.@ ;?,,=;
77 ,,7_kB)5+ILf+f(+/(%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD 7;6L6L77
33 7H=/2mM] ! *+IJ.0 	 ---rO   NFrk   rl   rm   r0   rg   ro   rp   s   @rN   rr   rr      s    5pv K.rO   rr   c                   $     e Zd Z fdZd Z xZS )IBertSelfOutputc           	         t         |           |j                  | _        d| _        d| _        d| _        d| _        d| _        t        |j                  |j                  d| j                  | j
                  | j                  d      | _
        t        | j                  | j                        | _        t        |j                  |j                  | j                  | j                  |j                        | _        t        | j                  | j                        | _        t%        j&                  |j(                        | _        y Nr   r"   r!   Trv   r*   r+   )r/   r0   r%   r3   r$   rx   r4   r5   r   r7   denser   ln_input_actr   rD   r.   rE   rF   r   rG   rH   rI   rJ   s     rN   r0   zIBertSelfOutput.__init__*  s     ++ ]]

 %T%6%64??S%%%)) ..
 "*$,,4??!Szz&"<"<=rO   c                     | j                  ||      \  }}| j                  |      }| j                  ||||      \  }}| j                  ||      \  }}| j	                  ||      \  }}||fS NrT   r   rI   r   rE   rF   rK   r   r   input_tensorinput_tensor_scaling_factors        rN   rg   zIBertSelfOutput.forwardG      6:jjPl6m33]36:6G6G(!$?	 7H 7
33 7;nn]Tp6q336:6L6L77
33 :::rO   r   rp   s   @rN   r   r   )      >:;rO   r   c                   *     e Zd Z fdZ	 	 ddZ xZS )IBertAttentionc                     t         |           |j                  | _        t        |      | _        t        |      | _        y N)r/   r0   r%   rr   rK   r   outputrJ   s     rN   r0   zIBertAttention.__init__Y  s3     ++&v.	%f-rO   c                     | j                  ||||      \  }}| j                  |d   |d   ||      \  }}|f|dd  z   }	|f|dd  z   }
|	|
fS )Nr   r   )rK   r   )rK   r   r   r   r   self_outputsself_outputs_scaling_factorattention_outputattention_output_scaling_factorr   outputs_scaling_factors              rN   rg   zIBertAttention.forward_  s     59II(	5
11 =AKKO8;]Lh=
99 $%QR(88"A!CFabcbdFe!e...rO   r   r   rp   s   @rN   r   r   X  s    . /rO   r   c                   $     e Zd Z fdZd Z xZS )IBertIntermediatec           	         t         |           |j                  | _        d| _        d| _        d| _        t        |j                  |j                  d| j                  | j
                  | j                  d      | _	        |j                  dk7  rt        d      t        | j                  |j                        | _        t        | j                  | j                        | _        y )	Nr   r"   Trv   r	   z3I-BERT only supports 'gelu' for `config.hidden_act`rz   r*   )r/   r0   r%   r3   r$   rx   r   r7   intermediate_sizer   
hidden_actr}   r   r.   intermediate_act_fnr   rF   rJ   s     rN   r0   zIBertIntermediate.__init__u  s     ++ $$]]

 &RSS#*dooU[UiUi#j !)$,,4??!SrO   c                     | j                  ||      \  }}| j                  ||      \  }}| j                  ||      \  }}||fS r   )r   r   rF   )rK   r   r   s      rN   rg   zIBertIntermediate.forward  sa    6:jjPl6m336:6N6N77
33
 7;6L6L77
33 :::rO   r   rp   s   @rN   r   r   t  s    T(
;rO   r   c                   $     e Zd Z fdZd Z xZS )IBertOutputc           	         t         |           |j                  | _        d| _        d| _        d| _        d| _        d| _        t        |j                  |j                  d| j                  | j
                  | j                  d      | _        t        | j                  | j                        | _        t        |j                  |j                  | j                  | j                  |j                         | _        t        | j                  | j                        | _        t'        j(                  |j*                        | _        y r   )r/   r0   r%   r3   r$   rx   r4   r5   r   r   r7   r   r   r   r   rD   r.   rE   rF   r   rG   rH   rI   rJ   s     rN   r0   zIBertOutput.__init__  s     ++ $$]]

 %T%6%64??S%%%)) ..
 "*$,,4??!Szz&"<"<=rO   c                     | j                  ||      \  }}| j                  |      }| j                  ||||      \  }}| j                  ||      \  }}| j	                  ||      \  }}||fS r   r   r   s        rN   rg   zIBertOutput.forward  r   rO   r   rp   s   @rN   r   r     r   rO   r   c                   0     e Zd Z fdZ	 	 ddZd Z xZS )
IBertLayerc                 X   t         |           |j                  | _        d| _        d| _        t        |      | _        t        |      | _        t        |      | _
        t        | j                  | j                        | _        t        | j                  | j                        | _        y )Nr   r   r*   )r/   r0   r%   r3   seq_len_dimr   	attentionr   intermediater   r   r   pre_intermediate_actpre_output_actrJ   s     rN   r0   zIBertLayer.__init__  s}     ++'/-f5!&)$,T\\doo$V!&t||PrO   c                     | j                  ||||      \  }}|d   }|d   }|dd  }	| j                  ||      \  }
}|
f|	z   }	|	S )N)r   r   r   )r   feed_forward_chunk)rK   r   r   r   r   self_attention_outputs%self_attention_outputs_scaling_factorr   r   r   layer_outputlayer_output_scaling_factors               rN   rg   zIBertLayer.forward  s     IM(/	 IW I
E E 2!4*OPQ*R'(,484K4K=5
11  /G+rO   c                     | j                  ||      \  }}| j                  ||      \  }}| j                  ||      \  }}| j                  ||||      \  }}||fS r   )r   r   r   r   )rK   r   r   intermediate_output"intermediate_output_scaling_factorr   r   s          rN   r   zIBertLayer.feed_forward_chunk  s    <@<U<U==
99 CGBSBS=C
?? CGBUBU!CC
?? 59KK!CEUWv5
11 888rO   r   )rk   rl   rm   r0   rg   r   ro   rp   s   @rN   r   r     s    Q" 29rO   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )IBertEncoderc                     t         |           || _        |j                  | _        t	        j
                  t        |j                        D cg c]  }t        |       c}      | _	        y c c}w r   )
r/   r0   rL   r%   r   
ModuleListrangenum_hidden_layersr   layer)rK   rL   r   rM   s      rN   r0   zIBertEncoder.__init__  sP     ++]]fF^F^@_#`1Jv$6#`a
#`s   A-c                     |rdnd }|rdnd }d }	t        | j                        D ])  \  }
}|r||fz   } |||||      }|d   }|s!||d   fz   }+ |r||fz   }|st        d ||||	fD              S t        ||||	      S )N r   r   c              3   $   K   | ]  }|| 
 y wr   r   ).0vs     rN   	<genexpr>z'IBertEncoder.forward.<locals>.<genexpr>$  s      	 = 	s   )last_hidden_stater   
attentionscross_attentions)	enumerater   tupler
   )rK   r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_outputss                rN   rg   zIBertEncoder.forward  s     #7BD$5b4#(4 	POA|#$58H$H!(,!	M *!,M &9]1=M<O&O#	P   1]4D D 	 "%'(		 	 	 9++*1	
 	
rO   )NFFTr   rp   s   @rN   r   r     s    b "/
rO   r   c                   $     e Zd Z fdZd Z xZS )IBertPoolerc                     t         |           |j                  | _        t        j                  |j
                  |j
                        | _        t        j                         | _        y r   )	r/   r0   r%   r   Linearr7   r   Tanh
activationrJ   s     rN   r0   zIBertPooler.__init__7  sF     ++YYv1163E3EF
'')rO   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   r  )rK   r   first_token_tensorpooled_outputs       rN   rg   zIBertPooler.forward=  s6     +1a40

#566rO   r   rp   s   @rN   r  r  6  s    $rO   r  c                   R    e Zd ZU eed<   dZ ej                         d        ZddZ	y)IBertPreTrainedModelrL   ibertc                    t        |t        t        j                  f      rt	        j
                  |j                  d| j                  j                         |j                  t	        j                  |j                         t        |dd      >t	        j                  |j                         t	        j                  |j                         t        |dd       t	        j                  |j                         yyt        |t        t        j                   f      rt	        j
                  |j                  d| j                  j                         |j"                  Ct        |j                  dd      s,t	        j                  |j                  |j"                            t        |dd      ?t	        j                  |j$                         t	        j                  |j                         yyt        |t&        t        j(                  f      rlt	        j                  |j                         t	        j*                  |j                         t        |d	d       t	        j                  |j,                         yyt        |t.              r t	        j                  |j                         yt        |t0              rZt	        j2                  |j4                  t7        j8                  |j4                  j:                  d
         j=                  d             yt        |t>              r`t	        j@                  |jB                  d       t	        j@                  |jD                  d       t	        j                  |jF                         yy)zInitialize the weightsg        )meanstdNweight_integerbias_integer_is_hf_initializedFweight_scaling_factorshiftr(   r'   gh㈵gh㈵>)$
isinstancer   r   r  initnormal_weightrL   initializer_rangerw   zeros_getattrr  fc_scaling_factorr  r   	Embeddingr#   r  r   rE   ones_r  IBertLMHeadr   copy_r&   r=   r>   r   r@   r   	constant_x_minx_maxact_scaling_factor)rK   modules     rN   _init_weightsz"IBertPreTrainedModel._init_weightsK  sJ    f{BII67LLSdkk6S6ST{{&FKK(v/6BF112F445v~t4@F//0 A >?LLSdkk6S6ST!!-gfmmMach6iFMM&*<*<=>v6=IF889F112 J r|| <=KK$JJv}}%vw-9FLL) :,KK$0JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh)NN6<</NN6<<.KK112 *rO   Nc                     t        d      )Nz6`resize_token_embeddings` is not supported for I-BERT.)NotImplementedError)rK   new_num_tokenss     rN   resize_token_embeddingsz,IBertPreTrainedModel.resize_token_embeddingsm  s    !"Z[[rO   r   )
rk   rl   rm   r   __annotations__base_model_prefixr=   no_gradr(  r,  r   rO   rN   r  r  F  s/    U]]_3 3B\rO   r  c                   ,    e Zd ZdZd fd	Zd Zd Ze	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  deeej                     z  fd       Z xZS )
IBertModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    c                     t         |   |       || _        |j                  | _        t	        |      | _        t        |      | _        |rt        |      nd| _	        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r/   r0   rL   r%   r   rd   r   encoderr  pooler	post_init)rK   rL   add_pooling_layerrM   s      rN   r0   zIBertModel.__init__|  sZ    
 	  ++)&1#F+->k&)D 	rO   c                 .    | j                   j                  S r   rd   r9   rK   s    rN   get_input_embeddingszIBertModel.get_input_embeddings  s    ...rO   c                 &    || j                   _        y r   r8  )rK   r   s     rN   set_input_embeddingszIBertModel.set_input_embeddings  s    */'rO   Nr]   r   r^   r&   r_   r   r   r   returnc	                 J   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|&t        j                  |
t        j                  |      }| j                  ||
      }| j                  ||||      \  }}| j                  ||||||      }|d   }| j                  | j                  |      nd }|s
||f|d	d  z   S t!        |||j"                  |j$                  |j&                  
      S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer(   z5You have to specify either input_ids or inputs_embeds)rS   rQ   )r]   r&   r^   r_   )r   r   r   r   r   r   )r   pooler_outputr   r   r   )rL   r   r   use_return_dictr}   %warn_if_padding_and_no_attention_maskrZ   rS   r=   onesr[   r\   get_extended_attention_maskrd   r3  r4  r   r   r   r   )rK   r]   r   r^   r&   r_   r   r   r   kwargsra   r   r   rS   extended_attention_maskembedding_outputembedding_output_scaling_factorencoder_outputssequence_outputr  s                       rN   rg   zIBertModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!"[[EJJvVN 150P0PQ_al0m<@OO%)'	 =L =
99 ,,+2/!5# ' 
 *!,8<8OO4UY#]3oab6III;-')77&11,==
 	
rO   )T)NNNNNNNN)rk   rl   rm   rn   r0   r:  r<  r   r=   
LongTensorFloatTensorboolr   r   rg   ro   rp   s   @rN   r1  r1  q  s    "/0  .237260426)-,0#'B
##d*B
 ))D0B
 ((4/	B

 &&-B
 ((4/B
  $;B
 #TkB
 D[B
 
6e>O>O8P	PB
 B
rO   r1  c                   P    e Zd ZdddZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  deeej                     z  fd       Z xZS )IBertForMaskedLMz(ibert.embeddings.word_embeddings.weight$zlm_head.bias)zlm_head.decoder.weightzlm_head.decoder.biasc                     t         |   |       t        |d      | _        t	        |      | _        | j                          y NF)r6  )r/   r0   r1  r  r!  lm_headr5  rJ   s     rN   r0   zIBertForMaskedLM.__init__  s6     %@
"6* 	rO   c                 .    | j                   j                  S r   )rQ  decoderr9  s    rN   get_output_embeddingsz&IBertForMaskedLM.get_output_embeddings  s    ||###rO   c                 \    || j                   _        |j                  | j                   _        y r   )rQ  rS  rw   )rK   new_embeddingss     rN   set_output_embeddingsz&IBertForMaskedLM.set_output_embeddings  s     -*//rO   Nr]   r   r^   r&   r_   labelsr   r   r   r=  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r^   r&   r_   r   r   r   r   r(   r   losslogitsr   r   )
rL   r@  r  rQ  r   r   r6   r   r   r   )rK   r]   r   r^   r&   r_   rX  r   r   r   rD  r   rI  prediction_scoresmasked_lm_lossloss_fctr   s                    rN   rg   zIBertForMaskedLM.forward  s    ( &1%<k$++B]B]**))%'/!5#  	
 "!* LL9')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rO   	NNNNNNNNN)rk   rl   rm   _tied_weights_keysr0   rT  rW  r   r=   rJ  rK  rL  r   r   rg   ro   rp   s   @rN   rN  rN    s    #M .
$0  .237260426*.)-,0#'0
##d*0
 ))D00
 ((4/	0

 &&-0
 ((4/0
   4'0
  $;0
 #Tk0
 D[0
 
% 1 12	20
 0
rO   rN  c                   (     e Zd ZdZ fdZd Z xZS )r!  z)I-BERT Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _
        t        j                  t        j                  |j                              | _        y )N)r,   )r/   r0   r   r  r7   r   rE   rD   
layer_normr6   rS  	Parameterr=   r[   rw   rJ   s     rN   r0   zIBertLMHead.__init__'  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	rO   c                     | j                  |      }t        |      }| j                  |      }| j                  |      }|S r   )r   r	   re  rS  )rK   featuresrD  xs       rN   rg   zIBertLMHead.forward/  s;    JJx GOOA LLOrO   rk   rl   rm   rn   r0   rg   ro   rp   s   @rN   r!  r!  $  s    3ArO   r!  z
    I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                   :    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
ej                     z  fd       Z xZS )IBertForSequenceClassificationc                     t         |   |       |j                  | _        t        |d      | _        t        |      | _        | j                          y rP  )r/   r0   
num_labelsr1  r  IBertClassificationHead
classifierr5  rJ   s     rN   r0   z'IBertForSequenceClassification.__init__A  sC      ++%@
1&9 	rO   Nr]   r   r^   r&   r_   rX  r   r   r   r=  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|	s|f|d	d z   }||f|z   S |S t        |||j                   |j"                  
      S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrZ  r   r   
regressionsingle_label_classificationmulti_label_classificationr(   r   r[  )rL   r@  r  rq  problem_typero  rR   r=   r\   r~   r   squeezer   r   r   r   r   r   rK   r]   r   r^   r&   r_   rX  r   r   r   rD  r   rI  r]  r\  r`  r   s                    rN   rg   z&IBertForSequenceClassification.forwardK  s   ( &1%<k$++B]B]**))%'/!5#  	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rO   ra  )rk   rl   rm   r0   r   r=   rJ  rK  rL  r   r   rg   ro   rp   s   @rN   rm  rm  :  s     .237260426*.)-,0#'A
##d*A
 ))D0A
 ((4/	A

 &&-A
 ((4/A
   4'A
  $;A
 #TkA
 D[A
 
"E%*;*;$<	<A
 A
rO   rm  c                   :    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
ej                     z  fd       Z xZS )IBertForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr   )r/   r0   r1  r  r   rG   rH   rI   r  r7   rq  r5  rJ   s     rN   r0   zIBertForMultipleChoice.__init__  sV     '
zz&"<"<=))F$6$6: 	rO   Nr]   r^   r   rX  r&   r_   r   r   r   r=  c
           
      J   |	|	n| j                   j                  }	||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r(   r   )r&   r^   r   r_   r   r   r   r   r[  )rL   r@  r   r   rZ   r  rI   rq  r   r   r   r   )rK   r]   r^   r   rX  r&   r_   r   r   r   rD  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r  r]  reshaped_logitsr\  r`  r   s                           rN   rg   zIBertForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ***..,/!5#  	
  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rO   ra  )rk   rl   rm   r0   r   r=   rJ  rK  rL  r   r   rg   ro   rp   s   @rN   rz  rz    s     .22637*.0426)-,0#'V
##d*V
 ((4/V
 ))D0	V

   4'V
 &&-V
 ((4/V
  $;V
 #TkV
 D[V
 
#U5+<+<%=	=V
 V
rO   rz  c                   :    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
ej                     z  fd       Z xZS )IBertForTokenClassificationc                 0   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y rP  )r/   r0   ro  r1  r  r   rG   rH   rI   r  r7   rq  r5  rJ   s     rN   r0   z$IBertForTokenClassification.__init__  sk      ++%@
zz&"<"<=))F$6$68I8IJ 	rO   Nr]   r   r^   r&   r_   rX  r   r   r   r=  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        NrZ  r   r(   r   r[  )rL   r@  r  rI   rq  r   r   ro  r   r   r   rx  s                    rN   rg   z#IBertForTokenClassification.forward  s    $ &1%<k$++B]B]**))%'/!5#  	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rO   ra  )rk   rl   rm   r0   r   r=   rJ  rK  rL  r   r   rg   ro   rp   s   @rN   r  r    s    	  .237260426*.)-,0#'1
##d*1
 ))D01
 ((4/	1

 &&-1
 ((4/1
   4'1
  $;1
 #Tk1
 D[1
 
u'8'8!9	91
 1
rO   r  c                   (     e Zd ZdZ fdZd Z xZS )rp  z-Head for sentence-level classification tasks.c                 &   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        y r   )r/   r0   r   r  r7   r   rG   rH   rI   ro  out_projrJ   s     rN   r0   z IBertClassificationHead.__init__;  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHrO   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r	  )rI   r   r=   tanhr  )rK   rh  rD  r   s       rN   rg   zIBertClassificationHead.forwardA  s^     Aq)]3

=1

=1]3m4rO   rj  rp   s   @rN   rp  rp  8  s    7IrO   rp  c                   Z    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  de	e
ej                     z  fd       Z xZS )IBertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y rP  )
r/   r0   ro  r1  r  r   r  r7   
qa_outputsr5  rJ   s     rN   r0   z"IBertForQuestionAnswering.__init__M  sU      ++%@
))F$6$68I8IJ 	rO   Nr]   r   r^   r&   r_   start_positionsend_positionsr   r   r   r=  c           
      &   |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	NrZ  r   r   r(   dim)ignore_indexr   )r\  start_logits
end_logitsr   r   )rL   r@  r  r  splitrw  r   lenrZ   clampr   r   r   r   )rK   r]   r   r^   r&   r_   r  r  r   r   r   rD  r   rI  r]  r  r  
total_lossignored_indexr`  
start_lossend_lossr   s                          rN   rg   z!IBertForQuestionAnswering.forwardW  s    &1%<k$++B]B]**))%'/!5#  	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rO   )
NNNNNNNNNN)rk   rl   rm   r0   r   r=   rJ  rK  rL  r   r   rg   ro   rp   s   @rN   r  r  K  s     .2372604263715)-,0#'=
##d*=
 ))D0=
 ((4/	=

 &&-=
 ((4/=
 ))D0=
 ''$.=
  $;=
 #Tk=
 D[=
 
&e.?.?(@	@=
 =
rO   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )aM  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's *utils.make_positions*.

    Args:
    input_ids (`torch.LongTensor`):
           Indices of input sequence tokens in the vocabulary.

    Returns: torch.Tensor
    r   r  )ner~   r=   cumsumtype_asr\   )r]   r#   r`   maskincremental_indicess        rN   rW   rW     sW     <<$((*D <<!4<<TBE[[_cc##%33rO   )rN  rz  r  rm  r  r1  r  )r   );rn   r   r=   r   torch.nnr   r   r    r   r  activationsr	   modeling_outputsr
   r   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_ibertr   quant_modulesr   r   r   r   r   r   
get_loggerrk   loggerModuler   rr   r   r   r   r   r   r   r  r  r1  rN  r!  rm  rz  r  rp  r  rW   __all__r   rO   rN   <module>r     s   "     A A &    . , , c c 
		H	%s=bii s=lC. C.L,;bii ,;^/RYY /8;		 ;D,;")) ,;^59 59p6
299 6
r"))   '\? '\ '\T d
% d
 d
N G
+ G
 G
T")) , M
%9 M
M
` b
1 b
 b
J >
"6 >
 >
Bbii & I
 4 I
 I
X4"rO   