
    qi                     <   d Z ddlZddlZddlmZ ddlmZmZmZmZ ddlm	Z
 ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZ ddlm Z   ejB                  e"      Z#d'dZ$ G d dejJ                        Z& G d dejJ                        Z' G d de      Z(e G d de             Z)e G d de)             Z* ed       G d de)e             Z+ ed       G d  d!e)             Z,e G d" d#e)             Z-e G d$ d%e)             Z.g d&Z/y)(zPyTorch MPT model.    N)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MptConfigc                 R   t        j                  d|z
  dt         j                  |      j                  ddd|      }dt	        j
                  t	        j                  |             z  }t        j                  d|dz   t         j                  |      j                         }|||z  z  }dt        j                  d|      z  }|j                  d|dd      }|| k7  r9t        j                  |ddddddf   |ddddddf   gd      ddd| df   }||z  }|j                  d      S )	a  
    Link to paper: https://huggingface.co/papers/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
    r   )dtypedevice   g      ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopess           V/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensorr3   *   s    LL_,au{{6RWWXY[\^_apqE		$))I*> ??<</!35;;vV\\^D>$889D599Q%%F[[0!Q7Fy(vaAsl3VAssCK5HIqQRSU_V_U_adRdeFNE==    c                        e Zd ZdZddededz  f fdZ	 	 	 ddej                  dej                  de	dz  d	ej                  dz  d
ej                  dz  f
dZ
 xZS )MptAttentionzzMulti-head self attention.
    Using torch or triton attention implementation enables user to also use additive bias.
    Nconfig	layer_idxc                    t         |           |j                  | _        |j                  | _        |j                  | _        | j                  | j                  z  | _        |j                  j                  | _        | j                  4dt        j                  | j                  | j                  z        z  | _        |j                  j                  | _        |j                  j                  | _        t        j                  | j                  d| j                  z  d      | _        t        j                  | j                  | j                  d      | _        || _        y )Nr   r	   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler#   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projr8   )selfr7   r8   	__class__s      r2   r=   zMptAttention.__init__F   s   !--~~$00((DLL8#//==%!"TYYt/?/?$,,/N%O!OD$00;;**33IId..D4D4D0D5Q			$"2"2D4D4D5Q"r4   hidden_statesposition_biaspast_key_valuesattention_maskcache_positionc                 p   |j                   d d \  }}| j                  |      }| j                  r(|j                  | j                   | j                        }|j	                  dd      \  }	}
}|	j                  ||| j                  | j                        j                  dd      }	|
j                  ||| j                  | j                        j                  dd      }
|j                  ||| j                  | j                        j                  dd      }|%d|i}|j                  |
|| j                  |      \  }
}t        j                  |	|
j                  dd            | j                  z  }||n||j                         z   }|t        |j                         dk7  r!t!        d	t        |j                                |
j                   d   }t#        d
|j%                  d      |z
        }t#        d
|j%                  d      |z
        }|d d |d |d f   }||z   }|9|j'                  |t        j(                  |	j*                        j,                        }t.        j0                  j3                  |j5                         d      j7                  |j*                        }t.        j0                  j9                  || j:                  | j<                        }t        j                  ||      }|j?                  d
ddd      jA                         jC                  ||d      }| jE                  |      }||fS )Nr   )minmaxr	   r   r   rR   z6Expecting position_bias shape to be 3 dimensions, got r   ptraining)#shaperJ   rH   clampchunkreshaper?   rB   	transposeupdater8   r   matmulrD   get_seq_lengthlen
ValueErrorrU   sizemasked_fillfinfor   rT   r   r   softmaxr'   todropoutrG   rZ   permute
contiguousr"   rK   )rL   rN   rO   rP   rQ   rR   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statescache_kwargsattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputs                        r2   forwardzMptAttention.forwardV   s    "/!4!4Ra!8
JIIm,	==!T]]NNI1:1J.j,#++J
DLLRVR_R_`jjklnop''
Jdmm\ffghjkl
#++J
DLLRVR_R_`jjklnop&,n=L'6'='=j,X\XfXfht'u$J <<j6J6J2r6RSVZVhVhh%4%<z*OmOmOoBo$=&&'1, #YZ]^k^q^qZrYs!tuu#))"-J(+A}/A/A!/D|/S(T%&)!]-?-?-BZ-O&P#)!-F-GI`Ia*abM/-?%/;;NEKKXdXjXjLkLoLop }},,-=-C-C-E2,NQQR^RdRde}},,\T=P=P[_[h[h,ilLA'//1a;FFHMMjZdfhimmN3L((r4   N)NNN)__name__
__module____qualname____doc__r   intr=   r   Tensorr
   r|   __classcell__rM   s   @r2   r6   r6   A   s}    #y #S4Z #( )-.2.21)||1) ||1) 	1)
 t+1) t+1)r4   r6   c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZS )MptMLPr7   c                 &   t         |           |j                  }t        j                  |d|z  d      | _        t        j                  d      | _        t        j                  d|z  |d      | _        |j                  j                  | _        y )N   Fr:   none)approximate)r<   r=   r>   r   rI   up_projGELUact	down_projrC   rF   hidden_dropout)rL   r7   r>   rM   s      r2   r=   zMptMLP.__init__   sm    ((yya+oEJ77v.1{?KeL$00;;r4   rN   residualreturnc                     | j                  | j                  |            }| j                  |      }t        j                  || j
                  | j                        }||z   }|S )NrX   )r   r   r   Frj   r   rZ   )rL   rN   r   intermediate_outputoutputs        r2   r|   zMptMLP.forward   sW    m!<="nn];.$2E2EPTP]P]^("r4   )	r~   r   r   r   r=   r   r   r|   r   r   s   @r2   r   r      s5    <y <U\\ U\\ ell r4   r   c                        e Zd Zddededz  f fdZ	 	 	 	 ddej                  dej                  dej                  dedz  d	e	d
e	dej                  dz  fdZ
 xZS )MptBlockNr7   r8   c                    t         |           |j                  }t        ||j                        | _        d | j
                  _        |j                  | _        t        ||      | _
        t        ||j                        | _        d | j                  _        t        |      | _        |j                  j                  | _        t#        j$                  | j                         | _        y )Neps)r<   r=   r>   r   layer_norm_epsilonnorm_1r;   r?   r+   r6   attnnorm_2r   ffnrC   rF   dropout_rater   Dropoutresid_attn_dropout)rL   r7   r8   r>   rM   s       r2   r=   zMptBlock.__init__   s    ((1J1JK 3	1J1JK&>"..99"$**T->->"?r4   rN   rO   rQ   
layer_past	use_cacheoutput_attentionsrR   c                     | j                  |      }|}	| j                  |||||      \  }
}| j                  |
      |	z   }| j                  |      }|}	| j	                  ||	      }||fS )N)rO   rQ   rP   rR   )r   r   r   r   r   )rL   rN   rO   rQ   r   r   r   rR   layernorm_outputr   attn_outputsry   r   s                r2   r|   zMptBlock.forward   s      ;;}5  &*YY')&) &/ &
"l //=H;;}5 ! *H5|##r4   r}   )NFFN)r~   r   r   r   r   r=   r   r   r
   boolr|   r   r   s   @r2   r   r      s    @y @S4Z @2 $("'.2"$||"$ ||"$ 	"$
 DL"$ "$  "$ t+"$r4   r   c                   &    e Zd ZU eed<   dZdZdgZy)MptPreTrainedModelr7   transformerTr   N)r~   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules r4   r2   r   r      s    %&*##r4   r   c                   J    e Zd Zdef fdZd ZddZdej                  fdZ	e
	 	 	 	 	 	 	 	 	 ddej                  dz  d	edz  d
ej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  df   ez  fd       Z xZS )MptModelr7   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        | j                  |j                        | _        d | j                   _        d| _        | j'                          y c c}w )N)r8   r   F)r<   r=   r>   r?   r+   r   	Embedding
vocab_sizewte
ModuleListrangen_layersr   blocksr   r   norm_fr;   gradient_checkpointing	post_init)rL   r7   irM   s      r2   r=   zMptModel.__init__   s     !-- << 1 143C3CD mmERXRaRaLb$cqXf%B$cd   0 0f6O6OP&+# 	 %ds   C7c                     | j                   S r}   r   )rL   s    r2   get_input_embeddingszMptModel.get_input_embeddings   s    xxr4   Nc                     t        ||||      S r}   )r3   )rL   r+   r,   r-   r   s        r2   r3   zMptModel.build_mpt_alibi_tensor   s    %i.RXYYr4   new_embeddingsc                     || _         y r}   r   rL   r   s     r2   set_input_embeddingszMptModel.set_input_embeddings   s	    !r4   	input_idsrP   rQ   inputs_embedsr   r   output_hidden_statesreturn_dictrR   r   .c
           
      v   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t        d      ||j                  \  }}n||j                  \  }}}nt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r|t        | j                         }|}|rdnd}|rdnd}| j                  | j                  | j                   j                  |j                         }||j#                         nd	}|	%t%        j&                  |||z   |j                         }	t)        | j                   |||	|
      j+                  t$        j,                        }| j.                  D ]*  }|r||fz   } ||||||||	      }|d	   }|s"||d   fz   }, | j1                  |      }|r||fz   }|st3        d ||||fD              S t5        ||||      S )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r7   r   r   r   )r7   r   rQ   rR   rP   )r   rQ   r   r   rO   rR   r   c              3   &   K   | ]	  }||  y wr}   r   ).0vs     r2   	<genexpr>z#MptModel.forward.<locals>.<genexpr>i  s      ghgts   )last_hidden_staterP   rN   
attentions)r7   r   r   r   use_return_dictrd   r[   r   rZ   loggerwarning_oncer   r   r3   r+   r@   r   rb   r   r    r   ri   r   r   r   tupler   )rL   r   rP   rQ   r   r   r   r   r   rR   kwargsrm   rn   _rN   all_self_attentionsall_hidden_statesr.   past_key_values_lengthcausal_maskblockoutputss                         r2   r|   zMptModel.forward  s   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"%.__"J
&(5(;(;%J
ATUU&&4==##p "	  HHY/M0*$++>O%$5b4"6BD ++DNNDKK<S<S\i\p\p+qETE`!?!?!Afg!"\\&(>(KTaThThN );;'))+
 "UZZ. 	 [[ 	JE#$58H$H!**#"3#-G $AJM &9WQZM&I#!	J& M2 1]4D D )?<MObc   9+++*	
 	
r4      N	NNNNNNNNN)r~   r   r   r   r=   r   r3   r   r   r   r   
LongTensorr
   r   r   r   r|   r   r   s   @r2   r   r      s   y ,Z"5<< "  .2(,.215!%)-,0#'.2o
##d*o
 o
 t+	o

 ''$.o
 $;o
  $;o
 #Tko
 D[o
 t+o
 
u||S 	!$M	Mo
 o
r4   r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       e Zd ZddiZdef fdZdej                  fdZe		 	 	 	 	 	 	 	 	 	 	 ddej                  dz  d	edz  d
ej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  z  deej                     ez  fd       Z xZS )MptForCausalLMzlm_head.weightztransformer.wte.weightr7   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFr:   )
r<   r=   r   r   r   rI   r>   r   lm_headr   rL   r7   rM   s     r2   r=   zMptForCausalLM.__init__~  sI     #F+yy!3!3V5F5FUS 	r4   r   c                     || _         y r}   )r   r   s     r2   set_output_embeddingsz$MptForCausalLM.set_output_embeddings  s	    %r4   Nr   rP   rQ   r   labelsr   r   r   r   rR   logits_to_keepr   c                    |	|	n| j                   j                  }	| j                  ||||||||	|
	      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                   j                  d|}|	s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)rP   rQ   r   r   r   r   r   rR   r   )logitsr   r   r   lossr   rP   rN   r   r   )r7   r   r   
isinstancer   slicer   loss_functionr   r   rP   rN   r   )rL   r   rP   rQ   r   r   r   r   r   r   rR   r   r   transformer_outputsrN   slice_indicesr   r   r   s                      r2   r|   zMptForCausalLM.forward  s   B &1%<k$++B]B]"..+)'/!5#) / 

 ,A.8B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopDY!4QR!88F)-)9TGf$EvE0/??-;;*55
 	
r4   )NNNNNNNNNNr   )r~   r   r   _tied_weights_keysr   r=   r   r   r   r   r   r
   r   r   r   r   r|   r   r   s   @r2   r   r   u  sC    +,DEy &ELL &  .2(,.2-1&*!%)-,0#'.2-.A
##d*A
 A
 t+	A

 ||d*A
 t#A
 $;A
  $;A
 #TkA
 D[A
 t+A
 ell*A
 
u||	@	@A
 A
r4   r   a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   8    e Zd Zdef fdZdej                  fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  de
dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  dedz  deej                     ez  fd       Z xZS )MptForSequenceClassificationr7   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                  d      | _        | j                          y r   )
r<   r=   
num_labelsr   r   r   rI   r>   scorer   r   s     r2   r=   z%MptForSequenceClassification.__init__  sV      ++#F+YYv1163D3D5Q
 	r4   r   c                     || _         y r}   )r   r   s     r2   r   z2MptForSequenceClassification.set_output_embeddings  s	    #
r4   Nr   rP   rQ   r   r   r   r   r   r   r   c
           
      r   |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}|^| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }nc |||      }nY| j                   j"                  dk(  rt1               } |||      }n,| j                   j"                  dk(  rt3               } |||      }|	s|f|dd z   }||f|z   S |S t5        |||j6                  |j8                  |j:                        S )6  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrP   rQ   r   r   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rV   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr   )r7   r   r   r   r[   pad_token_idrd   ri   r   r   r!   r    argmaxr   r   rM   r~   problem_typer   r   longr   r   r*   r   r   r   rP   rN   r   )rL   r   rP   rQ   r   r   r   r   r   r   r   r   rN   r   rm   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr   loss_fctr   s                         r2   r|   z$MptForSequenceClassification.forward  s   > &1%<k$++B]B]"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r4   r   )r~   r   r   r   r=   r   r   r   r   r   r
   r   r   r   r|   r   r   s   @r2   r   r     s   y $ELL $  .2(,.2-1&*!%)-,0#'e
##d*e
 e
 t+	e

 ||d*e
 t#e
 $;e
  $;e
 #Tke
 D[e
 
u||	?	?e
 e
r4   r   c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  de
dz  deej                     ez  fd       Z xZS )MptForTokenClassificationr7   c                    t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n't        |d      r|j                  |j                  }nd}t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropoutr   g?)r<   r=   r   r   r   hasattrr  r   r   r   rj   rI   r>   
classifierr   )rL   r7   r  rM   s      r2   r=   z"MptForTokenClassification.__init__T  s      ++#F+6/0V5N5N5Z!'!:!:V-.63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	r4   Nr   rP   rQ   r   r   r   r   r   r   r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|l|j                  |j                        }|j                  \  }}t               } ||j                  ||z  | j                        |j                  ||z              }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )r  Nr  r   r   )r   r   rN   r   )r7   r   r   rj   r  ri   r   r[   r   r"   r   r   rN   r   )rL   r   rP   rQ   r   r   r   r   r   r   deprecated_argumentsr   rN   r   r   rm   rn   r  r   s                      r2   r|   z!MptForTokenClassification.forwarde  s+   > &1%<k$++B]B]"..+)'/!5# / 	
 ,A.]3/YYv}}-F%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r4   r   )r~   r   r   r   r=   r   r   r   r
   r   r   r   r   r|   r   r   s   @r2   r  r  R  s    y "  .2(,.2-1&*!%)-,0#'B
##d*B
 B
 t+	B

 ||d*B
 t#B
 $;B
  $;B
 #TkB
 D[B
 
u||	4	4B
 B
r4   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  d	edz  d
edz  de	e
z  fd       Z xZS )MptForQuestionAnsweringc                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y )Nr   )	r<   r=   r   r   r   rI   r>   
qa_outputsr   r   s     r2   r=   z MptForQuestionAnswering.__init__  sA     #F+))F$6$6: 	r4   Nr   rQ   r   start_positionsend_positionsr   r   r   r   c	                 "   ||n| j                   j                  }| j                  ||||||      }
|
d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|
dd z   }||f|z   S |S t        ||||
j                  |
j                  	      S )
r   N)rQ   r   r   r   r   r   r   rV   r   )ignore_indexr   )r   start_logits
end_logitsrN   r   )r7   r   r   r  splitr*   rl   rc   re   r\   r   r   rN   r   )rL   r   rQ   r   r  r  r   r   r   r   r   sequence_outputr   r   r!  
total_lossignored_indexr  
start_lossend_lossr   s                        r2   r|   zMptForQuestionAnswering.forward  s   4 &1%<k$++B]B]"")'/!5# # 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r4   )NNNNNNNN)r~   r   r   r=   r   r   r   FloatTensorr   r   r   r|   r   r   s   @r2   r  r    s      .237263715)-,0#'F
##d*F
 ))D0F
 ((4/	F

 ))D0F
 ''$.F
  $;F
 #TkF
 D[F
 
-	-F
 F
r4   r  )r   r   r   r   r  r  r   )0r   r#   r   r   torch.nnr   r   r   r   r   r   cache_utilsr
   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mptr   
get_loggerr~   r   r3   Moduler6   r   r   r   r   r   r   r  r  __all__r   r4   r2   <module>r5     s_       L L $ . ) / 9  . , ( 
		H	%.F)299 F)RRYY *7$) 7$t % % % P
! P
 P
f P
' P
P
f s
#5 s
s
l U
 2 U
 U
p P
0 P
 P
fr4   