
    qi                     `   d Z ddlmZ ddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddlmZ  ej>                  e       Z!dZ" G d dejF                        Z$ G d dejF                        Z%dejL                  de'de'dejL                  fdZ( G d dejF                        Z) G d dejF                        Z* G d dejF                        Z+ G d dejF                        Z,	 dFd ejL                  d!e'd"e'd#e-d$e-dejL                  fd%Z. G d& d'ejF                        Z/ G d( d)ejF                        Z0e G d* d+e             Z1 G d, d-ejF                        Z2e ed./       G d0 d1e                    Z3 ed2/       G d3 d4e1             Z4e G d5 d6e1             Z5 ed7/       G d8 d9e1             Z6e G d: d;e1             Z7 ed</       G d= d>e1             Z8e G d? d@e1             Z9e G dA dBe1             Z:e G dC dDe1             Z;g dEZ<y)Gz!PyTorch Funnel Transformer model.    )	dataclassN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)ModelOutputauto_docstringlogging   )FunnelConfigg    .Ac                        e Zd Zdeddf fdZ	 ddej                  dz  dej                  dz  dej                  fdZ xZS )	FunnelEmbeddingsconfigreturnNc                 @   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _        y )N)padding_idxeps)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddings	LayerNormd_modellayer_norm_eps
layer_normDropouthidden_dropoutdropoutselfr   	__class__s     \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/funnel/modeling_funnel.pyr    zFunnelEmbeddings.__init__-   sh    !||F,=,=v?Q?Q_e_r_rs,,v~~6;P;PQzz&"7"78    	input_idsinputs_embedsc                 p    || j                  |      }| j                  |      }| j                  |      }|S N)r%   r)   r,   )r.   r2   r3   
embeddingss       r0   forwardzFunnelEmbeddings.forward3   s<       00;M__]3
\\*-
r1   NN	__name__
__module____qualname__r   r    torchTensorr7   __classcell__r/   s   @r0   r   r   ,   sQ    9| 9 9 [_,DILLSWDW	r1   r   c                       e Zd ZU dZdZeed<   deddf fdZ	 	 d de	j                  d	e	j                  dz  d
e	j                  dz  dee	j                     fdZd
e	j                  de	j                  fdZdede	j                  de	j                  dee	j                     eee	j                        z  fdZde	j                  defdZd!de	j                  dedede	j                  fdZde	j                  ee	j                     z  ee	j                     z  deee   z  ee   z  de	j                  fdZ	 d"de	j                  ee	j                     z  ee	j                     z  dedede	j                  fdZdee	j                     dee	j                  ee	j                     f   fdZdee	j                     dee	j                     fdZ xZS )#FunnelAttentionStructurez>
    Contains helpers for `FunnelRelMultiheadAttention `.
       cls_token_type_idr   r   Nc                     t         |           || _        t        j                  |j
                        | _        t        j                  |j
                        | _        d | _        y r5   )	r   r    r   r   r*   r+   sin_dropoutcos_dropoutpooling_multr-   s     r0   r    z!FunnelAttentionStructure.__init__D   sM    ::f&;&;<::f&;&;< !r1   r3   attention_masktoken_type_idsc                 b   d| _         |j                  d      x| _        }| j                  ||j                  |j
                        }|| j                  |      nd}| j                  j                  r7t        j                  j                  |j                  |dz
  |dz
  g      d      nd}||||fS )zCReturns the attention inputs associated to the inputs of the model.r   N)r   r   r   r   )rH   sizeseq_lenget_position_embedsdtypedevicetoken_type_ids_to_matr   separate_clsr   
functionalpadnew_ones)r.   r3   rI   rJ   rM   position_embedstoken_type_matcls_masks           r0   init_attention_inputsz.FunnelAttentionStructure.init_attention_inputsM   s     !.!3!3A!66w227M<O<OQ^QeQefGUGa33NCgk {{'' MMm44gk7Q;5OPR^_ 	
  JJr1   c                     |dddddf   |dddf   k(  }|| j                   k(  }|dddddf   |dddf   z  }||z  S )z-Convert `token_type_ids` to `token_type_mat`.N)rD   )r.   rJ   rW   cls_idscls_mats        r0   rQ   z.FunnelAttentionStructure.token_type_ids_to_mata   sY    '1d
3~ag7NN D$:$::!Q*%4(88''r1   rM   rO   rP   c                 z   | j                   j                  }| j                   j                  dk(  rEt        j                  d|dt        j
                  |      j                  |      }t        j                  d|dz  dt        j
                  |      j                  |      }dd||dz  z  z  z  }|dddf   |d   z  }t        j                  |      }	| j                  |	      }
t        j                  |      }| j                  |      }t        j                  |
|
gd	
      }t        j                  ||	gd	
      }t        j                  ||gd	
      }t        j                  |	 |gd	
      }||||fS t        j                  d|dz  dt        j
                  |      j                  |      }dd||dz  z  z  z  }t        j                  | dz  |dz  dt        j
                  |      j                  |      }|dz  }|dddf   |d   z  }| j                  t        j                  |            }	| j                  t        j                  |            }t        j                  |	|gd	
      }t        j                  d|t        j
                  |      j                  |      }|}g }t        d| j                   j                        D ]  }|dk(  rd}ns| j                  ||      }d|dz
  z  }| j                  |||d      }|dddf   |z   }|j!                  |j#                  d      |      }t        j$                  |d|      }|}d|z  }| j                  ||      }|dddf   |z   }|j!                  |j#                  d      |      }t        j$                  |d|      }|j'                  ||g        |S )a  
        Create and cache inputs related to relative position encoding. Those are very different depending on whether we
        are using the factorized or the relative shift attention:

        For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
        final formula.

        For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
        formula.

        Paper link: https://huggingface.co/papers/2006.03236
        
factorizedr         ?rO   rP   rC   r   i'  Ndim)shift)r   r'   attention_typer=   arangeint64tosinrF   cosrG   catrange
num_blocksstride_pool_posrelative_posexpandrL   gatherappend)r.   rM   rO   rP   r'   pos_seqfreq_seqinv_freqsinusoid	sin_embedsin_embed_d	cos_embedcos_embed_dphipsipiomega
rel_pos_idzero_offset	pos_embedpos
pooled_posposition_embeds_listblock_indexposition_embeds_poolingstriderel_posposition_embeds_no_poolings                               r0   rN   z,FunnelAttentionStructure.get_position_embedsi   s|    ++%%;;%%5 ll1gs%++fUXXY^_G||Aw!|STZ[^^_deHEh'Q,&?@AHq$w'(4.8H		(+I**95K		(+I**95K))[+6B?C))Y	2;CK52>BII	z952>ES%(( ||Aw!|STZ[^^_deHEh'Q,&?@AHwhlGaKEKK`fgjjkpqJ!A+K!!T'*Xd^;H((8)<=I((8)<=I		9i"8bAI,,q'VLOOPUVCJ#% $Q(>(>? c !#.2+!%!5!5c;!GJ ;?3F"//VZq/QG%ag.<G%nnW\\!_gFG.3ll9a.Q+ !K++C8!!T'*[8!..a'B-2\\)Q-P*$++-GI`,ab9c: ('r1   pos_idr   c                     | j                   j                  rW|j                  d|z   dz   g      }| j                   j                  r|dd n|dd }t	        j
                  ||ddd   gd      S |ddd   S )ze
        Pool `pos_id` while keeping the cls token separate (if `config.separate_cls=True`).
        rC   r   ra   Nr   )r   rR   
new_tensortruncate_seqr=   rk   )r.   r   r   cls_pospooled_pos_ids        r0   rn   z(FunnelAttentionStructure.stride_pool_pos   s     ;;##
 ''1k>):Q)>(?@G,0KK,D,DF1RL&QRQS*M99g}SqS'9:A>>#A#;r1   r   r   rd   c                     ||}|d   |d   z
  }|t        |      z  }|||z  z   }|d   |d   z
  }t        j                  ||dz
  | t        j                  |j                        S )zV
        Build the relative positional vector between `pos` and `pooled_pos`.
        r   ra   r   r`   )lenr=   rf   longrP   )	r.   r   r   r   rd   	ref_point
num_removemax_distmin_dists	            r0   ro   z%FunnelAttentionStructure.relative_pos   sx     JqMCF*	S_,
zF22a=3r7*||HhlVG5::VYV`V`aar1   tensoraxisc                 l    |yt        t        t        f      rD ]  } j                  ||      } |S t        |t        t        f      r t	        |       fd|D              S |j
                  z   j                  j                  r# j                  j                  rt        ddd      nt        ddd      }t        t        d      gz  |gz         } j                  j                  rBt        t        d      gz  t        dd      gz         }t        j                  ||   |g      }||   S )zT
        Perform pooling by stride slicing the tensor along the given axis.
        Nc              3   B   K   | ]  }j                  |        y wr5   )stride_pool).0xr   r.   s     r0   	<genexpr>z7FunnelAttentionStructure.stride_pool.<locals>.<genexpr>   s     Ja 0 0D 9Js   ra   rC   r   )r   )
isinstancelisttupler   typendimr   rR   r   slicer=   rk   )r.   r   r   ax
axis_slice	enc_slice	cls_slices   ` `    r0   r   z$FunnelAttentionStructure.stride_pool   s'    > dT5M* 6))&"56M fudm,4<J6JJJ 	 #'++":":t{{?W?WE$A]bcgimop]q 	 5;-$.*=>	;;##uT{md2eD!n5EEFIYYy 16:FFi  r1   modec                     yt        t        t        f      r t               fdD              S  j                  j
                  rE j                  j                  rddddf   n}t        j                  ddddf   |gd      j                  }|dk(  rddddddf   n|dk(  rdddddddf   dfdk(  r$t        j                  j                  d	
      n_dk(  r$t        j                  j                  d	
      n6dk(  r&t        j                  j                   d	
       nt        d      |dk(  rddddddf   S |dk(  r	dddf   S S )z3Apply 1D pooling to a tensor of size [B x T (x H)].Nc              3   F   K   | ]  }j                           yw))r   r   N)pool_tensor)r   r   r   r.   r   r   s     r0   r   z7FunnelAttentionStructure.pool_tensor.<locals>.<genexpr>   s$     cWX 0 0d6 0 Rcs   !ra   r   rb   rC   r   meanT)r   	ceil_modemaxminz0The supported modes are 'mean', 'max' and 'min'.r   )r   r   r   r   r   rR   r   r=   rk   r   r   rS   
avg_pool2d
max_pool2dNotImplementedError)r.   r   r   r   suffixr   s   ````  r0   r   z$FunnelAttentionStructure.pool_tensor   s{    > fudm,4<c\bccc;;##'+{{'?'?VAssF^VFYYq"1"uv6A>F{{19AtQ,-FQYAtQM*F!6>]]--ffVW[-\FU]]]--ffVW[-\FU]mm..wvY].^^F%&XYY19!Q1*%%QY!Q$<r1   attention_inputsc                    |\  }}}}| j                   j                  r| j                   j                  dk(  r| j                  |dd d      |dd z   }| j                  |d      }| j                  |d      }| j	                  || j                   j
                        }n| xj                  dz  c_        | j                   j                  dk(  r| j                  |d      }| j                  |ddg      }| j                  |ddg      }| j	                  |d      }| j	                  || j                   j
                        }||||f}||fS )zTPool `output` and the proper parts of `attention_inputs` before the attention layer.r^   NrC   r   r   r   r   )r   pool_q_onlyre   r   r   pooling_typerH   )r.   outputr   rV   rW   rI   rX   s          r0   pre_attention_poolingz.FunnelAttentionStructure.pre_attention_pooling  sM    EUA;;""{{))\9"&"2"2?2A3F"J_]^]_M`"`!--na@N''!4H%%f4;;3K3K%LF"{{))\9"&"2"2?A"F!--nq!fEN''1a&9H!--n5-IN%%f4;;3K3K%LF+^^XV'''r1   c                 L   |\  }}}}| j                   j                  r| xj                  dz  c_        | j                   j                  dk(  r|dd | j	                  |dd d      z   }| j	                  |d      }| j	                  |d      }| j                  |d      }||||f}|S )zFPool the proper parts of `attention_inputs` after the attention layer.rC   r^   Nr   r   r   r   )r   r   rH   re   r   r   )r.   r   rV   rW   rI   rX   s         r0   post_attention_poolingz/FunnelAttentionStructure.post_attention_pooling3  s    DTA;;"""{{))\9"1"1"58H8HYZY[I\^_8`"`!--na@N''!4H!--n5-IN+^^XVr1   r8   Nr   )r   rC   )r:   r;   r<   __doc__rD   int__annotations__r   r    r=   r>   r   rY   rQ   rO   rP   r   rN   rn   ro   r   strr   r   r   r?   r@   s   @r0   rB   rB   =   s8    s!| ! ! /3.2	K||K t+K t+	K
 
u||	K((ELL (U\\ (N(N(#(;;N(8=N(	u||	tD$67	7N(`ell  b bc bSV b_d_k_k b!uU\\22T%,,5GG! E#Jc*! 
	!D rs$llU5<<%884;MM$UX$kn$	$L((-ell(;(	u||U5<<00	1(, uU\\7J  uUZUaUaOb  r1   rB   positional_attncontext_lenrd   r   c                     | j                   \  }}}}t        j                  | ||||g      } | d d d d |d d d f   } t        j                  | |||||z
  g      } | dd |f   } | S )N.)shaper=   reshape)r   r   rd   
batch_sizen_headrM   max_rel_lens          r0   _relative_shift_gatherr   A  s    />/D/D,J mmOj&+W^5_`O%aEFAo6OmmOj&'S^afSf5ghO%c<K<&78Or1   c                        e Zd Zdededdf fdZddZddZ	 ddej                  d	ej                  d
ej                  de
ej                     dede
ej                  df   fdZ xZS )FunnelRelMultiheadAttentionr   r   r   Nc                 J   t         |           || _        || _        |j                  |j
                  |j                  }}}t        j                  |j                        | _	        t        j                  |j                        | _
        t        j                  |||z  d      | _        t        j                  |||z        | _        t        j                  |||z        | _        t        j                  t!        j"                  ||g            | _        t        j                  t!        j"                  ||g            | _        t        j                  t!        j"                  |||g            | _        t        j                  t!        j"                  ||g            | _        t        j                  t!        j"                  d||g            | _        t        j                  ||z  |      | _        t        j0                  ||j2                        | _        d|dz  z  | _        y )NF)biasrC   r   r_   g      ?)r   r    r   r   r'   r   d_headr   r*   r+   attention_dropoutLinearq_headk_headv_head	Parameterr=   zerosr_w_biasr_r_biasr_kernelr_s_bias	seg_embed	post_projr&   r(   r)   scale)r.   r   r   r'   r   r   r/   s         r0   r    z$FunnelRelMultiheadAttention.__init__R  s   &"(..&-- jj)>)>?!#F,D,D!Eii&uEii&9ii&9U[[&&1A%BCU[[&&1A%BCU[['661J%KLU[[&&1A%BCekk1ff2E&FG6F?G<,,wF4I4IJFCK(
r1   c                 ~   | j                   j                  dk(  r|\  }}}}| j                  | j                  z  }	| j                  }
t        j                  d||	z   |
      }||dddf   z  }||dddf   z  }t        j                  d||      t        j                  d||      z   }n|j                  d   |k7  rdnd}|| j                     |dz
     }| j                  | j                  z  }| j                  }
t        j                  d||
      }t        j                  d||z   |      }t        |||      }|||z  }|S )	z5Relative attention score for the positional encodingsr^   zbinh,dnh->bindNzbind,jd->bnijr   rC   ztd,dnh->tnhzbinh,tnh->bnit)
r   re   r   r   r   r=   einsumr   r   r   )r.   rV   r   r   rX   r{   r}   r|   r~   uw_rq_r_attentionq_r_attention_1q_r_attention_2r   rd   rvr_heads                      r0   relative_positional_attentionz9FunnelRelMultiheadAttention.relative_positional_attentioni  sK    ;;%%5 #2CS%

*A--C "LL)96A:sKM+c!T'l:O+bDk9O $ll?OSQTYT`T`%U O  aK7AQE   0 01%!)<A

*A--C \\-C8F#ll+;VaZPO4_kSXYOx'Or1   c                    |y|j                   \  }}}| j                  | j                  z  }t        j                  d||z   | j
                        }|dddf   j                  ||j                   d   ||g      }t        j                  |dd      \  }	}
t        j                  ||
j                  |j                         |	j                  |j                               }|||z  }|S )z/Relative attention score for the token_type_idsNr   zbind,snd->bnisrC   r   ra   rb   )	r   r   r   r=   r   r   rp   splitwhere)r.   rW   r   rX   r   rM   r   r   token_type_biasdiff_token_typesame_token_typetoken_type_attns               r0   relative_token_type_attentionz9FunnelRelMultiheadAttention.relative_token_type_attention  s    !+9+?+?(
G[ ==4::-  ,,'7(9JDNN['4077V\\RS_V]_j8kl+0;;r+R(++O22>3G3GH/J`J`aoauauJv
 x'Or1   querykeyvaluer   output_attentions.c                    |\  }}}}	|j                   \  }
}}|j                   d   }| j                  j                  | j                  j                  }}| j	                  |      j                  |
|||      }| j                  |      j                  |
|||      }| j                  |      j                  |
|||      }|| j                  z  }| j                  | j                  z  }t        j                  d||z   |      }| j                  ||||	      }| j                  |||	      }||z   |z   }|j                  }|j                         }|%|t         d|d d d d f   j                         z
  z  z
  }t        j"                  |d|      }| j%                  |      }t        j                  d||      }| j'                  |j)                  |
|||z              }| j+                  |      }| j-                  ||z         }|r||fS |fS )Nr   zbind,bjnd->bnijra   )rc   rO   zbnij,bjnd->bind)r   r   r   r   r   viewr   r   r   r   r=   r   r   r   rO   floatINFsoftmaxr   r   r   r+   r)   )r.   r   r   r   r   r   rV   rW   rI   rX   r   rM   _r   r   r   r   r   r   r   content_scorer   r   
attn_scorerO   	attn_probattn_vecattn_outr   s                                r0   r7   z#FunnelRelMultiheadAttention.forward  s    EUA!&
GQiil++T[[-?-? U#((WffMS!&&z;OU#(([&&Q$**$==4::-%68I6R<<_fVackl<<^VU]^ #_4F
   %%'
%#cQ41N1T1T1V-V&WWJMM*"EB	**95	 << 19fE >>("2"2:wQW"XY&&x0!12&7	"FfYFr1   r5   F)r:   r;   r<   r   r   r    r   r   r=   r>   r   boolr7   r?   r@   s   @r0   r   r   Q  s    )| )# )$ ).(T< #(3G||3G \\3G ||	3G
  -3G  3G 
u||S 	!3Gr1   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )FunnelPositionwiseFFNr   r   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        t        j                  |j                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                         | _        y r5   )r   r    r   r   r'   d_innerlinear_1r
   
hidden_actactivation_functionr*   activation_dropoutlinear_2r+   r,   r&   r(   r)   r-   s     r0   r    zFunnelPositionwiseFFN.__init__  s    		&..&..A#)&*;*;#< "$**V-F-F"G		&..&..Azz&"7"78,,v~~v7L7LMr1   hiddenc                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j                  ||z         S r5   )r  r	  r
  r  r,   r)   )r.   r  hs      r0   r7   zFunnelPositionwiseFFN.forward  s^    MM&!$$Q'##A&MM!LLOvz**r1   r9   r@   s   @r0   r  r    s4    N| N N+ell +u|| +r1   r  c                        e Zd Zdededdf fdZ	 ddej                  dej                  dej                  d	ede	f
d
Z
 xZS )FunnelLayerr   r   r   Nc                 d    t         |           t        ||      | _        t	        |      | _        y r5   )r   r    r   	attentionr  ffn)r.   r   r   r/   s      r0   r    zFunnelLayer.__init__  s(    4V[I(0r1   r   r   r   r   c                 n    | j                  |||||      }| j                  |d         }|r||d   fS |fS )Nr   r   r   )r  r  )r.   r   r   r   r   r   attnr   s           r0   r7   zFunnelLayer.forward  sH     ~~eS%1AUf~g$q'"$5Q DF9Dr1   r  )r:   r;   r<   r   r   r    r=   r>   r  r   r7   r?   r@   s   @r0   r  r    si    1| 1# 1$ 1 #(
E||
E \\
E ||	
E  
E 

Er1   r  c                        e Zd Zdeddf fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  ded	ed
edee	z  fdZ
 xZS )FunnelEncoderr   r   Nc                 T   t         |           || _        t        |      | _        t        j                  t        |j                        D cg c];  \  }}t        j                  t        |      D cg c]  }t        ||       c}      = c}}}      | _        y c c}w c c}}}w r5   )r   r    r   rB   attention_structurer   
ModuleList	enumerateblock_sizesrl   r  blocks)r.   r   r   
block_sizer   r/   s        r0   r    zFunnelEncoder.__init__  s    #;F#C mm 099K9K/L +K zIZ[A{6;?[\
[s   $B#3B	B#B#r3   rI   rJ   r   output_hidden_statesreturn_dictc           
         |j                  |      }| j                  j                  |||      }|}|r|fnd }	|rdnd }
t        | j                        D ]  \  }}|j                  d      | j                  j                  rdndkD  }|xr |dkD  }|r| j                  j                  ||      \  }}t        |      D ]  \  }}t        | j                  j                  |         D ]{  }|dk(  xr	 |dk(  xr |}|r}| j                  j                  r|n|x}}n|x}x}} ||||||      }|d   }|r| j                  j                  |      }|r|
|dd  z   }
|sv|	|fz   }	}   |st        d ||	|
fD              S t        ||	|
      S )	NrI   rJ    r   rC   r   r  c              3   &   K   | ]	  }||  y wr5   r$  r   r   s     r0   r   z(FunnelEncoder.forward.<locals>.<genexpr>B       aqSTS`a   last_hidden_statehidden_states
attentions)type_asr  rY   r  r  rL   r   rR   r   rl   block_repeatsr   r   r   r   )r.   r3   rI   rJ   r   r   r!  r   r  all_hidden_statesall_attentionsr   blockpooling_flagpooled_hiddenlayer_indexlayerrepeat_index
do_poolingr   r   r   layer_outputs                          r0   r7   zFunnelEncoder.forward  s    (//>33II)) J 

 0D],$0d"+DKK"8 	JK!;;q>$++2J2JQPQRL';K!OL262J2J2`2`,3// '0&6 J"U$)$++*C*CK*P$Q JL".!"3!\+:J!\P\J! -040G0Gf]Ze.444e#(U<L`q#rL)!_F!+/+C+C+Z+Z[k+l(()7,qr:J)J+,=	,I)JJ	J2 aV->$OaaaGXesttr1   NNFFTr:   r;   r<   r   r    r=   r>   r  r   r   r7   r?   r@   s   @r0   r  r    s    	
| 	
 	
 /3.2"'%* 0u||0u t+0u t+	0u
  0u #0u 0u 
	 0ur1   r  r   r   
target_lenrR   r   c           	      6   |dk(  r| S |r| ddddf   }| ddddf   } t        j                  | |d      }|rT|r)t        j                  j	                  |ddd|dz
  ddf      }|ddd|dz
  f   }t        j
                  |gd      }|S |ddd|f   }|S )z{
    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
    r   N)repeatsrc   r   rb   )r=   repeat_interleaver   rS   rT   rk   )r   r   r;  rR   r   clsr   s          r0   upsampler@  F  s     {2A2haeH$$QA>F]]&&v1a!Q/JKF+Z!^++,C=a0 M ;J;'Mr1   c                        e Zd Zdeddf fdZ	 	 	 	 	 ddej                  dej                  dej                  dz  dej                  dz  d	ed
ededee	z  fdZ
 xZS )FunnelDecoderr   r   Nc           	          t         |           || _        t        |      | _        t        j                  t        |j                        D cg c]  }t        |d       c}      | _
        y c c}w )Nr   )r   r    r   rB   r  r   r  rl   num_decoder_layersr  layers)r.   r   r   r/   s      r0   r    zFunnelDecoder.__init__]  sR    #;F#C mmU6KdKdEe$f[%;$fg$fs   A-final_hiddenfirst_block_hiddenrI   rJ   r   r   r!  c                    t        |dt        | j                  j                        dz
  z  |j                  d   | j                  j
                  | j                  j                        }||z   }	|r|	fnd }
|rdnd }| j                  j                  |	||      }| j                  D ]'  } ||	|	|	||      }|d   }	|r||dd  z   }|s"|
|	fz   }
) |st        d |	|
|fD              S t        |	|
|	      S )
NrC   r   )r   r;  rR   r   r$  r#  r  r   c              3   &   K   | ]	  }||  y wr5   r$  r&  s     r0   r   z(FunnelDecoder.forward.<locals>.<genexpr>  r'  r(  r)  )r@  r   r   r  r   rR   r   r  rY   rE  r   r   )r.   rF  rG  rI   rJ   r   r   r!  upsampled_hiddenr  r/  r0  r   r5  r8  s                  r0   r7   zFunnelDecoder.forwardc  s%    $T[[4459:)//21111
 "$66)=VI40d33II)) J 
 [[ 	BE 9I]noL!!_F !/,qr2B!B#$5	$A!	B aV->$OaaaGXesttr1   r9  r:  r@   s   @r0   rB  rB  \  s    h| h h /3.2"'%* 'ull'u "LL'u t+	'u
 t+'u  'u #'u 'u 
	 'ur1   rB  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )FunnelDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.r   r   Nc                     t         |           || _        t        j                  |j
                  |j
                        | _        t        j                  |j
                  d      | _        y r   )r   r    r   r   r   r'   densedense_predictionr-   s     r0   r    z'FunnelDiscriminatorPredictions.__init__  sF    YYv~~v~~>
 "		&..! <r1   discriminator_hidden_statesc                     | j                  |      }t        | j                  j                     |      }| j	                  |      j                  d      }|S )Nra   )rN  r
   r   r  rO  squeeze)r.   rP  r+  logitss       r0   r7   z&FunnelDiscriminatorPredictions.forward  sJ    

#>?t{{556}E&&}5==bAr1   )
r:   r;   r<   r   r   r    r=   r>   r7   r?   r@   s   @r0   rL  rL    s4    O=| = =5<< ELL r1   rL  c                   J    e Zd ZU eed<   dZ ej                         d        Zy)FunnelPreTrainedModelr   funnelc                 r   |j                   j                  }|j                  d      dk7  rt        |dd       | j                  j
                  >|j                  j                  \  }}t        j                  dt        ||z         z        }n| j                  j
                  }t        j                  |j                  |       t        |dd       !t        j                  |j                  d       y y |dk(  r
t        j                  |j                   | j                  j"                  	       t        j                  |j$                  | j                  j"                  	       t        j                  |j&                  | j                  j"                  	       t        j                  |j(                  | j                  j"                  	       t        j                  |j*                  | j                  j"                  	       y |d
k(  r| j                  j
                  dn| j                  j
                  }t        j                  |j,                  j                  |       |j,                  j.                  At        j0                  |j,                  j                  |j,                  j.                            y y y )Nr   ra   weightr_   )stdr   g        r   )br   )r/   r:   findgetattrr   initializer_stdrX  r   npsqrtr   initnormal_	constant_r   uniform_r   initializer_ranger   r   r   r   r%   r   zeros_)r.   module	classnamefan_outfan_inrY  s         r0   _init_weightsz#FunnelPreTrainedModel._init_weights  s   $$--	>>(#r)vx.:;;..6&,mm&9&9OGV''#fw.>(?"?@C++55CV]]4vvt,8v{{C0 977MM&//T[[-J-JKMM&//T[[-J-JKMM&//T[[-J-JKMM&//T[[-J-JKMM&**dkk.K.KL,,44<#$++B]B]CLL//66C@%%11=F2299&:P:P:\:\]^ > -r1   N)	r:   r;   r<   r   r   base_model_prefixr=   no_gradrj  r$  r1   r0   rU  rU    s*     U]]__ _r1   rU  c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZ	S )FunnelClassificationHeadr   n_labelsr   Nc                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |      | _	        y r5   )
r   r    r   r   r'   linear_hiddenr*   r+   r,   
linear_out)r.   r   ro  r/   s      r0   r    z!FunnelClassificationHead.__init__  sU    YYv~~v~~Fzz&"7"78))FNNH=r1   r  c                     | j                  |      }t        j                  |      }| j                  |      }| j	                  |      S r5   )rq  r=   tanhr,   rr  )r.   r  s     r0   r7   z FunnelClassificationHead.forward  s=    ##F+F#f%v&&r1   )
r:   r;   r<   r   r   r    r=   r>   r7   r?   r@   s   @r0   rn  rn    s8    >| >s >t >'ell 'u|| 'r1   rn  z2
    Output type of [`FunnelForPreTraining`].
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                     dz  ed<   dZe
ej                     dz  ed<   y)FunnelForPreTrainingOutputa1  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss of the ELECTRA-style objective.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Prediction scores of the head (scores for each token before SoftMax).
    NlossrS  r+  r,  )r:   r;   r<   r   rx  r=   FloatTensorr   rS  r+  r   r,  r$  r1   r0   rw  rw    sg     &*D%

d
")'+FE$+59M5**+d2926Je''(4/6r1   rw  z
    The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
    decoder) or any task-specific head on top.
    c                   N    e Zd Zdeddf fdZdej                  fdZdej                  ddfdZe		 	 	 	 	 	 	 	 dde
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  dedz  dedz  dedz  deez  fd       Z xZS )FunnelBaseModelr   r   Nc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r5   )r   r    r   r6   r  encoder	post_initr-   s     r0   r    zFunnelBaseModel.__init__  s4     *62$V, 	r1   c                 .    | j                   j                  S r5   r6   r%   r.   s    r0   get_input_embeddingsz$FunnelBaseModel.get_input_embeddings      ...r1   new_embeddingsc                 &    || j                   _        y r5   r  r.   r  s     r0   set_input_embeddingsz$FunnelBaseModel.set_input_embeddings      *8'r1   r2   rI   rJ   position_idsr3   r   r   r!  c	                 V   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      ||j                  n|j                  }|t        j                  |
|      }|&t        j                  |
t        j                  |      }| j                  ||      }| j                  ||||||      }|S )NDYou cannot specify both input_ids and inputs_embeds at the same timera   5You have to specify either input_ids or inputs_embedsrP   r`   r3   rI   rJ   r   r   r!  )r   r   r   use_return_dict
ValueError%warn_if_padding_and_no_attention_maskrL   rP   r=   onesr   r   r6   r}  )r.   r2   rI   rJ   r  r3   r   r   r!  kwargsinput_shaperP   encoder_outputss                r0   r7   zFunnelBaseModel.forward  s;    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU%.%:!!@T@T!"ZZFCN!"[[EJJvVN	O,,))/!5# ' 
 r1   NNNNNNNNr:   r;   r<   r   r    r   r!   r  r  r   r=   r>   r  r   r   r7   r?   r@   s   @r0   r{  r{    s   |  /bll /92<< 9D 9  *..2.2,0-1)-,0#'.<<$&. t+. t+	.
 llT). ||d*.  $;. #Tk. D[. 
	 . .r1   r{  c                   .    e Zd Zdeddf fdZdej                  fdZdej                  ddfdZe		 	 	 	 	 	 	 dde
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  dedz  dedz  dedz  deez  fd       Z xZS )FunnelModelr   r   Nc                     t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        | j                          y r5   )
r   r    r   r   r6   r  r}  rB  decoderr~  r-   s     r0   r    zFunnelModel.__init__(  sG     *62$V,$V, 	r1   c                 .    | j                   j                  S r5   r  r  s    r0   r  z FunnelModel.get_input_embeddings2  r  r1   r  c                 &    || j                   _        y r5   r  r  s     r0   r  z FunnelModel.set_input_embeddings5  r  r1   r2   rI   rJ   r3   r   r   r!  c           	         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }	n!||j                         d d }	nt	        d      ||j                  n|j                  }
|t        j                  |	|
      }|&t        j                  |	t        j                  |
      }| j                  ||      }| j                  ||||d|      }| j                  |d	   |d
   | j                   j                  d	      |||||      }|s6d	}|d	   f}|r|d
z  }||d
   ||   z   fz   }|r|d
z  }||d   ||   z   fz   }|S t!        |d	   |r|j"                  |j"                  z   nd |r|j$                  |j$                  z         S d       S )Nr  ra   r  r  r`   r  Tr  r   r   )rF  rG  rI   rJ   r   r   r!  rC   r)  )r   r   r   r  r  r  rL   rP   r=   r  r   r   r6   r}  r  r  r   r+  r,  )r.   r2   rI   rJ   r3   r   r   r!  r  r  rP   r  decoder_outputsidxoutputss                  r0   r7   zFunnelModel.forward8  s?    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU%.%:!!@T@T!"ZZFCN!"[[EJJvVN	O,,))/!%# ' 
 ,,(+.q1$++2I2I!2LM))/!5# ' 
 C&q)+G#q!_Q%7/#:N%N$PP q!_Q%7/#:N%N$PPN-a0# +88?;X;XXTe22_5O5OO
 	

 lp
 	
r1   )NNNNNNNr  r@   s   @r0   r  r  &  s    |  /bll /92<< 9D 9  *..2.2-1)-,0#'H
<<$&H
 t+H
 t+	H

 ||d*H
  $;H
 #TkH
 D[H
 
	 H
 H
r1   r  z
    Funnel Transformer model with a binary classification head on top as used during pretraining for identifying
    generated tokens.
    c                   
    e Zd Zdeddf fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  de	e
z  fd       Z xZS )FunnelForPreTrainingr   r   Nc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r5   )r   r    r  rV  rL  discriminator_predictionsr~  r-   s     r0   r    zFunnelForPreTraining.__init__  s3     !&))G)O&r1   r2   rI   rJ   r3   labelsr   r   r!  c	           	      `   ||n| j                   j                  }| j                  |||||||      }
|
d   }| j                  |      }d}|t	        j
                         }|a|j                  d|j                  d         dk(  }|j                  d|j                  d         |   }||   } |||j                               }n4 ||j                  d|j                  d         |j                               }|s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )a"  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see `input_ids`
            docstring) Indices should be in `[0, 1]`:

            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, FunnelForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
        >>> model = FunnelForPreTraining.from_pretrained("funnel-transformer/small")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> logits = model(**inputs).logits
        ```NrI   rJ   r3   r   r   r!  r   ra   r   rx  rS  r+  r,  )r   r  rV  r  r   r   r   r   r   rw  r+  r,  )r.   r2   rI   rJ   r3   r  r   r   r!  r  rP  discriminator_sequence_outputrS  rx  loss_fctactive_lossactive_logitsactive_labelsr   s                      r0   r7   zFunnelForPreTraining.forward  sj   B &1%<k$++B]B]&*kk))'/!5# '2 '
# )DA(F%//0MN++-H),11"6S6Y6YZ[6\]abb &B0M0S0STU0V WXc d &{ 3}/B/B/DEB0M0S0STU0V WY_YeYeYghY!<QR!@@F)-)9TGf$EvE)5CC2==	
 	
r1   r  )r:   r;   r<   r   r    r   r=   r>   r  r   rw  r7   r?   r@   s   @r0   r  r    s    |    *..2.2-1&*)-,0#'C
<<$&C
 t+C
 t+	C

 ||d*C
 t#C
  $;C
 #TkC
 D[C
 
+	+C
 C
r1   r  c                   V    e Zd ZddiZdeddf fdZdej                  fdZdej                  ddfd	Z
e	 	 	 	 	 	 	 	 dd
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  deez  fd       Z xZS )FunnelForMaskedLMzlm_head.weightz(funnel.embeddings.word_embeddings.weightr   r   Nc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        | j                          y r5   )
r   r    r  rV  r   r   r'   r"   lm_headr~  r-   s     r0   r    zFunnelForMaskedLM.__init__  sD     !&)yy1B1BC 	r1   c                     | j                   S r5   r  r  s    r0   get_output_embeddingsz'FunnelForMaskedLM.get_output_embeddings  s    ||r1   r  c                     || _         y r5   r  r  s     r0   set_output_embeddingsz'FunnelForMaskedLM.set_output_embeddings  s	    %r1   r2   rI   rJ   r3   r  r   r   r!  c	           	         ||n| j                   j                  }| j                  |||||||      }
|
d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr  r   ra   r   r  )
r   r  rV  r  r   r   r"   r   r+  r,  )r.   r2   rI   rJ   r3   r  r   r   r!  r  r  r*  prediction_logitsmasked_lm_lossr  r   s                   r0   r7   zFunnelForMaskedLM.forward  s    & &1%<k$++B]B]++))'/!5#  
 $AJ LL):;')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r1   r  )r:   r;   r<   _tied_weights_keysr   r    r   r   r  r!   r  r   r=   r>   r  r   r   r7   r?   r@   s   @r0   r  r    s   *,VW|  ryy &BLL &T &  *..2.2-1&*)-,0#'/
<<$&/
 t+/
 t+	/

 ||d*/
 t#/
  $;/
 #Tk/
 D[/
 
	/
 /
r1   r  z
    Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
    first timestep of the last hidden state) e.g. for GLUE tasks.
    c                   
    e Zd Zdeddf fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  de	e
z  fd       Z xZS )FunnelForSequenceClassificationr   r   Nc                     t         |   |       |j                  | _        || _        t	        |      | _        t        ||j                        | _        | j                          y r5   )	r   r    
num_labelsr   r{  rV  rn  
classifierr~  r-   s     r0   r    z(FunnelForSequenceClassification.__init__'  sN      ++%f-266;L;LMr1   r2   rI   rJ   r3   r  r   r   r!  c	           	      ,   ||n| j                   j                  }| j                  |||||||      }
|
d   }|dddf   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|s|f|
dd z   }||f|z   S |S t        |||
j                   |
j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationra   r  )r   r  rV  r  problem_typer  rO   r=   r   r   r   rR  r   r   r   r   r+  r,  )r.   r2   rI   rJ   r3   r  r   r   r!  r  r  r*  pooled_outputrS  rx  r  r   s                    r0   r7   z'FunnelForSequenceClassification.forward1  s   & &1%<k$++B]B]++))'/!5#  
 $AJ)!Q$//{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r1   r  )r:   r;   r<   r   r    r   r=   r>   r  r   r   r7   r?   r@   s   @r0   r  r     s    |    *..2.2-1&*)-,0#'B
<<$&B
 t+B
 t+	B

 ||d*B
 t#B
  $;B
 #TkB
 D[B
 
)	)B
 B
r1   r  c                   
    e Zd Zdeddf fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  de	e
z  fd       Z xZS )FunnelForMultipleChoicer   r   Nc                     t         |   |       t        |      | _        t	        |d      | _        | j                          y r   )r   r    r{  rV  rn  r  r~  r-   s     r0   r    z FunnelForMultipleChoice.__init__y  s4     %f-261=r1   r2   rI   rJ   r3   r  r   r   r!  c	           	         ||n| j                   j                  }||j                  d   n|j                  d   }
|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  |||||||      }|d   }|dddf   }| j                  |      }|j                  d|
      }d}|t               } |||      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   ra   r  r   r  )r   r  r   r   rL   rV  r  r   r   r+  r,  )r.   r2   rI   rJ   r3   r  r   r   r!  r  num_choicesr  r*  r  rS  reshaped_logitsrx  r  r   s                      r0   r7   zFunnelForMultipleChoice.forward  s   & &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImq ( r=#5#5b#9=;M;Mb;QR 	 ++))'/!5#  
 $AJ)!Q$// ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r1   r  )r:   r;   r<   r   r    r   r=   r>   r  r   r   r7   r?   r@   s   @r0   r  r  w  s    |    *..2.2-1&*)-,0#';
<<$&;
 t+;
 t+	;

 ||d*;
 t#;
  $;;
 #Tk;
 D[;
 
*	*;
 ;
r1   r  c                   
    e Zd Zdeddf fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  de	e
z  fd       Z xZS )FunnelForTokenClassificationr   r   Nc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r5   )r   r    r  r  rV  r   r*   r+   r,   r   r#   r  r~  r-   s     r0   r    z%FunnelForTokenClassification.__init__  si      ++!&)zz&"7"78))F$6$68I8IJ 	r1   r2   rI   rJ   r3   r  r   r   r!  c	           	         ||n| j                   j                  }| j                  |||||||      }
|
d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   ra   r   r  )r   r  rV  r,   r  r   r   r  r   r+  r,  )r.   r2   rI   rJ   r3   r  r   r   r!  r  r  r*  rS  rx  r  r   s                   r0   r7   z$FunnelForTokenClassification.forward  s    " &1%<k$++B]B]++))'/!5#  
 $AJ LL):;!23')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r1   r  )r:   r;   r<   r   r    r   r=   r>   r  r   r   r7   r?   r@   s   @r0   r  r    s    	| 	 	  *..2.2-1&*)-,0#'.
<<$&.
 t+.
 t+	.

 ||d*.
 t#.
  $;.
 #Tk.
 D[.
 
&	&.
 .
r1   r  c                   *    e Zd Zdeddf fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  de	e
z  fd       Z xZS )FunnelForQuestionAnsweringr   r   Nc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r5   )
r   r    r  r  rV  r   r   r#   
qa_outputsr~  r-   s     r0   r    z#FunnelForQuestionAnswering.__init__  sS      ++!&)))F$6$68I8IJ 	r1   r2   rI   rJ   r3   start_positionsend_positionsr   r   r!  c
           	      $   |	|	n| j                   j                  }	| j                  |||||||	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|	s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r   ra   rb   )ignore_indexrC   )rx  start_logits
end_logitsr+  r,  )r   r  rV  r  r   rR  
contiguousr   rL   squezeclampr   r   r+  r,  )r.   r2   rI   rJ   r3   r  r  r   r   r!  r  r  r*  rS  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                         r0   r7   z"FunnelForQuestionAnswering.forward  s    &1%<k$++B]B]++))'/!5#  
 $AJ!23#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"8"8"<=%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r1   )	NNNNNNNNN)r:   r;   r<   r   r    r   r=   r>   r  r   r   r7   r?   r@   s   @r0   r  r    s    |    *..2.2-1/3-1)-,0#';
<<$&;
 t+;
 t+	;

 ||d*;
 ,;
 ||d*;
  $;;
 #Tk;
 D[;
 
-	-;
 ;
r1   r  )	r{  r  r  r  r  r  r  r  rU  )TF)=r   dataclassesr   numpyr^  r=   r   torch.nnr   r   r    r	   r`  activationsr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   utilsr   r   r   configuration_funnelr   
get_loggerr:   loggerr   Moduler   rB   r>   r   r   r   r  r  r  r  r@  rB  rL  rU  rn  rw  r{  r  r  r  r  r  r  r  __all__r$  r1   r0   <module>r     s   ( !    A A & !  . 9 9 . 
		H	% 
ryy "A ryy A HELL s SV [`[g[g  MG")) MG`+BII +&E")) E&<uBII <u@ di|| .1AE\`
\\,.uBII .ubRYY   _O _ _<'ryy ' 
7 7 7 ?+ ??D Z
' Z
 Z
z M
0 M
M
` B
- B
 B
J N
&; N
N
b E
3 E
 E
P ;
#8 ;
 ;
| G
!6 G
 G
T
r1   