
    qi              
          d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1  e       rddlm2Z2  e-jf                  e4      Z5 G d dejl                        Z7d Z8dAdZ9 G d dejt                        Z;dejx                  de=d ej|                  d!ejx                  fd"Z?d#ejx                  d$ejx                  d%e@d&eAd!ejx                  f
d'ZB G d( d)ejt                        ZC G d* d+eC      ZD G d, d-ejt                        ZEeCeCeDd.ZF G d/ d0e      ZGe, G d1 d2e*             ZHe, G d3 d4eH             ZI e,d56       G d7 d8eHe             ZJ e,d96       G d: d;eH             ZKe, G d< d=eH             ZLe, G d> d?eH             ZMg d@ZNy)BzPyTorch Falcon model.    N)Callable)Optional)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )initialization)get_activation)CacheDynamicCache)GenerationMixin)create_causal_mask)!flash_attn_supports_top_left_maskis_flash_attn_available)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)auto_docstringlogging)maybe_autocast   )FalconConfig)_flash_attention_forwardc                   D    e Zd Zdej                  dej                  fdZy)FalconLinearinputreturnc                 n    || j                   j                  z  }| j                  |S || j                  z   S N)weightTbias)selfr%   hidden_statess      \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/falcon/modeling_falcon.pyforwardzFalconLinear.forward=   s3    -99  tyy((    N)__name__
__module____qualname__torchTensorr/    r0   r.   r$   r$   <   s    )U\\ )ell )r0   r$   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..N   dim)shaper4   cat)xx1x2s      r.   rotate_halfrA   E   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r0   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerA   )qkcossinunsqueeze_dimq_embedk_embeds          r.   apply_rotary_pos_embrK   M   sY    $ --
&C
--
&C3w;q>C/0G3w;q>C/0GGr0   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )FalconRotaryEmbeddinginv_freqNconfigc                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultrN   F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrO   rope_parametersrQ   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r,   rO   devicerope_init_fnrN   	__class__s        r.   rV   zFalconRotaryEmbedding.__init__j   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr0   r_   ztorch.deviceseq_lenr&   ztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimN      ?r   r9   dtyper_   rh   )	rZ   getattrhidden_sizenum_attention_headsr4   arangeint64tofloat)rO   r_   rb   baser;   attention_factorrN   s          r.   r[   z5FalconRotaryEmbedding.compute_default_rope_parametersz   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r0   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r8   r    mpscpuF)device_typeenabledr9   r:   rg   )rN   rp   expandr<   ro   r_   
isinstancetypestrr   	transposer4   r=   rF   r\   rG   rh   )
r,   r>   position_idsinv_freq_expandedposition_ids_expandedrv   freqsembrF   rG   s
             r.   r/   zFalconRotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$r(   )NNN)r1   r2   r3   r4   r5   __annotations__r!   rV   staticmethodr   inttuplerp   r[   no_gradr   r/   __classcell__ra   s   @r.   rM   rM   g   s    llV| V  &*+/"*t#*(* t* 
~u$	%	* *: U]]_<  <r0   rM   attention_mask	num_headsrh   r&   c                    | j                   \  }}dt        j                  t        j                  |            z  }t	        j
                  ddt        j                  |      dz
   z   z  | j                  t        j                        }t	        j                  dd|z   | j                  t        j                        }t	        j                  ||      }||k7  rt	        j
                  ddt        j                  d|z        dz
   z   z  | j                  t        j                        }	t        |||z
        }
t	        j                  ddd|
z  z   d| j                  t        j                        }t	        j                  |t	        j                  |	|      gd      }| j                  d      dz
  | z  d d d d d f   }|d   j                         |z  }|j                  ||z  d|      j!                  |      S )	Nr9   r   ri   r    r   r:   r8   ).N)r<   mathfloorlog2r4   tensorr_   float32rm   int32powminr=   cumsumbfloat16reshapero   )r   r   rh   
batch_size
seq_lengthclosest_power_of_2rq   powersslopes
extra_basenum_remaining_headsextra_powersarange_tensoralibis                 r.   build_alibi_tensorr      s   +11J
djj9)=>><<	tyy!34q899:;NDYDYafananD \\!Q!33N<Q<QY^YdYdeFYYtV$FY&\\A499Q);%;<q@AABCNLaLainiviv

 ""4iBT6TU||Aq1/B+B'BAnNcNckpkvkvwFEIIj,$GHaP %+++3a7>I1dTU:VM9&&(=8E==i/J?BB5IIr0   r>   residualprobtrainingc                 @    t        j                  | ||      }||z   }|S )a
  
    Dropout add function

    Args:
        x (`torch.tensor`):
            input tensor
        residual (`torch.tensor`):
            residual tensor
        prob (`float`):
            dropout probability
        training (`bool`):
            training mode
    )pr   )Fdropout)r>   r   r   r   outs        r.   dropout_addr      s$     ))A
1C
S.CJr0   c                       e Zd Zddef fdZdej                  deej                  ej                  ej                  f   fdZdej                  dej                  fdZ		 	 	 	 	 	 dd	ej                  d
ej                  dz  dej                  dej                  dz  dedz  dededej                  dz  deej                  ej                  f   dz  fdZ xZS )FalconAttentionNrO   c                    t         |           || _        |j                  | _        |j                  | _        | j                  | j
                  z  | _        | j                  | _        |j                  | _        |j                  | _	        d| _
        || _        |-t        j                  d| j                  j                   d       | j                  | j
                  z  | j                  k7  r&t!        d| j                   d| j
                   d      dt#        j$                  | j                        z  | _        | j&                  | _        |j*                  r*|j,                  dz  |j                  z   | j                  z  }n8|j.                  r| j                  d| j                  z  z   }nd	| j                  z  }t1        | j                  ||j2                  
      | _        |j*                  | _        |j.                  | _        t1        | j                  | j                  |j2                  
      | _        t9        j:                  |j<                        | _        | j*                  s| j.                  s|j,                  | _        y d| _        y )NTzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zA`hidden_size` must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).rf   r9   r   r+   r    )rU   rV   rO   rk   rl   r   re   
split_sizehidden_dropoutrW   	is_causal	layer_idxloggerwarning_oncera   r1   
ValueErrorr   sqrtinv_norm_factorbetanew_decoder_architecturenum_kv_headsmulti_queryr$   r+   query_key_valuedenser   Dropoutattention_dropout)r,   rO   r   qkv_out_dimra   s       r.   rV   zFalconAttention.__init__   s    !--33((DNN:**$33'-'E'E$" !8!8 9 :, , ==4>>)T-=-==STXTdTdSe fNN#2'   #TYYt}}%==((	**!..2V5O5OOSWS`S``K**Q->>Kd...K+D,<,<kPVP[P[\(.(G(G%!--!$"2"2D4D4D6;;W
!#F,D,D!E484Q4QY]YiYiF//pqr0   	fused_qkvr&   c                 l   | j                   r|j                  \  }}}|j                  ||d| j                  | j                  z  dz   | j
                        }|ddddddddf   }|dddddddgf   }|dddddddgf   }t        j                  ||j                        }t        j                  ||j                        }|||fD 	cg c]  }	|	j                  dd       c}	\  }}}|||fS | j                  sV|j                  \  }
}}|j                  |
|| j                  d| j
                        }|ddddf   |ddddf   |ddddf   fS |j                  \  }
}}|j                  |
|| j                  dz   | j
                        }|dddddf   |ddgddf   |ddgddf   fS c c}	w )	a  
        Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
            value: [batch_size, seq_length, num_heads, head_dim]
        r8   r9   Nr   .r   r    )
r   r<   viewr   r   re   r4   broadcast_toflattenr   )r,   r   batchrb   _qkvquerykeyvaluer>   r   r   three_times_hidden_sizes                r.   _split_headszFalconAttention._split_heads  s    (( )E7A..T^^tGXGX5X[\5\^b^k^klC1a"%EaAtm$C1a"&E$$S%++6C&&uekk:E;@#u:M NQ1a NE3#u$$!!>Goo;J
$;!z:t~~qRVR_R_`IS!QY'319)=yaQR?SSS>Goo;J
$;!z:t~~PQ?QSWS`S`aIS#2#q[)9S2$\+BIcTVSWYZlD[[[ !Os   F1r>   c                    |j                   \  }}}|| j                  z  }|j                  || j                  || j                        }|j	                  dddd      }|j                  ||| j                  | j                  z        S )z
        Merge heads together over the last dimension

        Args:
            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]

        Returns:
            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
        r   r9   r    r   )r<   r   r   re   permuter   )r,   r>   batch_size_and_num_headsr   r   r   s         r.   _merge_headszFalconAttention._merge_heads#  sy     34''/ *a-?
 FF:t~~z4==I IIaAq! yyZ$--1OPPr0   r-   r   r   r}   
layer_past	use_cacheoutput_attentionscache_positionposition_embeddingsc
                 h	   | j                  |      }
| j                  r| j                  n| j                  }| j	                  |
      \  }}}|j
                  \  }}}}|j                  dd      j                  || j                  || j                        }|j                  dd      j                  |||| j                        }|j                  dd      j                  |||| j                        }||	\  }}t        ||||      \  }}|;d|i}||j                  d       |j                  ||| j                  |      \  }}|j
                  d   }|B| j                  j                  dk(  rM|sK| j                  xr |d u xr |dkD  }t        j                   j"                  j%                  ||||d|      }d }na||j                  d	d      z  }|t'        j(                  | j                        z  }t+        j,                  ||z   d	|j.                  
      }||z  }|j1                  || j                  || j                        }|j3                  dddd      }|j                  ||| j                  | j                  z        }| j5                  |      }||fS | j                  j                  dk(  r|s| j                  xr |d u xr |dkD  }t        j                   j"                  j%                  ||||| j6                  r| j8                  j:                  nd|      }d }|j                  dd      }|j                  ||| j                  | j                  z        }| j5                  |      }||fS ||j                  d	d      z  }|j1                  || j                  ||      }|j.                  }|t        j<                  k(  s|t        j>                  k(  r|jA                  t        jB                        }||j1                  || j                  dd	      z   }|| jD                  z  }t+        j,                  ||z   d	|j.                  
      }| j9                  |      }|j1                  || j                  ||      }||z  jG                  dd      }| jI                  |      }| j5                  |      }||fS )Nr    r9   r   rG   rF   r   sdpa        )	attn_mask	dropout_pr   r8   )r;   rh   r   r   )%r   r   r   r   r   r<   r|   r   re   rK   updater   rO   _attn_implementationr   r4   r   r
   scaled_dot_product_attentionr   r   r   softmaxrh   r   r   r   r   r   r   float16r   ro   r   r   r   r   )r,   r-   r   r   r}   r   r   r   r   r   r   r   query_layer	key_layervalue_layerr   query_lengthr   rF   rG   cache_kwargs	kv_lengthr   attn_outputattention_scoresattention_probsmatmul_resultinput_dtypeattention_logitsattention_probs_reshapeds                                 r.   r/   zFalconAttention.forward<  s    ((7	)-)F)Ft~~DL]L]040A0A)0L-i)4):):&
L!Q!++Aq199*dnnVbdhdqdqr''1-55j,P\^b^k^kl	!++Aq199*lT`bfbobop=*HC%9+yRUWZ%["K!,n=L}##C$<=%/%6%6y+t~~_k%l"I{OOB'	={{//69BS
 !NNZ~/EZ,YZJZ	#hh11NN,!' O  $( #.1D1DR1L#L  DIIdmm$<< #$99-=-NTV^k^q^q#r .<%**:t~~|UYUbUbcK%--aAq9K%--j,Y]YfYfHfgK**[1K 000 {{//69BS !NNZ~/EZ,YZJZ	#hh11NN,:>--d4466S' O  #')33Aq9)11*lDNN]a]j]jLjk"jj5< //9 !,i.A.A"b.I I $1#5#5j$..R^`i#j  /44%--/;%..3P'7':':5=='I$#3ejjT^^]^`b6c#c  D$8$88 "#)),<~,MSU]j]p]p"q"&"8"8"I ,;+?+?
DNN\hjs+t(  8+ENNqRST #//<"jj5//r0   r(   NNFFNN)r1   r2   r3   r!   rV   r4   r5   r   r   r   
LongTensorr   boolr/   r   r   s   @r.   r   r      s#   (r| (rT\ell \uU\\5<<Y^YeYe=e7f \@Qell Qu|| Q< 15#'"'26HLr0||r0 ||d"r0 	r0
 &&-r0 DLr0 r0  r0 ((4/r0 #5<<#=>Er0r0   r   c                       e Zd ZdZ fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dej                  dz  dedz  d	e	d
e	dej                  dz  de
ej                  ej                  f   dz  fdZ xZS )FalconFlashAttention2aH  
    Falcon flash attention module. This module inherits from `FalconAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 B    t        |   |i | t               | _        y r(   )rU   rV   r   _flash_attn_uses_top_left_mask)r,   argskwargsra   s      r.   rV   zFalconFlashAttention2.__init__  s#    $)&)
 /P.Q+r0   Nr-   r   r   r}   r   r   r   r   r   c
                 V   | j                  |      }
| j                  r| j                  n| j                  }| j	                  |
      \  }}}|j
                  \  }}}}|j                  dd      j                  || j                  || j                        }|j                  dd      j                  |||| j                        }|j                  dd      j                  |||| j                        }||	\  }}t        ||||      \  }}|;d|i}||j                  d       |j                  ||| j                  |      \  }}|j                  dd      }|j                  dd      }|j                  dd      }|t        d      | j                  r| j                  j                  nd}|j                   }|j"                  j$                  dk7  r|j"                  j$                  nd}|t&        j(                  k(  rt'        j*                  |      rt'        j,                  |      }nMt/        | j                  d	      r| j                  j                   }n | j                   j0                  j                   }t2        j5                  d
| d       |j7                  |      }|j7                  |      }|j7                  |      }t9        |||||||| j:                  | j<                  	      }|j                  ||| j                  | j                  z        }| j?                  |      }|sd }||fS )Nr    r9   r   r   z6`alibi` is not supported when `use_flash_attn` is Truer   rt   ru   _is_quantizedzThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .)r}   r   r   use_top_left_mask) r   r   r   r   r   r<   r|   r   re   rK   r   r   r   r   rO   r   rh   r_   rz   r4   r   is_autocast_enabledget_autocast_dtypehasattrr)   r   r   ro   r"   r   r   r   )r,   r-   r   r   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   rF   rG   r   attn_dropoutr   rv   target_dtyper   attn_weightss                              r.   r/   zFalconFlashAttention2.forward  s    ((7	)-)F)Ft~~DL]L]040A0A)0L-i)4):):&
L!Q!++Aq199*dnnVbdhdqdqr''1-55j,P\^b^k^kl	!++Aq199*lT`bfbobop=*HC%9+yRUWZ%["K!,n=L}##C$<=%/%6%6y+t~~_k%l"I{ "++Aq1''1-	!++Aq1UVV8<t{{443
 "''1<1C1C1H1HE1Qk((--W\%--'((5$77Do6#{{00#33::@@ >$ &..6K!\2I%..6K.% nn"AA

 #**:|T^^VZVcVcEcdjj. LL((r0   r   )r1   r2   r3   __doc__rV   r4   r5   r   r   r   r   r/   r   r   s   @r.   r   r     s    R 15#'"'26HLV)||V) ||d"V) 	V)
 &&-V) DLV) V)  V) ((4/V) #5<<#=>EV)r0   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )	FalconMLPrO   c                 ,   t         |           |j                  }t        ||j                  |j
                        | _        t        |j                        | _	        t        |j                  ||j
                        | _
        |j                  | _        y )Nr   )rU   rV   rk   r$   ffn_hidden_sizer+   dense_h_to_4hr   
activationactdense_4h_to_hr   )r,   rO   rk   ra   s      r.   rV   zFalconMLP.__init__  su    (()+v7M7MTZT_T_`!&"3"34)&*@*@+TZT_T_`$33r0   r>   r&   c                 h    | j                  | j                  |            }| j                  |      }|S r(   )r
  r  r  )r,   r>   s     r.   r/   zFalconMLP.forward#  s0    HHT''*+q!r0   )	r1   r2   r3   r!   rV   r4   r5   r/   r   r   s   @r.   r  r    s*    4| 4 %,, r0   r  )eagerr   flash_attention_2c                   L    e Zd Zddef fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dej                  dz  dee	ej                  ej                  f   z  dz  d	e
d
e
dej                  dz  de	ej                  ej                  f   dz  fdZ xZS )FalconDecoderLayerNrO   c                 x   t         |           |j                  }|j                  | _        t        |j                     ||      | _        t        |      | _	        |j                  | _
        || _        |j                  |j                  rd|_        |j                  s9t        ||j                         | _        t        ||j                         | _        y |j                  dk(  r9t        ||j                         | _        t        ||j                         | _        y t        ||j                         | _        y )Nr9   eps)rU   rV   rk   rl   r   FALCON_ATTENTION_CLASSESr   self_attentionr  mlpr   rO   num_ln_in_parallel_attnr   parallel_attnr   layer_norm_epsilonpost_attention_layernorminput_layernormln_attnln_mlp)r,   rO   r   rk   ra   s       r.   rV   zFalconDecoderLayer.__init__1  s    ((336v7R7RSTZ\efV$$33))1f6U6U-.F*##,5kvG`G`,aD)#,[f>W>W#XD --2(&:S:ST'9R9RS'0&B[B['\$r0   r-   r   r   r}   r   r   r   r   r   c
                 T   |}| j                   j                  r<| j                   j                  dk(  r#| j                  |      }| j	                  |      }n| j                  |      }| j                  |||||||||		      \  }}| j                   j                  sW| j                   j                  r|}n>t        ||| j                   j                  | j                        }| j                  |      }| j                   j                  r1| j                   j                  r| j                   j                  dk(  r|}| j                        }| j                   j                  s| j                   j                  r||z  }t        ||| j                   j                  | j                        }||fS )Nr9   )r   r   r}   r   r   r   r   r   )r   r    )rO   r   r  r  r  r  r  r  r   r   r   r  r  r   )r,   r-   r   r   r}   r   r   r   r   r   r   r   attention_layernorm_outmlp_layernorm_outattention_outputr  
mlp_outputoutputs                     r.   r/   zFalconDecoderLayer.forwardJ  s}    !;;//DKK4W4W[\4\&*ll=&A# $M :&*&:&:=&I# *.)<)<#!)%/) 3 *= 
*
&, {{33{{(($;!&$h0M0MX\XeXe %)$A$A($K! KK00))33q8 7 XX/0
;;//4;;3L3L**JZ4;;3M3MX\XeXef|##r0   r(   r   )r1   r2   r3   r!   rV   r4   r5   r   r   r   r   r/   r   r   s   @r.   r  r  0  s    ]| ]< 15GK"'26HL:$||:$ ||d":$ 	:$
 &&-:$ E%,,"<==D:$ :$  :$ ((4/:$ #5<<#=>E:$r0   r  c                        e Zd ZU eed<   dZdZdgZdZdZ	dZ
 ej                         dej                  f fd       Zed	defd       Z xZS )
FalconPreTrainedModelrO   transformerTr  modulec                 
   t         |   |       t        |t              rct	        j
                  |j                  d| j                  j                         |j                   t	        j                  |j                         yyy)zInitialize the weights.r   )meanstdN)rU   _init_weightsry   r$   initnormal_r)   rO   initializer_ranger+   zeros_)r,   r'  ra   s     r.   r+  z#FalconPreTrainedModel._init_weights  s^     	f%fl+LLSdkk6S6ST{{&FKK( ' ,r0   hard_check_onlyc                 :    t        | dd      }|r|S |sd|_        |S )Nuse_bettertransformerFr   )rj   r   )clsrO   r0  _is_bettertransformers       r.   _check_and_enable_sdpaz,FalconPreTrainedModel._check_and_enable_sdpa  s*     '-De L M*0F'r0   )F)r1   r2   r3   r!   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphr4   r   r   Moduler+  classmethodr   r5  r   r   s   @r.   r%  r%    so    %&*#-.N!U]]_)BII ) ) T  r0   r%  c                   b    e Zd Zdef fdZd Zdej                  fdZe		 	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  d	ej                  dz  d
ej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  df   ez  fd       Z xZS )FalconModelrO   c           	         t         |   |       |j                  | _        |j                  | _        |j                  | _        t        j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t#        | j                  |j$                        | _        d| _        t+        |      | _        | j/                          y c c}w )N)r   r  FrO   )rU   rV   rk   	embed_dimrl   r   r   	use_alibir   	Embedding
vocab_sizeword_embeddings
ModuleListrangenum_hidden_layersr  hr   r  ln_fgradient_checkpointingrM   
rotary_emb	post_init)r,   rO   ira   s      r.   rV   zFalconModel.__init__  s     ++33  "||F,=,=t~~N QVW]WoWoQpqA 26Q Gqr dnn&2K2KL	&+#/v> 	  rs   Dc                     | j                   S r(   rF  )r,   s    r.   get_input_embeddingsz FalconModel.get_input_embeddings  s    ###r0   new_embeddingsc                     || _         y r(   rQ  r,   rS  s     r.   set_input_embeddingsz FalconModel.set_input_embeddings  s
    -r0   N	input_idspast_key_valuesr   r}   inputs_embedsr   r   output_hidden_statesreturn_dictr   r&   .c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|du |duz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r|t        | j                         }d}||j                         nd}|j                  \  }}}| j                  r[|5t        j                   |||z   f|j"                  t        j$                        n|}t'        || j(                  |j*                        }|
%t        j,                  |||z   |j"                  	      }
||
j/                  d      }t1        | j                   |||
|d
       }|||j2                  dk(  rt        j4                  |j*                        j6                  }|j*                  t        j8                  k(  rAt        j:                  |t        j<                  d|j"                  |j*                        |      } |j>                  |dg|j                  dd  }t        j@                  |tC        jD                  | j                   jF                  | j(                  z        z  |dk  |      }|}| jI                  ||      }|rdnd}|rdnd}tK        | jL                        D ]/  \  }}|r||fz   } |||||||||
|	      }|d   }|s'||d   fz   }1 | jO                  |      }|r||fz   }|	stQ        d ||||fD              S tS        ||||      S )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FrA  r   ri   rg   r_   c                  L    t        j                  dt         j                        S )NTrg   )r4   r   r   )r   s    r.   <lambda>z%FalconModel.forward.<locals>.<lambda>  s    ELLUZZ,P r0   )rO   rY  r   r   rX  and_mask_function   r   r8   r    )r}   r6   )r   r   r}   r   r   r   r   r   c              3   &   K   | ]	  }||  y wr(   r6   ).0vs     r.   	<genexpr>z&FalconModel.forward.<locals>.<genexpr>G  s      ghgts   )last_hidden_staterX  r-   
attentions)*rO   r   rZ  r   use_return_dictr   rL  r   r   r   rF  r   get_seq_lengthr<   rC  r4   onesr_   longr   r   rh   rm   rC   r   ndimfinfor   r   wherer   r   masked_fillr   r   rk   rM  	enumeraterJ  rK  r   r   )r,   rW  rX  r   r}   rY  r   r   rZ  r[  r   r   r   past_key_values_lengthr   r   r   maskcausal_mask	min_dtyper-   r   all_self_attentionsall_hidden_statesrO  blockoutputss                              r.   r/   zFalconModel.forward  s   8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==##p "	  00;M0*$++>O ETE`!?!?!Afg$1$7$7!
J>>
 ") 

.D!DEmNbNbjojtjt $  'tT^^=CVCVWE!"\\&(>(KTaThThN )33A6L(;;'))+P
 !8[=M=MQR=RM$7$78<<I   EJJ.#kkc+:L:LTaTgTg!hjs
 "EMM*bC5;;qr?CE++		$++"9"9T^^"KLLb K &"oom,oW$5b4"6BD!$&&) 	JHAu#$58H$H!**)#"3-$7
G $AJM &9WQZM&I#%	J* 		-0 1]4D D )?<MObc   9+++*	
 	
r0   )
NNNNNNNNNN)r1   r2   r3   r!   rV   rR  r4   r5   rV  r   r   r   r   r   r   r/   r   r   s   @r.   r?  r?    s.   | *$.5<< .  .2(,.20415!%)-,0#'26L
##d*L
 L
 t+	L

 &&-L
 ''$.L
 $;L
  $;L
 #TkL
 D[L
 ((4/L
 
u||S 	!$M	ML
 L
r0   r?  z
    The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
    )custom_introc                       e Zd ZddiZdef fdZdej                  fdZe		 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  d	edz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  z  deej                     ez  fd       Z xZS )FalconForCausalLMzlm_head.weightz"transformer.word_embeddings.weightrO   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFr   )
rU   rV   r?  r&  r   Linearrk   rE  lm_headrN  r,   rO   ra   s     r.   rV   zFalconForCausalLM.__init__[  sI     &v.yy!3!3V5F5FUS 	r0   rS  c                     || _         y r(   )r  rU  s     r.   set_output_embeddingsz'FalconForCausalLM.set_output_embeddingsc  s	    %r0   NrW  rX  r   r}   rY  labelsr   r   rZ  r[  r   logits_to_keepr&   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
|
      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|* | j                  ||fd| j                   j                  i|}|
s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)	rX  r   r}   rY  r   r   rZ  r[  r   r   rE  r    losslogitsrX  r-   rh  )rO   ri  r&  ry   r   slicer  loss_functionrE  r   rX  r-   rh  )r,   rW  rX  r   r}   rY  r  r   r   rZ  r[  r   r  r   transformer_outputsr-   slice_indices	lm_logitsr  r#  s                       r.   r/   zFalconForCausalLM.forwardf  s/   F &1%<k$++B]B]"..+)%'/!5#) / 
 ,A.8B>SV8W~ot4]kLLq-/B!CD	%4%%  ;;11 	D \$7$;;F)-)9TGf$EvE0/??-;;*55
 	
r0   )NNNNNNNNNNNr   )r1   r2   r3   _tied_weights_keysr!   rV   r4   r5   r  r   r   r   r   r   r   r   r/   r   r   s   @r.   r|  r|  S  s^    +,PQ| &ELL &  .2(,.204-1&*!%)-,0#'26-.H
##d*H
 H
 t+	H

 &&-H
 ||d*H
 t#H
 $;H
  $;H
 #TkH
 D[H
 ((4/H
 ell*H
 
u||	@	@H
 H
r0   r|  a  
    The Falcon Model transformer with a sequence classification head on top (linear layer).

    [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  de
dz  deej                     ez  fd       Z xZS )FalconForSequenceClassificationrO   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                  d      | _        | j                          y r~  )
rU   rV   
num_labelsr?  r&  r   r  rk   scorerN  r  s     r.   rV   z(FalconForSequenceClassification.__init__  sV      ++&v.YYv1163D3D5Q
 	r0   NrW  rX  r   rY  r  r   r   rZ  r[  r&   c
           
      r   |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}|^| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }nc |||      }nY| j                   j"                  dk(  rt1               } |||      }n,| j                   j"                  dk(  rt3               } |||      }|	s|f|dd z   }||f|z   S |S t5        |||j6                  |j8                  |j:                        S )6  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrX  r   rY  r   r   rZ  r[  r   r    z=Cannot handle batch sizes > 1 if no padding token is defined.r8   ri   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r^  
regressionsingle_label_classificationmulti_label_classificationr  )rO   ri  r&  r  r<   pad_token_idr   ro   r_   r4   r   rm   argmaxr   r   ra   r1   problem_typer  rh   rl  r   r	   squeezer   r   r   rX  r-   rh  )r,   rW  rX  r   rY  r  r   r   rZ  r[  r   r  r-   r  r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr#  s                         r.   r/   z'FalconForSequenceClassification.forward  s   @ &1%<k$++B]B]"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r0   	NNNNNNNNN)r1   r2   r3   r!   rV   r   r4   r   r   r5   r   r   r   r/   r   r   s   @r.   r  r    s    |   .2(,.2-1&*!%)-,0#'f
##d*f
 f
 t+	f

 ||d*f
 t#f
 $;f
  $;f
 #Tkf
 D[f
 
u||	?	?f
 f
r0   r  c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  de
dz  deej                     ez  fd       Z xZS )FalconForTokenClassificationrO   c                    t         |   |       |j                  | _        t        |      | _        t        |dd       |j                  }nt        |dd       |j                  }nd}t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropoutr   g?)rU   rV   r  r?  r&  rj   r  r   r   r   r   r  rk   
classifierrN  )r,   rO   r  ra   s      r.   rV   z%FalconForTokenClassification.__init__6  s      ++&v.6/6B!'!:!:V-t4@!'!6!6!$zz"45))F$6$68I8IJ 	r0   NrW  rX  r   rY  r  r   r   rZ  r[  r&   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|Q|j
                  \  }}t               } ||j                  ||z  | j                        |j                  ||z              }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )r  Nr  r   r9   )r  r  r-   rh  )rO   ri  r&  r   r  r<   r   r   r  r   r-   rh  )r,   rW  rX  r   rY  r  r   r   rZ  r[  r   r  r-   r  r  r   r   r  r#  s                      r.   r/   z$FalconForTokenClassification.forwardG  s   @ &1%<k$++B]B]"..+)'/!5# / 	
 ,A.]3/%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r0   r  )r1   r2   r3   r!   rV   r   r4   r   r   r5   r   r   r   r/   r   r   s   @r.   r  r  4  s    | "  .2(,.2-1&*!%)-,0#'A
##d*A
 A
 t+	A

 ||d*A
 t#A
 $;A
  $;A
 #TkA
 D[A
 
u||	4	4A
 A
r0   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  d	edz  d
edz  de	e
z  fd       Z xZS )FalconForQuestionAnsweringc                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y )Nr9   )	rU   rV   r?  r&  r   r  rk   
qa_outputsrN  r  s     r.   rV   z#FalconForQuestionAnswering.__init__  sA     &v.))F$6$6: 	r0   NrW  r   rY  start_positionsend_positionsr   rZ  r[  r&   c	                 "   ||n| j                   j                  }| j                  ||||||      }
|
d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|
dd z   }||f|z   S |S t        ||||
j                  |
j                  	      S )
r]  N)r   rY  r   rZ  r[  r   r    r8   r:   )ignore_indexr9   )r  start_logits
end_logitsr-   rh  )rO   ri  r&  r  splitr  
contiguouslensizeclampr   r   r-   rh  )r,   rW  r   rY  r  r  r   rZ  r[  r   ry  sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr#  s                        r.   r/   z"FalconForQuestionAnswering.forward  s   4 &1%<k$++B]B]"")'/!5# # 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r0   )NNNNNNNN)r1   r2   r3   rV   r   r4   r   FloatTensorr   r   r   r/   r   r   s   @r.   r  r    s      .237263715)-,0#'F
##d*F
 ))D0F
 ((4/	F

 ))D0F
 ''$.F
  $;F
 #TkF
 D[F
 
-	-F
 F
r0   r  )r|  r?  r%  r  r  r  )r    )Or  r   collections.abcr   typingr   r4   r   torch.nnr   r   r   r	   r
   r    r   r,  activationsr   cache_utilsr   r   
generationr   masking_utilsr   modeling_flash_attention_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   utilsr   r   utils.genericr   configuration_falconr!   r"   
get_loggerr1   r   r  r$   rA   rK   r<  rM   r5   r   rh   r   rp   r   r   r   r   r  r  r  r%  r?  r|  r  r  r  __all__r6   r0   r.   <module>r     s:     $    L L $ & ) . ) / h 9  . , . J			H	%
)299 )(4><BII ><BJu|| J JEKK J\a\h\h J:5<< 5<< u PT Y^YeYe &V0bii V0re)O e)P		 " . T$3 T$n O  < i
' i
 i
X 
W
- W

W
t q
&; q
q
h T
#8 T
 T
n P
!6 P
 P
fr0   