
    qi                     t   d dl mZ d dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0  G d dejb                        Z2d Z3 ed      dAd       Z4dejj                  de6dejj                  fdZ7	 dBdejb                  d ejj                  d!ejj                  d"ejj                  d#ejj                  dz  d$e8d%e8d&e$e&   fd'Z9 ee4       G d( d)ejb                               Z: G d* d+ejv                  jx                        Z=e G d, d-ejb                               Z>dCd.Z? G d/ d0ej                        ZA G d1 d2ejb                        ZB G d3 d4e      ZCe' G d5 d6e"             ZDe' G d7 d8eD             ZE	 	 	 dDd9ejj                  eFejj                     z  dz  d:e6dz  d#ejj                  dz  dejj                  e6z  fd;ZGe' G d< d=eDe             ZH G d> d?eeD      ZIg d@ZJy)E    )Callable)OptionalN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_experts_implementationuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )PhimoeConfigc                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         edd              Z xZS )PhimoeRotaryEmbeddinginv_freqNconfigc                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  | _	        | j                  dk7  rt        | j                     | _	        | j                  | j
                  |      \  }| _        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultr%   F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr&   rope_parametersr(   compute_default_rope_parametersrope_init_fnr   attention_scalingregister_bufferclone)selfr&   devicer%   	__class__s       \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/phimoe/modeling_phimoe.pyr-   zPhimoeRotaryEmbedding.__init__0   s    "("@"@$*$B$B!44[A&*&J&J>>Y& 3DNN CD+/+<+<T[[&+Q($(ZeD0(..2BuU    r8   ztorch.deviceseq_lenreturnztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimN      ?r      )dtype)r8   rC   )	r1   getattrhidden_sizenum_attention_headstorcharangeint64tofloat)r&   r8   r<   basedimattention_factorr%   s          r:   r2   z5PhimoeRotaryEmbedding.compute_default_rope_parameters@   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r;   c                    |%t        | j                  j                   d| d      d }t        j                  |      dz   }| j
                  j                  d   dk7  rP|rN|| j
                  j                  d   kD  r| j
                  j                  d   n| j
                  j                  d   }| j                  | j
                  |j                  |      \  }}||n|}|d d d d f   j                         j                  |j                  d	   d
d      j                  |j                        }|d d d d d f   j                         }	t        |j                  j                  t              r/|j                  j                  dk7  r|j                  j                  nd}
t!        |
d      5  |j                         |	j                         z  j#                  dd      }t        j$                  ||fd
      }|j'                         |z  }|j)                         |z  }d d d        j                  |j*                        j                  |j*                        fS # 1 sw Y   ?xY w)Nz3 does not support layer types, but got `layer_type=`r!   r(   r)    original_max_position_embeddingslong_mscaleshort_mscaler   mpscpuF)device_typeenabledrB   rM   )
ValueErrorr9   __name__rG   maxr&   r1   r3   r8   rK   expandshaperJ   
isinstancetypestrr   	transposecatcossinrC   )r7   xposition_ids
layer_typemscaler<   r%   r4   inv_freq_expandedposition_ids_expandedrW   freqsembrd   re   s                  r:   forwardzPhimoeRotaryEmbedding.forward^   s    !>>**++^_i^jjkl  ))L)A-;;&&{3y@W T[[889[\\ ++M:[[00@ 
 '+&7&7QXXw&W##&,n"&$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	%&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')f$C'')f$C		%
 vvaggqww//	% 	%s   )A1II!N)NNN)NN)r[   
__module____qualname__rG   Tensor__annotations__r"   r-   staticmethodr   inttuplerK   r2   no_gradr   rn   __classcell__r9   s   @r:   r$   r$   -   s    llV| V  &*+/"*t#*(* t* 
~u$	%	* *: U]]_0  0r;   r$   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrT   rB   rY   )r^   rG   rc   )rf   x1x2s      r:   rotate_halfr}   |   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r;   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer}   )qkrd   re   unsqueeze_dimq_embedk_embeds          r:   apply_rotary_pos_embr      sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr;   hidden_statesn_repr=   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)r^   r]   reshape)r   r   batchnum_key_value_headsslenr@   s         r:   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr;   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )NrB   r   rT   )rM   rC   )ptrainingr!   )r   num_key_value_groupsrG   matmulrb   r   
functionalsoftmaxfloat32rJ   rC   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputs               r:   eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r;   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  d	e
dz  d
ej                  dz  dee   de	ej                  ej                  f   fdZ xZS )PhimoeAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr&   	layer_idxc                 d   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y )Nr@   g      Tbias)r,   r-   r&   r   rD   rE   rF   r@   r   r   r   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_projr7   r&   r   r9   s      r:   r-   zPhimoeAttention.__init__   sM   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r;   Nr   position_embeddingsr   past_key_valuescache_positionr   r=   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t              } || |	|
||f| j                  sdn| j                   | j"                  d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )NrT   r!   rB   )re   rd   r           )r   r   )r^   r@   r   viewrb   r   r   r   updater   r   get_interfacer&   _attn_implementationr   r   r   r   r   r   r   )r7   r   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rd   re   cache_kwargsattention_interfacer   r   s                     r:   rn   zPhimoeAttention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r;   )NNNN)r[   rp   rq   __doc__r"   ru   r-   rG   rr   rv   r	   
LongTensorr   r   rn   rx   ry   s   @r:   r   r      s    G
| 
 
4 IM.2(,26))||)) #5<<#=>E)) t+	))
 )) ((4/)) +,)) 
u||U\\)	*))r;   r   c                       e Zd Zedej
                  dej
                  dej
                  dej
                  dej
                  f
d       Zedej
                  fd       Zy	)
PhimoeMultiplierscores
multiplierselected_expertsmasked_gatesmask_for_onec                 2    | j                  |||       ||z  S )a  
        Forward pass for the custom autograd function.

        Args:
            ctx: Context object to save information for backward computation.
            scores (torch.Tensor): Input scores tensor.
            multiplier (torch.Tensor): Multiplier tensor.
            selected_experts (torch.Tensor): Tensor of selected experts.
            masked_gates (torch.Tensor): Masked gates tensor.
            mask_for_one (torch.Tensor): Mask for one tensor.

        Returns:
            torch.Tensor: Result of the forward pass.
        )save_for_backward)ctxr   r   r   r   r   s         r:   rn   zPhimoeMultiplier.forward
  s"    . 	j*:LIL((r;   grad_at_outputc                     | j                   \  }}}||z  }||j                  d      z  }|j                  d||       |ddddfS )aB  
        Backward pass for the custom autograd function.

        Args:
            ctx: Context object with saved tensors from the forward pass.
            grad_at_output (torch.Tensor): Gradient at the output.

        Returns:
            tuple[torch.Tensor, None, None, None, None]: Gradients for the inputs.
        rT   )rM   indexsrcN)saved_tensorsmulscatter_add_)r   r   r   r   r   grad_at_scores_expandeds         r:   backwardzPhimoeMultiplier.backward$  sn     695F5F2
$l'*4".1C1CB1G"G,," 	- 	
 $
 	
r;   N)r[   rp   rq   rt   rG   rr   rn   r    r;   r:   r   r   	  sx    )) LL)  ,,	)
 ll) ll) )2 

 
r;   r   c                        e Zd ZdZdef fdZdej                  dej                  dej                  dej                  fdZ xZ	S )	PhimoeExpertsz2Collection of expert weights stored as 3D tensors.r&   c                    t         |           |j                  | _        |j                  | _        |j                  | _        t        j                  t        j                  | j                  d| j                  z  | j
                              | _        t        j                  t        j                  | j                  | j
                  | j                              | _        t        |j                     | _        y )NrB   )r,   r-   num_local_expertsnum_expertsrE   
hidden_dimintermediate_sizeintermediate_dimr   	ParameterrG   emptygate_up_proj	down_projr   
hidden_actact_fnr7   r&   r9   s     r:   r-   zPhimoeExperts.__init__K  s    !33 ,, & 8 8LLT5E5Eq4K`K`G`bfbqbq)rsekk$2B2BDOOUYUjUj&klV../r;   r   top_k_indextop_k_weightsr=   c                 f   t        j                  |      }t        j                         5  t         j                  j                  j                  || j                        }|j                  ddd      }t        j                  |j                  d      d      j                         }d d d        D ]  }|d   }|| j                  k(  rt        j                  |         \  }}	||	   }
t        j                  j                  |
| j                  |         j                  dd      \  }}| j                  |      |z  }t        j                  j                  || j                   |         }|||	|d f   z  }|j#                  d|	|j%                  |j&                                |S # 1 sw Y   xY w)N)num_classesrB   r!   r   )rT   rY   rT   )rG   
zeros_likerw   r   r   one_hotr   permutegreatersumnonzerowherelinearr   chunkr   r   
index_add_rJ   rC   )r7   r   r   r   final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategateupcurrent_hidden_statess                 r:   rn   zPhimoeExperts.forwardT  s    $..}=]]_ 	S((--55ktO_O_5`K%--aA6K{8'DaHPPRJ	S
 % 
	nJ#AJT---#(;;{:/F#G Iy))4M}}++M4;L;LZ;XY__`agi_jHD"$(KK$5$:!$&MM$8$89NPTP^P^_iPj$k!$9M)U^`dJd<e$e!**1i9N9Q9QReRkRk9lm
	n #"#	S 	Ss   A=F&&F0)
r[   rp   rq   r   r"   r-   rG   rr   rn   rx   ry   s   @r:   r   r   G  sM    <0| 0#||# \\# ||	#
 
#r;   r   c                 l   t        j                         5  | j                  dd      \  }}| j                         j	                  |      }|| z
  |z  d|z  kD  }ddd       | j                  t        d            }|rg|t        j                  |t         j                        j                         j                         z
  j                  d	      d
   j                  d      }n}t        j                  |d	      }|j                  d|      }	|r|j                  dd      \  }
}t        j                  ||k(  t        j                  |
      dkD        }t        j                   d|d      j#                  |      }t$        j'                  | |	|||      }n|	}t        j(                  | d|t        d            }t        j                         5  |j                  dd      \  }}| j                         j	                  |      }|| z
  |z  d|z  kD  }ddd       |j                  |t        d            }|rg|t        j                  |t         j                        j                         j                         z
  j                  d	      d
   j                  d      }n}t        j                  |d	      }|j                  d|      }|r|j                  dd      \  }
}t        j                  ||k(  t        j                  |
      j+                         dkD        }t        j                   d|d      j#                  |      }t$        j'                  | ||||      }n|}t        j,                  ||fd	      }t        j,                  ||fd	      }||fS # 1 sw Y   DxY w# 1 sw Y   xY w)ud  
    Sparse mixer function to select top-k experts and compute multipliers.
    Based on the paper: https://huggingface.co/papers/2409.12136
    We first replace the TopK(·) function as random sampling of discrete variables
    in model training. Then, following Liu et al. (2023a) and Liu et al. (2023b), we apply Heun's
    third order method to approximate the expert routing gradient and construct a modified
    back-propagation to give a mathematically sound gradient estimation for expert routing.

    Args:
        scores (torch.Tensor): Input scores tensor.
        jitter_eps (float): Jitter epsilon for numerical stability.
        training (bool): Flag indicating if the model is in training mode.
        top_k (int): Number of top experts to select.

    Returns:
        tuple[torch.Tensor, torch.Tensor]: Multiplier and selected experts tensors.
    rT   T)rM   keepdim)minrB   Nz-inf)memory_formatrY   r!   )rM   r   g      ?gioT?gK=U?)alpha)rG   rw   r\   absclampmasked_fillrK   
empty_likelegacy_contiguous_formatexponential_logr   r   gather
logical_or	rand_likeaddtype_asr   applyscatteruniform_concat)r   
jitter_epsr   top_kmask_logits_thresholdmax_indfactorr   r   multiplier_o
max_scoresr   r   masked_scoresmasked_gates_top2selected_experts_top2multiplier_top2_omask_for_one_top2multiplier_top2s                      r:   sparsemixerr"  o  s   $ 
 _)/D)I&w##(=#>"7&"@F!JqS]~ ^	_ %%&;U6]KL ""<u?]?]^kkmqqst SRS[	
 Yr] 	 # ==26L&&25E&FL*..2t.D
G'''OOJ'$.

 yyVDLL\Z%++

 "
 MM
f	M 
 _)6):):r4):)P&w##(=#>"7&"@F!JqS]~ ^	_ &112GvW """#4EDbDbc
 SRS[ Yr] 	 !(&7R@)00R?T0U/33D3I
G!,,!W,OOJ'002T9

 "IIf.?vNVVWhi*00!
 ,z?;DJ||%57L$MSUV 	 G_ _f_ _s   ANAN)N&)N3c                   ~     e Zd Zdef fdZdej                  deej                  ej                  f   f fdZ xZ	S )PhimoeTopKRouterr&   c                     t         |   |j                  |j                  d       |j                  | _        |j
                  | _        |j                  | _        y )NFr   )r,   r-   rE   r   router_jitter_noiseinput_jitter_noisenum_experts_per_tokr  r   s     r:   r-   zPhimoeTopKRouter.__init__  sL    ++V-E-EER#)#=#= "(";";//
r;   r   r=   c                 F   | j                   rQ| j                  dkD  rB|t        j                  |      j	                  d| j                  z
  d| j                  z         z  }t
        |   |      }t        || j                  | j                   | j                        \  }}|||fS )Nr   rA   )r  r   r  )
r   r'  rG   r  r  r,   rn   r"  r&  r  )r7   r   router_logitsrouting_weightsr   r9   s        r:   rn   zPhimoeTopKRouter.forward  s    ==T44q8U--m<EEd---sT5L5L/L M 6,7d&>&>^b^h^h-
)) o/???r;   )
r[   rp   rq   r"   r-   rG   rr   rv   rn   rx   ry   s   @r:   r$  r$    sA    0| 0	@U\\ 	@eELL%,,<V6W 	@ 	@r;   r$  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )PhimoeSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    c                    t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        t        |      | _        t        |      | _        |j                  | _        y ro   )r,   r-   rE   r   r   ffn_dimr   r   r(  r  r$  routerr   expertsr'  r   s     r:   r-   zPhimoeSparseMoeBlock.__init__	  si     ,,//!33//
&v.$V,"(";";r;   r   r=   c                    |j                   \  }}}| j                  rQ| j                  dkD  rB|t        j                  |      j                  d| j                  z
  d| j                  z         z  }|j                   \  }}}|j                  d|      }| j                  |      \  }}}| j                  |||      }|j                  |||      S )Nr   rA   rT   )	r^   r   r'  rG   r  r  r   r0  r1  )	r7   r   
batch_sizesequence_lengthr   _r+  r   r   s	            r:   rn   zPhimoeSparseMoeBlock.forward  s    2?2E2E/
OZ==T44q8U--m<EEd---sT5L5L/L M 3@2E2E/
OZ%--b*=/3{{=/I,?,"ll=:JO\"**:
SSr;   )	r[   rp   rq   r   r-   rG   rr   rn   rx   ry   s   @r:   r-  r-    s+    	<TU\\ Tell Tr;   r-  c                       e Zd Zdedef fdZ	 	 	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  dej                  dz  d	e
dz  d
ej                  dz  dee   dej                  fdZ xZS )PhimoeDecoderLayerr&   r   c                 J   t         |           |j                  | _        t        ||      | _        t        |      | _        t        j                  |j                  |j                  d      | _
        t        j                  |j                  |j                  d      | _        y )NTepselementwise_affine)r,   r-   rE   r   	self_attnr-  mlpr   	LayerNormrms_norm_epsinput_layernormpost_attention_layernormr   s      r:   r-   zPhimoeDecoderLayer.__init__"  s~    !--(;'/  "||F,>,>FDWDWlpq(*F$7$7D)
%r;   Nr   r   r   rg   r   r   r   r=   c           
          |}| j                  |      } | j                  d||||||d|\  }}	||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r   r   r   rg   r   r   r   )r@  r<  rA  r=  )
r7   r   r   r   rg   r   r   r   residualr5  s
             r:   rn   zPhimoeDecoderLayer.forward0  s     !,,];)4>> 
' 3)%+)
 
q !=0 55mD/ =0r;   )NNNNN)r[   rp   rq   r"   ru   r-   rG   rr   rv   r   r	   r   r   rn   rx   ry   s   @r:   r7  r7  !  s    
| 
 
" IM.204(,26|| #5<<#=>E t+	
 &&-  ((4/ +, 
r;   r7  c                        e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZ eed      eedZ ej(                          fd	       Z xZS )
PhimoePreTrainedModelr&   modelTr7  r   r   )r   )r*  r   
attentionsc                 `   t         |   |       | j                  j                  }t	        |t
              rEt        j                  |j                  d|       t        j                  |j                  d|       y t	        |t              r#t        j                  |j                  d|       y y )Nr   )meanstd)r,   _init_weightsr&   initializer_ranger_   r   initnormal_r   r   r$  weight)r7   r   rJ  r9   s      r:   rK  z#PhimoePreTrainedModel._init_weights`  sy    f%kk++fm,LL,,3C@LL))= 01LLSc: 2r;   )r[   rp   rq   r"   rs   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r$  r7  r   _can_record_outputsrG   rw   rK  rx   ry   s   @r:   rE  rE  M  sy    &*#-.#4"5N!"&'(8B+% U]]_; ;r;   rE  c                       e Zd Zdef fdZeee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ej                  dz  dee   defd                     Z xZS )PhimoeModelr&   c           	      "   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j                  |j                  d      | _        t#        |      | _        d| _        | j)                          y c c}w )NTr9  r&   F)r,   r-   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrE   embed_tokens
ModuleListrangenum_hidden_layersr7  layersr>  r?  normr$   
rotary_embgradient_checkpointing	post_initr   s      r:   r-   zPhimoeModel.__init__m  s     !.. ++LL):):F<N<NPTP`P`ammDI&JbJbDcdy	2d
 LL!3!39L9Laef	/v>&+# 	 es   DN	input_idsr   rg   r   inputs_embeds	use_cacher   r   r=   c                    |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }| j                  j                  t        nt        }
 |
| j                  |||||      }|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f||||||d|} | j!                  |      }t#        ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embedsr]  r   r!   )r8   )r&   rl  r   r   r   rg   )rg   )r   rg   r   rm  r   r   )last_hidden_stater   )rZ   r
   r&   rb  get_seq_lengthrG   rH   r^   r8   r   sliding_windowr   r   rh  rf  re  rg  r   )r7   rk  r   rg   r   rl  rm  r   r   past_seen_tokensmask_functioncausal_maskr   r   decoder_layers                  r:   rn   zPhimoeModel.forward}  sx    -t";<YZZ0*$++>O  --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L.2kk.H.H.P*Vw#;;'))+%
 &"oom,oW![[)H4;;+H+HI 
	M)	*) /#-$7	 	M
	 		-0%++
 	
r;   )NNNNNNN)r[   rp   rq   r"   r-   r   r    r   rG   r   rr   r	   FloatTensorboolr   r   r   rn   rx   ry   s   @r:   r[  r[  k  s    |     .2.204(,26!%26:
##d*:
 t+:
 &&-	:

 :
 ((4/:
 $;:
 ((4/:
 +,:
 
 :
    :
r;   r[  gate_logitsr   c                    | t        | t              syt        | t              rC| d   j                  }t        j                  | D cg c]  }|j                  |       c}d      }t        j                  j                  j                  d      }t        j                  ||d      \  }}	t        j                  j                  j                  |	|      }
|>t        j                  |
j                         d      }t        j                  |d      }n|j                  \  }}|j                  d   ||z  z  }|dddddddf   j                  |||||f      j                  d||      j                        }t        j                   |
j                         |z  d      t        j                   |d      z  }|ddddddf   j                  ||||f      j                  d|      j                  |      }t        j                   ||z  d      t        j                   |d      z  }t        j                   ||j#                  d      z        }||z  S c c}w )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   rY   rT   )r_   rv   r8   rG   rc   rJ   r   r   r   topkr   rI  rK   r^   r]   r   r   r   )rx  r   r  r   compute_device
layer_gateconcatenated_gate_logitsr+  r5  r   r   tokens_per_expertrouter_prob_per_expertr3  r4  re  expert_attention_mask router_per_expert_attention_maskoverall_losss                      r:   load_balancing_loss_funcr    s9   : *[%"@+u%$Q..#(99^i-jPZjmmN.K-jpq#r hh))112JPR1SO**_eDA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
O4::1=*B^_ 4AtT12V&
OUKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&
O[QRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1G1Q1QRS1TTUL+%%[ .ks   Ic                       e Zd ZddiZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 dd	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dedz  de	j                  dz  dee	j                  z  dee   defd              Z	 	 	 	 	 	 	 d fd	Z xZS )PhimoeForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr   logitsc                 v   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  | j                  j                        | _
        |j                  | _        |j                  | _        |j                  | _        | j                          y )Nr   )r,   r-   r[  rF  r`  r   r   rE   r&   lm_head_biasr  router_aux_loss_coefr   r   r(  rj  r   s     r:   r-   zPhimoeForCausalLM.__init__  s      (
 ++yy!3!3V5F5FT[[MeMef$*$?$?!!33#)#=#=  	r;   Nrk  r   rg   r   rl  labelsrm  output_router_logitsr   logits_to_keepr   r=   c                 l   ||n| j                   j                  } | j                  d||||||||	d|}|j                  }t	        |
t
              rt        |
 d      n|
}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                         z  z  }t#        ||||j$                  |j&                  |j(                  |j                        S )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PhimoeForCausalLM

        >>> model = PhimoeForCausalLM.from_pretrained("mistralai/Phimoe-8x7B-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Phimoe-8x7B-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)rk  r   rg   r   rl  rm  r  r   )lossaux_lossr  r   r   rG  r*  r   )r&   r  rF  ro  r_   ru   slicer  loss_functionr`  r  r*  r   r(  r  rJ   r8   r   r   r   rG  )r7   rk  r   rg   r   rl  r  rm  r  r   r  r   outputsr   slice_indicesr  r  r  s                     r:   rn   zPhimoeForCausalLM.forward!  sX   P %9$D $++JjJj 	
 +5$** 
+
)%+'!5)
+
 
+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r;   c	                     |r_t        | j                  d      rI|j                  d   | j                  j                  dz   k\  r |d   }
|
| j                  j                  k  rd }t	        |   d||||||||d|	}|S )NrQ   r!   r   )rk  r   r   rl  r   rg   rm  r  r   )hasattrr&   r^   rQ   r,   prepare_inputs_for_generation)r7   rk  r   r   rl  r   rg   rm  r  r   past_lengthmodel_inputsr9   s               r:   r  z/PhimoeForCausalLM.prepare_inputs_for_generationx  s    $ %GH"dkk&R&RUV&VV(+KdkkJJJ"&w< 

+)')%)

 

 r;   )
NNNNNNNNNr   )NNNNNTN)r[   rp   rq   _tied_weights_keys_tp_plan_pp_planr-   r   r   rG   r   rr   r	   rv  rw  ru   r   r   r   rn   r  rx   ry   s   @r:   r  r    sp   *,GH23H_-z:;H
  .2.204(,26*.!%,026-.R
##d*R
 t+R
 &&-	R

 R
 ((4/R
   4'R
 $;R
 #TkR
 ((4/R
 ell*R
 +,R
 
#R
  R
p % %r;   r  c                       e Zd Zy)PhimoeForSequenceClassificationN)r[   rp   rq   r   r;   r:   r  r    s    r;   r  )rE  r[  r  r  )r!   )r   )rB   )NrB   N)Kcollections.abcr   typingr   rG   r    r   rM  activationsr   cache_utilsr	   r
   
generationr   integrationsr   r   r   masking_utilsr   r   modeling_layersr   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   r    configuration_phimoer"   Moduler$   r}   r   rr   ru   r   rK   r   r   autogradFunctionr   r   r"  r   r$  r-  r7  rE  r[  rv   r  r  r  __all__r   r;   r:   <module>r     s  , %    & ! . ) e e R [ Q K F & I I G E .L0BII L0^( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*C)bii C) +C)L;
u~~.. ;
| $#BII $# $#Nxv@ryy @&!T299 !TH)3 )X ;O ; ;: N
' N
 N
f #
*.	O&ell 33d:O&tO& LL4'	O&
 \\CO&d M- M M` d&FH] c kr;   