
    qi=                        d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 dd	lmZ d
dlmZ d
dlmZmZmZmZmZmZ ddlmZ  G d de      Z G d de      Z G d dej4                  j6                        Zd'dZ G d de      Z G d dej>                        Z  G d dejB                        Z" G d de      Z# G d de      Z$ G d  d!e      Z% G d" d#e      Z& G d$ d%ee$      Z'g d&Z(y)(zPyTorch Phimoe model.    )CallableN)nn   ) GenericForSequenceClassification)ROPE_INIT_FUNCTIONS)maybe_autocast)OutputRecorder   )LlamaAttention)MixtralDecoderLayerMixtralExpertsMixtralForCausalLMMixtralModelMixtralPreTrainedModelMixtralRotaryEmbedding   )PhimoeConfigc                   "    e Zd ZddefdZddZy)PhimoeRotaryEmbeddingNconfigc                    t         j                  j                          |j                  | _        |j                  | _        || _        | j                  j                  d   | _        | j                  | _
        | j                  dk7  rt        | j                     | _
        | j                  | j                  |      \  }| _        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultinv_freqF)
persistentoriginal_inv_freq)r   Module__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr   rope_parametersr   compute_default_rope_parametersrope_init_fnr   attention_scalingregister_bufferclone)selfr   devicer   s       [/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/phimoe/modular_phimoe.pyr   zPhimoeRotaryEmbedding.__init__)   s    
		"("@"@$*$B$B!44[A&*&J&J>>Y& 3DNN CD+/+<+<T[[&+Q($(ZeD0(..2BuU    c                    |%t        | j                  j                   d| d      d }t        j                  |      dz   }| j
                  j                  d   dk7  rP|rN|| j
                  j                  d   kD  r| j
                  j                  d   n| j
                  j                  d   }| j                  | j
                  |j                  |      \  }}||n|}|d d d d f   j                         j                  |j                  d	   d
d      j                  |j                        }|d d d d d f   j                         }	t        |j                  j                  t              r/|j                  j                  dk7  r|j                  j                  nd}
t!        |
d      5  |j                         |	j                         z  j#                  dd      }t        j$                  ||fd
      }|j'                         |z  }|j)                         |z  }d d d        j                  |j*                        j                  |j*                        fS # 1 sw Y   ?xY w)Nz3 does not support layer types, but got `layer_type=`r   r   r    original_max_position_embeddingslong_mscaleshort_mscaler   mpscpuF)device_typeenabledr
   dim)
ValueError	__class____name__torchmaxr   r"   r$   r)   floatexpandshapeto
isinstancetypestrr   	transposecatcossindtype)r(   xposition_ids
layer_typemscaleseq_lenr   r%   inv_freq_expandedposition_ids_expandedr4   freqsembrF   rG   s                  r*   forwardzPhimoeRotaryEmbedding.forward9   s   !>>**++^_i^jjkl  ))L)A-;;&&{3y@W T[[889[\\ ++M:[[00@ 
 '+&7&7QXXw&W##&,n"&$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	%&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')f$C'')f$C		%
 vvaggqww//	% 	%s   )A1II!N)NN)r:   
__module____qualname__r   r   rR    r+   r*   r   r   (   s    V| V 0r+   r   c                       e Zd Zy)PhimoeAttentionNr:   rT   rU   rV   r+   r*   rX   rX   U       r+   rX   c                       e Zd Zedej
                  dej
                  dej
                  dej
                  dej
                  f
d       Zedej
                  fd       Zy	)
PhimoeMultiplierscores
multiplierselected_expertsmasked_gatesmask_for_onec                 2    | j                  |||       ||z  S )a  
        Forward pass for the custom autograd function.

        Args:
            ctx: Context object to save information for backward computation.
            scores (torch.Tensor): Input scores tensor.
            multiplier (torch.Tensor): Multiplier tensor.
            selected_experts (torch.Tensor): Tensor of selected experts.
            masked_gates (torch.Tensor): Masked gates tensor.
            mask_for_one (torch.Tensor): Mask for one tensor.

        Returns:
            torch.Tensor: Result of the forward pass.
        )save_for_backward)ctxr]   r^   r_   r`   ra   s         r*   rR   zPhimoeMultiplier.forwardZ   s"    . 	j*:LIL((r+   grad_at_outputc                     | j                   \  }}}||z  }||j                  d      z  }|j                  d||       |ddddfS )aB  
        Backward pass for the custom autograd function.

        Args:
            ctx: Context object with saved tensors from the forward pass.
            grad_at_output (torch.Tensor): Gradient at the output.

        Returns:
            tuple[torch.Tensor, None, None, None, None]: Gradients for the inputs.
        r1   )r7   indexsrcN)saved_tensorsmulscatter_add_)rd   re   r^   r_   r`   grad_at_scores_expandeds         r*   backwardzPhimoeMultiplier.backwardt   sn     695F5F2
$l'*4".1C1CB1G"G,," 	- 	
 $
 	
r+   N)r:   rT   rU   staticmethodr;   TensorrR   rm   rV   r+   r*   r\   r\   Y   sx    )) LL)  ,,	)
 ll) ll) )2 

 
r+   r\   c                 l   t        j                         5  | j                  dd      \  }}| j                         j	                  |      }|| z
  |z  d|z  kD  }ddd       | j                  t        d            }|rg|t        j                  |t         j                        j                         j                         z
  j                  d	      d
   j                  d      }n}t        j                  |d	      }|j                  d|      }	|r|j                  dd      \  }
}t        j                  ||k(  t        j                  |
      dkD        }t        j                   d|d      j#                  |      }t$        j'                  | |	|||      }n|	}t        j(                  | d|t        d            }t        j                         5  |j                  dd      \  }}| j                         j	                  |      }|| z
  |z  d|z  kD  }ddd       |j                  |t        d            }|rg|t        j                  |t         j                        j                         j                         z
  j                  d	      d
   j                  d      }n}t        j                  |d	      }|j                  d|      }|r|j                  dd      \  }
}t        j                  ||k(  t        j                  |
      j+                         dkD        }t        j                   d|d      j#                  |      }t$        j'                  | ||||      }n|}t        j,                  ||fd	      }t        j,                  ||fd	      }||fS # 1 sw Y   DxY w# 1 sw Y   xY w)ud  
    Sparse mixer function to select top-k experts and compute multipliers.
    Based on the paper: https://huggingface.co/papers/2409.12136
    We first replace the TopK(·) function as random sampling of discrete variables
    in model training. Then, following Liu et al. (2023a) and Liu et al. (2023b), we apply Heun's
    third order method to approximate the expert routing gradient and construct a modified
    back-propagation to give a mathematically sound gradient estimation for expert routing.

    Args:
        scores (torch.Tensor): Input scores tensor.
        jitter_eps (float): Jitter epsilon for numerical stability.
        training (bool): Flag indicating if the model is in training mode.
        top_k (int): Number of top experts to select.

    Returns:
        tuple[torch.Tensor, torch.Tensor]: Multiplier and selected experts tensors.
    r1   T)r7   keepdim)minr
   Nz-inf)memory_formatr6   r   )r7   rg   g      ?gioT?gK=U?)alpha)r;   no_gradr<   absclampmasked_fillr=   
empty_likelegacy_contiguous_formatexponential_log	unsqueezesoftmaxgather
logical_or	rand_likeaddtype_asr\   applyscatteruniform_concat)r]   
jitter_epstrainingtop_kmask_logits_thresholdmax_indfactorr`   r_   multiplier_o
max_scoresra   r^   masked_scoresmasked_gates_top2selected_experts_top2multiplier_top2_omask_for_one_top2multiplier_top2s                      r*   sparsemixerr      s   $ 
 _)/D)I&w##(=#>"7&"@F!JqS]~ ^	_ %%&;U6]KL ""<u?]?]^kkmqqst SRS[	
 Yr] 	 # ==26L&&25E&FL*..2t.D
G'''OOJ'$.

 yyVDLL\Z%++

 "
 MM
f	M 
 _)6):):r4):)P&w##(=#>"7&"@F!JqS]~ ^	_ &112GvW """#4EDbDbc
 SRS[ Yr] 	 !(&7R@)00R?T0U/33D3I
G!,,!W,OOJ'002T9

 "IIf.?vNVVWhi*00!
 ,z?;DJ||%57L$MSUV 	 G_ _f_ _s   ANAN)N&)N3c                       e Zd Zy)PhimoeExpertsNrY   rV   r+   r*   r   r     rZ   r+   r   c                   ~     e Zd Zdef fdZdej                  deej                  ej                  f   f fdZ xZ	S )PhimoeTopKRouterr   c                     t         |   |j                  |j                  d       |j                  | _        |j
                  | _        |j                  | _        y )NFbias)superr   hidden_sizenum_local_expertsrouter_jitter_noiseinput_jitter_noisenum_experts_per_tokr   r(   r   r9   s     r*   r   zPhimoeTopKRouter.__init__  sL    ++V-E-EER#)#=#= "(";";//
r+   hidden_statesreturnc                 F   | j                   rQ| j                  dkD  rB|t        j                  |      j	                  d| j                  z
  d| j                  z         z  }t
        |   |      }t        || j                  | j                   | j                        \  }}|||fS )Nr         ?)r   r   r   )
r   r   r;   ry   r   r   rR   r   r   r   )r(   r   router_logitsrouting_weightsr_   r9   s        r*   rR   zPhimoeTopKRouter.forward  s    ==T44q8U--m<EEd---sT5L5L/L M 6,7d&>&>^b^h^h-
)) o/???r+   )
r:   rT   rU   r   r   r;   ro   tuplerR   __classcell__r9   s   @r*   r   r     sA    0| 0	@U\\ 	@eELL%,,<V6W 	@ 	@r+   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )PhimoeSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    c                    t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        t        |      | _        t        |      | _        |j                  | _        y rS   )r   r   r   
hidden_dimintermediate_sizeffn_dimr   num_expertsr   r   r   routerr   expertsr   r   s     r*   r   zPhimoeSparseMoeBlock.__init__5  si     ,,//!33//
&v.$V,"(";";r+   r   r   c                    |j                   \  }}}| j                  rQ| j                  dkD  rB|t        j                  |      j                  d| j                  z
  d| j                  z         z  }|j                   \  }}}|j                  d|      }| j                  |      \  }}}| j                  |||      }|j                  |||      S )Nr   r   r1   )	r?   r   r   r;   ry   r   reshaper   r   )	r(   r   
batch_sizesequence_lengthr   _r   r_   final_hidden_statess	            r*   rR   zPhimoeSparseMoeBlock.forward?  s    2?2E2E/
OZ==T44q8U--m<EEd---sT5L5L/L M 3@2E2E/
OZ%--b*=/3{{=/I,?,"ll=:JO\"**:
SSr+   )	r:   rT   rU   __doc__r   r;   ro   rR   r   r   s   @r*   r   r   )  s+    	<TU\\ Tell Tr+   r   c                   (     e Zd Zdedef fdZ xZS )PhimoeDecoderLayerr   	layer_idxc                     t         |   ||       t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        y NT)epselementwise_affine)r   r   r   	LayerNormr   rms_norm_epsinput_layernormpost_attention_layernorm)r(   r   r   r9   s      r*   r   zPhimoeDecoderLayer.__init__N  sZ    +  "||F,>,>FDWDWlpq(*F$7$7D)
%r+   )r:   rT   rU   r   intr   r   r   s   @r*   r   r   M  s    
| 
 
 
r+   r   c                   (    e Zd Z eed      eedZy)PhimoePreTrainedModelr   )rg   )r   r   
attentionsN)r:   rT   rU   r	   r   r   rX   _can_record_outputsrV   r+   r*   r   r   X  s    '(8B+%r+   r   c                   $     e Zd Zdef fdZ xZS )PhimoeModelr   c                     t         |   |       t        j                  |j                  |j
                  d      | _        y r   )r   r   r   r   r   r   normr   s     r*   r   zPhimoeModel.__init__a  s1     LL!3!39L9Laef	r+   )r:   rT   rU   r   r   r   r   s   @r*   r   r   `  s    g| g gr+   r   c                   8     e Zd Z fdZ	 	 	 	 	 	 	 d fd	Z xZS )PhimoeForCausalLMc                     t         |   |       t        j                  |j                  |j
                  | j                  j                        | _        y )Nr   )	r   r   r   Linearr   
vocab_sizer   lm_head_biaslm_headr   s     r*   r   zPhimoeForCausalLM.__init__g  s:     yy!3!3V5F5FT[[MeMefr+   c	                     |r_t        | j                  d      rI|j                  d   | j                  j                  dz   k\  r |d   }
|
| j                  j                  k  rd }t	        |   d||||||||d|	}|S )Nr.   r   r   )	input_idspast_key_valuesattention_maskinputs_embedscache_positionrJ   	use_cachelogits_to_keeprV   )hasattrr   r?   r.   r   prepare_inputs_for_generation)r(   r   r   r   r   r   rJ   r   r   kwargspast_lengthmodel_inputsr9   s               r*   r   z/PhimoeForCausalLM.prepare_inputs_for_generationl  s    $ %GH"dkk&R&RUV&VV(+KdkkJJJ"&w< 

+)')%)

 

 r+   )NNNNNTN)r:   rT   rU   r   r   r   r   s   @r*   r   r   f  s*    g % %r+   r   c                       e Zd Zy)PhimoeForSequenceClassificationNrY   rV   r+   r*   r   r     s    r+   r   )r   r   r   r   )r
   ))r   collections.abcr   r;   r   modeling_layersr   modeling_rope_utilsr   utils.genericr   utils.output_capturingr	   llama.modeling_llamar   mixtral.modeling_mixtralr   r   r   r   r   r   configuration_phimoer   r   rX   autogradFunctionr\   r   r   r   r   r   r   r   r   r   r   r   __all__rV   r+   r*   <module>r      s     $   7 + 4 1  /*02 *0Z	n 	;
u~~.. ;
|xv	N 	@ryy @&!T299 !TH
, 
2 g, g+* +\ d&FH] cr+   