
    qiX                        d dl mZ d dlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ  e
       rd dlZ ej                  e      Z	 	 d+d	ej                   d
ej                   dej                   dz  dedej                   f
dZdej&                  j(                  dej                   dej                   dej                   dej                   f
dZd	ej                   d
ej                   dej                   dej                   fdZd	ej                   d
ej                   dej                   dej                   fdZd Zd Z e
       rXej4                  j7                  ded       ej4                  j9                  de       ej4                  j;                  dee       d	ej                   d
ej                   dej                   defdZd	ej                   d
ej                   dej                   dej                   fdZ	 	 d+d	ej                   d
ej                   dej                   dej                   dz  dedej                   fdZ dej&                  j(                  dej                   dej                   dej                   dej                   f
d Z! G d! d"e      Z" e"       Z#d#ej                   dej                   fd$Z$	 d,ddd%d&d'e%ej&                  j(                     dz  ded(ed)ede%ej&                  j(                     f
d*Z&y)-    )Callable)wraps   )logging)GeneralInterface)is_grouped_mm_availableis_torch_availableis_torchdynamo_compilingNFinputweightbiasis_transposedreturnc                     |r5t        j                  | j                  d      |      j                  d      }n4t        j                  || j                  d            j                  d      }|||z   }|S )a  Batched linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (batch_size, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (batch_size, output_dim, input_dim) if transposed is `False`,
            else of shape (batch_size, input_dim, output_dim).
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (batch_size, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (batch_size, output_dim).
       )torchbmm	unsqueezesqueeze)r   r   r   r   outs        O/opt/pipecat/venv/lib/python3.12/site-packages/transformers/integrations/moe.py_batched_linearr   D   se    * ii*F3;;A> ii 34<<R@DjJ    selfhidden_statestop_k_indextop_k_weightsc                 ~   |j                   }|j                  d      }|j                  d      }|j                  d      }t        j                  ||      j	                  d      j                  d|      j                  d      }|j                  d      }	|j                  d      }
|
| j                  k\  }|
j                  d| j                  dz
        }
||   }| j                  r-| j                  |
   }| j                  r| j                  |
   nd }n,| j                  |
   }| j                  r| j                  |
   nd }t        |||| j                         }| j                  r| j#                  |      }n| j%                  |      }| j&                  |
   }| j                  r| j(                  |
   nd }t        |||| j                         }||	j	                  d      z  }|j+                  |j	                  d      d       |j-                  |||      j/                  d      }|j1                  |j2                        S )Nr   r   devicer   r   r   g        dim)r!   sizer   aranger   expandreshapenum_expertsclamphas_gategate_up_projhas_biasgate_up_proj_biasup_projup_proj_biasr   r   _apply_gateact_fn	down_projdown_proj_biasmasked_fill_viewsumtodtype)r   r   r   r   r!   	num_top_k
num_tokens
hidden_dim	token_idxsample_weights
expert_idsinvalid_maskselected_hidden_statesselected_weightsselected_biasesup_proj_outdown_proj_outweighted_outfinal_hidden_statess                      r   batched_mm_experts_forwardrH   f   s&    !!F  $I##A&J##B'J Z7AA!DKKBPYZbbcefI"**2.N$$R(J !1!11L!!!T%5%5%9:J +95 }},,Z8@D$00<SW<<
3;?==$++J7d " 0VZVhVhK
 }}&&{3 kk+. ~~j19=d))*5DO $%O4K]K]M
 !>#;#;B#??Ll44R8#> '++J	:NRRWXRY!!-"5"566r   offsc                 4   t        j                  | j                  d      |j                  d      | j                  | j                        }d}t        |j                               D ].  \  }}||k(  rt        j                  | || ||   |||        |}0 |S )a(  
    Fallback grouped matrix multiplication used when `torch.nn.functional.grouped_mm` and `torch._grouped_mm`
    are unavailable or incompatible with `torch.compile` (e.g. non-bfloat16 weights).

    Args:
        input (`torch.Tensor`): Input of shape (S, input_dim), sorted by expert id.
        weight (`torch.Tensor`): Expert weights of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`): Cumulative token counts per expert of shape (num_experts,).
    Returns:
        `torch.Tensor`: Output of shape (S, output_dim).
    r   r   r!   r9   r   )r   zerosr%   r!   r9   	enumeratetolistmm)r   r   rI   outputstartiends          r   _grouped_mm_fallbackrU      s     [[AAu||SXS^S^_FE DKKM* 3C<uS!6!9&s2CD	 Mr   c                 p   | j                         dk(  sJ dt        | j                                |j                         dk(  sJ dt        |j                                |j                         dk(  sJ dt        |j                                |j                  d      |j                  d      k(  s+J d|j                  d       d	|j                  d              | j                  d      |j                  d      k(  s+J d
| j                  d       d|j                  d              |j                  t
        j                  t
        j                  fv sJ d|j                          t        j                  | j                  d      |j                  d      | j                  | j                        S )zRShape/dtype inference stub for `_grouped_mm_fallback` required by `torch.compile`.r   z+input must be 2D (S, input_dim), got shape    zBweight must be 3D (num_experts, input_dim, output_dim), got shape r   z*offs must be 1D (num_experts,), got shape r   zoffs length z must match number of experts zinput_dim mismatch: input has z, weight has z$offs must be an integer tensor, got rK   )
r$   tupleshaper%   r9   r   int32int64emptyr!   r   r   rI   s      r   _grouped_mm_fallback_faker^      s   99;!_J5QVQ\Q\K]J^__::<1 
LUSYS_S_M`Lab 88:?\HtzzIZH[\\?99Q<6;;q>)v\$))A,Geflfqfqrsfteu+vv)::a=FKKN* 
(A}V[[QR^DTU* ::%++u{{33h7[\`\f\f[g5hh3;;uzz!}fkk!nU\\QVQ\Q\]]r   c                 H    | j                  |d   |d          |d   | _        y)zjSaves input and weight for backward; offs is stored directly as it is a non-differentiable integer tensor.r   r   r   N)save_for_backwardrI   )ctxinputsrQ   s      r   "_grouped_mm_fallback_setup_contextrc      s%    &)VAY/ayCHr   c                    | j                   \  }}t        j                  |      }t        j                  |      }d}t        | j                  j                               D ]c  \  }}||k(  rt        j                  ||| ||   j                  |||        t        j                  ||| j                  ||| ||          |}e ||dfS )zuBackward pass for `_grouped_mm_fallback`. Computes grad_input and grad_weight per expert group; offs has no gradient.r   rL   N)saved_tensorsr   
zeros_likerN   rI   rO   rP   T)	ra   grad_outputr   r   
grad_inputgrad_weightrR   rS   rT   s	            r   _grouped_mm_fallback_backwardrk      s    %%ME6!!%(J""6*KE CHHOO-. 3C<U3'*U3:OPuS!##[s%;QP {D((r   z!transformers::grouped_mm_fallback )mutates_args)setup_contextc                 f    t               r|j                  t        j                  k7  ryt	               S )a  
    Check if torch.nn.functional.grouped_mm or torch._grouped_mm can be used based on availability and compatibility with torch.compile.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `bool`: True if grouped_mm can be used, False otherwise.
    F)r
   r9   r   bfloat16r   r]   s      r   _can_use_grouped_mmrq      s%      !fllenn&D"$$r   c                    t        | ||      rt        t        j                  j                  d      rEt        j                  j                  j                  | j                  |j                        ||      S t        t        d      r1t        j                  | j                  |j                        ||      S t        j                  j                  j                  | ||      S )a  Grouped matrix multiplication dispatcher that uses torch.nn.functional.grouped_mm if available, else falls back to torch._grouped_mm.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    
grouped_mmrI   _grouped_mm)rq   hasattrr   nn
functionalrs   r8   r9   ru   opstransformersgrouped_mm_fallbackr]   s      r   ru   ru     s    $ 5&$/
 588&&588&&11%((6<<2H&W[1\\UM*$$UXXfll%;V$OO99!!55eV$5OOr   c                 r    |rt        | ||      }nt        | |j                  dd      |      }|||z   }|S )a  Grouped linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, input_dim, output_dim) if `is_transposed`,
            else of shape (num_experts, output_dim, input_dim).
        offs (`torch.Tensor`):
            Offsets tensor indicating the boundaries of each group in the input tensor.
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (num_experts, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    rt   r   )ru   	transpose)r   r   rI   r   r   r   s         r   _grouped_linearr   ,  sF    0 %d3 %!1!1"b!9EDjJr   c                 X   |j                   }|j                  d      }|j                  d      }|j                  d      }t        j                  ||      j	                  d      j                  d|      j                  d      }|j                  d      }	|j                  d      }
||   }t        j                  |
      }t        j                  |      }|
|   }|	|   }||   }|j                  dk(  r|j                         n|j                         }t        j                  || j                  d| j                  dz
        }t        j                  |dt        j                        }| j                  r*| j                   }| j"                  r| j$                  |   nd }n)| j&                  }| j"                  r| j(                  |   nd }t+        ||||| j,                        }| j                  r| j/                  |      }n| j1                  |      }| j2                  }| j"                  r| j4                  |   nd }t+        ||||| j,                        }||j	                  d      z  }||   }|j7                  |||      j9                  d	      }|j;                  |j<                        S )
Nr   r   r    r   cpu)binsminmax)r$   r9   r"   r#   )r!   r%   r   r&   r   r'   r(   argsorttypefloatinthistcr)   cumsumrZ   r+   r,   r-   r.   r/   r0   r   r   r1   r2   r3   r4   r6   r7   r8   r9   )r   r   r   r   r!   r:   r;   r<   r=   r>   r?   rA   perminv_permexpert_ids_gsample_weights_gselected_hidden_states_ghistc_inputnum_tokens_per_expertoffsetsrB   rC   rD   rE   rF   rG   s                             r   grouped_mm_experts_forwardr   R  s    !!F  $I##A&J##B'J Z7AA!DKKBPYZbbcefI"**2.N$$R(J +95 ==$D}}T"Hd#L%d+5d;
 +1++*>,$$&LDTDTDVK!KK$:J:JPQW[WgWgjkWklll0au{{KG }},,BF--$00>UY<<=A]]$++L9PT " "2G/aeasasK
 }}&&{3 kk+. ~~;?==d)),7dO $%w_TXTfTfM
 !#3#=#=b#AAL  )L '++J	:NRRWXRY!!-"5"566r   c                   :     e Zd ZdZeedZdededef fdZ	 xZ
S )ExpertsInterfacez9Interface for registering custom experts implementations.)
batched_mmrs   experts_implementationdefaultr   c                     |t         j                  d       n|dk7  r|| vrt        d| d      t        |   ||      S )zfReturn the requested `experts_implementation`. Also strictly check its validity, and raise if invalid.a
  You tried to access the `ExpertsInterface` with a `config._experts_implementation` set to `None`. This is expected if you use an Expert Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._experts_implementation`eager`zL` is not a valid experts implementation registered in the `ExpertsInterface`)loggerwarning_onceKeyErrorsuperget)r   r   r   	__class__s      r   get_interfacezExpertsInterface.get_interface  s`    !)N
 $w.3IQU3U*++wx  w{17;;r   )__name__
__module____qualname____doc__rH   r   _global_mappingstrr   r   __classcell__)r   s   @r   r   r     s4    C 10O
<C <( <x < <r   r   gate_up_outc                 V    |j                  dd      \  }}| j                  |      |z  S )a  
    Default gating mechanism: splits the gate_up_out into gate and up parts,
    applies the activation function to the gate part, and multiplies it with the up part.
    Args:
        gate_up_out (`torch.Tensor`):
            The output tensor from the gate and up projection of shape (S, 2 * intermediate_dim).
    Returns:
        `torch.Tensor`: The gated output tensor of shape (S, intermediate_dim).
    r   r   r#   )chunkr2   )r   r   gateups       r   _default_apply_gater     s1        +HD";;tr!!r   T)r   r-   r+   experts_classr-   r+   c                    dt         t        j                  j                     dt         t        j                  j                     ffd}|  ||       S |S )aV  Decorator to modify experts class to support different experts implementations.

    Args:
        experts_class (`type[torch.nn.Module]`, *optional*):
            The experts class to modify. If not provided, returns a decorator that can be applied to the class.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the expert weights are stored in transposed format.
        has_bias (`bool`, *optional*, defaults to `False`):
            Whether the expert layers include bias terms.

    Returns:
        `type[torch.nn.Module]`: The modified experts class.
    r   r   c                     | j                   | j                  t              fd       }t              fd       }t        | d      st        | _        || _         || _        | S )Nc                 X     | |g|i | || _         | _        | _        | _        y N)configr+   r-   r   )r   r   argskwargsr-   r+   r   original_inits       r   __init__z=use_experts_implementation.<locals>.wrapper.<locals>.__init__  s4    $888 DK$DM$DM!.Dr   c                 p    t         j                  | j                  j                        } || g|i |S r   )ALL_EXPERTS_FUNCTIONSr   r   _experts_implementation)r   r   r   experts_forwardoriginal_forwards       r   forwardz<use_experts_implementation.<locals>.wrapper.<locals>.forward  s:    3AA335EO #49$9&99r   r1   )r   r   r   rv   r   r1   )r   r   r   r   r   r-   r+   r   s      @@r   wrapperz+use_experts_implementation.<locals>.wrapper  su    %..(00	}		/ 
	/ 
	 	: 
!	: }m4(;M%!) 'r   )r   r   rw   Module)r   r   r-   r+   r   s    ``` r   use_experts_implementationr     sH    *tEHHOO4 ehhoo9N 4  }%%Nr   )NFr   )'collections.abcr   	functoolsr   utilsr   utils.genericr   utils.import_utilsr   r	   r
   r   
get_loggerr   r   Tensorboolr   rw   r   rH   rU   r^   rc   rk   library	custom_opregister_fakeregister_autogradrq   ru   r   r   r   r   r   r   r   rl   r   r   <module>r      se   %   , f f 			H	%Z !%	<<LL ,,
 	
 \\D@7
((//@7<<@7 @7 <<	@7
 \\@7L ell %,, [`[g[g 4^U\\ ^5<< ^u|| ^`e`l`l ^)& 	MM?AUdfg	MM CE^_	MM##+%8 $ %u|| %U\\ % %Z^ %*P<<PLLP ,,P \\	PF !%#<<#LL# ,,# ,,
	#
 # \\#LO7
((//O7<<O7 O7 <<	O7
 \\O7d<' <. )* "5<< "ELL " 372  2(4/2 2 	2
 2 
%((//2r   