
    qi7                        d dl mZ ddlmZ ddlmZ ddlmZmZ ddl	m
Z
mZmZmZmZ  e       r
d dlZd dlmZ  e
       rd d	lmZ  e       Z e       resd dlZ ej,                  e      Z G d
 de      Z G d dej                   j4                        Z G d dej8                        Z ed      d        Z	 ddee    dz  fdZ!y)    )	lru_cache   )ACT2FN)ConversionOps)get_module_from_nameshould_convert_module)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availableis_torch_xpu_availableloggingN)nn)init_empty_weightsc            	           e Zd Zd Z	 ddeeej                  eej                     z  f   dej                  j                  dz  deeej                  f   fdZy)FbgemmFp8Quantizec                     || _         y N)hf_quantizer)selfr   s     V/opt/pipecat/venv/lib/python3.12/site-packages/transformers/integrations/fbgemm_fp8.py__init__zFbgemmFp8Quantize.__init__-   s
    (    N
input_dictmodelreturnc                 z   t        |j                               d   \  }}|d   }ddlm} t	        ||      \  }}t        ||      r|dk(  r~|j                  dd      }	|	j                  }
|	j                  d|
d         }t        |      \  }}|j                  |
      }|j                  dd      }|j                  |
d   d|
d         }n|dk(  r|j                  dd      }	|	j                  }
|	j                  d|
d         }t        |      \  }}|j                  |
      }|j                  dd      }|j                  |
d   |
d   d      }nJt        |      \  }}t        j                  j                  |j                  |j                  d   d            }|t        j                  j                        | diS )	Nr   r   )FbgemmFp8Llama4TextExpertsgate_up_proj   	down_proj_scale)tupleitemsintegrationsr   r   
isinstance	transposeshapereshapequantize_fp8_per_rowtorchr   	Parameterview)r   r   r   kwargs
target_keyvaluer   moduletensor_nametransposed_paramoriginal_shapeflattened_paramnew_value_flatweight_scale_flat	new_valueweight_scales                   r   convertzFbgemmFp8Quantize.convert0   s    "*"2"2"45a8
Ea=25*Ef89n, $)??1a#8  "2!7!7"2":":2~b?Q"R 5I4Y1 1 +22>B	%//15	0889JA~^_O`a+ $)??1a#8  "2!7!7"2":":2~b?Q"R 5I4Y1 1 +22>B	%//15	0889JN[\L]_`a&:5&A#I| 88--l.?.?@R@RST@UWX.YZLEHH..y9j\;PR^__r   r   )__name__
__module____qualname__r   dictstrr+   Tensorlistr   Moduler:    r   r   r   r   ,   sh    ) )-2`ellT%,,-???@2` xx%2`
 
c5<<	 2`r   r   c                   <     e Zd Zej                  f fd	Zd Z xZS )FbgemmFp8Linearc                 r   t         |   |||       || _        || _        t        j
                  j                  t	        j                  ||f|            | _        t        j
                  j                  t	        j                  |dft        j                              | _
        | j                  dt	        j                  dgt        j                        d       |rRt        j
                  j                  t	        j                  | j                  t        j                              | _        y d | _        y )Ndtyper   input_scale_ubF
persistent)superr   in_featuresout_featuresr+   r   r,   zerosweightfloat32r9   register_bufferfloatbias)r   rM   rN   rT   rH   	__class__s        r   r   zFbgemmFp8Linear.__init__f   s    lD9&(hh((lK5PX])^_!HH..u{{L!;LTYTaTa/bc-u{{A3ekk/R_de**5;;8I8IRWR_R_+`aDIDIr   c                    g |j                   d d d}t        |j                  d|j                   d         j                         | j                        \  }}| j
                  j                  t        j                        }t        ret        j                  || j                  j                         |j                  d      |j                         |j                  | j                        }nUt        j                   j"                  j%                  || j                  ||d      }| j                  || j                  z   n|}|j                  |j&                        }|j)                  |      }~~|S )Nr    )scale_ub)scale_ascale_b	out_dtyperT   Tuse_fast_accum)r(   r*   r-   
contiguousrI   r9   tor+   rQ   _is_torch_xpu_available
_scaled_mmrP   t	unsqueezerH   rT   opsfbgemmf8f8bf16_rowwisedevicer)   )r   xoutput_shapex_quantizedx_scaleweight_scale_float32outputs          r   forwardzFbgemmFp8Linear.forwardt   s0   *"*r*  4AFF2qwwr{4K4V4V4XcgcvcvwW
  $0033EMMB"%%))"-,..0''YYF YY%%66T[['3GX\ 7 F ,099+@Vdii'fF188$-r   )r;   r<   r=   r+   float8_e4m3fnr   rm   __classcell__rU   s   @r   rE   rE   e   s    >C>Q>Q r   rE   c                   <     e Zd Zej                  f fd	Zd Z xZS )r   c                 n   t         |           |j                  | _        |j                  | _        |j
                  | _        | j                  | _        t        |j                     | _	        t        j                  j                  t        j                  | j                  | j
                  d| j                  z  ft        j                              | _        t        j                  j                  t        j                  | j                  d| j                  dz  ft        j                               | _        t        j                  j                  t        j                  | j                  | j                  | j
                  ft        j                              | _        t        j                  j                  t        j                  | j                  | j
                  dft        j                               | _        | j)                  dt        j                  dgt        j*                        d       y )Nr   rG   r   rI   FrJ   )rL   r   num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimr   
hidden_actact_fnr+   r   r,   rO   rn   r   rQ   gate_up_proj_scaler!   down_proj_scalerR   rS   )r   configrH   rU   s      r   r   z#FbgemmFp8Llama4TextExperts.__init__   s   !33!'!9!9!--00V../!HH..KK))4+;+;Q=PQY^YlYlm
 #((("4"4KK))1doo.AB%--X#
 ++KK))4??D<L<LMUZUhUhi
  %xx11KK))4+;+;Q?u}}U 
 	-u{{A3ekk/R_der   c           
      	   |j                  | j                  d| j                        }d}t        j                  |      }t        | j                        D ]D  }||   }|j                  d| j                        }t        ||| j                        \  }}| j                  j                  d   dz  }	| j                  j                  t        j                        }
t        rJt        j                  || j                  |   j!                  dd      d|	 j#                         j%                         |j'                  d      |
|   d   d|	 j                  dd      j#                         j%                         |j(                        }t        j                  || j                  |   j!                  dd      |	d j#                         j%                         |j'                  d      |
|   d   |	d j                  dd      j#                         j%                         |j(                        }nt        j*                  j,                  j/                  || j                  |   j!                  dd      d|	 j#                         ||
|   d   d|	 j                  dd      j#                         d      }t        j*                  j,                  j/                  || j                  |   j!                  dd      |	d j#                         ||
|   d   |	d j                  dd      j#                         d      }|| j1                  |      z  }t        ||| j                        \  }}| j2                  j                  t        j                        }t        rt        j                  || j4                  |   j!                  dd      j#                         |j'                  d      ||   j                  dd      j#                         j%                         |j(                        }nzt        j*                  j,                  j/                  || j4                  |   j!                  dd      j#                         |||   j                  dd      j#                         d      }|||<   G |j                  |j6                        }|j                  d| j                        S )	z
        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
        Returns:
            torch.Tensor: (batch_size * token_num, hidden_size)
        r    Nr   r   r   )rX   rY   rZ   Tr[   )r-   rt   rv   r+   
empty_likeranger)   r*   rI   r   r(   rz   r^   rQ   r_   r`   r'   r]   ra   rb   rH   rc   rd   re   ry   r{   r!   rf   )r   hidden_states
num_tokensnext_statesiexpert_hiddenexpert_hidden_reshapedexpert_quantizedexpert_scalesharded_expert_dimgate_up_proj_scale_float32gateup	activatedactivated_quantizedactivated_scaledown_proj_scale_float32expert_outputs                     r   rm   z"FbgemmFp8Llama4TextExperts.forward   st    &**4+;+;RAQAQR
 &&}5t''( @	+A)!,M%2%:%:2t?O?O%P"-A&
D4G4G.*l "&!2!2!8!8!<!A)-)@)@)C)CEMM)R&&''$%%a(221a89L:LMXXZ\\^(22266q9!<=P>PQVVWY[\]hhjlln+11 %%$%%a(221a89K9LMXXZ\\^(22266q9!<=O=PQVVWY[\]hhjlln+11 yy''88$%%a(221a89L:LMXXZ .q1!45H6HINNrSTU``b#' 9  YY%%66$%%a(221a89K9LMXXZ .q1!45G5HINNrSTU``b#' 7  T[[..I3G	S]_c_r_r3s0&*&:&:&=&=emm&L#& % 0 0'NN1%//15@@B+55b93A6;;BBMMOQQS+11! !&		 0 0 A A'NN1%//15@@B#+A.33B:EEG#' !B ! +KNA@	+B "nn]%9%9:D$4$455r   )r;   r<   r=   r+   rQ   r   rm   ro   rp   s   @r   r   r      s    %*]] f0P6r   r   r   )maxsizec                      t         rddlm}   | d      j                  S t        j
                  j                  j                  S )Nr   
get_kernelzkernels-community/fp8-fbgemm)r_   hub_kernelsr   r*   r+   rc   rd   r   s    r   get_quantize_fp8_per_rowr      s0    +89NNN99000r   modules_to_not_convertc                 t   t               ad}|ri nddi}| j                         D ]  \  }}t        ||      sd}	t	        d      5  |j
                  j                  dk(  r;t        | j                  d| j                        }
t        |
xs | j                        }	nYt        |t        j                        r?t        |j                  |j                  |j                   dufi |}	|	j#                  d       ddd       |	| j%                  ||	       d} |st&        j)                  d       | S # 1 sw Y   ;xY w)	a  
    A helper function to replace all `torch.nn.Linear` modules by `FbgemmFp8Linear` modules.
    This will enable running your models using high performance fp8 kernel from FBGEMM library.

    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`):
            Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons.
        quantization_config (`FbgemmFp8Config`):
            The quantization config object that contains the quantization parameters.
        pre_quantized (`book`, defaults to `False`):
            Whether the model is pre-quantized or not
    FrH   NT)include_buffersLlama4TextExpertstext_configzYou are loading your model using FP8 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)r   r*   named_modulesr   r   rU   r;   getattrr|   r   r&   r   LinearrE   rM   rN   rT   requires_grad_set_submoduleloggerwarning)r   r   quantization_configpre_quantizedtp_planhas_been_replacedmodule_kwargsmodule_namer1   
new_moduler   s              r   replace_with_fbgemm_fp8_linearr   	  s;   $ 45'Bgt_M$224 !V$[2HI
5 	1((,??
 &ellM5<<P78Su||T
FBII.,&&''KKt+ $	
 ))%0	1" K4 5!8 	
 L=	1 	1s   B.D..D7	)NNFN)"	functoolsr   activationsr   core_model_loadingr   quantizers.quantizers_utilsr   r   utilsr	   r
   r   r   r   r+   r   
accelerater   r_   fbgemm_gpu.experimental.gen_ai
fbgemm_gpu
get_loggerr;   r   r   r   rE   rB   r   r   rA   r?   r   rC   r   r   <module>r      s        . U  -02 %<)			H	%6` 6`r,ehhoo ,^i6 i6X 11 1 tx:#'9t#3:r   