
    qi%                         d dl mZ ddlmZ erddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlmZ  e
       rd dlZ ej"                  e      Z G d	 d
e      Zy)    )TYPE_CHECKING   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_kernels_availableis_torch_availableis_torch_cuda_availableis_torch_xpu_availablelogging)get_module_from_nameNc                        e Zd ZdZdZ fdZd ZddZddd	ede	fd
Z
ddd	edddef fdZ	 	 ddZd Zd Zd Zede	fd       Zd Z xZS )FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    Fc                 &    t        |   |fi | y )N)super__init__)selfquantization_configkwargs	__class__s      ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   zFbgemmFp8HfQuantizer.__init__/   s    ,77    c                 N   t               st               st        d      t               rt               st        d      t               rt	               st        d      t               st        d      t               r3t        j                  j                         }|\  }}|dk  rt        d      |j                  d      }|t        j                  d       y t        |t              r=| j                  s0d	|j!                         v sd
|j!                         v rt        d      y y y )Nz3Using fbgemm fp8 quantization requires a GPU or XPUz@Using FP8 fbgemm on XPU requires kernels (`pip install kernels`)zLoading an FP8 fbgemm quantized model on CUDA requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librarieszWLoading an FP8 quantized model requires accelerate (`pip install --upgrade accelerate`)	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu' or 'auto'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r   ImportErrorr
   r	   r   torchcudaget_device_capability
ValueErrorgetloggerwarning_once
isinstancedictpre_quantizedvalues)r   argsr   compute_capabilitymajor_r   s          r   validate_environmentz)FbgemmFp8HfQuantizer.validate_environment2   s&   &(1G1ISTT!#,@,B`aa"$-D-FF  '(i  #$!&!A!A!C)HE1qy n  ZZ-
S 
D)%%5J4E4E4G+G6U_UfUfUhKh n  Li% *r   returnc                 ~    |t         j                  k7  r)t        j                  d| d       t         j                  }|S )NzSetting dtype to zP, but only bfloat16 is supported right now. Overwriting torch_dtype to bfloat16.)r!   bfloat16r&   r'   )r   dtypes     r   update_dtypez!FbgemmFp8HfQuantizer.update_dtypeV   s9    ENN"#E7*z{ NNEr   modelr   
param_namec                     ddl m}m} t        ||      \  }}t	        ||      r| j
                  s|dk(  ryyt	        ||      r| j
                  s|dk(  ryyy)Nr   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsbiasFT)integrationsr:   r;   r   r(   r*   )r   r6   r7   r   r:   r;   moduletensor_names           r   param_needs_quantizationz-FbgemmFp8HfQuantizer.param_needs_quantization^   sZ    N25*Efo.!![F%:f89!![F%:r   paramztorch.Tensorc                 L    | j                  ||      ryt        | 	  |||      S )z4Return the element size (in bytes) for `param_name`.r   )r@   r   param_element_size)r   r6   r7   rA   r   s       r   rC   z'FbgemmFp8HfQuantizer.param_element_sizeo   s*    ((
;w)%UCCr   c                     ddl m} | j                  || j                  j                  |j
                        | _         ||| j                  | j                  | j                  |j                        }y )Nr   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r*   tp_plan)r=   rE   get_modules_to_not_convertr   rF   _keep_in_fp32_modulesr*   _tp_plan)r   r6   r   rE   s       r   $_process_model_before_weight_loadingz9FbgemmFp8HfQuantizer._process_model_before_weight_loadingv   se    
 	B&*&E&E4++BBED_D_'
# /#'#>#> $ 8 8,,NN
r   c                     ddl m}m} |j                         D ]M  }t	        |||f      st        |d      s|j                  j                  | j                  j                         O |S )z
        Force update the input scale upper bound after weight loading and device dispatch are complete.
        This resolves issues where persistent buffers are zeroed out or overwritten during the loading process.
        r   r9   input_scale_ub)
integrations.fbgemm_fp8r:   r;   modulesr(   hasattrrM   fill_r   activation_scale_ub)r   r6   r   r:   r;   ms         r   #_process_model_after_weight_loadingz8FbgemmFp8HfQuantizer._process_model_after_weight_loading   sa    
 	Z 	YA!o/IJK1./$$**4+C+C+W+WX		Y
 r   c                    d|j                   j                  v rni ddddddddddddd	d
dddddddddddddddd
ddddddd
dddd}|j                         ||j                         _        |S ||_        |S |S )NLlama4z layers.*.self_attn.q_proj.weightcolwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightrowwisezlayers.*.input_layernorm.weightsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalepacked_rowwise)z.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__get_text_configbase_model_tp_plan)r   config	text_plans      r   update_tp_planz#FbgemmFp8HfQuantizer.update_tp_plan   sP   v''000! 3I	!
 9)! 3I! 9)! 3I! 9)! 3I! 23F! ;<O! 2!$ G	%!& Mi'!( Ei)!* KI+!, G	-!. CI/!0 I)1!2 CLHQDM ?ODT;DA!ID %%'3>G&&(; M -6)Mr   c                      y)NT r   s    r   is_serializablez$FbgemmFp8HfQuantizer.is_serializable   s    r   c                      y)NFrb   rc   s    r   is_trainablez!FbgemmFp8HfQuantizer.is_trainable   s    r   c                     ddl m}  ||       S )Nr   )FbgemmFp8Quantize)rN   rh   )r   rh   s     r   get_quantize_opsz%FbgemmFp8HfQuantizer.get_quantize_ops   s    ? &&r   )r4   torch.dtyper1   rj   )r6   r   )r[   
__module____qualname____doc__requires_calibrationr   r0   r5   strboolr@   floatrC   rK   rT   r`   rd   propertyrf   ri   __classcell__)r   s   @r   r   r   (   s     !8"H.? S _c "D(9 Ds DSa Dfk D
 
&*X d  'r   r   )typingr   baser   modeling_utilsr   utilsr   r	   r
   r   r   r   r   quantizers_utilsr   r!   
get_loggerr[   r&   r   rb   r   r   <module>rz      sT    !  0   3 			H	%e'; e'r   