
    qiL1                         d dl mZ ddlmZ erddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZ  e	       r
d dlZdd	lmZ  ej"                  e      ZdZ G d
 de      Zy)    )TYPE_CHECKING   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameN)WeightConverterc                        e Zd ZdZdZ fdZd Zd Zddded	e	fd
Z
ddZ	 dddde	fdZd Zd Zd Zd Zed	e	fd       Zd Zd Z xZS )Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    Fc                 4    t        |   |fi | d | _        y N)super__init__triton_kernels_hub)selfquantization_configkwargs	__class__s      Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   zMxfp4HfQuantizer.__init__0   s    ,77"&    c                     | j                    	 ddlm}  |d      | _         | j                   S | j                   S # t        $ r t        d      w xY w)z3Lazy import and initialize kernels only when neededr   )
get_kernelz(kernels-community/gpt-oss-triton-kernelsz2kernels package is required for MXFP4 quantization)r   integrations.hub_kernelsr   ImportError)r   r   s     r   _lazy_import_kernelsz%Mxfp4HfQuantizer._lazy_import_kernels4   s]    ""*XA*45_*`' &&&t&&&  X!"VWWXs	   9 Ac                    t               st        d      | j                  j                  ry t	               st        d      t
        j                  j                         xs t        j                  d      }|j                  dvrF| j                  r+t        j                  d| d       d| j                  _        y t        d| d	      t
        j                  j                         rd}t!        d
      xr
 t#               }nt
        j$                  j                         r;t
        j$                  j'                         }|dk\  }t!        d      xr
 t#               }n-|j                  dk(  rd}t!        d
      xr
 t#               }nd}d}| j                  rR|s't        j                  d       d| j                  _        y |sAt        j                  d       d| j                  _        y |st)        d      |st)        d      | j                  s| j+                          |j-                  d      }|<t/        |t0              r+| j                  sd|j3                         v rt)        d      y y y y )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z9Using mxfp4 requires Accelerate: `pip install accelerate`cpu)cudaxpur"   zGUsing MXFP4 quantized models requires model on cuda/xpu/cpu, but found zj, we will default to dequantizing the model to bf16. To use mxfp4, please disable the current accelerator.TzIQuantizing a model using MXFP4 requires model on cuda/xpu/cpu, but found z7. To use mxfp4, please disable the current accelerator.z3.5.0)      z3.4.0Fu   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) or CPUzyMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU/CPU requires Triton >= 3.5.0
device_mapdiskzYou are attempting to load an FP4 model with a device_map that contains a disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the disk device from the device_map.)r
   r   r   
dequantizer   torchacceleratorcurrent_acceleratordevicetypepre_quantizedloggerwarning_onceRuntimeErrorr$   is_availabler   r	   r#   get_device_capability
ValueErrorr    get
isinstancedictvalues)r   argsr   r-   is_device_supported_mxfp4kernels_availablecompute_capabilityr'   s           r   validate_environmentz%Mxfp4HfQuantizer.validate_environment?   sp   !#] 
 ##..&(YZZ""668OELL<O;;44!!##]^d]e  fP  Q 7;((3"_`f_g  h_  `  99!!#(,% 3G < WAUAWZZ$$&!&!A!A!C(:f(D% 3G < WAUAW[[E!(,% 3G < WAUAW(-% %,##I 7;((3$##  7;((3* x  # L  !!%%'ZZ-
!jT&B%%&J4E4E4G*G g  +H% 'C!r   modelr   
param_namereturnc                 R    ddl m} t        ||      \  }}t        ||      r|dv ryyy)Nr   Mxfp4GptOssExperts)down_proj_biasgate_up_proj_biasFT)integrationsrD   r   r7   )r   r?   r@   r   rD   moduletensor_names          r   param_needs_quantizationz)Mxfp4HfQuantizer.param_needs_quantization   s3    525*Ef01EEr   c                     t         j                  j                         rt         j                  j                          y t         j                  j                         rt         j                  j                          y y r   )r*   r#   r3   empty_cacher$   )r   r?   r   s      r   #_process_model_after_weight_loadingz4Mxfp4HfQuantizer._process_model_after_weight_loading   sG    ::""$JJ""$YY##%II!!# &r   use_kernelsc                    ddl m} t        j                  j	                         xs t        j
                  d      }|r4|j                  dvr&t        j                  d       d| j                  _
        |s4|j                  dv r&t        j                  d       d| j                  _
        | j                  || j                  j                  |j                        | _         ||| j                  | j                        }y )	Nr   )replace_with_mxfp4_linearr"   )r"   zYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseTzMXFP4 inference on CPU requires use_kernels=True, but use_kernels is disabled. We will dequantize the model to bf16. To run MXFP4 natively on CPU, please set use_kernels=True.)modules_to_not_convertr   )rG   rP   r*   r+   r,   r-   r.   r0   r1   r   r)   get_modules_to_not_convertrQ   _keep_in_fp32_modules)r   r?   rN   r   rP   r-   s         r   $_process_model_before_weight_loadingz5Mxfp4HfQuantizer._process_model_before_weight_loading   s     	= ""668OELL<O6;;g5e 37D$$/v{{g5s 37D$$/&*&E&E4++BBED_D_'
# *$*E*E[_[s[s
r   c                     d|j                   j                  v r-t        |dd        |j                  j	                  ddddd       |S )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrrW   updater   configs     r   update_tp_planzMxfp4HfQuantizer.update_tp_plan   R    V--666v3T:F))00DRDRAOAO	 r   c                     d|j                   j                  v r-t        |dd        |j                  j	                  ddddd       |S )NrV   base_model_ep_planrX   rY   )r   rZ   r[   rb   r\   r]   s     r   update_ep_planzMxfp4HfQuantizer.update_ep_plan   r`   r   c                 0   ddl m} |j                         }t        |j                  dd      }t        |j                  dd      }|j                         D ]9  \  }}t        ||      rt        |d      rt        |d      s,d	D ]  }t        ||      }	t        || d
      }
|	j                  j                  j                  |	j                  j                        j                  dd      }|dk(  r|j                  |ddd      }n|j                  ||dd      }|
j                  j                  j                  j                  |
j                  j                  j                        j                  dd      }||| d| d<   ||| d| d<    < i }||fS )Nr   rC   num_local_experts    hidden_sizei@  gate_up_proj	down_proj)rh   ri   _precision_configZ      ._blocks_scales)rG   rD   
state_dictr[   r^   named_modulesr7   hasattrstoragelayoutunswizzle_datadata	transposereshapeweight_scale)r   r?   rD   rr   re   rg   namerH   projtriton_tensorprecision_configblocksscalesmetadatas                 r   get_state_dict_and_metadataz,Mxfp4HfQuantizer.get_state_dict_and_metadata   s   5%%'
#ELL2ErJellM4@!//1 	=LD&6#56FN3FK05 = ' 5#*6dV;L3M#N &..55DD]EZEZE_E_`jjkmoqr>)#^^,=r2rJF#^^,={BPRSF)66>>EETT$1199>>)B#  7=
dV1TF'236<
dV1TF'23=	=2 8##r   c                      y)NT r   s    r   is_serializablez Mxfp4HfQuantizer.is_serializable   s    r   c                 .    t         j                  d       y)NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r0   r1   r   s    r   is_trainablezMxfp4HfQuantizer.is_trainable   s     x	
 r   c                     ddl m}  ||       S )Nr   )Mxfp4Quantize)integrations.mxfp4r   )r   r   s     r   get_quantize_opsz!Mxfp4HfQuantizer.get_quantize_ops  s    6T""r   c                    ddl m}m} | j                  rE| j                  j
                  r/t        ddgd ||       g      t        ddgd	g ||       g      gS t        ddgd	 ||       g      t        ddgd ||       g      gS )
Nr   )Mxfp4DequantizeMxfp4Deserializedown_proj_blocksdown_proj_scalesz
down_proj$)source_patternstarget_patterns
operationsgate_up_proj_blocksgate_up_proj_scaleszgate_up_proj$)r   r   r   r/   r   r)   r   )r   r   r   s      r   get_weight_conversionsz'Mxfp4HfQuantizer.get_weight_conversions  s    J$":":"E"E%79K$L$1 / 56
  %:<Q$R%4$5 / 56  !68M N 0,T23
 !35G H -,T23
 	
r   )r?   r   )F)rZ   
__module____qualname____doc__requires_calibrationr   r    r>   strboolrJ   rM   rT   r_   rc   r   r   propertyr   r   r   __classcell__)r   s   @r   r   r   )   s     !'	'M^.? S _c $ "
 
 
B!$F d  #

r   r   )typingr   baser   modeling_utilsr   utilsr   r	   r
   r   r   quantizers_utilsr   r*   core_model_loadingr   
get_loggerrZ   r0   r   r   r   r   r   <module>r      sX    !  0  3 4			H	% 
{ 
r   