
    qi1                     $   d dl mZmZ d dlmZmZ ddlmZmZ ddl	m
Z
mZ ddlmZ erddlmZ  e       rd d	lZd d
lmZ neZ ej(                  e      ZdefdZd Z G d de      Z G d de      Zdeej8                  ej:                  gdiZy	)    )ABCabstractmethod)TYPE_CHECKINGAny   )is_torch_availablelogging)QuantizationConfigMixinQuantizationMethod   )get_module_from_namePreTrainedModelN)
ModuleListreturnc                 6   t               }t        | j                        dkD  rGt        | j                  j                               t        | j                  j	                               z  }t        | j                               d   d   h}| j                         }| j                         D ch c]   \  }}|t        |      t        |      k(  r|" }}}||z  |z  }t        |D ch c]  }|j                  d       c}      }t        |      S c c}}w c c}w )z
    Function to automatically detect keys to not convert for usage like quantization. For example for CausalLM modules
    we may want to keep the lm_head in full precision for numerical stability reasons.
    r   z.weight)setlenall_tied_weights_keysvalueskeyslistnamed_parametersget_output_embeddingsnamed_modulesidremovesuffix)	model	tied_keyslast_module_keyoutput_emb_modulenamemoduleoutput_emb_keysmodules_to_not_convertks	            N/opt/pipecat/venv/lib/python3.12/site-packages/transformers/quantizers/base.pyget_keys_to_not_convertr)   "   s    I
5&&'!+33::<=ED_D_DdDdDf@gg	 E2245b9!<=O 335 "//1D&(RZ2>O;P-P 	O 
 '8?J!F\"]1>>)#<"]^&'' #^s   -%D&Dc                 v    ddl m} | j                         D ]   }t        ||      sd|j                  _        " y )Nr   r   T)modeling_utilsr   modules
isinstanceconfig_is_quantized)r   r   r$   s      r(   _assign_is_quantizedr0   =   s0    0--/ /fo.*.FMM'/    c            
          e Zd ZdZdZdefdZd*dZdee	e
f   dz  dee	e
f   dz  fd	Zd
dde	dddefdZdee	ee	z  f   dee	ee	z  f   fdZd
dde	defdZd Zd Zd Zd Zd+d,dZd,dZd,dZd Zd+dZd+dZde	de	fdZe	 	 	 d-d
ddee	   dz  dee	   dz  d efd!       Ze defd"       Z!e defd#       Z"d$ Z#e$d%        Z%e e$d&               Z&d' Z'd( Z(d) Z)y).HfQuantizera  
    Abstract class of the HuggingFace quantizer. Supports for now quantizing HF transformers models for inference and/or quantization.
    This class is used only for transformers.PreTrainedModel.from_pretrained and cannot be easily used outside the scope of that method
    yet.

    Attributes
        quantization_config (`transformers.utils.quantization_config.QuantizationConfigMixin`):
            The quantization config that defines the quantization parameters of your model that you want to quantize.
        requires_calibration (`bool`):
            Whether the quantization method requires to calibrate the model before using it.
    Fquantization_configc                     || _         |j                  dd      | _        | j                  s&| j                  rt	        d|j
                   d      y y )Npre_quantizedTzThe quantization method z does require the model to be pre-quantized. You explicitly passed `pre_quantized=False` meaning your model weights are not quantized. Make sure to pass `pre_quantized=True` while knowing what you are doing.)r4   popr6   requires_calibration
ValueErrorquant_method)selfr4   kwargss      r(   __init__zHfQuantizer.__init__T   s`    #6 #ZZ>!!d&?&?*+>+K+K*L MN O  '@!r1   r   c                     |S )aO  
        Some quantization methods require to explicitly set the dtype of the model to a
        target dtype. You need to override this method in case you want to make sure that behavior is
        preserved

        Args:
            dtype (`torch.dtype`):
                The input dtype that is passed in `from_pretrained`
         )r;   dtypes     r(   update_dtypezHfQuantizer.update_dtype_   s	     r1   
device_mapNc                     |S )a  
        Override this method if you want to pass a override the existing device map with a new
        one. E.g. for bitsandbytes, since `accelerate` is a hard requirement, if no device_map is
        passed, the device_map is set to `"auto"``

        Args:
            device_map (`Union[dict, str]`, *optional*):
                The device_map that is passed through the `from_pretrained` method.
        r?   )r;   rB   s     r(   update_device_mapzHfQuantizer.update_device_mapk   s
     r1   r   r   
param_nameparamtorch.Tensorc                 "    |j                         S N)element_size)r;   r   rE   rF   s       r(   param_element_sizezHfQuantizer.param_element_sizew   s    !!##r1   
max_memoryc                     |S )zaadjust max_memory argument for infer_auto_device_map() if extra memory is needed for quantizationr?   )r;   rL   s     r(   adjust_max_memoryzHfQuantizer.adjust_max_memoryz   s    r1   c                      y)zD
        Check whether a given param needs to be quantized.
        Fr?   )r;   r   rE   r<   s       r(   param_needs_quantizationz$HfQuantizer.param_needs_quantization~   s     r1   c                      y)a&  
        This method is used to potentially check for potential conflicts with arguments that are
        passed in `from_pretrained`. You need to define it for all future quantizers that are integrated with transformers.
        If no explicit check are needed, simply return nothing.
        Nr?   )r;   argsr<   s      r(   validate_environmentz HfQuantizer.validate_environment   s     	r1   c                     |S z"updates the tp plan for the scalesr?   r;   r.   s     r(   update_tp_planzHfQuantizer.update_tp_plan       r1   c                     |S rU   r?   rV   s     r(   update_ep_planzHfQuantizer.update_ep_plan   rX   r1   c                     |S rI   r?   r;   r   r<   s      r(   $_process_model_before_weight_loadingz0HfQuantizer._process_model_before_weight_loading       r1   c                     d|_         | j                  j                  |_        | j                  r| j                  |        | j                  |fi | y)aQ  
        Setting model attributes and/or converting model before weights loading. At this point
        the model should be initialized on the meta device so you can freely manipulate the skeleton
        of the model in order to replace modules in-place. Make sure to override the abstract method `_process_model_before_weight_loading`.

        Args:
            model (`~transformers.PreTrainedModel`):
                The model to quantize
            kwargs (`dict`, *optional*):
                The keyword arguments that are passed along `_process_model_before_weight_loading`.
        TN)is_quantizedr4   r:   quantization_methodr6   _convert_model_for_quantizationr]   )r;   r   r@   r<   s       r(   preprocess_modelzHfQuantizer.preprocess_model   sL     "$($<$<$I$I!007111%B6Br1   c                     |S rI   r?   r\   s      r(   #_process_model_after_weight_loadingz/HfQuantizer._process_model_after_weight_loading   r^   r1   c                     | j                   |j                  _         | j                  r)t        | j                   dd      r| j	                  |       nt        |        | j                  |fi |S )a  
        Post-process the model post weights loading.
        Make sure to override the abstract method `_process_model_after_weight_loading`.

        Args:
            model (`~transformers.PreTrainedModel`):
                The model to quantize
            kwargs (`dict`, *optional*):
                The keyword arguments that are passed along `_process_model_after_weight_loading`.
        
dequantizeF)r4   r.   r6   getattrremove_quantization_configr0   re   r\   s      r(   postprocess_modelzHfQuantizer.postprocess_model   s_     ,0+C+C('$*B*BLRW"X++E2 '7t77HHHr1   c                     t        |d      r|`t        |j                  d      r|j                  `t        |d      r|`d|_        y)z@
        Remove the quantization config from the model.
        hf_quantizerr4   ra   FN)hasattrrl   r.   r4   ra   r`   r;   r   s     r(   ri   z&HfQuantizer.remove_quantization_config   sF     5.)"5<<!6705/0)"r1   c                 ~    ||j                   j                  }| j                  ||      }| j                  |       |S )z
        Potentially dequantize the model to retrieve the original model, with some loss in accuracy / performance.
        Note not all quantization schemes support this.
        )r@   )r.   r@   _dequantizeri   r;   r   r@   s      r(   rg   zHfQuantizer.dequantize   sB    
 = LL&&E  e 4''.r1   c                 F    t        | j                  j                   d      )NzH has no implementation of `dequantize`, please raise an issue on GitHub.NotImplementedErrorr4   r:   rq   s      r(   rp   zHfQuantizer._dequantize   s'    !''4455}~
 	
r1   c                     |S )zN
        Override this method if you want to adjust the `param_name`.
        r?   )r;   rE   s     r(   get_param_namezHfQuantizer.get_param_name   s
     r1   skip_moduleskeep_in_fp32_modulesadd_default_skipsc                     ||rt        |       }ng }||j                  |       ||j                  |       t        t        |            }|S rI   )r)   extendr   r   )r   rw   rx   ry   r&   s        r(   get_modules_to_not_convertz&HfQuantizer.get_modules_to_not_convert   s^     #4%<U%C"%'"#")),7+"))*>?!%c*@&A!B%%r1   c                      y)zUFlag indicating whether the quantized model can carry out quantization aware trainingFr?   r;   s    r(   is_qat_trainablezHfQuantizer.is_qat_trainable        r1   c                      y)z;Flag indicating whether the quantized model can be compiledFr?   r~   s    r(   is_compileablezHfQuantizer.is_compileable   r   r1   c                 
    di fS )zcGet state dict and metadata. Useful when we need to modify a bit the state dict due to quantizationNr?   rn   s     r(   get_state_dict_and_metadataz'HfQuantizer.get_state_dict_and_metadata  s    Rxr1   c                      y rI   r?   r~   s    r(   is_serializablezHfQuantizer.is_serializable	  s    "r1   c                      y rI   r?   r~   s    r(   is_trainablezHfQuantizer.is_trainable  s    r1   c                    |j                         D ]  \  }}|j                  j                  }|t        v s%| j                  j
                  t        |   d   v sHt        j                  d      5  t        ||      \  }}t        |   d   |j                  j                               |j                  |<   d d d         y # 1 sw Y   xY w)Nquantization_methodsmetamodule_name)r   	__class____name__!MODULES_TO_PATCH_FOR_QUANTIZATIONr4   r:   torchdevicer   r.   get_text_config_modules)r;   r   r#   r$   module_class_nameparent_modules         r(   rb   z+HfQuantizer._convert_model_for_quantization  s    !//1 
	LD& & 0 0 9 9 $EE((5545FGH^_` \\&) *>ud*K'M43TUf3ghu3v4464M**40 
	 s   .AB??C	c                 F    t        | j                  j                   d      )Nz1 is not available yet and will be supported soon.rs   r~   s    r(   get_quantize_opszHfQuantizer.get_quantize_ops  s'    !''4455fg
 	
r1   c                     g S rI   r?   r~   s    r(   get_weight_conversionsz"HfQuantizer.get_weight_conversions"  s    	r1   )r@   torch.dtyper   r   rI   )r   r   )NNF)*r   
__module____qualname____doc__r8   r
   r=   rA   dictstrr   rD   floatrK   intrN   boolrP   rS   rW   rZ   r]   rc   re   rj   ri   rg   rp   rv   staticmethodr   r|   propertyr   r   r   r   r   r   rb   r   r   r?   r1   r(   r3   r3   E   s   
 !	,C 	

DcNT,A 
d3PS8nW[F[ 
$(9 $s $Sa $fk $DcCi,@ T#sUXy.EY .? S _c C$I(
#

    *.15"'	& &3i$&& #3i$.&  	& &* $      " "  

r1   r3   c                   2     e Zd ZdZ fdZ	 	 	 	 ddZ xZS )SequentialLlama4TextExpertsz
    A module that implements a compressed version of a list of expert modules.
    This is specifically designed to work with Llama4TextExperts in MoE layers.
    c                     ddl m} t        |   t	        |j
                        D cg c]
  } ||       c}       |j
                  | _        y c c}w )Nr   )Llama4TextMLP)*transformers.models.llama4.modeling_llama4r   superr=   rangenum_local_expertsnum_experts)r;   r.   r   _r   s       r(   r=   z$SequentialLlama4TextExperts.__init__,  sB    Lv?W?W9XYA-/YZ!33 Zs   Ac                     |j                  | j                  d|j                  d         }t        j                  |      }t        | j                        D ]  } | |   ||         ||<    |S )Nr   )reshaper   shaper   
zeros_liker   )r;   hidden_states
routed_out
expert_idxs       r(   forwardz#SequentialLlama4TextExperts.forward2  sw     &--d.>.>MDWDWXZD[\%%m4
 0 01 	QJ%5T*%5mJ6O%PJz"	Qr1   )r   rG   r   rG   )r   r   r   r   r=   r   __classcell__)r   s   @r(   r   r   &  s$    
4% 
r1   r   Llama4TextExperts)r   r   )abcr   r   typingr   r   utilsr   r	   utils.quantization_configr
   r   quantizers_utilsr   r+   r   r   torch.nnr   r   
get_logger__file__loggerr   r)   r0   r3   r   COMPRESSED_TENSORSBITS_AND_BYTESr   r?   r1   r(   <module>r      s    $ % / S 2 0#J			H	%(d (6/^# ^B* 0 211--!
% !r1   