
    qi
%                        d dl mZ d dlmZ ddlmZmZ ddlmZ ddl	m
Z
 ddlmZ  e       rd d	lZerdd
lmZ  ej                   e      Z G d de
      Zy	)    )annotations)TYPE_CHECKING   )is_torch_availablelogging)
SinqConfig   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                       e Zd ZU dZdZded<   d fdZddZedd       Z	d Z
dd	Zdd
ZddZddZd Zd Z	 d	 	 	 ddZ	 	 ddZ xZS )SinqHfQuantizera  
    HF v5 quantizer for SINQ.

    Modes:
      - method="sinq" (default):
          * weight-only SINQ
          * param-level ConversionOps (`SinqQuantize`) during load for pure language models
            (each Linear.weight is turned into a SINQLinear module)
          * module-level quantization after load for multimodal models
      - method="asinq":
          * A-SINQ (activation-aware) SINQ quantization
    Tbool requires_parameters_quantizationc                B    t        |   |fi | d | _        d| _        y )NF)super__init___normalized_device_str_do_param_level_sinq)selfquantization_configkwargs	__class__s      X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_sinq.pyr   zSinqHfQuantizer.__init__1   s&    ,7726#*/!    c                     yNT r   s    r   is_serializablezSinqHfQuantizer.is_serializable7   s    r   c                     yr   r   r   s    r   is_trainablezSinqHfQuantizer.is_trainable:   s    r   c                    |\t         j                  j                         r!dt         j                  j                         i}nddi}t        j                  d| d       |S )N cpuz:The device_map was not initialized. Setting device_map to zJ. If you want to use the model for inference, please set device_map='auto')torchcudais_availablecurrent_deviceloggerinfo)r   
device_maps     r   update_device_mapz!SinqHfQuantizer.update_device_map>   sb    zz&&( %**";";"=>
 %[
KK))3 5[[
 r   c                8    |t         j                  }|| _        |S N)r&   bfloat16dtype)r   r1   s     r   update_dtypezSinqHfQuantizer.update_dtypeK   s    =NNE
r   c                   ddl m}  |       st        d      t        j                  j                         st        j                  d       |j                  d      }t        |t              r?t        |j                               }t        |      dkD  rt        dt        |       d      | j                   j"                  d	k(  r| j$                  st'        d
      y y )Nr   )is_sinq_availablezMThe 'sinq' package is not installed. Please install it with: pip install sinqzNo CUDA device is available. Quantization and inference will run on the CPU. Please note that this will significantly slow down inference speed and increase quantization time.r,   r	   zkSinqHfQuantizer: multi-GPU device_map detected, but SINQ currently supports only a single CUDA device. Got z. Please use device_map=None.asinqzYou are using `method='asinq'` in the quantization config. Right now the calibrated version of SINQ is not supported in Hugging Face, please refer and use the official SINQ repository `to quantize a model with this method. )utilsr4   ImportErrorr&   r'   r(   r*   warningget
isinstancedictsetvalueslenRuntimeErrorsortedr   methodpre_quantized
ValueError)r   argsr   r4   r,   device_map_valuess         r   validate_environmentz$SinqHfQuantizer.validate_environmentQ   s    - "mnnzz&&(NN B ZZ-
j$' #J$5$5$7 8$%)"##)*;#<"==Z\ 
 ##**g5d>P>P:  ?Q5r   c                    ddl m} |j                  } |t        |j                        |j
                  t        |j
                        ndddddt        |j                        |      S )zI
        Build the dict that SINQLinear expects as quant_config.
        r   )sinq_base_quant_configNFr	   )nbits
group_size
quant_zeroquant_scaleview_as_floataxistiling_moderA   )sinq.sinqlinear_hfrH   rA   intrI   rJ   strrO   )r   cfgsinq_base_quant_config_fnrA   s       r   _build_sinq_quant_dictz&SinqHfQuantizer._build_sinq_quant_dictm   s[     	[(cii..1nn.Hs3>>*dCOO,	
 		
r   c                    ddl m} | j                  ry| j                  j                  dk(  ry| j
                  syt        ||      \  }}|dk7  ryt        ||      }t        |dd      }|xr | }	|	S )a-  
        Called per-parameter to decide whether to run `SinqQuantize` on it.

        - If `self.pre_quantized`, we do *not* quantize again (handled by SinqDeserialize instead).
        - For method="asinq": return False (ASINQ is not supported in Hugging Face).
        - For method="sinq": True only for SINQLinear.weight not in modules_to_not_convert.

        Note: After _process_model_before_weight_loading(), the modules are already SINQLinear,
        not nn.Linear. We check for SINQLinear modules that are not yet quantized (ready=False).
        r   )
SINQLinearFr5   weightreadyT)	rP   rW   rB   r   rA   r   r   r:   getattr)
r   model
param_namer   rW   moduletensor_nameis_sinqis_readyresults
             r   param_needs_quantizationz(SinqHfQuantizer.param_needs_quantization   s     	2##**g5 ((25*E(" VZ067D1)\r   c                    ddl m}  ||       S )z
        Return the ConversionOps used for param-level quantization (Sinq).
        The actual SINQLinear construction is in integrations/sinq.py.
        r   )SinqQuantize)integrations.sinqrd   )r   rd   s     r   get_quantize_opsz SinqHfQuantizer.get_quantize_ops   s    
 	5D!!r   c                b    ddl m} | j                  rddlm}  |g ddg ||       g      gS g S )a4  
        If `pre_quantized=True`, interpret a checkpoint produced by SINQLinear.state_dict:

            <prefix>.W_q
            <prefix>.bias
            <prefix>.meta

        via a WeightConverter + SinqDeserialize so that we reconstruct a SINQLinear
        module instead of a plain nn.Linear.
        r   )WeightConverter)SinqDeserialize)z.W_qz.metaz.biasz.weight)source_patternstarget_patterns
operations)core_model_loadingrh   rB   re   ri   )r   rh   ri   s      r   get_weight_conversionsz&SinqHfQuantizer.get_weight_conversions   sE     	9;  %
 &/K / 56
 
 	r   c                b   ddl m} | j                  || j                  j                  xs g |      | _        | j                  j
                  dk(  xr | j                   | _        | j                  rdn| j                  | j                        }t        |t              rEt        t        |j                               d      }t        |t              rd| }n.t        |      }n"t         j"                  j%                         rdnd} ||| j                  || j&                  || j                  	      }y)
a  
        Called on meta-initialized model, before loading any weights.

        For SINQ, we replace nn.Linear modules with empty SINQLinear modules here.
        The actual quantization happens later in SinqQuantize.convert() when weights are loaded.
        r   )replace_with_sinq_linearsinqNr   zcuda:zcuda:0r%   )modules_to_not_convertquant_configcompute_dtypedevicerB   )re   rp   get_modules_to_not_convertr   rr   rA   rB   r   rU   r:   r;   nextiterr=   rQ   rR   r&   r'   r(   r1   )	r   r[   r,   keep_in_fp32_modulesr   rp   sinq_quant_dictfirst_device
device_strs	            r   $_process_model_before_weight_loadingz4SinqHfQuantizer._process_model_before_weight_loading   s    	A&*&E&ED,,CCIrL`'
#
 %)$<$<$C$Cv$M$hVZVhVhRh!"&"4"4$$:U:UVZVnVn:o j$'Z%6%6%8 91=L,,$\N3
 .
%*ZZ%<%<%>EJ(#'#>#>(**,,
r   c                     ddl m}  |        |S )aq  
        Called after *all* weights have been loaded.

        For SINQ:
        1. Move non-SINQLinear modules to GPU (embeddings, norms, lm_head, etc.)
           - SINQLinear modules already have GemLite buffers on GPU
           - We skip moving SINQLinear's W_q/meta to avoid memory duplication
        2. Patch HF save/load methods for SINQ serialization
        r   )patch_hf_pretrained_io)
sinq.hf_ior   )r   r[   r   r   s       r   #_process_model_after_weight_loadingz3SinqHfQuantizer._process_model_after_weight_loading   s     	6 	 r   )r   r   )returnr   )r1   torch.dtyper   r   )r   None)rS   r   r   r;   )r[   r   r\   rR   r   r   r/   )r[   r   ry   zlist[str] | None)r[   r   )__name__
__module____qualname____doc__r   __annotations__r   r    propertyr"   r-   r2   rF   rU   rb   rf   rn   r}   r   __classcell__)r   s   @r   r   r   !   s     .2$d10  8
$ D"B 26	)
)
 /	)
Vr   r   )
__future__r   typingr   r6   r   r   utils.quantization_configr   baser
   quantizers_utilsr   r&   modeling_utilsr   
get_loggerr   r*   r   r   r   r   <module>r      sJ    #   / 2  2 0			H	%dk dr   