
    qi(              
       z   d Z ddlmZ ddlmZ ddlmZmZ  e       r
ddlZddl	m
Z
  ej                  e      Zdad Z G d d	e
j                         Z	 	 	 dd
ee   dz  defdZdej,                  dedefdZdej,                  dej,                  dej,                  dedef
dZ G d de      Z G d de      Zy)a  
Metal affine quantization integration for transformers.

This module provides:
  - ``MetalLinear``: a drop-in replacement for ``nn.Linear`` that stores weights
    as affine-quantized uint32 packed tensors and uses the ``quantization-mlx``
    Metal kernels for the forward pass.
  - ``replace_with_metal_linear``: walks a model and swaps every eligible
    ``nn.Linear`` with ``MetalLinear``.
  - ``MetalQuantize`` / ``MetalDequantize``: weight conversion operations that
    participate in the new ``WeightConverter`` pipeline.

Weight layout (transposed, matching ``affine_qmm_t``):
  - ``weight``: ``[N, K_packed]`` (``uint32``) -- K is the packed dimension.
  - ``scales``:  ``[N, K // group_size]`` (``float16 / bfloat16``)
  - ``qbiases``: ``[N, K // group_size]`` (same dtype as scales)

The kernel call is ``affine_qmm_t(x, weight, scales, qbiases, group_size, bits)``
which computes ``y = x @ dequant(weight).T``, identical to ``nn.Linear``.
   )ConversionOps)should_convert_module)is_torch_availablelogging    Nc                      t         	 ddlm}   | d      a t         S t         S # t        $ r}t	        d| d      |d}~ww xY w)z>Lazily load the quantization-mlx kernel from Hugging Face Hub.N   )
get_kernelz0kernels-community/mlx-quantization-metal-kernelsz9Failed to load the quantization-mlx kernel from the Hub: zm. Make sure you have `kernels` installed (`pip install kernels`) and are running on an Apple Silicon machine.)_metal_kernelhub_kernelsr
   	ExceptionImportError)r
   es     ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/integrations/metal_quantization.py_get_metal_kernelr   3   s`     		/&'YZM =  	KA3 O? ? 		s   " 	A ;A c                       e Zd ZdZdej
                  ddfdedededed	ef
d
Zdej                  dej                  fdZ
y)MetalLinearz
    A quantized linear layer that stores weights in affine uint32 packed format
    and uses the ``quantization-mlx`` Metal kernels for the forward pass.

    Parameters match ``nn.Linear`` with additional quantization metadata.
    F      in_featuresout_featuresbiasbits
group_sizec                 :   t         j                  j                  |        || _        || _        || _        || _        d|z  }||z  }||z  }	|t        j                  k(  rAt        j                  t        j                  ||t        j                        d      | _        n2t        j                  t        j                  |||      d      | _        |t        j                  k(  rt        j                  nd }
t        j                  t        j                  ||	|
      d      | _        t        j                  t        j                  ||	|
      d      | _        |r.t        j                  t        j                  |            | _        y | j!                  dd        y )N    )dtypeF)requires_gradr   )nnModule__init__r   r   r   r   torchuint32	Parameterzerosweightfloat32scalesqbiasesr   register_parameter)selfr   r   r   r   r   r   elems_per_intk_packedn_groupsscales_dtypes              r   r!   zMetalLinear.__init__Q   s'    			4 &(	$d
-/*,ELL ,,u{{<QVQ]Q]'^nstDK,,u{{<TY'ZjopDK(-(=u}}4ll5;;|X\#Zjop||EKKhl$[kpqU[[%>?DI##FD1    inputreturnc                    | j                   j                  t        j                  k7  r5t        j
                  j                  || j                   | j                        S t               }|j                  || j                   | j                  j                  |j                        | j                  j                  |j                        | j                  | j                        }| j                  || j                  z   }|S N)r&   r   r"   r#   r   
functionallinearr   r   affine_qmm_tr(   tor)   r   r   )r+   r1   kerneloutputs       r   forwardzMetalLinear.forwards   s    ;;,==''t{{DIIFF"$$$KKKKNN5;;'LLOOEKK(OOII
 99 dii'Fr0   N)__name__
__module____qualname____doc__r"   r#   intboolr!   Tensorr;    r0   r   r   r   I   sj     ll 2 2  2 	 2  2  2DU\\ ell r0   r   modules_to_not_convertpre_quantizedc           
         |j                   r| S |j                  }|j                  }d}| j                         D ]z  \  }}t	        ||      st        |t        j                        s.|ri nddi}	t        d|j                  |j                  |j                  du||d|	}
| j                  ||
       d}| |st        j                  d       | S )a`  
    Replace every eligible ``nn.Linear`` with ``MetalLinear``.

    Args:
        model: the ``PreTrainedModel`` (on the meta device at this point).
        modules_to_not_convert: module names to leave untouched.
        quantization_config: the ``MetalConfig`` instance.
        pre_quantized: ``True`` when loading from a quantized checkpoint.
    Fr   N)r   r   r   r   r   TzYou are loading a model with Metal quantization but no nn.Linear modules were found. Please double check your model architecture.rC   )
dequantizer   r   named_modulesr   
isinstancer   Linearr   r   r   r   set_submoduleloggerwarning)modelrD   quantization_configrE   r   r   has_been_replacedmodule_namemodulemodule_kwargs
new_modules              r   replace_with_metal_linearrU      s     %%##D$//J$224 %V$[2HIfbii("/Bgt_M$ "..#00[[,%  J Z8 $!%$ ;	

 Lr0   r&   r   r   c                 
   | j                   \  }}d|z  }d|z  dz
  }||z  }| j                         j                  |||      }|j                  d      j                  }	|j                  d      j                  }
|
|	z
  |z  j                  d      }|	}||j                  d      z
  |j                  d      z  }|j                         j                  d|      j                  t        j                        j                  ||      }||z  }t        j                  ||t        j                  | j                        }t        |      D ]  }||d	d	|d	|f   ||z  z  z  } |j                  t        j                        ||fS )
aP  
    Quantize a 2-D float weight ``[N, K]`` into packed uint32 + scales + biases.

    Returns ``(w_packed, scales, biases)`` with:
      - ``w_packed``: ``[N, K // (32 // bits)]`` uint32
      - ``scales``:   ``[N, K // group_size]`` float32/float16/bfloat16
      - ``biases``:   ``[N, K // group_size]`` float32/float16/bfloat16
    r   r	   )dimg:0yE>)minr   r   deviceN)shapefloatreshaperY   valuesmaxclamp	unsqueezeroundr8   r"   int32r%   r[   ranger#   )r&   r   r   NKr,   max_valr.   	w_groupedw_minw_maxr(   biasesw_intr-   w_packedis                    r   _affine_quantize_tensorrp      sm    <<DAq$JMDyAoGJH&&q(J?IMMbM!((EMMbM!((Eu}'..4.8FF))"--1A1A"1EEEKKM7+..u{{;CCAqIE M!H{{1hekk&--PH=! =E!Q---.4!8<<= ;;u||$ff44r0   rn   r(   rl   c                 2   | j                   d   }d|z  }d|z  dz
  }| j                   d   |z  }| j                  t        j                        }	t        j                  ||t        j
                  | j                        }
t        |      D ]%  }|	||z  z	  |z  j                         |
dd|d|f<   ' |
j                  |d|      }||j                         j                  d      z  |j                         j                  d      z   }|j                  ||      S )zv
    Dequantize a packed uint32 weight ``[N, K_packed]`` back to float.

    Returns a ``[N, K]`` float32 tensor.
    r   r   r	   rZ   NrW   )r\   r8   r"   rd   r%   r'   r[   re   r]   r^   rb   )rn   r(   rl   r   r   rf   r,   rh   rg   
w_packed_iw_flatro   ri   w_deqs                 r   _affine_dequantize_tensorru      s    	qA$JMDyAoGqM)AU[[)J[[AU]]8??KF=! U(2tax(@G'K&R&R&Tq!"]""#U q"j1I0044v||~7O7OPR7SSE==Ar0   c                   &    e Zd ZdZd ZdedefdZy)MetalQuantizez
    Quantize a full-precision weight tensor into (weight, scales, qbiases).

    Used during quantize-on-the-fly.  The float ``weight`` is replaced in-place
    by the packed uint32 tensor.
    c                     || _         y r4   hf_quantizerr+   rz   s     r   r!   zMetalQuantize.__init__   
    (r0   
input_dictr2   c                    t        t        |j                                     \  }}t        |t              r|d   n|}| j
                  j                  j                  }| j
                  j                  j                  }t        |||      \  }}}	d|v r|j                  dd      d   nd}
|
r|
 dnd}|
r|
 dnd}|j                  }||||j                  |      ||	j                  |      iS )	Nr   .r	    z.scalesr(   z.qbiasesr)   )nextiteritemsrI   listrz   rO   r   r   rp   rsplitr   r8   )r+   r}   kwargs
target_keyvaluer   r   rn   r(   rl   base	scale_keybias_key
orig_dtypes                 r   convertzMetalQuantize.convert   s     j&6&6&8!9:
E&ud3a  4499&&::EE
#:5*d#S &&/2j/@z  a(+b(,tfG$(	(,dV8$)[[
vyy,fii
+
 	
r0   N)r<   r=   r>   r?   r!   dictr   rC   r0   r   rw   rw      s    )
$ 
T 
r0   rw   c                   2    e Zd ZdZd Zddededz  defdZy)	MetalDequantizez
    Dequantize (weight, scales, qbiases) back to a full-precision tensor.

    Used when ``dequantize=True`` is set in the config to fall back to a normal
    ``nn.Linear`` on devices without MPS.
    c                     || _         y r4   ry   r{   s     r   r!   zMetalDequantize.__init__  r|   r0   Nr}   full_layer_namer2   c                 4   | j                   j                  j                  }| j                   j                  j                  }t	        |      dk  r||d   iS |d   d   }|d   d   }|d   d   }t        |||||      }	||	j                  |j                        iS )Nr   zweight$r   r(   r)   )rz   rO   r   r   lenru   r8   r   )
r+   r}   r   r   r   r   	quantizedr(   r)   rt   s
             r   r   zMetalDequantize.convert  s      4499&&::EE
z?Q#Z	%:;;y)!,	H%a(Y'*))VWjRVW&,,!788r0   r4   )r<   r=   r>   r?   r!   r   strr   rC   r0   r   r   r     s+    )9$ 9t 9Y] 9r0   r   )NNF)r?   core_model_loadingr   quantizers.quantizers_utilsr   utilsr   r   r"   torch.nnr   
get_loggerr<   rL   r   r   rJ   r   r   r   rA   rU   rB   r@   rp   ru   rw   r   rC   r0   r   <module>r      s   * / ? /  
		H	%,;")) ;@ 04	/ I,/ 	/d5ELL 5c 5 5Bll$)LL:?,,TW_b.
M 
@9m 9r0   