
    qiGO                        d dl mZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZmZ ddlmZ dd	lmZmZmZ d
dlmZmZmZmZmZmZmZ d
dlmZ d
dl m!Z!m"Z"  ejF                  e$      Z% G d de	      Z& G d de      Z' G d de!      Z( G d de      Z) G d de      Z* G d de      Z+ G d de      Z,e G d de             Z- G d de      Z. G d  d!e      Z/g d"Z0y)#    )CallableN   )initialization)PreTrainedConfig)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModelCLIPVisionTransformer)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionc                   F     e Zd ZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )MLCDVisionConfiga  
    This is the configuration class to store the configuration of a [`MLCDVisionModel`]. It is used to instantiate a MLCD
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the vision encoder of the MLCD
    [DeepGlint-AI/mlcd-vit-bigG-patch14-336](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1664):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 1024):
            Dimensionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 336):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```python
    >>> from transformers import MLCDVisionConfig, MLCDVisionModel

    >>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> configuration = MLCDVisionConfig()

    >>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> model = MLCDVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```mlcd_vision_modelvision_configc                     t        |   di | || _        || _        || _        || _        || _        || _        || _        || _	        || _
        || _        || _        |
| _        |	| _        y )N )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_groupsnum_channels
patch_size
image_sizeinitializer_rangeinitializer_factorattention_dropoutlayer_norm_eps
hidden_act)selfr"   r#   r$   r%   r&   r'   r)   r(   r.   r-   r,   r*   r+   kwargs	__class__s                  W/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/mlcd/modular_mlcd.pyr!   zMLCDVisionConfig.__init__c   sz    " 	"6"&!2!2#6 $8!($$!2"4!2,$    )i  i    0         r   iP     gelugh㈵>        g{Gz?      ?)__name__
__module____qualname____doc__
model_typebase_config_keyr!   __classcell__r1   s   @r2   r   r   )   sH    4l %J%O % %r3   r   c                       e Zd Zy)MLCDMLPN)r;   r<   r=   r   r3   r2   rD   rD      s    r3   rD   c                   4    e Zd Zdededej
                  fdZy)MLCDRotaryEmbeddingnum_patches_heightnum_patches_widthreturnc                    t        j                  || j                  j                        j	                  d      j                  d|      }t        j                  || j                  j                        j	                  d      j                  |d      }t        j                  |j                         |j                         gd      }t        ||      }t        j                  || j                  j                  | j                  j                        }t        j                  || j                        }||   j                  d      }	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer6   r   dim)rK   dtype)torcharangeinv_freqrK   	unsqueezeexpandstackflattenmaxrO   outer)
r/   rG   rH   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embs
             r2   forwardzMLCDRotaryEmbedding.forward   s	    LL+DMM4H4HISSTUV]]^`bst 	 LL*4==3G3GHRRSTU\\]oqst 	
 ++x//183C3C3EFBO .0ABll=1E1ET]]M`M`a#kk#t}}= -W5==a@r3   N)r;   r<   r=   intrP   Tensorr`   r   r3   r2   rF   rF      s     # # %,, r3   rF   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZ	S )MLCDVisionEmbeddingsconfigc                 (    t         |   |       | `y N)r    r!   position_embeddingr/   re   r1   s     r2   r!   zMLCDVisionEmbeddings.__init__   s     #r3   pixel_valuesrI   c                 T   |j                   d   }| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }|S )Nr   rO   r   r6   rL   rM   )shapepatch_embeddingweightrO   torV   	transposeclass_embeddingrT   rP   cat)r/   rj   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingss          r2   r`   zMLCDVisionEmbeddings.forward   s    !''*
++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
r3   )
r;   r<   r=   r   r!   rP   FloatTensorrb   r`   rA   rB   s   @r2   rd   rd      s-    $/ $
E$5$5 
%,, 
r3   rd   c                        e Zd ZdZdef fdZ	 ddej                  deej                  ej                  f   dej                  dz  de	e
   d	eej                  ej                  dz  f   f
d
Z xZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    re   c                 T    t         |   |       |j                  | _        d| _        y )NF)r    r!   r&   	is_causalri   s     r2   r!   zMLCDAttention.__init__   s%     $*$?$?!r3   Nhidden_statesposition_embeddingsattention_maskr0   rI   c                    |j                   d d \  }}| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }	|d   j                  d      j                         }
|d   j                  d      j                         }t        |||
|      \  }}|j                  dddd      j                         }|j                  dddd      j                         }|	j                  dddd      j                         }	t        j                  | j                  j                  t               } || |||	|f| j"                  sdn| j$                  | j&                  | j(                  d|\  }}|j                  dddd      j                         }|j+                  ||d      }| j-                  |      }|j                  ddd      j                         }||fS )NrL   r   r6   r   r   r9   )dropoutscalingr}   )rm   q_projreshape	num_headshead_dimk_projv_projrS   floatr   permute
contiguousr	   get_interfacere   _attn_implementationr   trainingr   scaler}   viewout_proj)r/   r~   r   r   r0   rt   
seq_lengthquery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  r2   r`   zMLCDAttention.forward   s;    "/!4!4Sb!9
J {{=199:zSWSaSacgcpcp:qr[[/77ZQUQ_Q_aeanan8op
{{=199:zSWSaSacgcpcp:qr "!$..q1779!!$..q1779#>|ZY\^a#b j $++Aq!Q7BBD''1a3>>@
#++Aq!Q7BBD(?(M(MKK,,.E)
 %8
%
  $}}C$,,JJnn
%
 
%
!\ "))!Q15@@B!&&z:rBmmK0!))!Q2==?L((r3   rg   )r;   r<   r=   r>   r   r!   rP   rb   tupler   r   r`   rA   rB   s   @r2   r{   r{      s    /  /3	,)||,) #5<<#=>,) t+	,)
 +,,) 
u||U\\D00	1,)r3   r{   c                        e Zd Zdef fdZ	 d
dej                  deej                  ej                  f   dej                  dz  dee	   deej                     f
d	Z xZS )MLCDEncoderLayerre   c                 D    t         |   |       t        |      | _        y rg   )r    r!   r{   	self_attnri   s     r2   r!   zMLCDEncoderLayer.__init__   s     &v.r3   Nr~   r   r   r0   rI   c                     |}| j                  |      } | j                  d|||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
        )r~   r   r   r   )layer_norm1r   layer_norm2mlp)r/   r~   r   r   r0   residual_s          r2   r`   zMLCDEncoderLayer.forward   s    $ !((7)4>> 
' 3)
 	
q !=0 ((7/ =0r3   rg   )r;   r<   r=   r   r!   rP   rb   r   r   r   ry   r`   rA   rB   s   @r2   r   r      sz    // / /3	"||" #5<<#=>" t+	"
 +," 
u  	!"r3   r   c                        e Zd ZdZdef fdZ	 ddej                  deej                  ej                  f   dej                  dz  de
e   d	eez  f
d
Z xZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    re   c                 $    t         |   |       y)z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)r    r!   ri   s     r2   r!   zMLCDEncoder.__init__,  s     r3   Ninputs_embedsr   r   r0   rI   c                 V    |}| j                   D ]  } ||||fi |} t        |      S )a=  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        )last_hidden_state)layersr   )r/   r   r   r   r0   r~   encoder_layers          r2   r`   zMLCDEncoder.forward0  sK    , &![[ 	M)# 	M	 +
 	
r3   rg   )r;   r<   r=   r>   r   r!   rP   ry   r   rb   r   r   r   r`   rA   rB   s   @r2   r   r   #  s{    !/ ! /3	!
((!
 #5<<#=>!
 t+	!

 +,!
 
	 !
r3   r   c                   l    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZeedZ ej                          d        Zy)MLCDPreTrainedModelre   mlcdTF)r~   
attentionsc                 &	   | j                   j                  }t        |t              r| j                   j                  }t	        j
                  |j                  d|j                  dz  |z         t	        j
                  |j                  j                  |j                   j                  |z         t	        j                  |j                  t        j                  |j                  j                  d         j!                  d             yt        |t"              r| j                   j                  }|j                  dz  d|j                   j$                  z  dz  z  |z  }|j                  dz  |z  }t	        j
                  |j&                  j                  |       t	        j
                  |j(                  j                  |       t	        j
                  |j*                  j                  |       t	        j
                  |j,                  j                  |       yt        |t.              r| j                   j                  }|j                   j0                  dz  d|j                   j$                  z  dz  z  |z  }d|j                   j0                  z  dz  |z  }t	        j
                  |j2                  j                  |       t	        j
                  |j4                  j                  |       yt        |t6              ro| j                   j                  }|j                   j0                  |j                   j8                  z  dz  dz  |z  }t	        j
                  |j:                  d|       yt        |t<        j>                        r?t	        j@                  |jB                         t	        jD                  |j                         yt        |t<        jF                        r,|jB                   t	        j@                  |jB                         yt        |tH              rod	|jJ                  t        j                  d
|jL                  dt        jN                        |jL                  z  z  z  }t	        j                  |jP                  |       yy)zInitialize the weightsr9   g      )meanstd)r   rL   )r6   rL   r   Nr:   r   rl   ))re   r+   
isinstancerd   initnormal_rr   	embed_dimrn   ro   r*   copy_position_idsrP   rQ   rm   rT   r{   r$   r   r   r   r   rD   r"   fc1fc2MLCDVisionTransformerr%   class_pos_embnn	LayerNormzeros_biasones_LinearrF   thetarN   r   rR   )r/   modulefactorin_proj_stdout_proj_stdfc_stdpos_emb_stdrR   s           r2   _init_weightsz!MLCDPreTrainedModel._init_weightsc  s#    //f23[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh.[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**< 56[[33F!==448Y8YY]^^cggjppKLL--C[I-KK$JJv}}%		*v{{/FKK$ 34fllu||Avzz1TYT_T_/`cicmcm/mnoHJJv1 5r3   N)r;   r<   r=   r   __annotations__base_model_prefixsupports_gradient_checkpointingaccepts_loss_kwargs_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r{   _can_record_outputsrP   no_gradr   r   r3   r2   r   r   T  s[    &*#N"&)#
 U]]_!2 !2r3   r   c                   b     e Zd Zdef fdZ	 ddej                  dz  dee   de	e
z  fdZ xZS )	r   re   c                    t         |   |       t        |j                  |j                  z  dz        | _        t        j                  t        j                  d|j                  |j                  z  dz              | _
        y )Nr   r6   )r    r!   rF   r"   r%   vision_rotary_embeddingr   	ParameterrP   randnr   ri   s     r2   r!   zMLCDVisionTransformer.__init__  sh     ':6;M;MQWQkQk;kop;p'q$\\%++a9K9KvOiOi9imn9n*opr3   Nrj   r0   rI   c                    |t        d      |j                  d   | j                  j                  z  }|j                  d   | j                  j                  z  }| j	                  ||      }|j                  | j                  j                        }t        j                  | j                  |gd      }t        j                  ||fd      }|j                         |j                         f}| j                  |      }| j                  |      } | j                  d||d|}	|	d   }
|
d d dd d f   }| j                  |      }t!        |
|      S )	Nz You have to specify pixel_valuesrL   r   rM   )r   r   )r   pooler_outputr   )
ValueErrorrm   re   r(   r   rp   r   rK   rP   rs   r   r   rx   pre_layrnormencoderpost_layernormr   )r/   rj   r0   rG   rH   r_   embr   r~   encoder_outputsr   pooled_outputs               r2   r`   zMLCDVisionTransformer.forward  sP   
 ?@@)//3t{{7M7MM(..r2dkk6L6LL556HJ[\'**4+=+=+D+DED$6$6#GQOii8bA"wwy#'')45))-8&$,, 
' 3
 
 ,A.)!Q'2++M:)/'
 	
r3   rg   )r;   r<   r=   r   r!   rP   ry   r   r   r   r   r`   rA   rB   s   @r2   r   r     sO    q/ q 26 
''$. 
 +, 
 
+	+	 
r3   r   c                   J    e Zd Z	 ddej                  dz  dee   deez  fdZ	y)MLCDVisionModelNrj   r0   rI   c                 *     | j                   dd|i|S )a  
        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```rj   r   )vision_model)r/   rj   r0   s      r2   r`   zMLCDVisionModel.forward  s)    : !t   
%

 	
r3   rg   )
r;   r<   r=   rP   ry   r   r   r   r   r`   r   r3   r2   r   r     s?     26 
''$. 
 +, 
 
+	+	 
r3   r   )r   r   r   )1collections.abcr   rP   torch.nnr    r   r   configuration_utilsr   modeling_outputsr   r   modeling_utilsr	   r
   processing_utilsr   utilsr   r   r   clip.modeling_clipr   r   r   r   r   r   r   llama.modeling_llamar   qwen2_vl.modeling_qwen2_vlr   r   
get_loggerr;   loggerr   rD   rF   rd   r{   r   r   r   r   r   __all__r   r3   r2   <module>r      s    %   & 3 K F & @ @   ; [ 
		H	%Y%' Y%x	g 	/ D/ $9)M 9)x'' 'T.
+ .
b 02/ 02 02f&
1 &
R!
o !
Hr3   