
    qiD              	          d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ  ej0                  e      Zd-dej6                  dededej6                  fdZ G d dej>                        Z  G d dej>                        Z! G d dejD                        Z# G d dej>                        Z$ G d dej>                        Z% G d dej>                        Z& G d d ej>                        Z'e G d! d"e             Z(e G d# d$e(             Z) ed%&       G d' d(e(             Z* ed)&       G d* d+e	e(             Z+g d,Z,y).zPyTorch ConvNextV2 model.    N)nn   )initialization)ACT2FN)BackboneMixin)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging)can_return_tuple   )ConvNextV2Configinput	drop_probtrainingreturnc                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

            r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutputs          d/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/convnextv2/modeling_convnextv2.py	drop_pathr$   &   s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FM    c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
ConvNextV2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                 0    t         |           || _        y N)super__init__r   )selfr   	__class__s     r#   r+   zConvNextV2DropPath.__init__9   s    "r%   hidden_statesc                 D    t        || j                  | j                        S r)   )r$   r   r   )r,   r.   s     r#   forwardzConvNextV2DropPath.forward=   s    FFr%   c                      d| j                    S )Nzp=)r   )r,   s    r#   
extra_reprzConvNextV2DropPath.extra_repr@   s    DNN#$$r%   r)   )__name__
__module____qualname____doc__floatr+   r   Tensorr0   strr2   __classcell__r-   s   @r#   r'   r'   6   sG    b#%$, #$ #GU\\ Gell G%C %r%   r'   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )ConvNextV2GRNz)GRN (Global Response Normalization) layerdimc                     t         |           t        j                  t	        j
                  ddd|            | _        t        j                  t	        j
                  ddd|            | _        y )Nr   )r*   r+   r   	Parameterr   zerosweightbias)r,   r>   r-   s     r#   r+   zConvNextV2GRN.__init__G   sL    ll5;;q!Q#<=LLQ1c!:;	r%   r.   r   c                     t         j                  j                  |ddd      }||j                  dd      dz   z  }| j                  ||z  z  | j
                  z   |z   }|S )N   )r   rE   T)ordr>   keepdim)r>   rG   ư>)r   linalgvector_normmeanrB   rC   )r,   r.   global_featuresnorm_featuress       r#   r0   zConvNextV2GRN.forwardL   si    ,,22=aV]a2b'?+?+?BPT+?+UX\+\]}}'DE		QTaar%   )
r3   r4   r5   r6   intr+   r   FloatTensorr0   r:   r;   s   @r#   r=   r=   D   s1    3<C <
U%6%6 5;L;L r%   r=   c                   f     e Zd ZdZddd fd
Zdej                  dej                  f fdZ xZS )	ConvNextV2LayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    rI   channels_lastepsdata_formatc                \    t        |   |fd|i| |dvrt        d|       || _        y )NrU   )rS   channels_firstzUnsupported data format: )r*   r+   NotImplementedErrorrV   )r,   normalized_shaperU   rV   kwargsr-   s        r#   r+   zConvNextV2LayerNorm.__init__\   s?    )=s=f=AA%(A+&OPP&r%   featuresr   c                     | j                   dk(  r9|j                  dddd      }t        |   |      }|j                  dddd      }|S t        |   |      }|S )z
        Args:
            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
        rX   r   rE   r   r   )rV   permuter*   r0   )r,   r\   r-   s     r#   r0   zConvNextV2LayerNorm.forwardb   sj    
 //''1a3Hwx0H''1a3H  wx0Hr%   	r3   r4   r5   r6   r+   r   r8   r0   r:   r;   s   @r#   rR   rR   V   s4    
 15/ '   r%   rR   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZ	S )ConvNextV2EmbeddingszThis class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    c                    t         |           t        j                  |j                  |j
                  d   |j                  |j                        | _        t        |j
                  d   dd      | _	        |j                  | _        y )Nr   kernel_sizestriderI   rX   rT   )
r*   r+   r   Conv2dnum_channelshidden_sizes
patch_sizepatch_embeddingsrR   	layernormr,   configr-   s     r#   r+   zConvNextV2Embeddings.__init__v   sr     "		!4!4Q!7VEVEV_e_p_p!
 -V-@-@-C[kl"//r%   pixel_valuesr   c                     |j                   d   }|| j                  k7  rt        d      | j                  |      }| j	                  |      }|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r   rg   
ValueErrorrj   rk   )r,   rn   rg   
embeddingss       r#   r0   zConvNextV2Embeddings.forward~   sV    #))!,4,,,w  **<8
^^J/
r%   )
r3   r4   r5   r6   r+   r   rP   r8   r0   r:   r;   s   @r#   ra   ra   q   s*    0E$5$5 %,, r%   ra   c                   \     e Zd ZdZd fd	Zdej                  dej                  fdZ xZS )ConvNextV2Layera5  This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    c                    t         |           t        j                  ||dd|      | _        t        |d      | _        t        j                  |d|z        | _        t        |j                     | _        t        d|z        | _        t        j                  d|z  |      | _        |dkD  rt        |      | _        y t        j                          | _        y )N   r   )rd   paddinggroupsrI   rU      r   )r*   r+   r   rf   dwconvrR   rk   Linearpwconv1r   
hidden_actactr=   grnpwconv2r'   Identityr$   )r,   rm   r>   r$   r-   s       r#   r+   zConvNextV2Layer.__init__   s    iiSa3O,Sd;yya#g.&++, S)yyS#.:Cc/+I6r{{}r%   r\   r   c                 N   |}| j                  |      }|j                  dddd      }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }| j                  |      }|j                  dddd      }|| j                  |      z   }|S )Nr   rE   r   r   )rz   r^   rk   r|   r~   r   r   r$   )r,   r\   residuals      r#   r0   zConvNextV2Layer.forward   s    ;;x(##Aq!Q/>>(+<<)88H%88H%<<)##Aq!Q/dnnX66r%   )r   r_   r;   s   @r#   rs   rs      s)    
]  r%   rs   c                   \     e Zd ZdZd fd	Zdej                  dej                  fdZ xZS )ConvNextV2Stagea  ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`list[float]`): Stochastic depth rates for each layer.
    c                    t         	|           ||k7  s|dkD  r@t        j                  t	        |dd      t        j
                  ||||      g      | _        nt        j                         | _        |xs dg|z  }t        j                  t        |      D cg c]  }t        ||||          c}      | _	        y c c}w )Nr   rI   rX   rT   rc   r   )r>   r$   )
r*   r+   r   
ModuleListrR   rf   downsampling_layerrangers   layers)
r,   rm   in_channelsout_channelsrd   re   depthdrop_path_ratesjr-   s
            r#   r+   zConvNextV2Stage.__init__   s    ,&&1*&(mm'K[\IIk<[Y_`'D# ')mmoD#):cUU]mm^cdi^jkYZ_VQRASTk
ks   B>r\   r   c                 j    | j                   D ]
  } ||      } | j                  D ]
  } ||      } |S r)   )r   r   )r,   r\   layers      r#   r0   zConvNextV2Stage.forward   sA    ,, 	'EXH	'[[ 	'EXH	'r%   )rE   rE   rE   Nr_   r;   s   @r#   r   r      s(    
"  r%   r   c                   P     e Zd Z fdZ	 ddej
                  dedz  defdZ xZ	S )ConvNextV2Encoderc           
      ,   t         |           t        j                         | _        t        j                  d|j                  t        |j                        d      j                  |j                        D cg c]  }|j                          }}|j                  d   }t        |j                        D ]V  }|j                  |   }t        ||||dkD  rdnd|j                  |   ||         }| j                  j!                  |       |}X y c c}w )Nr   cpu)r   rE   r   )r   r   re   r   r   )r*   r+   r   r   stagesr   linspacedrop_path_ratesumdepthssplittolistrh   r   
num_stagesr   append)	r,   rm   xr   prev_chsiout_chsstager-   s	           r#   r+   zConvNextV2Encoder.__init__   s    mmo ^^Av'<'<c&-->PY^_eeflfsfst
 HHJ
 
 &&q)v(() 	A))!,G#$$EqqmmA& / 2E KKu%H	
s   :Dr.   output_hidden_statesNr   c                     |r|gnd }| j                   D ]  } ||      }||j                  |         t        ||      S )N)last_hidden_stater.   )r   r   r	   )r,   r.   r   all_hidden_stateslayer_modules        r#   r0   zConvNextV2Encoder.forward   sR     0D]O KK 	8L(7M ,!((7	8
 .]noor%   )F)
r3   r4   r5   r+   r   r8   boolr	   r0   r:   r;   s   @r#   r   r      s8    , PU
p"\\
pAE
p	'
pr%   r   c                   d     e Zd ZU eed<   dZdZdZdgZ e	j                          fd       Z xZS )ConvNextV2PreTrainedModelrm   
convnextv2rn   )imagers   c                     t         |   |       t        |t              r?t	        j
                  |j                         t	        j
                  |j                         yy)zInitialize the weightsN)r*   _init_weights
isinstancer=   initzeros_rB   rC   )r,   moduler-   s     r#   r   z'ConvNextV2PreTrainedModel._init_weights  sA     	f%fm,KK&KK$ -r%   )r3   r4   r5   r   __annotations__base_model_prefixmain_input_nameinput_modalities_no_split_modulesr   no_gradr   r:   r;   s   @r#   r   r      s=    $$O!*+U]]_% %r%   r   c            	       j     e Zd Z fdZee	 ddej                  dz  dedz  de	fd              Z
 xZS )ConvNextV2Modelc                     t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  d   |j                        | _        | j                          y )NrH   rx   )r*   r+   rm   ra   rq   r   encoderr   	LayerNormrh   layer_norm_epsrk   	post_initrl   s     r#   r+   zConvNextV2Model.__init__  s`     .v6(0 f&9&9"&=6CXCXY 	r%   Nrn   r   r   c                     || j                   j                  }|t        d      | j                  |      }| j	                  ||      }|j
                  }| j                  |j                  ddg            }t        |||j                        S )Nz You have to specify pixel_valuesr   rH   )r   pooler_outputr.   )
rm   r   rp   rq   r   r   rk   rL   r
   r.   )r,   rn   r   r[   embedding_outputencoder_outputsr   pooled_outputs           r#   r0   zConvNextV2Model.forward  s    
  '#';;#C#C ?@@??<8:>,,3G ;G ;
 ,== '8'='=r2h'GH7/')77
 	
r%   NN)r3   r4   r5   r+   r   r   r   rP   r   r
   r0   r:   r;   s   @r#   r   r     sP     ae
!--4
SWZ^S^
	1
  
r%   r   z
    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc            	            e Zd ZdZ fdZee	 ddej                  dz  dej                  dz  de
fd              Z xZS )	 ConvNextV2ForImageClassificationFc                 <   t         |   |       |j                  | _        t        |      | _        |j                  dkD  r3t        j                  |j                  d   |j                        | _        nt        j                         | _        | j                          y )Nr   rH   )r*   r+   
num_labelsr   r   r   r{   rh   
classifierr   r   rl   s     r#   r+   z)ConvNextV2ForImageClassification.__init__C  su      ++)&1 q  ii(;(;B(?ARARSDO kkmDO 	r%   Nrn   labelsr   c                      | j                   |fi |}|j                  }| j                  |      }d}|| j                  ||| j                        }t        |||j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   pooled_logitsrm   )losslogitsr.   )r   r   r   loss_functionrm   r   r.   )r,   rn   r   r[   outputsr   r   r   s           r#   r0   z(ConvNextV2ForImageClassification.forwardR  sy     =LDOOL<c\b<c--/%%V6RVR]R]%^D3!//
 	
r%   r   )r3   r4   r5   accepts_loss_kwargsr+   r   r   r   rP   
LongTensorr   r0   r:   r;   s   @r#   r   r   9  s^       _c
!--4
EJEUEUX\E\
	-
  
r%   r   zT
    ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
    c            	       h     e Zd ZdZ fdZee	 ddej                  de	dz  de
fd              Z xZS )	ConvNextV2BackboneFc                 p   t         |   |       t        |      | _        t	        |      | _        |j                  d   g|j                  z   | _        i }t        | j                  | j                        D ]  \  }}t        |d      ||<    t        j                  |      | _        | j                          y )Nr   rX   )rV   )r*   r+   ra   rq   r   r   rh   num_featureszipout_featureschannelsrR   r   
ModuleDicthidden_states_normsr   )r,   rm   r   r   rg   r-   s        r#   r+   zConvNextV2Backbone.__init__u  s     .v6(0#0034v7J7JJ !#&t'8'8$--#H 	iE<)<\Wg)h&	i#%==1D#E  	r%   Nrn   r   r   c                 z   || j                   j                  }| j                  |      }| j                  |d      }|j                  }g }t        | j                  |      D ]:  \  }}	|| j                  v s | j                  |   |	      }	|j                  |	       < t        t        |      |r|      S d      S )a  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   )feature_mapsr.   )rm   r   rq   r   r.   r   stage_namesr   r   r   r   tuple)
r,   rn   r   r[   r   r   r.   r   r   hidden_states
             r#   r0   zConvNextV2Backbone.forward  s    2  '#';;#C#C ??<8<@LLIYptL<u--#&t'7'7#G 	2E<)))>t77>|L##L1	2
 |,+?-
 	
EI
 	
r%   r)   )r3   r4   r5   has_attentionsr+   r   r   r   r8   r   r   r0   r:   r;   s   @r#   r   r   l  sN     N  NR'
!LL'
@Dt'
	'
  '
r%   r   )r   r   r   r   )r   F)-r6   r   r    r   r   activationsr   backbone_utilsr   modeling_outputsr   r	   r
   r   modeling_utilsr   utilsr   r   utils.genericr   configuration_convnextv2r   
get_loggerr3   loggerr8   r7   r   r$   Moduler'   r=   r   rR   ra   rs   r   r   r   r   r   r   __all__ r%   r#   <module>r      sv       & ! +  . , - 6 
		H	%U\\ e T V[VbVb  % %BII $",, 6299 0(bii (X!bii !J p		  pF % % %  &
/ &
 &
R )
'@ )
)
X <
(A <
<
~ ur%   