
    qi                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&  e!jN                  e(      Z)de	jT                  de	jT                  fdZ+de	jT                  de	jT                  fdZ,ee G d de                    Z-ee G d de                    Z.ee G d de                    Z/ G d de
j`                        Z1 G d d e
j`                        Z2	 dFd!e
j`                  d"e	jT                  d#e	jT                  d$e	jT                  d%e	jT                  dz  d&e3d'e3fd(Z4 G d) d*e
j`                        Z5 G d+ d,e
j`                        Z6 G d- d.e      Z7e G d/ d0e             Z8 G d1 d2e
j`                        Z9 G d3 d4e8      Z: G d5 d6e8      Z; G d7 d8e
j`                        Z< G d9 d:e8      Z=e G d; d<e8             Z> G d= d>e
j`                        Z? G d? d@e8      Z@ edAB       G dC dDe8             ZAg dEZBy)GzPyTorch CLIPSeg model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_lossr&   )   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r&   t)r(   caption_loss
image_losss      r%   clipseg_lossr-   .   s,    #J/L!*,,.1J:%,,r'   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)CLIPSegOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r5   r6   Ngetattrto_tuple.0kselfs     r%   	<genexpr>z)CLIPSegOutput.to_tuple.<locals>.<genexpr>T   s=      
  LLDGRYZ^`aRbRkRkRmm
   -0tuplekeysr?   s   `r%   r;   zCLIPSegOutput.to_tupleS   #     
YY[
 
 	
r'   )__name__
__module____qualname____doc__r0   r"   FloatTensor__annotations__r1   r2   r3   r4   r5   r   r6   rC   r   r;    r'   r%   r/   r/   4   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:
%* 
r'   r/   c                       e Zd ZU dZdZej                  dz  ed<   dZe	ej                     dz  ed<   dZ
e	ej                     dz  ed<   y)CLIPSegDecoderOutputz|
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    Nr   hidden_states
attentions)rG   rH   rI   rJ   r   r"   rK   rL   rP   rC   rQ   rM   r'   r%   rO   rO   Z   sR    
 (,FE$+59M5**+d2926Je''(4/6r'   rO   c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZeed<   dZeed<   d	ee   fd
Zy)CLIPSegImageSegmentationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Binary cross entropy loss for segmentation.
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
        Conditional embeddings used for segmentation.
    pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
        Pooled output of the [`CLIPSegVisionModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    decoder_output (`CLIPSegDecoderOutput`):
        The output of the [`CLIPSegDecoder`].
    Nr0   r   conditional_embeddingspooled_outputr6   decoder_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r6   rV   Nr9   r<   s     r%   r@   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>   s<      
  IIDGwW[]^O_OhOhOjj
rA   rB   rE   s   `r%   r;   z'CLIPSegImageSegmentationOutput.to_tuple   rF   r'   )rG   rH   rI   rJ   r0   r"   rK   rL   r   rT   rU   r6   r   rV   rO   rC   r   r;   rM   r'   r%   rS   rS   g   s     &*D%

d
")'+FE$+7;E--4;.2M5$$t+26:3:+/N(/
%* 
r'   rS   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )CLIPSegVisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__r[   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr"   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr#   expandr?   r[   	__class__s     r%   ri   z CLIPSegVisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr'   
embeddingsheightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nre   g      ?r   rb   bicubicF)sizemodealign_cornersdim)shaperw   weight	unsqueezer"   jit
is_tracingrc   rm   r   reshapepermuter   r    interpolateviewcat)r?   r|   r}   r~   rt   rw   ru   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r%   interpolate_pos_encodingz0CLIPSegVisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr'   pixel_valuesc                     |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  |      }|j	                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }	|r|	| j                  |	||      z   }	|	S |	| j                  | j                        z   }	|	S )	NzInput image size (*z) doesn't match model ().rb   r   re   r   )r   rl   
ValueErrorrs   flatten	transposerp   ry   r"   r   r   rw   rc   )
r?   r   r   
batch_size_r}   r~   patch_embedsclass_embedsr|   s
             r%   forwardzCLIPSegVisionEmbeddings.forward   s   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++L9#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr'   T)rG   rH   rI   r   ri   r"   Tensorintr   rK   r   __classcell__r{   s   @r%   rZ   rZ      se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Y^YeYe r'   rZ   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
CLIPSegTextEmbeddingsr[   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nrc   rd   Frf   )rh   ri   rj   r   rv   
vocab_sizetoken_embeddingmax_position_embeddingsrw   rx   r"   r#   ry   r?   r[   rk   r{   s      r%   ri   zCLIPSegTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r'   N	input_idsrc   inputs_embedsr   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nre   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   rw   r   r   rc   r   )r?   r   rc   r   
seq_lengthmax_position_embeddingposition_embeddingsr|   s           r%   r   zCLIPSegTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r'   )NNN)rG   rH   rI   r   ri   r"   
LongTensorrK   r   r   r   r   s   @r%   r   r      sk    

0 

 .20426	##d* &&- ((4/	
 
r'   r   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nre   r   )r   dtype)ptrainingr   rb   )r"   matmulr   r   r    softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r%   eager_attention_forwardr     s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r'   c                        e Zd ZdZdeez  f fdZ	 	 d
dej                  dej                  dz  de	dz  de
ej                  ej                  dz  f   fd	Z xZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr[   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rh   ri   r[   rj   rk   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrz   s     r%   ri   zCLIPSegAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar'   NrP   r   output_attentionsr   c                    |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
t        j                  | j                  j                  t              } || ||	|
|f| j                  | j                  sdn| j                  d|\  }}|j!                  |||      j#                         }| j%                  |      }|sd}||fS )z#Input shape: Batch x Time x Channelr   rb           )r   r   N)r   r   r   r   r   r   r   r   r   get_interfacer[   _attn_implementationr   r   r   r   r   r   r   )r?   rP   r   r   r   r   r   rk   queriesrD   valuesattention_interfacer   r   s                 r%   r   zCLIPSegAttention.forward2  si    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
!\ "))*j)LWWYmmK0 LL((r'   NF)rG   rH   rI   rJ   r   r   ri   r"   r   boolrC   r   r   r   s   @r%   r   r     sv    GB25FF B. /3).	')||') t+')  $;	') 
u||U\\D00	1')r'   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
CLIPSegMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)rh   ri   r[   r	   
hidden_actactivation_fnr   r   rj   intermediate_sizefc1fc2rz   s     r%   ri   zCLIPSegMLP.__init__^  sd    #F$5$5699V//1I1IJ99V55v7I7IJr'   rP   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r?   rP   s     r%   r   zCLIPSegMLP.forwarde  s4    /**=9/r'   )rG   rH   rI   ri   r"   r   r   r   r   s   @r%   r   r   ]  s$    KU\\ ell r'   r   c                        e Zd Zdef fdZ	 d
dej                  dej                  dedz  dee	   de
ej                     f
d	Z xZS )CLIPSegEncoderLayerr[   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y N)epsrh   ri   rj   rk   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rz   s     r%   ri   zCLIPSegEncoderLayer.__init__n  m    ++)&1<<F<Q<QRf%<<F<Q<QRr'   rP   r   r   Nr   r   c                     |}| j                  |      } | j                  d|||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )I  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rP   r   r   rM   )r   r   r   r   )r?   rP   r   r   r   residualr   outputss           r%   r   zCLIPSegEncoderLayer.forwardv  s    " !((7&4dnn '
')/'
 	'
#| !=0 ((7/ =0 "&Gr'   F)rG   rH   rI   r   ri   r"   r   r   r   r   rC   rK   r   r   r   s   @r%   r   r   m  sh    S} S */	&||& &  $;	&
 +,& 
u  	!&r'   r   c                   R    e Zd ZU eed<   dZdZdZ ej                         d        Z
y)CLIPSegPreTrainedModelr[   clip)imagetextTc                 
   | j                   j                  }t        |t              rt	        j
                  |j                  j                  d|dz         t	        j
                  |j                  j                  d|dz         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             nt        |t              r| j                   j                  }t	        j
                  |j                   d|j"                  dz  |z         t	        j
                  |j$                  j                  |j                   j&                  |z         t	        j
                  |j                  j                  |j                   j&                  |z         t	        j                  |j                  t        j                  |j(                        j                  d             nt        |t*              r| j                   j                  }|j"                  dz  d|j                   j,                  z  dz  z  |z  }|j"                  dz  |z  }t	        j
                  |j.                  j                  |       t	        j
                  |j0                  j                  |       t	        j
                  |j2                  j                  |       t	        j
                  |j4                  j                  |       nt        |t6              r| j                   j                  }|j                   j8                  dz  d|j                   j,                  z  dz  z  |z  }d|j                   j8                  z  dz  |z  }t	        j
                  |j:                  j                  |       t	        j
                  |j<                  j                  |       nt        |t>              rt	        j
                  |j@                  j                  |jB                  dz  | j                   j                  z         t	        j
                  |jD                  j                  |jF                  dz  | j                   j                  z         t        |tH        jJ                        r>t	        jL                  |jN                         t	        jP                  |j                         t        |tH        jR                        r-|jN                   t	        jL                  |jN                         y	y	y	)
zInitialize the weightsr   g{Gz?)meanstdre   rd   r   )r  rb   N)*r[   initializer_factor
isinstancer   initnormal_r   r   rw   copy_rc   r"   r#   r   ry   rZ   rp   rk   rs   initializer_rangeru   r   num_hidden_layersr   r   r   r   r   rj   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   r   zeros_ra   ones_r   )r?   r   factorin_proj_stdout_proj_stdfc_stds         r%   _init_weightsz$CLIPSegPreTrainedModel._init_weights  s    //f34LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 78[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_ 01[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B
+[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<-LL&&--))4/$++2P2PP LL((//++T1DKK4R4RR
 fbll+KK$JJv}}%fbii(V[[-DKK$ .E(r'   N)rG   rH   rI   r   rL   base_model_prefixinput_modalitiessupports_gradient_checkpointingr"   no_gradr  rM   r'   r%   r   r     s4    (&*#U]]_)% )%r'   r   c                        e Zd ZdZdef fdZe	 	 	 	 ddej                  dz  de	dz  de	dz  de	dz  d	e
e   d
eez  fd       Z xZS )CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    r[   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
rh   ri   r[   r   
ModuleListranger  r   layersgradient_checkpointing)r?   r[   r   r{   s      r%   ri   zCLIPSegEncoder.__init__  sP    mm%PVPhPhJi$jQ%8%@$jk&+# %ks   A#Nr   r   output_hidden_statesreturn_dictr   r   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|fd|i|}|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrM   r   r   r   )last_hidden_staterP   rQ   )r[   r   r%  use_return_dict	enumerater#  r   )r?   r   r   r   r%  r&  r   encoder_statesall_attentionsrP   idxencoder_layerlayer_outputss                r%   r   zCLIPSegEncoder.forward  s   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B) #4 	M *!,M !/=3C2E!E	F  +}.>>N+>Vd
 	
r'   )NNNN)rG   rH   rI   rJ   r   ri   r   r"   r   r   r   r   rC   r   r   r   r   s   @r%   r  r    s    ,} ,  /3)-,0#'=
 t+=
  $;	=

 #Tk=
 D[=
 +,=
 
	 =
 =
r'   r  c                        e Zd Zdef fdZe	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	edz  d
e	e
z  fd       Z xZS )CLIPSegTextTransformerr[   c                 
   t         |   |       |j                  }t        |      | _        t        |      | _        t        j                  ||j                        | _
        |j                  | _        | j                          y r   )rh   ri   rj   r   r|   r  encoderr   r   r   final_layer_normeos_token_id	post_initr   s      r%   ri   zCLIPSegTextTransformer.__init__%  sf     &&	/7%f- "YF<Q<Q R #//r'   Nr   r   rc   r   r%  r&  r   c           
         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |j                         }|j                  d|d         }| j                  ||      }	t        | j                   |	|t        j                  |	j                  d   |	j                        d       }|j                  dd         | j                  d|	||||dd	|}
|
d
   }| j                  |      }| j                   dk(  rm|t        j                  |j                  d
   |j                        |j#                  t        j$                  |j                        j'                  d      f   }n|t        j                  |j                  d
   |j                        |j#                  t        j$                  |j                        | j                   k(  j%                         j'                  d      f   }|s
||f|
dd  z   S t)        |||
j*                  |
j,                        S )NzYou have to specify input_idsre   )r   rc   r   r   )r[   r   r   cache_positionpast_key_valuesr   T)r   r   r   r%  r&  r   r   rb   )r   r   r   r(  pooler_outputrP   rQ   rM   )r[   r   r%  r)  r   r   r   r|   r
   r"   r#   r   r   popr3  r4  r5  r   r   argmaxr   rP   rQ   )r?   r   r   rc   r   r%  r&  r   input_shaperP   encoder_outputsr(  rU   s                r%   r   zCLIPSegTextTransformer.forward2  sH    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&NN2{27	),W+;;') <<(;(;A(>}G[G[\ 
 	

;%&$,, 
')/!5#
 
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M %}58KKK)/')77&11	
 	
r'   NNNNNN)rG   rH   rI   r   ri   r   r"   r   r   rC   r   r   r   r   s   @r%   r1  r1  $  s    0   *..2,0)-,0#'L
<<$&L
 t+L
 llT)	L

  $;L
 #TkL
 D[L
 
+	+L
 L
r'   r1  c                       e Zd ZU eed<   dZddgZdef fdZdej                  fdZ
d Ze	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  ded	z  ded	z  ded	z  deez  fd       Z xZS )CLIPSegTextModelr[   )r  r   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rh   ri   r1  
text_modelr6  rz   s     r%   ri   zCLIPSegTextModel.__init__  s&     08r'   r   c                 B    | j                   j                  j                  S r   rD  r|   r   rE   s    r%   get_input_embeddingsz%CLIPSegTextModel.get_input_embeddings  s    ))999r'   c                 :    || j                   j                  _        y r   rF  )r?   r   s     r%   set_input_embeddingsz%CLIPSegTextModel.set_input_embeddings  s    5:""2r'   Nr   r   rc   r   r%  r&  c                 0    | j                  ||||||      S )a;  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rc   r   r%  r&  )rD  )r?   r   r   rc   r   r%  r&  r   s           r%   r   zCLIPSegTextModel.forward  s,    4 )%/!5#  
 	
r'   r@  )rG   rH   rI   r   rL   r  _no_split_modulesri   r   ModulerG  rI  r   r"   r   r   rC   r   r   r   r   s   @r%   rB  rB    s     02GH0 :bii :;  *..2,0)-,0#' 
<<$& 
 t+ 
 llT)	 

  $; 
 #Tk 
 D[ 
 
+	+ 
  
r'   rB  c                        e Zd Zdef fdZe	 	 	 	 ddej                  dz  dedz  dedz  dedz  dedz  d	e	e
z  fd
       Z xZS )CLIPSegVisionTransformerr[   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )rh   ri   r[   rj   rZ   r|   r   r   r   pre_layrnormr  r3  post_layernormr   s      r%   ri   z!CLIPSegVisionTransformer.__init__  sj    &&	1&9LL8M8MN%f- ll9&:O:OPr'   Nr   r   r%  r&  r   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||      }| j                  |      }| j                  ||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )N)r   )r   r   r%  r&  r   r   r:  )r[   r   r%  r)  r|   rQ  r3  rR  r   rP   rQ   )
r?   r   r   r%  r&  r   rP   r?  r(  rU   s
             r%   r   z CLIPSegVisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]Ogh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r'   )NNNT)rG   rH   rI   r   ri   r   r"   rK   r   rC   r   r   r   r   s   @r%   rO  rO    s    Q2 Q  *.,0#'04$
''$.$
  $;$
 #Tk	$

 D[$
 #'+$
 
+	+$
 $
r'   rO  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e	 	 	 	 	 ddej                  dz  dedz  d	edz  d
edz  dedz  deez  fd       Z xZS )CLIPSegVisionModelr[   r   )r  c                 d    t         |   |       t        |      | _        | j	                          y r   )rh   ri   rO  vision_modelr6  rz   s     r%   ri   zCLIPSegVisionModel.__init__  s'     4V<r'   r   c                 B    | j                   j                  j                  S r   )rW  r|   rs   rE   s    r%   rG  z'CLIPSegVisionModel.get_input_embeddings  s      ++;;;r'   Nr   r%  r   r&  c                 .    | j                  |||||      S )a+  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r%  r   r&  )rW  )r?   r   r   r%  r   r&  r   s          r%   r   zCLIPSegVisionModel.forward  s,    @   %/!5%=# ! 
 	
r'   )NNNTN)rG   rH   rI   r   rL   main_input_namer  ri   r   rM  rG  r   r"   rK   r   rC   r   r   r   r   s   @r%   rU  rU    s    $O!2 <bii <  26)-,004#'%
''$.%
  $;%
 #Tk	%

 #'+%
 D[%
 
+	+%
 %
r'   rU  c                       e Zd ZU eed<   def fdZee	 	 ddej                  dej                  dz  dej                  dz  de
e   deez  f
d	              Zee	 dd
ej                  dede
e   deez  fd              Ze	 	 	 	 	 	 	 	 	 ddej$                  dz  d
ej                  dz  dej                  dz  dej$                  dz  dedz  dedz  dedz  dededz  deez  fd       Z xZS )r  r[   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  |_	        |j                  |_	        |j                  | _
        |j                  | _        |j                  | _        t        |      | _        t!        |      | _        t%        j&                  | j                  | j                  d      | _        t%        j&                  | j                  | j                  d      | _        t%        j,                  t/        j0                  | j2                  j4                              | _        | j9                          y )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)ra   )rh   ri   r  text_configr   	TypeErrortypevision_configr   r   projection_dimrj   r  r  r1  rD  rO  rW  r   r   r  r  rn   r"   tensorr[   logit_scale_init_valuelogit_scaler6  )r?   r[   r_  rb  r{   s       r%   ri   zCLIPSegModel.__init__'  ss    &,,.?@++,-Q0 
 &..0CD--./q2 
 ((,,+1+F+F(-3-H-H*$33)55 - 9 90=4]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r'   Nr   r   rc   r   r   c                 x     | j                   d|||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   rc   r&  rM   )rD  r;  r  )r?   r   r   rc   r   text_outputsrU   s          r%   get_text_featureszCLIPSegModel.get_text_featuresK  sV    . 4C4?? 4
)%	4

 4
 %22%)%9%9-%H"r'   r   r   c                 v     | j                   d||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```T)r   r   r&  rM   )rW  r;  r  )r?   r   r   r   vision_outputsrU   s         r%   get_image_featureszCLIPSegModel.get_image_featuresn  sU    6 6GT5F5F 6
%%=6
 	6
 '44'+'='=m'L$r'   return_lossr   r%  r&  c
           	         ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  |||||	      }| j                  ||||||	      }|d   }| j                  |      }|d   }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }t        j                  ||j                               |z  }|j                         }d}|rt        |      }|	s||||||f}||f|z   S |S t        |||||||	      S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NrZ  rK  r   rb   re   T)r   r   keepdim)r0   r1   r2   r3   r4   r5   r6   )r[   r   r%  r)  rW  rD  r  r  normrf  expr"   r   r*   r-   r/   )r?   r   r   r   rc   rm  r   r%  r   r&  r   rk  rh  r4   r3   rf  r2   r1   r0   outputs                       r%   r   zCLIPSegModel.forward  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 )%/!5# ' 
 &a(--l;"1o**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO &&**,,,{LNN4DES*,,.0D&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r'   )NNr   )	NNNNNNNTN)rG   rH   rI   r   rL   ri   r   r   r"   r   r   r   rC   r   ri  rK   r   rl  r   r/   r   r   r   s   @r%   r  r  #  s   "} "H  /3,0	<< t+ llT)	
 +, 
+	+  B  *."''" #'" +,	"
 
+	+"  "H  .215.204#')-,0)-#'^
##d*^
 ''$.^
 t+	^

 &&-^
 D[^
  $;^
 #Tk^
 #'^
 D[^
 
	^
 ^
r'   r  c                        e Zd ZdZdef fdZ	 ddej                  dej                  dej                  dedz  d	e	ej                     f
d
Z xZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    r[   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   r   rz   s     r%   ri   zCLIPSegDecoderLayer.__init__  r   r'   rP   r   causal_attention_maskr   Nr   c                     |}| j                  ||||      \  }}||z   }| j                  |      }|}| j                  |      }||z   }| j                  |      }|f}|r||fz  }|S )r   )rP   r   rv  r   )r   r   r   r   )r?   rP   r   rv  r   r   r   r   s           r%   r   zCLIPSegDecoderLayer.forward  s    " !&*nn')"7/	 '5 '
#| !=0((7 / =0((7 "&Gr'   r   )rG   rH   rI   rJ   r   ri   r"   r   r   rC   rK   r   r   r   s   @r%   rt  rt    sk    S} S */'||' '  %||	'
  $;' 
u  	!'r'   rt  c                        e Zd Zdef fdZ	 	 	 d
deej                     dej                  dedz  dedz  dedz  f
d	Z	 xZ
S )CLIPSegDecoderr[   c                 &   t         |   |       |j                  | _        t        j                  |j
                  |j                        | _        t        j                  |j
                  |j                        | _        |j                  r|j                  j                  dz  |j                  j                  dz  f}t        j                  t        j                  |j                  |j                  dd      t        j                         t        j                  |j                  |j                  dz  |d   |d         t        j                         t        j                  |j                  dz  d|d   |d               | _        nPt        j                  |j                  d|j                  j                  |j                  j                        | _        t#        |j$                        }t        j&                  t)        |      D cg c]6  }t        j                  |j                  j*                  |j                        8 c}      | _        t/        j0                  |j                        }|j                  |_        |j2                  |_        |j6                  |_        d	|_        t        j&                  t)        t#        |j$                              D cg c]  }t=        |       c}      | _        | jA                          y c c}w c c}w )
N   r   r   )r_   paddingrb   r   )r_   r`   )r`   relu)!rh   ri   conditional_layerr   r   rc  
reduce_dimfilm_mulfilm_add"use_complex_transposed_convolutionrb  rm   
Sequentialrq   ReLUConvTranspose2dtransposed_convolutionr$   extract_layersr!  r"  rj   reducescopydeepcopydecoder_num_attention_headsr   decoder_intermediate_sizer   r   rt  r#  r6  )r?   r[   transposed_kernelsdepthr   decoder_configr{   s         r%   ri   zCLIPSegDecoder.__init__0  s[    !'!9!9		&"7"79J9JK		&"7"79J9JK44"("6"6"A"AQ"FH\H\HgHgklHl!m*,--		&++V->->AWXY	""%%%%* 21 5-a0	 	""%%*A;Ma;PYklmYn+D' +-*<*<!!1f&:&:&E&EfNbNbNmNm+D' F))*}}UZ[`UabPQRYYv++779J9JKb
 v';';<%+%6%6"-3-O-O*+1+K+K($*!mmRWX[\b\q\qXrRs$tQ%8%H$tu c %us   ;L	LNrP   rT   r   r%  r&  c                 @   |rdnd }|rdnd }|d d d   }	d }
t        t        |	| j                  | j                              D ]  \  }\  }}}|
 ||      |
z   }
n ||      }
|| j                  k(  rJ| j                  |      |
j                  ddd      z  | j                  |      z   }
|
j                  ddd      }
 ||
d d |      }|d   }
|r||
fz  }|s||d   fz  } |
d d dd d d f   j                  ddd      }
t        t        j                  |
j                  d               }|j                  d   }|
j                  ||
j                  d   ||      }
| j                  |
      j                  d      }|st        d |||fD              S t!        |||      S )	NrM   re   r   r   rb   )r   rv  r   c              3   &   K   | ]	  }||  y wr   rM   )r=   vs     r%   r@   z)CLIPSegDecoder.forward.<locals>.<genexpr>  s     aqSTS`as   )r   rP   rQ   )r*  zipr#  r  r~  r  r   r  r   mathsqrtr   r   r  squeezerC   rO   )r?   rP   rT   r   r%  r&  r   all_hidden_statesr,  activationsrr  i
activationlayerreducer/  r   r   r   s                      r%   r   zCLIPSegDecoder.forward\  s    #7BD0d#DbD).7KVZVbVb8c.d 	6*A*
E6!
+f4
+D***'=>PQSTVWAXX[_[h[h*\   1a0!t4[lM #1%F#!fY.! =#3"55-	60 12q!))!Q2499V\\!_-.+11!4
Za$E,,V4<<Q?aV->$Oaaa#+%
 	
r'   )NNT)rG   rH   rI   r   ri   rC   r"   r   r   r   r   r   s   @r%   ry  ry  /  si    *} *` *.,0#'7
U\\*7
 !&7
  $;	7

 #Tk7
 D[7
r'   ry  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )custom_introc                       e Zd ZU eed<   def fdZ	 	 	 	 	 ddedz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  f
d	Z	e
	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dededz  deez  fd       Z xZS )CLIPSegForImageSegmentationr[   c                     t         |   |       || _        t        |      | _        |j
                  | _        t        |      | _        | j                          y r   )	rh   ri   r[   r  r  r  ry  decoderr6  rz   s     r%   ri   z$CLIPSegForImageSegmentation.__init__  sI      (	$33%f- 	r'   Nr   r   r   rc   conditional_pixel_valuesc                    |`t        |      |k7  rt        d      t        j                         5  | j                  j                  |||      j                  }d d d        |S |]t        |      |k7  rt        d      t        j                         5  | j                  j                  |      j                  }d d d        |S t        d      # 1 sw Y   S xY w# 1 sw Y   S xY w)Nz@Make sure to pass as many prompt texts as there are query images)r   rc   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r$   r   r"   r  r  ri  r;  rl  )r?   r   r   r   rc   r  rT   s          r%   get_conditional_embeddingsz6CLIPSegForImageSegmentation.get_conditional_embeddings  s      9~+ !cdd  )-)D)Dn< *E *- '  &% &1+,
: !dee n)-)E)EF^)_)m)m&n &%	 m   &%n &%s   )C&CCC$r   rT   labelsr   r%  r   r&  r   c                 "   ||n| j                   j                  }t        j                         5  | j                  j                  ||d|
|      }| j                  j                  |d         }|r|j                  n|d   }| j                  D cg c]
  }||dz       }}|r<t        |j                  |j                  |	r|j                  nd|j                        }n|	s|dd |dd z   n|}ddd       |$| j                  |j                  d   ||||	      }n[|j                  d   |j                  d   k7  rt        d
      |j                  d   | j                   j                   k7  rt        d      | j#                  |||	|      }|r|j$                  n|d   }d}|8|j'                  |j(                        }t+        j,                         } |||      }|s|||f}||f|z   S |S t/        ||||      S c c}w # 1 sw Y   xY w)a~  
        conditional_pixel_values (`torch.FloatTensor`, *optional*):
            The pixel values of the conditional images.
        conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
            The conditional embeddings for the query images. If provided, the model will use this instead of computing
            the embeddings from the conditional_pixel_values.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTrZ  r   rb   r:  r   r   )r   r   r   rc   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r%  r&  )r0   r   rT   rU   r6   rV   )r[   r)  r"   r  r  rW  r  rP   r  r   r(  r;  rQ   r  r   r   rc  r  r   r   r   r   BCEWithLogitsLossrS   )r?   r   r   r  rT   r   rc   r  r   r%  r   r&  r   rk  rU   rP   r  r  decoder_outputsr   r0   loss_fnrr  s                          r%   r   z#CLIPSegForImageSegmentation.forward  sn   d &1%<k$++B]B] ]]_ 	!YY33)"3%))A' 4 N !II77q8IJM<GN88^\]M^M9=9L9LMA=Q/MKM !;&4&F&F"0">">BV.">">\`-88	" DXN2A&);;]k /	8 ")%)%D%D'--a0#-))A &E &" &++A.,2D2DQ2GG m  &++A.$++2L2LL 0  ,,"/!5# ' 
 ,7''OA<NYYv}}-F**,G66*D4m^UdeF)-)9TGf$EvE-#9' .*
 	
q N	 	s   A HG?AH?HH)NNNNN)NNNNNNNNNTN)rG   rH   rI   r   rL   ri   r   r"   r   r  r   rK   r   r   rC   r/   r   r   r   s   @r%   r  r    s    }  "&)-.2,08<&$J& <<$&& t+	&
 llT)& #(,,"5&:  /315=A;?.204*.)-,0)-#'
$$t+
 ''$.
 #("3"3d":	

 !& 1 1D 8
 t+
 &&-
   4'
  $;
 #Tk
 #'
 D[
 
	
 
r'   r  )r  r   rB  rU  r  )r   )CrJ   r  r  collections.abcr   dataclassesr   typingr   r"   r    r   r	  r  r	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   configuration_clipsegr   r   r   
get_loggerrG   loggerr   r&   r-   r/   rO   rS   rM  rZ   r   floatr   r   r   r   r   r  r1  rB  rO  rU  r  rt  ry  r  __all__rM   r'   r%   <module>r     s      $ !    & ! / 9 K F & j j X X 
		H	%
`U\\ `ell `
-U\\ -ell -  
K  
   
F 7; 7  7 
[ 
  
<Pbii Ph%BII %` %II%<<% 
% <<	%
 LL4'% % %.>)ryy >)D  /4 /d 0%_ 0% 0%hM
RYY M
`[
3 [
|3
- 3
l1
ryy 1
h4
/ 4
n O
) O
 O
d6")) 6rd
+ d
N 
m
"8 m

m
`r'   