
    qi              	          d Z ddlZddlmZ ddlZddlmZ ddlmZ	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ  ej4                  e      Ze ed       G d de                    Ze ed       G d de                    Z G d dej>                        Z  G d dej>                        Z! G d dej>                        Z" G d dej>                        Z#d>dejH                  de%de&d ejH                  fd!Z' G d" d#ej>                        Z( G d$ d%e      Z) G d& d'ej>                        Z* G d( d)ejV                        Z, G d* d+ej>                        Z- G d, d-ej>                        Z.e G d. d/e             Z/e G d0 d1e/             Z0d2ejH                  d3e1d ejH                  fd4Z2d2ejH                  d5e1d6e1d ejH                  fd7Z3 G d8 d9ej>                        Z4 ed:       G d; d<e/             Z5g d=Z6y)?zPyTorch SegGpt model.    N)	dataclass)nn)
functional   )initialization)ACT2FN)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )SegGptConfigz1
    Output type of [`SegGptEncoderOutput`].
    )custom_introc                       e Zd ZU dZej
                  ed<   dZeej
                     dz  ed<   dZ	eej
                     dz  ed<   dZ
eej
                     dz  ed<   y)SegGptEncoderOutputay  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
        of shape `(batch_size, patch_height, patch_width, hidden_size)`.
    attentions (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
        Tuple of *torch.FloatTensor* (one for each layer) of shape
        `(batch_size, num_heads, seq_len, seq_len)`.
    intermediate_hidden_states (`tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
        Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
        Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
        Additionally, each feature passes through a LayerNorm.
    last_hidden_stateNhidden_states
attentionsintermediate_hidden_states)__name__
__module____qualname____doc__torchFloatTensor__annotations__r   tupler   r        \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/seggpt/modeling_seggpt.pyr   r   "   sd     (((59M5**+d2926Je''(4/6BFe&7&7 84 ?Fr!   r   z;
    Output type of [`SegGptImageSegmentationOutput`].
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                     dz  ed<   dZe
ej                     dz  ed<   y)SegGptImageSegmentationOutputa  
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
        The loss value.
    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        The predicted masks.
    hidden_states (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
        of shape `(batch_size, patch_height, patch_width, hidden_size)`.
    attentions (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape
        `(batch_size, num_heads, seq_len, seq_len)`.
    Nloss
pred_masksr   r   )r   r   r   r   r%   r   r   r   r&   r   r   r   r    r!   r"   r$   r$   >   sg     &*D%

d
")+/J!!D(/59M5**+d2926Je''(4/6r!   r$   c                   (     e Zd ZdZ fdZd Z xZS )SegGptPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)selfconfigr.   r/   r0   r1   r6   	__class__s          r"   r-   zSegGptPatchEmbeddings.__init__`   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir!   c                 N   |j                   \  }}}}|| j                  k7  rt        d      || j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      j                  ddd	d      }|S )
NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model ().   r   )shaper0   
ValueErrorr.   r8   permute)r9   pixel_values
batch_sizer0   heightwidth
embeddingss          r"   forwardzSegGptPatchEmbeddings.forwardn   s    2>2D2D/
L&%4,,,w  T__Q''5DOOA4F+F$VHAeW4KDOO\]L^K__`aeapapqras`ttvw  __\2::1aAF
r!   )r   r   r   r   r-   rH   __classcell__r;   s   @r"   r(   r(   Y   s    jr!   r(   c                        e Zd ZdZdeddf fdZdededej                  fdZ		 	 dd	ej                  d
ej                  dej                  dz  dedz  dej                  f
dZ xZS )SegGptEmbeddingszX
    Construct the embeddings from patch, position embeddings for input and prompt.
    r:   returnNc                 ~   t         |           t        j                  t	        j
                  ddd|j                              | _        t        j                  t	        j
                  ddd|j                              | _        t        j                  t	        j
                  ddd|j                              | _	        t        j                  t	        j
                  ddd|j                              | _
        t        j                  t	        j
                  ddd|j                              | _        t        |      | _        |j                  |j                  z  dz  dz   }t        j                  t	        j                   d||j                              | _        t        j$                  |j&                        | _        y )Nr   r?   )r,   r-   r   	Parameterr   zerosr1   
mask_tokensegment_token_inputsegment_token_prompttype_token_semantictype_token_instancer(   patch_embeddingspretrain_image_sizer/   randnposition_embeddingsDropouthidden_dropout_probdropout)r9   r:   num_positionsr;   s      r"   r-   zSegGptEmbeddings.__init__   s3   ,,u{{1aF<N<N'OP#%<<Aq!VEWEW0X#Y $&LLQ1fFXFX1Y$Z!#%<<Aq!VEWEW0X#Y #%<<Aq!VEWEW0X#Y  5f =33v7H7HHQNQRR#%<<A}fN`N`0a#b zz&"<"<=r!   rE   rF   c                    | j                   d d dd f   }|j                  d   }t        |dz        }t        j                  j                         s
||k7  s||k7  rSt        j                  |j                  d||d      j                  dddd      ||fdd	      }|j                  dddd      S |j                  d||d      S )
Nr         ?r   r   r?   bicubicF)sizemodealign_corners)
rY   r@   r   r   jit
is_tracingFinterpolatereshaperB   )r9   rE   rF   patch_pos_embedr6   pretrain_patch_sizes         r"   interpolate_pos_encodingz)SegGptEmbeddings.interpolate_pos_encoding   s    221ab59%++A.'S(89 99!%8F%BFY]bFbmm''+>@SUWX``abdeghjkle_#	O #**1aA66"**1feR@@r!   rC   prompt_pixel_valuesbool_masked_posembedding_typec                 R   | j                  |      }| j                  |      }|j                  \  }}}	}
| j                  j                  |||	d      }|j	                  d      j                  |      j                  d||	d      }|d|z
  z  ||z  z   }||nd}| j                  ||	      }|| j                  z   }|| j                  z   }||z   }||z   }|dk(  r| j                  }n |dk(  r| j                  }nt        d|       ||z   }||z   }t        j                  ||fd      }|S )Nr`   r   instancesemanticzBEmbedding type should be either 'semantic' or 'instance', but got r   dim)rV   r@   rQ   expand	unsqueezetype_asri   rl   rR   rS   rT   rU   rA   r   cat)r9   rC   rm   rn   ro   input_embeddingsprompt_embeddingsrD   patch_heightpatch_width_rQ   w	pos_embedtype_embeddingrG   s                   r"   rH   zSegGptEmbeddings.forward   sh     00> 112EF3C3I3I0
L+q__++JkSUV
%%b)11*=EEb,Xcefg-Q7*q.H+9+E: 11,L	 ,d.F.FF-0I0II ,i7-	9 Z'!55Nz)!55Nabpaqrss+n<->YY 02CD!L
r!   )NN)r   r   r   r   r   r-   intr   Tensorrl   
BoolTensorstrrH   rI   rJ   s   @r"   rL   rL   |   s    >| > > As A3 A5<< A, 48%)+ll+ #\\+ ))D0	+
 d
+ 
+r!   rL   c                   8    e Zd ZdZ fdZdededej                  dej                  fdZdej                  d	ej                  d
ej                  dej                  de	eef   de	eef   dej                  fdZ
ddej                  dej                  fdZ xZS )SegGptAttentionz=Multi-head Attention block with relative position embeddings.c                    t         |           |j                  |j                  }}t	        |t
        j                  j                        r|n||f}t	        |t
        j                  j                        r|n||f}|d   |j                  z  |d   |j                  z  f}|j                  |j                  z  }|j                  | _	        |dz  | _
        t        j                  |j                  |j                  dz  |j                        | _        t        j                  |j                  |j                        | _        |j                   | _        | j                   r||t#        d      t        j$                  t'        j(                  d|d   z  dz
  |            | _        t        j$                  t'        j(                  d|d   z  dz
  |            | _        y y )Nr   r   g      r   biaszBInput size must be provided if using relative positional encoding.r?   )r,   r-   r.   r/   r2   r3   r4   r5   r1   num_attention_headsscaler   Linearqkv_biasqkvproj use_relative_position_embeddingsrA   rO   r   rP   	rel_pos_h	rel_pos_w)r9   r:   r.   r/   
input_sizehead_dimr;   s         r"   r-   zSegGptAttention.__init__   s   !'!2!2F4E4EJ
#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
 mv'8'88*Q-6K\K\:\]
%%)C)CC#)#=#= t^
99V//1C1Ca1Gfoo^IIf00&2D2DE	060W0W-00! !eff  \\%++a*Q-6G!6KX*VWDN\\%++a*Q-6G!6KX*VWDN 1r!   q_sizek_sizerel_posrM   c                    t        dt        ||      z  dz
        }t        j                  |j	                  d|j
                  d   d      j                  ddd      |d      }|j	                  d|      j                  dd      }t        j                  |      dddf   t        ||z  d      z  }t        j                  |      dddf   t        ||z  d      z  }||z
  |dz
  t        ||z  d      z  z   }||j                            S )	a  
        Get relative positional embeddings according to the relative positions of
            query and key sizes.

        Args:
            q_size (int):
                size of the query.
            k_size (int):
                size of key k.
            rel_pos (`torch.Tensor`):
                relative position embeddings (L, channel).

        Returns:
            Extracted positional embeddings according to relative positions.
        r?   r   r   r`   linear)rb   rc   Ng      ?)
r   maxrg   rh   ri   r@   rB   r   arangelong)	r9   r   r   r   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss	            r"   get_rel_poszSegGptAttention.get_rel_pos   s     1s6622Q67--OOAw}}Q/4<<Q1E

 *11"lCKKAqQ <<'403v3LL<<'a03v3LL#h.6A:Vf_VYAZ2ZZ33566r!   attnqueryr   r   c                    |\  }}|\  }	}
| j                  ||	|      }| j                  ||
|      }|j                  \  }}}|j                  ||||      }t        j                  d||      }t        j                  d||      }|j                  ||||	|
      }||dddddddddf   z   |dddddddddf   z   }|j                  |||z  |	|
z        }|S )a  
        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py

        Args:
            attn (`torch.Tensor`):
                attention map.
            query (`torch.Tensor`):
                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
            rel_pos_h (`torch.Tensor`):
                relative position embeddings (Lh, channel) for height axis.
            rel_pos_w (`torch.Tensor`):
                relative position embeddings (Lw, channel) for width axis.
            q_size (tuple):
                spatial sequence size of query q with (query_height, query_width).
            k_size (tuple):
                spatial sequence size of key k with (key_height, key_width).

        Returns:
            attn (`torch.Tensor`):
                attention map with added relative positional embeddings.
        zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)r   r@   ri   r   einsum)r9   r   r   r   r   r   r   query_heightquery_width
key_height	key_widthrelative_position_heightrelative_position_widthrD   r}   rt   reshaped_queryrel_hrel_ws                      r"   add_decomposed_rel_posz&SegGptAttention.add_decomposed_rel_pos  s    > %+!k &
I#'#3#3L*i#X "&"2"2;	9"U"[[
Asz<cR-~?WX-~?VW||Jk:yYeAq!Q,--aAtQ6F0GG||J{(BJQZDZ[r!   r   c           	         |j                   \  }}}}| j                  |      j                  |||z  d| j                  d      j	                  ddddd      }|j                  d|| j                  z  ||z  d      j                  d      \  }}	}
|| j                  z  |	j                  dd      z  }| j                  r.| j                  ||| j                  | j                  ||f||f      }t        j                  j                  j                  |t        j                   d      j#                  |j$                        }|rE|j'                  || j                  ||z  d      }|j'                  || j                  z  ||z  d      }nd }||
z  j                  || j                  ||d      }|j	                  ddddd      j                  |||d      }| j)                  |      }||fS )	Nr   r`   r?   r   r      )dtypert   )r@   r   ri   r   rB   unbindr   	transposer   r   r   r   r   r   r   softmaxfloat32tor   viewr   )r9   r   output_attentionsrD   rE   rF   r}   r   r   keyvalueattn_weightsattn_weights_reshapedattn_outputs                 r"   rH   zSegGptAttention.forward9  s   '4':':$
FE1 HH]#WZ%D4L4LbQWQ1a# 	  KK:8P8P+PRX[`R`bdellmnosE

*cmmB.CC0066eT^^T^^fe_W]_dVeL xx**22<u}}Z\2]``afalalm
 %1$5$5j$BZBZ\bej\jln$o!055j4C[C[6[]cfk]kmopL$(!#e+44ZAYAY[achjlm!))!Q1a8@@VUZ\^_ii,233r!   )F)r   r   r   r   r-   r   r   r   r   r   r   rH   rI   rJ   s   @r"   r   r      s    GX07# 7s 7U\\ 7ell 7@+ll+ ||+ <<	+
 <<+ c3h+ c3h+ 
+Z#4U\\ #4u|| #4r!   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	SegGptMlpc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        |j                     | _
        y N)r,   r-   r   r   r1   mlp_dimlin1lin2r   
hidden_actactr9   r:   r;   s     r"   r-   zSegGptMlp.__init__a  sX    IIf00&..A	IIfnnf.@.@A	&++,r!   r   rM   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r9   r   s     r"   rH   zSegGptMlp.forwardg  s2    		-0/		-0r!   )r   r   r   r-   r   r   rH   rI   rJ   s   @r"   r   r   `  s#    -U\\ ell r!   r   input	drop_probtrainingrM   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

            r   r   )r   r   device)r@   ndimr   randr   r   floor_div)r   r   r   	keep_probr@   random_tensoroutputs          r"   	drop_pathr   o  s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr!   c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
SegGptDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rM   c                 0    t         |           || _        y r   )r,   r-   r   )r9   r   r;   s     r"   r-   zSegGptDropPath.__init__  s    "r!   r   c                 D    t        || j                  | j                        S r   )r   r   r   r   s     r"   rH   zSegGptDropPath.forward  s    FFr!   c                      d| j                    S )Nzp=)r   r9   s    r"   
extra_reprzSegGptDropPath.extra_repr  s    DNN#$$r!   r   )r   r   r   r   floatr-   r   r   rH   r   r   rI   rJ   s   @r"   r   r     sG    b#%$, #$ #GU\\ Gell G%C %r!   r   c                        e Zd Zdededdf fdZ	 	 ddej                  dede	d	e	de
ej                  ej                  f   e
ej                     z  f
d
Z xZS )SegGptLayerr:   drop_path_raterM   Nc                 t   t         |           t        |      | _        t	        |      | _        |dkD  rt        |      nt        j                         | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   eps)r,   r-   r   	attentionr   mlpr   r   Identityr   	LayerNormr1   layer_norm_epslayernorm_beforelayernorm_after)r9   r:   r   r;   s      r"   r-   zSegGptLayer.__init__  s    (0V$;IC;O7UWU`U`Ub "V-?-?VEZEZ [!||F,>,>FDYDYZr!   r   ensemble_condfeature_ensembler   c                    | j                  | j                  |      |      }|d   }|dd  }|r|j                  d   dz  |k\  r|j                  |j                  d   dz  d      \  }}	|dk(  ra|j                  d   dz  }
|	j	                  d|
d      }	|	j                  dd      j                  |	      }	 |	j                  |j                   }	n"|	j                  dd      j                  |	      }	t        j                  ||	gd      }| j                  |      |z   }|}| j                  |      }| j                  |      }|| j                  |      z   }|f|z   }|S )	N)r   r   r   r?   rs   r`   T)rt   keepdim)r   r   r@   splitri   mean	expand_asr   rx   r   r   r   )r9   r   r   r   r   self_attention_outputsattention_outputoutputspromptinputsnum_promptsresiduals               r"   rH   zSegGptLayer.forward  sz    "&!!-0/ "0 "
 2!4(, 0 6 6q 9Q >- O-334D4J4J14MQR4RXY3ZNFF!.44Q71<;;D9CCFK'6D9CCFK$yy&&)9qA '78=H ,,];/ 4>>-#@@ "W,r!   )FF)r   r   r   r   r   r-   r   r   r   boolr   rH   rI   rJ   s   @r"   r   r     s    [| [U [t [ "'"'#||# # 	#
  # 
u||U\\)	*U5<<-@	@#r!   r   c                   l     e Zd Zdeddf fdZ	 	 	 	 ddej                  dededed	edee	z  fd
Z
 xZS )SegGptEncoderr:   rM   Nc           
         t         |           || _        t        j                  d|j
                  |j                  d      D cg c]  }|j                          }}t        j                  t        |j                        D cg c]  }t        |||          c}      | _        t        j                  |j                  |j                        | _        d| _        y c c}w c c}w )Nr   cpu)r   r   F)r,   r-   r:   r   linspacer   num_hidden_layersitemr   
ModuleListranger   layersr   r1   r   	layernormgradient_checkpointing)r9   r:   xdprir;   s        r"   r-   zSegGptEncoder.__init__  s    !&63H3H&JbJbkp!qrAqvvxrrmm%PVPhPhJi$jQ[Q%@$jkf&8&8f>S>ST&+# s$js   CC$r   r   r   output_hidden_statesreturn_dictc                 6   |rdnd }|rdnd }g }t        | j                        D ]  \  }	}
|r||fz   }| j                  j                  |	kD  rdnd} |
||||      }|d   }|	| j                  j                  k(  r.|d |j                  d   dz   ||j                  d   dz  d  z   dz  }|	| j                  j
                  v r |j                  | j                  |             |s||d   fz   } |r||fz   }|st        d ||||fD              S t        ||||      S )Nr    r?   r   r   r_   c              3   $   K   | ]  }|| 
 y wr   r    ).0vs     r"   	<genexpr>z(SegGptEncoder.forward.<locals>.<genexpr>  s      = s   )r   r   r   r   )
	enumerater	  r:   merge_indexr@   !intermediate_hidden_state_indicesappendr
  r   r   )r9   r   r   r   r  r  all_hidden_statesall_self_attentionsr   r  layer_moduler   layer_outputss                r"   rH   zSegGptEncoder.forward  sr    #7BD$5b4%'"(5 	POA|#$58H$H! "&!8!81!<A!M(GWYjkM)!,MDKK+++!"?M$7$7$:a$?@=Q^QdQdefQgklQlQnCoo! DKKAAA*11$..2OP &9]1=M<O&O#)	P,   1]4D D '):<OQkl  
 #++*'A	
 	
r!   )FFFT)r   r   r   r   r-   r   r   r   r   r   rH   rI   rJ   s   @r"   r  r    sm    ,| , , "'"'%* 0
||0
 0
  	0

 #0
 0
 
$	$0
r!   r  c                   f     e Zd ZdZddd fd
Zdej                  dej                  f fdZ xZS )	SegGptLayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    gư>channels_last)r   data_formatc                \    t        |   |fd|i| |dvrt        d|       || _        y )Nr   )r   channels_firstzUnsupported data format: )r,   r-   NotImplementedErrorr!  )r9   normalized_shaper   r!  kwargsr;   s        r"   r-   zSegGptLayerNorm.__init__  s?    )=s=f=AA%(A+&OPP&r!   featuresrM   c                     | j                   dk(  r9|j                  dddd      }t        |   |      }|j                  dddd      }|S t        |   |      }|S )z
        Args:
            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
        r#  r   r?   r   r   )r!  rB   r,   rH   )r9   r'  r;   s     r"   rH   zSegGptLayerNorm.forward  sj    
 //''1a3Hwx0H''1a3H  wx0Hr!   )	r   r   r   r   r-   r   r   rH   rI   rJ   s   @r"   r  r    s4    
 15/ '   r!   r  c                   >     e Zd Z fdZdej
                  fdZ xZS )SegGptDecoderHeadc                 T   t         |           t        j                  |j                  |j                  dd      | _        t        |j                  |j                  d      | _        t        |j                     | _        t        j                  |j                  ddd      | _        y )Nr   r   )r*   paddingr#  )r%  r   r!  T)r*   r   )r,   r-   r   r7   decoder_hidden_sizeconvr  r   r
  r   r   act_fctheadr   s     r"   r-   zSegGptDecoderHead.__init__  s    II&&&&	
	 )#77V=R=R`p
 f//0IIf88!QUV	r!   r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r.  r
  r/  r0  r   s     r"   rH   zSegGptDecoderHead.forward"  s@    		-0}5]3		-0r!   )r   r   r   r-   r   r   rH   rI   rJ   s   @r"   r*  r*    s    WU%6%6 r!   r*  c                   v     e Zd Z fdZdej
                  dej
                  fdZdej
                  fdZ xZS )SegGptDecoderc                 B   t         |           t        j                  |j                  t        |j                        z  |j                  dz  |j                  z  d      | _	        t        |      | _        |j                  | _        |j                  | _        || _        y )Nr?   Tr   )r,   r-   r   r   r1   lenr  r/   r-  decoder_embedr*  decoder_predr:   r   s     r"   r-   zSegGptDecoder.__init__,  s    YYV%M%M!NNq 6#=#==

 .f5 ++#)#=#= r!   r   rM   c                    |j                   \  }}}}|j                  |||| j                  | j                  | j                        }|j	                  dddddd      }|j                  |d|| j                  z  || j                  z  f      }|S )	Nr      r   r   r?   r   r`   r@   )r@   ri   r/   r-  rB   )r9   r   rD   r{   r|   r}   s         r"   _reshape_hidden_statesz$SegGptDecoder._reshape_hidden_states8  s    3@3F3F0
L+q%--k4??DOOUYUmUm
 &--aAq!Q?%--r<$//#A;QUQ`Q`C`a . 
 r!   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r6  r;  r7  r   s     r"   rH   zSegGptDecoder.forwardD  s8    **=933MB))-8r!   )	r   r   r   r-   r   r   r;  rH   rI   rJ   s   @r"   r3  r3  +  s9    

E4E4E 
%J[J[ 
U%6%6 r!   r3  c                   |    e Zd ZU eed<   dZdZdZdZddgZ	 e
j                         dej                  d	d
fd       Zy
)SegGptPreTrainedModelr:   modelrC   )imageTrL   r   modulerM   Nc                 :   | j                   j                  }t        |t        j                  t        j
                  f      rOt        j                  |j                  d|       |j                   t        j                  |j                         yyt        |t        j                  t        f      r?t        j                  |j                         t        j                  |j                         yt        |t              rEt        j                  |j                  d|       t        j                  |j                   d|       yt        |t"              rt        j                  |j$                  d|       t        j&                  |j(                  |       t        j&                  |j*                  |       t        j&                  |j,                  |       t        j&                  |j.                  |       t        j&                  |j0                  |       yy)zInitialize the weightsr   )r   stdN)rC  )r:   initializer_ranger2   r   r   r7   inittrunc_normal_weightr   zeros_r   r  ones_r   r   r   rL   rY   normal_rQ   rR   rS   rT   rU   )r9   rA  rC  s      r"   _init_weightsz#SegGptPreTrainedModel._init_weightsU  sX    kk++fryy"))45v}}3C@{{&FKK( ' ?@KK$JJv}}%0v//csCv//csC 01v99MLL**4LL33=LL44#>LL33=LL33= 2r!   )r   r   r   r   r   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr   no_gradr   ModulerK  r    r!   r"   r>  r>  L  sV    $O!&*#+];U]]_>BII >$ > >r!   r>  c                       e Zd Zdef fdZdefdZe	 	 	 	 	 	 	 ddej                  dej                  dej                  d	ej                  dz  d
edz  dedz  dej                  dz  dedz  dedz  dedz  deez  fd       Z xZS )SegGptModelr:   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r,   r-   r:   rL   rG   r  encoder	post_initr   s     r"   r-   zSegGptModel.__init__n  s;     *62$V, 	r!   rM   c                 .    | j                   j                  S r   )rG   rV   r   s    r"   get_input_embeddingsz SegGptModel.get_input_embeddingsx  s    ///r!   NrC   rm   prompt_masksrn   r   ro   labelsr   r  r  c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
||nd}| j                  j
                  j                  j                  j                  }|j                  |      }|j                  |      }t        j                  ||fd      }|t        j                  ||fd      nt        j                  ||fd      }||t        j                  d       || j                  j
                  j                  }t        j                  |dz  t        j                   |j"                        }t        j$                  ||dz  z
  t        j                   |j"                        }t        j                  ||g      }|j'                  d      }| j	                  ||||      }| j)                  ||||	|
      }|S )	ax  
        prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
            [`SegGptImageProcessor.__call__`] for details.
        prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
            details.
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        feature_ensemble (`bool`, *optional*):
            Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
            if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
            be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.
        embedding_type (`str`, *optional*):
            Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
            instance or semantic.
        labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
            Ground truth mask for input images.

        Examples:

        ```python
        >>> from transformers import SegGptImageProcessor, SegGptModel
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
        >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
        >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

        >>> with httpx.stream("GET", image_input_url) as response:
        ...     image_input = Image.open(BytesIO(response.read()))

        >>> with httpx.stream("GET", image_prompt_url) as response:
        ...     image_prompt = Image.open(BytesIO(response.read()))

        >>> with httpx.stream("GET", mask_prompt_url) as response:
        ...     mask_prompt = Image.open(BytesIO(response.read())).convert("L")

        >>> checkpoint = "BAAI/seggpt-vit-large"
        >>> model = SegGptModel.from_pretrained(checkpoint)
        >>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

        >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> list(outputs.last_hidden_state.shape)
        [1, 56, 28, 1024]
        ```
        Fr?   rs   zLabels were provided, but bool_masked_pos were not. It will be set to default value. If you're training the model, make sure to provide a bool_masked_pos.r   r   )ro   rn   )r   r   r  r  )r:   r   r  use_return_dictrG   rV   r8   rG  r   r   r   rx   loggerwarning_oncer6   rP   r   r   onesrv   rV  )r9   rC   rm   rZ  rn   r   ro   r[  r   r  r  r&  expected_dtyper6   bool_masked_pos_zerosbool_masked_pos_onesembedding_outputencoder_outputss                     r"   rH   zSegGptModel.forward{  s   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]/?/K+QV99DDKKQQ#~6144^D yy"5|!D!L ~ II|\2:L&1q9 	 "v'9 m "//::FFK$)KKq0@

[g[n[n$o!#(::kQ..ejjI\I\$  $ii)>@T(UVO-77:O??-n^m + 
 ,,-/!5# ' 
 r!   NNNNNNN)r   r   r   r   r-   r(   rY  r   r   r   r   r   r   r   r   r   rH   rI   rJ   s   @r"   rT  rT  l  s    | 0&; 0  48(,%)+/)-,0#'rllr #\\r ll	r
 ))D0r +r d
r !!D(r  $;r #Tkr D[r 
$	$r rr!   rT  tensorr/   c                     | j                   \  }}}}||z  }||z  }| j                  ||||||f      } | j                  dddddd      } | j                  |||z  |dz  dz  f      } | S )Nr:  r   r?   r   r   r9  r   )r@   ri   rB   )rg  r/   rD   r0   rE   rF   r{   r|   s           r"   patchifyri    s    .4ll+JfeZ'L:%K^^:|\:Wbdn"o^pF^^Aq!Q1-F^^:|k/I:WX=[\K\"]^^FMr!   r{   r|   c           	      b   | j                   d   }t        | j                   d   dz  dz        }||z  | j                   d   k7  r"t        d| j                   d    d| d| d	      | j                  |||||df
      } | j	                  dddddd      } | j                  |d||z  ||z  f
      } | S )Nr   r`   r   r_   r   zNumber of patches z does not match patch height (z) and width (r>   r:  r9  r?   r   )r@   r   rA   ri   rB   )rg  r{   r|   rD   r/   s        r"   
unpatchifyrk    s    aJfll2&*s23Jk!V\\!_4 a 11OP\~]jkvjwwyz
 	
 ^^:|[*V`bc"d^eF^^Aq!Q1-F^^:q,2K[[eMe"f^gFMr!   c                        e Zd Z fdZdej
                  dej
                  dej
                  dej                  fdZ xZS )
SegGptLossc                 f    t         |           |j                  | _        |j                  | _        y r   )r,   r-   betar/   r   s     r"   r-   zSegGptLoss.__init__  s&    KK	 ++r!   rZ  r&   r[  rn   c                    t        j                  ||fd      }|dddddf   j                  dd| j                  dz  dz        }t	        ||j
                  d   | j                  z  |j
                  d   | j                  z        }t        j                  ||d| j                        }||z  j                         |j                         z  }|S )aN  Computes the L1 loss between the predicted masks and the ground truth masks.

        Args:
            prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values from mask prompt.

            pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
                Predicted masks.

            labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Ground truth mask for input images.

            bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
                Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:
            `torch.FloatTensor`: The mean L1 loss between the predicted masks and the ground truth masks.
        r?   rs   Nr   r   none)	reductionro  )
r   rx   repeatr/   rk  r@   rg   smooth_l1_lossro  sum)r9   rZ  r&   r[  rn   ground_truthmaskr%   s           r"   rH   zSegGptLoss.forward  s    2 yy,!7Q?q!Tz*11!Q8JQ8NO$ 2 21 5 H,J\J\]^J_cgcrcrJrs
LFQUQZQZ[t  "TXXZ/r!   )	r   r   r   r-   r   r   r   rH   rI   rJ   s   @r"   rm  rm    sK    ,
!''! %%! !!	!
 ))!r!   rm  zM
    SegGpt model with a decoder on top for one-shot image segmentation.
    c                       e Zd Zdef fdZe	 	 	 	 	 	 	 ddej                  dej                  dej                  dej                  dz  de	dz  d	e
dz  d
ej                  dz  de	dz  de	dz  de	dz  deez  fd       Z xZS )SegGptForImageSegmentationr:   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r,   r-   r:   rT  r?  r3  decoderrW  r   s     r"   r-   z#SegGptForImageSegmentation.__init__<  s;      (
$V, 	r!   NrC   rm   rZ  rn   r   ro   r[  r   r  r  rM   c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
|| j                  j
                  j                  j                  }t        j                  |dz  t        j                  |j                        }t        j                  ||dz  z
  t        j                  |j                        }t        j                  ||g      }|j                  d      }| j	                  |||||||||	|

      }|
r|j                  n|d   }t        j                  |d      }| j!                  |      }d}| t#        | j                         } |||||      }|
s)|f}|	r	||d   fz   }|r|	rdnd}|||   fz   }||f|z   }|S t%        |||j&                  |j(                  	      S )
a  
        prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
            [`SegGptImageProcessor.__call__`] for details.
        prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
            details.
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        feature_ensemble (`bool`, *optional*):
            Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
            if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
            be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.
        embedding_type (`str`, *optional*):
            Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
            instance or semantic.
        labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
            Ground truth mask for input images.

        Examples:

        ```python
        >>> from transformers import SegGptImageProcessor, SegGptForImageSegmentation
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
        >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
        >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

        >>> with httpx.stream("GET", image_input_url) as response:
        ...     image_input = Image.open(BytesIO(response.read()))

        >>> with httpx.stream("GET", image_prompt_url) as response:
        ...     image_prompt = Image.open(BytesIO(response.read()))

        >>> with httpx.stream("GET", mask_prompt_url) as response:
        ...     mask_prompt = Image.open(BytesIO(response.read())).convert("L")

        >>> checkpoint = "BAAI/seggpt-vit-large"
        >>> model = SegGptForImageSegmentation.from_pretrained(checkpoint)
        >>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

        >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> result = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[(image_input.height, image_input.width)])[0]
        >>> print(list(result.shape))
        [170, 297]
        ```
        Nr?   r   r   )
rC   rm   rZ  rn   r   ro   r[  r   r  r  r`   rs   r   )r%   r&   r   r   )r:   r   r  r]  r?  rG   rV   r6   r   rP   r   r   r`  rx   rv   r   r{  rm  r$   r   r   )r9   rC   rm   rZ  rn   r   ro   r[  r   r  r  r&  r6   rb  rc  r   r   r&   r%   loss_fnr   idxs                         r"   rH   z"SegGptForImageSegmentation.forwardF  s   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]"**//@@LLK$)KKq0@

[g[n[n$o!#(::kQ..ejjI\I\$  $ii)>@T(UVO-77:O**% 3%+-)/!5#  
 LWW%G%G\cdf\g"%*YY/Ir%R"\\"<=
 -G<V_MD ]F#71:-/ /aQ73</16)M,!!//))	
 	
r!   rf  )r   r   r   r   r-   r   r   r   r   r   r   r   r   r$   rH   rI   rJ   s   @r"   ry  ry  6  s    |   48(,%)+/)-,0#'x
llx
 #\\x
 ll	x

 ))D0x
 +x
 d
x
 !!D(x
  $;x
 #Tkx
 D[x
 
.	.x
 x
r!   ry  )rT  r>  ry  )r   F)7r   collections.abcr3   dataclassesr   r   r   torch.nnr   rg    r   rE  activationsr   modeling_layersr	   modeling_utilsr
   utilsr   r   r   r   configuration_seggptr   
get_loggerr   r^  r   r$   rR  r(   rL   r   r   r   r   r   r   r   r   r  r   r  r*  r3  r>  rT  r   ri  rk  rm  ry  __all__r    r!   r"   <module>r     s;     !   $ & ! 9 - D D . 
		H	% 
G+ G G, 
7K 7 7* BII  FRryy RjK4bii K4^		 U\\ e T V[VbVb  %RYY %,, ,^9
BII 9
zbll 4		 0BII B >O > >> A' A AH	U\\ 	s 	u|| 	u|| 3 S U\\ ' 'T 
D
!6 D

D
N Qr!   