
    qi                       d Z ddlZddlmZ ddlmZ ddlZddlmZ ddl	m
Z ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*  e#jV                  e,      Z-e e"d       G d de                    Z.e e"d       G d de                    Z/e e"d       G d de                     Z0e e"d       G d d e                     Z1 G d! d"ejd                        Z3d# Z4 G d$ d%ejd                        Z5 G d& d'ejd                        Z6 G d( d)ejd                        Z7	 	 dWd*ejd                  d+ejp                  d,ejp                  d-ejp                  d.ejp                  dz  d/e9dz  d0e9d1ee!   fd2Z: G d3 d4ejd                        Z; G d5 d6ejd                        Z< G d7 d8ejd                        Z= G d9 d:e      Z> G d; d<e      Z? G d= d>ejd                        Z@ G d? d@ejd                        ZA G dA dBejd                        ZB G dC dDejd                        ZCe" G dE dFe             ZD G dG dHeD      ZE G dI dJeD      ZF e"dK       G dL dMeD             ZG G dN dOejd                        ZH e"dP       G dQ dReD             ZI e"dS       G dT dUeD             ZJg dVZKy)XzPyTorch DETR model.    N)Callable)	dataclass   )initialization)ACT2FN)load_backbone)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutput"BaseModelOutputWithCrossAttentionsSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)#compile_compatible_method_lru_cache)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )
DetrConfigaU  
    Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
    )custom_introc                   :    e Zd ZU dZdZej                  dz  ed<   y)DetrDecoderOutputa&  
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
        used to compute the weighted average in the cross-attention heads.
    intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
        Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
        layernorm.
    Nintermediate_hidden_states__name__
__module____qualname____doc__r   torchFloatTensor__annotations__     X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/detr/modeling_detr.pyr   r   2   s      <@ 1 1D 8?r(   r   aS  
    Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
    c                   :    e Zd ZU dZdZej                  dz  ed<   y)DetrModelOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.
    intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
        Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
        layernorm.
    Nr   r   r'   r(   r)   r+   r+   H   s      <@ 1 1D 8?r(   r+   z4
    Output type of [`DetrForObjectDetection`].
    c                      e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZee	   dz  ed<   dZej                  dz  ed<   dZeej                     dz  ed	<   dZeej                     dz  ed
<   dZeej                     dz  ed<   dZej                  dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)DetrObjectDetectionOutputa@  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
        Classification logits (including no-object) for all queries.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
        unnormalized bounding boxes.
    auxiliary_outputs (`list[Dict]`, *optional*):
        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
        `pred_boxes`) for each decoder layer.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.
    Nloss	loss_dictlogits
pred_boxesauxiliary_outputslast_hidden_statedecoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentions)r    r!   r"   r#   r.   r$   r%   r&   r/   dictr0   r1   r2   listr3   r4   tupler5   r6   r7   r8   r9   r'   r(   r)   r-   r-   \   s   , &*D%

d
")!Itd{!'+FE$++/J!!D(/+/tDzD(/26u((4/6=A5!2!23d:A:>e//047>8<eE--.5<:>u0047>=A5!2!23d:A:>e//047>r(   r-   z1
    Output type of [`DetrForSegmentation`].
    c                      e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZee	   dz  ed<   dZej                  dz  ed	<   dZeej                     dz  ed
<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)DetrSegmentationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
        Classification logits (including no-object) for all queries.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
        unnormalized bounding boxes.
    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
        Segmentation masks logits for all queries. See also
        [`~DetrImageProcessor.post_process_semantic_segmentation`] or
        [`~DetrImageProcessor.post_process_instance_segmentation`]
        [`~DetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic
        segmentation masks respectively.
    auxiliary_outputs (`list[Dict]`, *optional*):
        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
        `pred_boxes`) for each decoder layer.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.
    Nr.   r/   r0   r1   
pred_masksr2   r3   r4   r5   r6   r7   r8   r9   )r    r!   r"   r#   r.   r$   r%   r&   r/   r:   r0   r1   r?   r2   r;   r3   r4   r<   r5   r6   r7   r8   r9   r'   r(   r)   r>   r>      s2   8 &*D%

d
")!Itd{!'+FE$++/J!!D(/+/J!!D(/+/tDzD(/26u((4/6=A5!2!23d:A:>e//047>8<eE--.5<:>u0047>=A5!2!23d:A:>e//047>r(   r>   c                   2     e Zd ZdZ fdZ fdZd Z xZS )DetrFrozenBatchNorm2dz
    BatchNorm2d where the batch statistics and the affine parameters are fixed.

    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
    torchvision.models.resnet[18,34,50,101] produce nans.
    c                 J   t         |           | j                  dt        j                  |             | j                  dt        j
                  |             | j                  dt        j
                  |             | j                  dt        j                  |             y )Nweightbiasrunning_meanrunning_var)super__init__register_bufferr$   oneszeros)selfn	__class__s     r)   rH   zDetrFrozenBatchNorm2d.__init__   sn    Xuzz!}5VU[[^4^U[[^<]EJJqM:r(   c           	      H    |dz   }||v r||= t         	|   |||||||       y )Nnum_batches_tracked)rG   _load_from_state_dict)
rL   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsnum_batches_tracked_keyrN   s
            r)   rQ   z+DetrFrozenBatchNorm2d._load_from_state_dict   s?     #)+@"@"j023%oWa	
r(   c                 B   | j                   j                  dddd      }| j                  j                  dddd      }| j                  j                  dddd      }| j                  j                  dddd      }d}|||z   j                         z  }|||z  z
  }||z  |z   S )Nr   gh㈵>)rC   reshaperD   rF   rE   rsqrt)rL   xrC   rD   rF   rE   epsilonscales           r)   forwardzDetrFrozenBatchNorm2d.forward   s     $$QAq1yy  B1-&&..q"a;((00B1=+/6688lU**5y4r(   )r    r!   r"   r#   rH   rQ   ra   __classcell__rN   s   @r)   rA   rA      s    ;	

 r(   rA   c                    | j                         D ]6  \  }}t        |t        j                        rt	        |j
                        }|j                  j                  t        j                  d      k7  r|j                  j                  |j                         |j                  j                  |j                         |j                  j                  |j                         |j                  j                  |j                         || j                  |<   t        t        |j!                                     dkD  s,t#        |       9 y)z
    Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`.

    Args:
        model (torch.nn.Module):
            input model
    metar   N)named_children
isinstancennBatchNorm2drA   num_featuresrC   devicer$   copy_rD   rE   rF   _moduleslenr;   childrenreplace_batch_norm)modelnamemodule
new_modules       r)   rp   rp      s     ,,. 'ffbnn-.v/B/BCJ}}##u||F';;!!''6%%fkk2''--f.A.AB&&,,V-?-?@#-ENN4 tFOO%&'!+v&'r(   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )DetrConvEncoderz
    Convolutional backbone, using either the AutoBackbone API or one from the timm library.

    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.

    c                    t         |           || _        t        |      }|j                  | _        t        j                         5  t        |       d d d        d}t        |d      r|j                  }d}|| _        |j                  j                  }d|v rf| j                  j                         D ]H  \  }}|r!d|vsd|vsd|vs|j                  d       )d|vs.d	|vs3d
|vs8|j                  d       J y y # 1 sw Y   xY w)NF	_backboneTresnetlayer2layer3layer4zstage.1zstage.2zstage.3)rG   rH   configr   channelsintermediate_channel_sizesr$   no_gradrp   hasattrrx   rq   backbone_config
model_typenamed_parametersrequires_grad_)rL   r}   backboneis_timm_modelbackbone_model_typerr   	parameterrN   s          r)   rH   zDetrConvEncoder.__init__   s
    (*2*;*;' ]]_ 	)x(	)
 8[)))H M
$44??**#'::#>#>#@ 8i t+0DY]I]!007 ,$1F9\`K`!0078 +	) 	)s   C??Dpixel_values
pixel_maskc                 V   | j                  |      }t        |t              r|j                  }g }|D ]t  }t        j
                  j                  |d    j                         |j                  dd        j                  t        j                        d   }|j                  ||f       v |S )Nsizer   )rq   rg   r:   feature_mapsrh   
functionalinterpolatefloatshapetor$   boolappend)rL   r   r   featuresoutfeature_mapmasks          r)   ra   zDetrConvEncoder.forward  s    ::l+h%,,H# 	,K==,,Z-=-C-C-EKL]L]^`^aLb,cffglgqgqrstuDJJT*+	, 
r(   )	r    r!   r"   r#   rH   r$   Tensorra   rb   rc   s   @r)   rv   rv      s(    8<ELL ell r(   rv   c                        e Zd ZdZ	 	 	 	 ddededededz  f fdZ ed	      	 dd
e	j                  de	j                  ez  de	j                  de	j                  dz  de	j                  f
d       Z xZS )DetrSinePositionEmbeddingz
    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
    need paper, generalized to work on images.
    Nnum_position_featurestemperature	normalizer`   c                     t         |           ||du rt        d      || _        || _        || _        |dt        j                  z  | _        y || _        y )NFz+normalize should be True if scale is passed   )	rG   rH   
ValueErrorr   r   r   mathpir`   )rL   r   r   r   r`   rN   s        r)   rH   z"DetrSinePositionEmbedding.__init__2  sY     	e!3JKK%:"&"$)MQ[
u
r(   r   maxsizer   rk   dtyper   returnc           
         |2t        j                  |d   |d   |d   f|t         j                        }|j                  d|      }|j                  d|      }| j                  rDd}||d d dd d d f   |z   z  | j
                  z  }||d d d d dd f   |z   z  | j
                  z  }t        j                  | j                  t         j                  |	      j                  |      }| j                  dt        j                  |dd
      z  | j                  z  z  }|d d d d d d d f   |z  }	|d d d d d d d f   |z  }
t        j                  |	d d d d d d dd df   j                         |	d d d d d d dd df   j                         fd      j                  d      }	t        j                  |
d d d d d d dd df   j                         |
d d d d d d dd df   j                         fd      j                  d      }
t        j                   |
|	fd      j#                  dddd      }|j                  d      j#                  ddd      }|S )Nr   r   r   rk   r   r   )r   gư>r[   )r   rk   floor)rounding_mode   dim)r$   rK   r   cumsumr   r`   aranger   int64r   r   divstacksincosflattencatpermute)rL   r   rk   r   r   y_embedx_embedepsdim_tpos_xpos_yposs               r)   ra   z!DetrSinePositionEmbedding.forwardA  s0    <;;a%(E!H=fTYT^T^_D++au+-++au+->>CBC!3c!9:TZZGGArs!3c!9:TZZGGT77u{{SYZ]]^cd  Q5!7)S%SVZVpVp%pq1a&.1a&.U1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgU1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgiiA.66q!QB kk!n$$Q1-
r(   )@   i'  FNN)r    r!   r"   r#   intr   r   rH   r   r$   Sizerk   strr   r   ra   rb   rc   s   @r)   r   r   ,  s     &( "="= = 	=
 t|= )3 %)zz s" {{	
 llT! 
 4r(   r   c                        e Zd ZdZd fd	Z ed      	 ddej                  dej                  e	z  dej                  d	ej                  dz  fd
       Z xZS )DetrLearnedPositionEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    c                     t         |           t        j                  d|      | _        t        j                  d|      | _        y )N2   )rG   rH   rh   	Embeddingrow_embeddingscolumn_embeddings)rL   embedding_dimrN   s     r)   rH   z%DetrLearnedPositionEmbedding.__init__e  s4     ll2}=!#b-!@r(   r   r   Nr   rk   r   r   c                 "   |dd  \  }}t        j                  ||      }t        j                  ||      }| j                  |      }	| j                  |      }
t        j                  |	j                  d      j                  |dd      |
j                  d      j                  d|d      gd      }|j                  ddd      }|j                  d      }|j                  |d   ddd      }|j                  d      j                  ddd      }|S )Nr   rk   r   r   r[   r   r   )	r$   r   r   r   r   	unsqueezerepeatr   r   )rL   r   rk   r   r   heightwidthwidth_valuesheight_valuesx_emby_embr   s               r)   ra   z$DetrLearnedPositionEmbedding.forwardj  s     bc
||E&9VF;&&|4##M2ii+2261a@%//RSBTB[B[\]_dfgBhioqrkk!Q"mmAjjq1a+ kk!n$$Q1-
r(   )   r   )r    r!   r"   r#   rH   r   r$   r   rk   r   r   r   ra   rb   rc   s   @r)   r   r   `  sl    A
 )3 %)zz s" {{	
 llT! 4r(   r   rs   querykeyvalueattention_maskscalingdropoutkwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr[         r   r   r   ptrainingr   )
r   r$   matmul	transposerh   r   softmaxr   r   
contiguous)
rs   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r)   eager_attention_forwardr     s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r(   c                        e Zd ZdZ	 	 ddededededef
 fdZ	 	 dd	e	j                  d
e	j                  dz  de	j                  dz  dee   dee	j                  e	j                  f   f
dZ xZS )DetrSelfAttentionz
    Multi-headed self-attention from 'Attention Is All You Need' paper.

    In DETR, position embeddings are added to both queries and keys (but not values) in self-attention.
    r}   hidden_sizenum_attention_headsr   rD   c                 p   t         |           || _        ||z  | _        | j                  dz  | _        || _        d| _        t        j                  |||      | _	        t        j                  |||      | _
        t        j                  |||      | _        t        j                  |||      | _        y Nr   FrD   rG   rH   r}   head_dimr   attention_dropout	is_causalrh   Lineark_projv_projq_projo_projrL   r}   r   r   r   rD   rN   s         r)   rH   zDetrSelfAttention.__init__       	#'::}}d*!(ii[tDii[tDii[tDii[tDr(   Nhidden_statesr   position_embeddingsr   r   c                    |j                   dd }g |d| j                  }|||z   n|}| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                  | j                  d|\  }} |j                  g |d j!                         }| j#                  |      }||fS )zZ
        Position embeddings are added to both queries and keys (but not values).
        Nr[   r   r           r   r   r   r   r   viewr   r   r   r   get_interfacer}   _attn_implementationr   r   r   r   r\   r   r   )rL   r   r   r  r   input_shapehidden_shapequery_key_inputquery_states
key_statesvalue_statesattention_interfacer   r   s                 r)   ra   zDetrSelfAttention.forward  s_    $))#2.88b8$--8ATA`-*==fs{{?388FPPQRTUV[[166|DNNqRST
{{=166|DNNqRST(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r(   r  TNNr    r!   r"   r#   r   r   r   r   rH   r$   r   r   r   r<   ra   rb   rc   s   @r)   r   r     s     EE E !	E
 E E. /337	$)||$) t+$) #\\D0	$)
 +,$) 
u||U\\)	*$)r(   r   c                       e Zd ZdZ	 	 ddededededef
 fdZ	 	 	 dd	e	j                  d
e	j                  de	j                  dz  de	j                  dz  de	j                  dz  dee   dee	j                  e	j                  f   fdZ xZS )DetrCrossAttentionz
    Multi-headed cross-attention from 'Attention Is All You Need' paper.

    In DETR, queries get their own position embeddings, while keys get encoder position embeddings.
    Values don't get any position embeddings.
    r}   r   r   r   rD   c                 p   t         |           || _        ||z  | _        | j                  dz  | _        || _        d| _        t        j                  |||      | _	        t        j                  |||      | _
        t        j                  |||      | _        t        j                  |||      | _        y r   r   r   s         r)   rH   zDetrCrossAttention.__init__  r   r(   Nr   key_value_statesr   r  encoder_position_embeddingsr   r   c                    |j                   dd }g |d| j                  }|j                   dd }	g |	d| j                  }
|||z   n|}|||z   n|}| j                  |      j                  |      j	                  dd      }| j                  |      j                  |
      j	                  dd      }| j                  |      j                  |
      j	                  dd      }t        j                  | j                  j                  t              } || ||||f| j                  sdn| j                  | j                  d|\  }} |j                  g |d j!                         }| j#                  |      }||fS )z
        Position embeddings logic:
        - Queries get position_embeddings
        - Keys get encoder_position_embeddings
        - Values don't get any position embeddings
        Nr[   r   r   r  r  r  )rL   r   r  r   r  r  r   query_input_shapequery_hidden_shapekv_input_shapekv_hidden_shapequery_input	key_inputr  r  r  r  r   r   s                      r)   ra   zDetrCrossAttention.forward  s    *//4D0D"DdmmD)//4>N>B>>=P=\m&99bo +6 ::! 	 {{;/445GHRRSTVWX[[+00AKKAqQ
{{#3499/JTTUVXYZ(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k))A+<AbALLNkk+.L((r(   r  NNNr  rc   s   @r)   r  r    s     EE E !	E
 E E0 /337;?1)||1)  ,,1) t+	1)
 #\\D01) &+\\D%81) +,1) 
u||U\\)	*1)r(   r  c                   d     e Zd Zdededef fdZdej                  dej                  fdZ xZ	S )DetrMLPr}   r   intermediate_sizec                    t         |           t        j                  ||      | _        t        j                  ||      | _        t        |j                     | _        |j                  | _	        |j                  | _
        y r   )rG   rH   rh   r   fc1fc2r   activation_functionactivation_fnactivation_dropoutr   )rL   r}   r   r"  rN   s       r)   rH   zDetrMLP.__init__1  s`    99[*;<99.<#F$>$>?"(";";~~r(   r   r   c                 @   | j                  | j                  |            }t        j                  j	                  || j
                  | j                        }| j                  |      }t        j                  j	                  || j                  | j                        }|S )Nr   )r'  r$  rh   r   r   r(  r   r%  )rL   r   s     r)   ra   zDetrMLP.forward9  s}    **488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-dr(   )
r    r!   r"   r   r   rH   r$   r   ra   rb   rc   s   @r)   r!  r!  0  s9    &z & &PS &U\\ ell r(   r!  c                        e Zd Zdef fdZ	 d
dej                  dej                  dej                  dz  dee   dej                  f
d	Z	 xZ
S )DetrEncoderLayerr}   c                    t         |           |j                  | _        t	        || j                  |j
                  |j                        | _        t        j                  | j                        | _
        |j                  | _        t        || j                  |j                        | _        t        j                  | j                        | _        y N)r}   r   r   r   )rG   rH   d_modelr   r   encoder_attention_headsr   	self_attnrh   	LayerNormself_attn_layer_normr   r!  encoder_ffn_dimmlpfinal_layer_normrL   r}   rN   s     r)   rH   zDetrEncoderLayer.__init__B  s    !>>*(( & > >,,	
 %'LL1A1A$B!~~64#3#3V5K5KL "T-=-= >r(   Nr   r   spatial_position_embeddingsr   r   c                    |} | j                   d|||d|\  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  |      }||z   }| j                  |      }| j                  rht        j                  |      j                         sEt        j                  |j                        j                  dz
  }t        j                  || |      }|S )a[  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                values.
            spatial_position_embeddings (`torch.FloatTensor`, *optional*):
                Spatial position embeddings (2D positional encodings of image locations), to be added to both
                the queries and keys in self-attention (but not to values).
        )r   r   r  r   i  )minmaxr'   )r0  rh   r   r   r   r2  r4  r5  r$   isfiniteallfinfor   r:  clamp)rL   r   r   r7  r   residual_clamp_values           r)   ra   zDetrEncoderLayer.forwardP  s    " !)4>> 
') ;
 	
q --mt||VZVcVc-d =011-@ / =0--m<==>>-0446#kk-*=*=>BBTI %M|Q\ ]r(   r   r    r!   r"   r   rH   r$   r   r   r   ra   rb   rc   s   @r)   r+  r+  A  se    ?z ?$ <@	'||' ' &+\\D%8	'
 +,' 
'r(   r+  c                       e Zd Zdef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ee   dej                  fdZ	 xZ
S )DetrDecoderLayerr}   c                 R   t         |           |j                  | _        t	        || j                  |j
                  |j                        | _        |j                  | _        t        j                  | j                        | _        t        || j                  |j
                  |j                        | _        t        j                  | j                        | _        t        || j                  |j                         | _        t        j                  | j                        | _        y r-  )rG   rH   r.  r   r   decoder_attention_headsr   r0  r   rh   r1  r2  r  encoder_attnencoder_attn_layer_normr!  decoder_ffn_dimr4  r5  r6  s     r)   rH   zDetrDecoderLayer.__init__{  s    !>>*(( & > >,,	
 ~~$&LL1A1A$B!.(( & > >,,	
 (*||D4D4D'E$64#3#3V5K5KL "T-=-= >r(   Nr   r   r7  "object_queries_position_embeddingsr8   encoder_attention_maskr   r   c           	         |} | j                   d|||d|\  }}	t        j                  j                  || j                  | j                        }||z   }| j                  |      }|i|} | j                  d|||||d|\  }}	t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  |      }||z   }| j                  |      }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                values.
            spatial_position_embeddings (`torch.FloatTensor`, *optional*):
                Spatial position embeddings (2D positional encodings from encoder) that are added to the keys only
                in the cross-attention layer (not to values).
            object_queries_position_embeddings (`torch.FloatTensor`, *optional*):
                Position embeddings for the object query slots. In self-attention, these are added to both queries
                and keys (not values). In cross-attention, these are added to queries only (not to keys or values).
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                values.
        )r   r  r   r   )r   r  r   r  r  r'   )
r0  rh   r   r   r   r2  rG  rH  r4  r5  )
rL   r   r   r7  rJ  r8   rK  r   r?  r@  s
             r)   ra   zDetrDecoderLayer.forward  s3   8 ! *4>> 
' B)
 	
q --mt||VZVcVc-d =011-@ !,$H0t00  +!65$F,G   M1 MM11-4<<Z^ZgZg1hM$}4M 88GM !/ =0--m<r(   )NNNNNrB  rc   s   @r)   rD  rD  z  s    ?z ?4 /3;?BF596:A||A t+A &+\\D%8	A
 -2LL4,?A  %||d2A !&t 3A +,A 
Ar(   rD  c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ	 xZ
S )
DetrConvBlockz5Basic conv block: Conv3x3 -> GroupNorm -> Activation.in_channelsout_channels
activationc                     t         |           t        j                  ||dd      | _        t        j
                  t        d|      |      | _        t        |   | _	        y )Nr   r   kernel_sizepadding   )
rG   rH   rh   Conv2dconv	GroupNormr9  normr   rQ  )rL   rO  rP  rQ  rN   s       r)   rH   zDetrConvBlock.__init__  sJ    IIk<QPQR	LLQ!5|D	 ,r(   r^   r   c                 `    | j                  | j                  | j                  |                  S r   )rQ  rZ  rX  )rL   r^   s     r)   ra   zDetrConvBlock.forward  s"    tyy1677r(   relur    r!   r"   r#   r   r   rH   r$   r   ra   rb   rc   s   @r)   rN  rN    s;    ?-C -s - -8 8%,, 8r(   rN  c            	            e Zd ZdZddedededef fdZdej                  dej                  d	ej                  fd
Z	 xZ
S )DetrFPNFusionStagez\Single FPN fusion stage combining low-resolution features with high-resolution FPN features.fpn_channelscurrent_channelsoutput_channelsrQ  c                     t         |           t        j                  ||d      | _        t        |||      | _        y )Nr   rT  )rG   rH   rh   rW  fpn_adapterrN  refine)rL   ra  rb  rc  rQ  rN   s        r)   rH   zDetrFPNFusionStage.__init__  s6    99\3CQRS#$4ozRr(   r   fpn_featuresr   c                     | j                  |      }t        j                  j                  ||j                  dd d      }| j                  ||z         S )a?  
        Args:
            features: Current features to upsample, shape (B*Q, current_channels, H_in, W_in)
            fpn_features: FPN features at target resolution, shape (B*Q, fpn_channels, H_out, W_out)

        Returns:
            Fused and refined features, shape (B*Q, output_channels, H_out, W_out)
        r   Nnearest)r   mode)rf  rh   r   r   r   rg  )rL   r   rh  s      r)   ra   zDetrFPNFusionStage.forward  sQ     ''5==,,XL<N<Nrs<SZc,d{{<(233r(   r\  r^  rc   s   @r)   r`  r`    sT    fSS SC SRU Scf S
4 4ELL 4U\\ 4r(   r`  c            	            e Zd ZdZ	 ddedee   dedef fdZdej                  dej                  d	eej                     d
ej                  fdZ
 xZS )DetrMaskHeadSmallConva+  
    Segmentation mask head that generates per-query masks using FPN-based progressive upsampling.

    Combines attention maps (spatial localization) with encoder features (semantics) and progressively
    upsamples through multiple scales, fusing with FPN features for high-resolution detail.
    input_channelsra  r   r&  c           
         t         |           |dz  dk7  rt        d|       t        |||      | _        t        ||dz  |      | _        t        j                  t        |d   |dz  |dz  |      t        |d   |dz  |dz  |      t        |d   |dz  |dz  |      g      | _	        t        j                  |dz  ddd	      | _        y )
NrV  r   z+input_channels must be divisible by 8, got r   r   r      r   rS  )rG   rH   r   rN  conv1conv2rh   
ModuleListr`  
fpn_stagesrW  output_conv)rL   rn  ra  r   r&  rN   s        r)   rH   zDetrMaskHeadSmallConv.__init__  s     	A"J>JZ[\\">>CVW
">;!3CEXY
 --"<?K14DkUVFVXkl"<?K14DkUVFVXkl"<?K14DkUWFWYlm
 99[B%6qRSTr(   r   attention_masksrh  r   c           
         |j                   d   }|j                  d      j                  d|ddd      j                  dd      }|j                  dd      }|D cg c]6  }|j                  d      j                  d|ddd      j                  dd      8 }}t	        j
                  ||gd      }| j                  |      }| j                  |      }t        | j                  |      D ]  \  }} |||      } | j                  |      S c c}w )a  
        Args:
            features: Encoder output features, shape (batch_size, hidden_size, H, W)
            attention_masks: Cross-attention maps from decoder, shape (batch_size, num_queries, num_heads, H, W)
            fpn_features: List of 3 FPN features from low to high resolution, each (batch_size, C, H, W)

        Returns:
            Predicted masks, shape (batch_size * num_queries, 1, output_H, output_W)
        r   r[   r   r   )r   r   expandr   r$   r   rq  rr  ziprt  ru  )rL   r   rv  rh  num_queriesfpn_featr   	fpn_stages           r)   ra   zDetrMaskHeadSmallConv.forward  s    &++A. %%a(//KRLTTUVXYZ)11!Q7dp
X`Hq!((["b"EMMaQRS
 
 		8_"=1E

=1

=1#&t#E 	?Ix%mX>M	? ..
s   ;D
r\  )r    r!   r"   r#   r   r;   r   rH   r$   r   ra   rb   rc   s   @r)   rm  rm    s     $*UU 3iU 	U
 !U2/,,/ / 5<<(	/
 
/r(   rm  c            	            e Zd ZdZ	 	 ddedededef fdZ	 ddej                  d	ej                  d
ej                  dz  fdZ
 xZS )DetrMHAttentionMapzdThis is a 2D attention module, which only returns the attention softmax (no multiplication by value)r   r   r   rD   c                     t         |           ||z  | _        | j                  dz  | _        || _        t        j                  |||      | _        t        j                  |||      | _        y )Nr   r   )	rG   rH   r   r   r   rh   r   r   r   )rL   r   r   r   rD   rN   s        r)   rH   zDetrMHAttentionMap.__init__?  s]     	#'::}}d*!(ii[tDii[tDr(   Nr  r  r   c                    g |j                   d d d| j                  }|j                   d   d| j                  g|j                   dd  }| j                  |      j                  |      }t        j
                  j                  || j                  j                  j                  d      j                  d      | j                  j                        j                  |      }|j                   \  }}}}	|j                   \  }
}
}
}}||z  ||	f}||z  ||z  |	f}|||||f}|j                  dd      j                         j                  |      }|j                  ddddd      j                         j                  |      }t        j                  || j                   z  |j                  dd            j                  |      j                  dd      }|||z   }t        j
                  j#                  |j%                  d      d      j                  |j'                               }t        j
                  j)                  || j*                  | j,                  	      }|S )
Nr[   r   r   r   r   r   r   r   r   )r   r   r   r  rh   r   conv2dr   rC   r   rD   r   r   r   r$   r   r   r   r   r   r   r   r   )rL   r  r  r   r  key_hidden_shape
batch_sizerz  	num_headsr   r@  r   r   query_shape	key_shapeattn_weights_shaper   r   r   s                      r)   ra   zDetrMHAttentionMap.forwardN  s;    K|11#26JJDMMJ&,,Q/T]][ZEUEUVXVYEZ[{{<0556HI]]))**44R8BB2FHXHX

$
  	 8D7I7I4
KH!+!1!11a!I-{HE)+Ve^XF	()[&%P&&q!,779>>{K  Aq!Q/::<AA)L \\%$,,.a0CDJJK]^hhijlmn 	 %'.8L}},,\-A-A!-D",MRRS_SdSdSfg}},,\T=S=S^b^k^k,lr(   r  r   )r    r!   r"   r#   r   r   r   rH   r$   r   ra   rb   rc   s   @r)   r~  r~  <  sv    n EE !E 	E
 E  ko!LL6;llTYT`T`cgTgr(   r~  c                   t    e Zd ZU eed<   dZdZdZg dZdZ	dZ
dZdZdZdgZ ej                          d        Zy	)
DetrPreTrainedModelr}   rq   r   )image)rv   r+  rD  TzMdetr\.model\.backbone\.model\.layer\d+\.0\.downsample\.1\.num_batches_trackedc                    | j                   j                  }| j                   j                  }t        |t              r|j                         D ]k  }t        |t        j                        st        j                  |j                  d       |j                  Lt        j                  |j                  d       m y t        |t              rt        j                  |j                  j                         t        j                  |j                   j                         t        j"                  |j                  j                  |       t        j"                  |j                   j                  |       y t        |t$              rSt        j&                  |j(                  j                         t        j&                  |j*                  j                         y t        |t        j,                  t        j                  f      rOt        j.                  |j                  d|       |j                   t        j                  |j                         y y t        |t        j0                        rtt        j.                  |j                  d|       |j2                  Et5        |j                  dd      s-t        j                  |j                  |j2                            y y y t        |t        j6                  t        j8                  f      r?t        j:                  |j                         t        j                  |j                         y y )	Nr   )ar   )gainr  )meanstd_is_hf_initializedF)r}   init_stdinit_xavier_stdrg   rm  modulesrh   rW  initkaiming_uniform_rC   rD   	constant_r~  zeros_r   r   xavier_uniform_r   uniform_r   r   r   normal_r   padding_idxgetattrr1  rY  ones_)rL   rs   r  
xavier_stdms        r)   _init_weightsz!DetrPreTrainedModel._init_weights  s   kk""[[00
f34^^% 2a+))!((a8vv)qvvq1	2
  23KK**+KK**+  !5!5JG  !5!5JG <=MM&//667MM&2299:BII 67LLSc:{{&FKK( '-LLSc:!!-gfmmMach6iFMM&*<*<=> 7j-r|| <=JJv}}%KK$ >r(   N)r    r!   r"   r   r&   base_model_prefixmain_input_nameinput_modalities_no_split_modulessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_attention_backend_supports_flex_attn"_keys_to_ignore_on_load_unexpectedr$   r   r  r'   r(   r)   r  r  o  sc    $O!V&*#N"&X*& U]]_% %r(   r  c                   d     e Zd ZdZeedZdef fdZe	e
	 	 	 ddee   defd              Z xZS )	DetrEncoderz
    Transformer encoder that processes a flattened feature map from a vision backbone, composed of a stack of
    [`DetrEncoderLayer`] modules.

    Args:
        config (`DetrConfig`): Model configuration object.
    )r   
attentionsr}   c                     t         |   |       |j                  | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        | j                          y c c}w r   )
rG   rH   r   rh   rs  rangeencoder_layersr+  layers	post_initrL   r}   r@  rN   s      r)   rH   zDetrEncoder.__init__  sY     ~~mmuVMbMbGc$d!%5f%=$de 	 %es   A7r   r   c                     |}t         j                  j                  || j                  | j                        }t	        | j
                  ||      }| j                  D ]  } |||fd|i|} t        |      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:

                - 1 for pixel features that are real (i.e. **not masked**),
                - 0 for pixel features that are padding (i.e. **masked**).

                [What are attention masks?](../glossary#attention-mask)
            spatial_position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Spatial position embeddings (2D positional encodings) that are added to the queries and keys in each self-attention layer.
        r   r}   inputs_embedsr   r7  )r3   )rh   r   r   r   r	   r}   r  r   )rL   r  r   r7  r   r   encoder_layers          r)   ra   zDetrEncoder.forward  s    . &--mt||VZVcVc-d2;;')
 "[[ 	M)~KfjpM	 ??r(   r  )r    r!   r"   r#   r+  r   _can_record_outputsr   rH   r   r   r   r   r   ra   rb   rc   s   @r)   r  r    sg     -=L]^z    $(	$@
 +,$@ 
$@   $@r(   r  c                   l     e Zd ZdZeeedZdef fdZ	e
e	 	 	 	 	 	 ddee   defd              Z xZS )	DetrDecodera   
    Transformer decoder that refines a set of object queries. It is composed of a stack of [`DetrDecoderLayer`] modules,
    which apply self-attention to the queries and cross-attention to the encoder's outputs.

    Args:
        config (`DetrConfig`): Model configuration object.
    )r   r  r6   r}   c                 @   t         |   |       |j                  | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t        j                  |j                        | _        | j                          y c c}w r   )rG   rH   r   rh   rs  r  decoder_layersrD  r  r1  r.  	layernormr  r  s      r)   rH   zDetrDecoder.__init__  sm     ~~mmuVMbMbGc$d!%5f%=$defnn5 	 %es   Br   r   c                    ||}|t        | j                  |      }||t        | j                  ||      }| j                  j                  rdnd}	t        | j                        D ]B  \  }
} |||||fd|i|}| j                  j                  s,| j                  |      }|	|fz  }	D | j                        }| j                  j                  rt        j                  |	      }	t        ||	      S )a
  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                The query embeddings that are passed into the decoder.

            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:

                - 1 for queries that are **not masked**,
                - 0 for queries that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
                in `[0, 1]`:

                - 1 for pixels that are real (i.e. **not masked**),
                - 0 for pixels that are padding (i.e. **masked**).

            spatial_position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Spatial position embeddings (2D positional encodings from encoder) that are added to the keys in each cross-attention layer.
            object_queries_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
                Position embeddings for the object query slots that are added to the queries and keys in each self-attention layer.
        Nr  )r}   r  r   r8   r'   rK  )r3   r   )	r	   r}   auxiliary_loss	enumerater  r  r$   r   r   )rL   r  r   r8   rK  r7  rJ  r   r   intermediateidxdecoder_layers               r)   ra   zDetrDecoder.forward  s   P $)M%6{{+-N !,1G1S%>{{+5&;	&" "[[77rT #,DKK"8 	1C)+2% (> M {{)) $} = 00	1  }5 ;;%% ;;|4L =]ijjr(   NNNNNN)r    r!   r"   r#   rD  r   r  r  r   rH   r   r   r   r   r   ra   rb   rc   s   @r)   r  r    sy     *'.	z 	   "#$(+/Tk +,Tk 
Tk   Tkr(   r  z
    The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
    any specific head on top.
    c                   <    e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  dee   dee	j                     ez  fd              Z xZS )	DetrModelr}   c                 f   t         |   |       t        |      | _        |j                  dk(  r t        |j                  dz  d      | _        nE|j                  dk(  rt        |j                  dz        | _        nt        d|j                         t        j                  |j                  |j                        | _        t        j                  | j                  j                  d   |j                  d	      | _        t#        |      | _        t'        |      | _        | j+                          y )
Nsiner   T)r   learnedzNot supported r[   r   re  )rG   rH   rv   r   position_embedding_typer   r.  position_embeddingr   r   rh   r   rz  query_position_embeddingsrW  r   input_projectionr  encoderr  decoderr  r6  s     r)   rH   zDetrModel.__init__X  s     '/))V3&?RS@S_c&dD#++y8&B6>>UVCV&WD#~f.L.L-MNOO)+f6H6H&..)Y& "		$--*R*RSU*VX^XfXftu v"6*"6* 	r(   c                 ~    | j                   j                  j                         D ]  \  }}|j                  d        y )NFr   rq   r   r   rL   r@  params      r)   freeze_backbonezDetrModel.freeze_backbonel  s6    ++<<> 	(HAu  '	(r(   c                 ~    | j                   j                  j                         D ]  \  }}|j                  d        y )NTr  r  s      r)   unfreeze_backbonezDetrModel.unfreeze_backbonep  s6    ++<<> 	'HAu  &	'r(   Nr   r   decoder_attention_maskencoder_outputsr  decoder_inputs_embedsr   r   c           
         ||t        d      ||j                  \  }}	}
}|j                  }|t        j                  ||
|f|      }| j                  ||      }|d   \  }}| j                  |      }|j                  d      j                  ddd      }| j                  |j                  ||j                  |      }|j                  d      }n|j                  d   }|j                  }|}|j                  d   }t        |d	z        }| j                  t        j                  || j                  j                  ||g      ||j                  
      }|ft        j                   j#                  |d   j%                         ||f      j'                  t        j(                        d   }|j                  d      }n(t        j                  ||f|t        j*                        }| | j,                  d|||d|}| j.                  j0                  j3                  d      j5                  |dd      }||}nt        j6                  |      } | j8                  d|||||j:                  |d|}t=        |j:                  |j>                  |j@                  |jB                  |j:                  |j>                  |j@                  |jD                        S )a  
        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
            Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`:

            - 1 for queries that are **not masked**,
            - 0 for queries that are **masked**.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
            can choose to directly pass a flattened representation of an image. Useful for bypassing the vision backbone.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
            embedded representation. Useful for tasks that require custom query initialization.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DetrModel
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
        >>> model = DetrModel.from_pretrained("facebook/detr-resnet-50")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**inputs)

        >>> # the last hidden states are the final query embeddings of the Transformer decoder
        >>> # these are of shape (batch_size, num_queries, hidden_size)
        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 100, 256]
        ```Nz8You have to specify either pixel_values or inputs_embedsr   r[   r   r   r   r   rk   r   r   g      ?)r   rk   r   r   r   r  r   r7  r  r   r7  rJ  r8   rK  )r3   r4   r5   r6   r7   r8   r9   r   r'   )#r   r   rk   r$   rJ   r   r  r   r   r  r   r   r   r}   r.  rh   r   r   r   r   r   longr  r  rC   r   r   
zeros_liker  r3   r+   r   r  r6   r   )rL   r   r   r  r  r  r  r   r  num_channelsr   r   rk   vision_featuresr   r   projected_feature_mapflattened_featuresr7  flattened_maskseq_lenfeat_dimrJ  queriesdecoder_outputss                            r)   ra   zDetrModel.forwardt  s   h M$9WXX 6B6H6H3Jfe!((F!"ZZ*fe)DfU
"mmL*EO / 3K %)$9$9+$F!!6!>!>q!A!I!I!QPQ!R*.*A*A!''l>P>PW[ +B +' "\\!_N&,,Q/J"))F!. $))!,G7C<(H*.*A*Ajj*dkk.A.A8X!VW#)) +B +' %}}00D1A1G1G1IQY[cPd0ehhinisistuvw!%a "'Z,A&X]XbXb!c"*dll 0-,G 	O .2-K-K-R-R-\-\]^-_-f-f1.
*
 !,+G&&'IJG '$,, 
!1(C/Q"1"C"C#1
 
 -??"1"?"?.99,==&5&G&G"1"?"?.99'6'Q'Q	
 		
r(   r  )r    r!   r"   r   rH   r  r  r   r   r$   r%   
LongTensorr   r   r<   r+   ra   rb   rc   s   @r)   r  r  Q  s    z (('  26.2;?4826:>B
''$.B
 $$t+B
 !& 1 1D 8	B

 **T1B
 ((4/B
  %0047B
 +,B
 
u  	!O	3B
  B
r(   r  c                   (     e Zd ZdZ fdZd Z xZS )DetrMLPPredictionHeadz
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    c                     t         |           || _        |g|dz
  z  }t        j                  d t        |g|z   ||gz         D              | _        y )Nr   c              3   N   K   | ]  \  }}t        j                  ||        y wr   )rh   r   ).0rM   ks      r)   	<genexpr>z1DetrMLPPredictionHead.__init__.<locals>.<genexpr>  s     #g1BIIaO#gs   #%)rG   rH   
num_layersrh   rs  ry  r  )rL   	input_dim
hidden_dim
output_dimr  hrN   s         r)   rH   zDetrMLPPredictionHead.__init__  sS    $LJN+mm#gYKRSOUVZdYeUe@f#ggr(   c                     t        | j                        D ]D  \  }}|| j                  dz
  k  r%t        j                  j                   ||            n ||      }F |S )Nr   )r  r  r  rh   r   r]  )rL   r^   ilayers       r)   ra   zDetrMLPPredictionHead.forward  sT    !$++. 	VHAu01DOOa4G0G""58,USTXA	Vr(   )r    r!   r"   r#   rH   ra   rb   rc   s   @r)   r  r    s    hr(   r  z
    DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
    such as COCO detection.
    c                   :    e Zd Zdef fdZee	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e
e   dz  dee   deej                     ez  fd              Z xZS )DetrForObjectDetectionr}   c                    t         |   |       t        |      | _        t	        j
                  |j                  |j                  dz         | _        t        |j                  |j                  dd      | _
        | j                          y )Nr   r   r   )r  r  r  r  )rG   rH   r  rq   rh   r   r.  
num_labelsclass_labels_classifierr  bbox_predictorr  r6  s     r)   rH   zDetrForObjectDetection.__init__  sr      v&
 (*yyNNF--1(
$ 4nnAZ[

 	r(   Nr   r   r  r  r  r  labelsr   r   c                 t    | j                   |f|||||d|}	|	d   }
| j                  |
      }| j                  |
      j                         }d\  }}}|d\  }}| j                  j
                  r<|	j                  }| j                  |      }| j                  |      j                         }| j                  ||| j                  || j                  ||      \  }}}t        ||||||	j                  |	j                  |	j                  |	j                  |	j                  |	j                  |	j                         S )aB  
        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
            Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`:

            - 1 for queries that are **not masked**,
            - 0 for queries that are **masked**.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
            can choose to directly pass a flattened representation of an image. Useful for bypassing the vision backbone.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
            embedded representation. Useful for tasks that require custom query initialization.
        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DetrForObjectDetection
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
        >>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> target_sizes = torch.tensor([image.size[::-1]])
        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
        ...     0
        ... ]

        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(
        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
        ...         f"{round(score.item(), 3)} at location {box}"
        ...     )
        Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98]
        Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66]
        Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76]
        Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93]
        Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72]
        ```)r   r  r  r  r  r   r  r  )r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   )rq   r  r  sigmoidr}   r  r   loss_functionrk   r-   r3   r4   r5   r6   r7   r8   r9   )rL   r   r   r  r  r  r  r  r   outputssequence_outputr0   r1   r.   r/   r2   outputs_classoutputs_coordr  s                      r)   ra   zDetrForObjectDetection.forward&  sW   L $**
!#9+'"7
 
 "!* --o>((9AAC
-=*i*+5(M={{))&AA $ < <\ J $ 3 3L A I I K151C1CZmUb2.D). )!/%77")"?"?&99$55&-&G&G")"?"?&99
 	
r(   r  )r    r!   r"   r   rH   r   r   r$   r%   r  r;   r:   r   r   r<   r-   ra   rb   rc   s   @r)   r  r    s    z "  /3;?4826:>$(l
''l
 $$t+l
 !& 1 1D 8	l

 **T1l
 ((4/l
  %0047l
 T
T!l
 +,l
 
u  	!$=	=l
  l
r(   r  z
    DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
    such as COCO panoptic.
    c                       e Zd Zi ddddddddd	d
dddddddddddddddddddddd Zd!ef fd"Zee	 	 	 	 	 	 d.d$ej                  d%ej                  d#z  d&ej                  d#z  d'ej                  d#z  d(ej                  d#z  d)ej                  d#z  d*ee   d#z  d+ee   d,eej                     ez  fd-              Z xZS )/DetrForSegmentationzbbox_attention.q_linearzbbox_attention.q_projzbbox_attention.k_linearzbbox_attention.k_projzmask_head.lay1zmask_head.conv1.convzmask_head.gn1zmask_head.conv1.normzmask_head.lay2zmask_head.conv2.convzmask_head.gn2zmask_head.conv2.normzmask_head.adapter1z"mask_head.fpn_stages.0.fpn_adapterzmask_head.lay3z"mask_head.fpn_stages.0.refine.convzmask_head.gn3z"mask_head.fpn_stages.0.refine.normzmask_head.adapter2z"mask_head.fpn_stages.1.fpn_adapterzmask_head.lay4z"mask_head.fpn_stages.1.refine.convzmask_head.gn4z"mask_head.fpn_stages.1.refine.normzmask_head.adapter3z"mask_head.fpn_stages.2.fpn_adapterzmask_head.lay5z"mask_head.fpn_stages.2.refine.convzmask_head.gn5z"mask_head.fpn_stages.2.refine.normzmask_head.out_layzmask_head.output_convr}   c                 b   t         |   |       t        |      | _        |j                  |j
                  }}| j                  j                  j                  j                  }t        ||z   |d d d   dd  ||j                        | _        t        ||d      | _        | j                          y )Nr[   )rn  ra  r   r&  r  )r   )rG   rH   r  detrr.  r/  rq   r   r   rm  r&  	mask_headr~  bbox_attentionr  )rL   r}   r   number_of_headsr   rN   s        r)   rH   zDetrForSegmentation.__init__  s      +62	 (.~~v7U7U_%)YY__%=%=%X%X".&83DbD9"#># & : :	
 1oWZ[r(   Nr   r   r  r  r  r  r  r   r   c                    |j                   \  }	}
}}|j                  }|t        j                  |	||f|      }| j                  j
                  j                  ||      }|d   \  }}| j                  j
                  j                  |      }|j                  d      j                  ddd      }| j                  j
                  j                  |j                   ||j                  |      }|j                  d      }|* | j                  j
                  j                  d|||d|}| j                  j
                  j                  j                  j                  d      j!                  |	dd      }||}nt        j"                  |      } | j                  j
                  j$                  d|||||j&                  |d	|}|d   }| j                  j)                  |      }| j                  j+                  |      j-                         }|j                   d
d \  }}|j&                  j                  ddd      j/                  |	| j0                  j2                  ||      }|j/                  |	||      }|t        j4                  |j                        j6                  }t        j8                  |j                  d      j                  d      t        j:                  d|j                  |j                        |      }| j=                  |||      }| j?                  |||d   d   |d   d   |d   d   g      }|j/                  |	| j                  j0                  j@                  |j                   d
   |j                   d         } d\  }!}"}#|d\  }$}%| j0                  jB                  rP|jD                  }&| j                  j)                  |&      }$| j                  j+                  |&      j-                         }%| jG                  ||||| | j0                  |$|%      \  }!}"}#tI        |!|"||| |#|j&                  |jJ                  |jL                  |jN                  |j&                  |jJ                  |jL                        S )aN  
        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
            Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`:

            - 1 for queries that are **not masked**,
            - 0 for queries that are **masked**.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Kept for backward compatibility, but cannot be used for segmentation, as segmentation requires
            multi-scale features from the backbone that are not available when bypassing it with inputs_embeds.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
            embedded representation. Useful for tasks that require custom query initialization.
        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
            dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
            bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
            should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a
            `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a
            `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`.

        Examples:

        ```python
        >>> import io
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> import torch
        >>> import numpy

        >>> from transformers import AutoImageProcessor, DetrForSegmentation
        >>> from transformers.image_transforms import rgb_to_id

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic")
        >>> model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**inputs)

        >>> # Use the `post_process_panoptic_segmentation` method of the `image_processor` to retrieve post-processed panoptic segmentation maps
        >>> # Segmentation results are returned as a list of dictionaries
        >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)])

        >>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
        >>> panoptic_seg = result[0]["segmentation"]
        >>> panoptic_seg.shape
        torch.Size([300, 500])
        >>> # Get prediction score and segment_id to class_id mapping of each segment
        >>> panoptic_segments_info = result[0]["segments_info"]
        >>> len(panoptic_segments_info)
        5
        ```Nr   r[   r   r   r   r  r  r  r   r  r   )r   )r   rv  rh  r  r  )r.   r/   r0   r1   r?   r2   r3   r4   r5   r6   r7   r8   r9   r'   )(r   rk   r$   rJ   r  rq   r   r  r   r   r  r   r  r  rC   r   r   r  r  r3   r  r  r  r  r}   r.  r=  r9  wheretensorr  r  rz  r  r   r  r>   r   r  r6   )'rL   r   r   r  r  r  r  r  r   r  r  r   r   rk   r  r   r   r  r  r7  r  rJ  r  r  r  r0   r1   memoryr   	min_dtype	bbox_mask	seg_masksr?   r.   r/   r2   r  r  r  s'                                          r)   ra   zDetrForSegmentation.forward  sI   R 3?2D2D/
L&%$$Z$?OJ))//22<L+B/T !%		 @ @ M2::1=EEaAN&*iioo&H&H##F,:L:LSW 'I '
# a"5diioo55 0-,G 	O .2YY__-V-V-]-]-g-ghi-j-q-q1.
*
 !,+G&&'IJG1$))//11 
!1(C/Q"1"C"C#1
 
 *!,22?CYY--o>FFH
#))"#. 22::1aCHH++VU
 (,,ZG%FLL155I"[[((+55a8SfllKN ''P^'_	NN*%)!,Q/1CA1FXYHZ[\H]^ # 
	 ^^J		0@0@0L0Lioo^`Naclcrcrsucvw
-=*i*+5(M={{)).II $		 A A, O $		 8 8 F N N P151C1C
J]\i2.D). &!!/-??"1"?"?.99,==&5&G&G"1"?"?.99
 	
r(   r  )r    r!   r"   _checkpoint_conversion_mappingr   rH   r   r   r$   r%   r  r;   r:   r   r   r<   r>   ra   rb   rc   s   @r)   r  r    s   &!#:&!#:& 	0	&
 	/& 	0& 	/& 	B& 	>& 	=& 	B& 	>& 	=& 	B& 	>&  	=!&" 	4#&"(z *  /3;?4826:>$(j
''j
 $$t+j
 !& 1 1D 8	j

 **T1j
 ((4/j
  %0047j
 T
T!j
 +,j
 
u  	!$:	:j
  j
r(   r  )r  r  r  r  )Nr  )Lr#   r   collections.abcr   dataclassesr   r$   torch.nnrh    r   r  activationsr   backbone_utilsr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_detrr   
get_loggerr    loggerr   r+   r-   r>   ModulerA   rp   rv   r   r   r   r   r   r   r  r!  r+  rD  rN  r`  rm  r~  r  r  r  r  r  r  r  __all__r'   r(   r)   <module>r+     sT     $ !   & ! + 6 9 
 G & @  J 5 * 
		H	% @: @ @ 	@( 	@ 	@ 
"? "? "?J 
)?[ )? )?X$ BII $ N'01bii 1h1		 1h299 P !%II%<<% 
% <<	%
 LL4'% T\% % '(%8?)		 ?)DM) M)`bii "61 6rY1 Yx
8BII 
84 4,@/BII @/F0 0f .%/ .% .%b:@% :@zpk% pkf a
# a
a
HBII & @
0 @
@
F V
- V
V
rr(   