
    qij                     D   d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZ  ej@                  e!      Z"e ed       G d de                    Z# G d dejH                        Z% G d dejH                        Z& G d dejH                        Z' G d dejH                        Z(	 	 d@dejH                  dejR                  dejR                  d ejR                  d!ejR                  dz  d"e*dz  d#e*d$ee   fd%Z+ G d& d'ejH                        Z, G d( d)ejH                        Z- G d* d+ejH                        Z. G d, d-ejH                        Z/ G d. d/ejH                        Z0 G d0 d1e      Z1 G d2 d3ejH                        Z2e G d4 d5e             Z3e G d6 d7e3             Z4 G d8 d9ejH                        Z5 G d: d;ejH                        Z6 ed<       G d= d>e3             Z7g d?Z8y)AzPyTorch YOLOS model.    N)Callable)	dataclass)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )YolosConfigz5
    Output type of [`YolosForObjectDetection`].
    )custom_introc                   <   e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZee	   dz  ed<   dZej                  dz  ed<   dZeej                     dz  ed	<   dZeej                     dz  ed
<   y)YolosObjectDetectionOutputa0  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
        Classification logits (including no-object) for all queries.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
        boxes.
    auxiliary_outputs (`list[Dict]`, *optional*):
        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
        `pred_boxes`) for each decoder layer.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the decoder of the model.
    Nloss	loss_dictlogits
pred_boxesauxiliary_outputslast_hidden_statehidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   dictr   r   r   listr   r    tupler!        Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/yolos/modeling_yolos.pyr   r   %   s    , &*D%

d
")!Itd{!'+FE$++/J!!D(/+/tDzD(/26u((4/659M5**+d2926Je''(4/6r-   r   c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )YolosEmbeddingszT
    Construct the CLS token, detection tokens, position and patch embeddings.

    configreturnNc                 n   t         |           t        j                  t	        j
                  dd|j                              | _        t        j                  t	        j
                  d|j                  |j                              | _	        t        |      | _        | j                  j                  }t        j                  t	        j
                  d||j                  z   dz   |j                              | _        t        j                  |j                        | _        t#        |      | _        || _        y Nr   )super__init__r   	Parameterr&   zeroshidden_size	cls_tokennum_detection_tokensdetection_tokensYolosPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout$InterpolateInitialPositionEmbeddingsinterpolationr1   )selfr1   r?   	__class__s      r.   r6   zYolosEmbeddings.__init__R   s    ekk!Q8J8J&KL "U[[F<W<WY_YkYk-l m 4V <++77#%<<KK;)D)DDqH&J\J\]$
  zz&"<"<=A&Ir-   pixel_valuesc                    |j                   \  }}}}| j                  |      }|j                         \  }}}| j                  j	                  |dd      }	| j
                  j	                  |dd      }
t        j                  |	||
fd      }| j                  | j                  ||f      }||z   }| j                  |      }|S )Nr   dim)shaper>   sizer:   expandr<   r&   catrE   r@   rC   )rF   rH   
batch_sizenum_channelsheightwidth
embeddingsseq_len_
cls_tokensr<   r@   s               r.   forwardzYolosEmbeddings.forwarda   s    2>2D2D/
L&%**<8
!+!2
GQ ^^**:r2>
0077
BKYY
J8HIqQ
 #001I1IFTY?["55
\\*-
r-   
r"   r#   r$   r%   r   r6   r&   TensorrY   __classcell__rG   s   @r.   r0   r0   L   s6    
{ t ELL U\\ r-   r0   c                   B     e Zd Zd fdZddej
                  fdZ xZS )rD   r2   c                 0    t         |           || _        y Nr5   r6   r1   rF   r1   rG   s     r.   r6   z-InterpolateInitialPositionEmbeddings.__init__w       r-   c                    |d d dd d f   }|d d d f   }|d d | j                   j                   d d d f   }|d d d| j                   j                   d d f   }|j                  dd      }|j                  \  }}}| j                   j                  d   | j                   j
                  z  | j                   j                  d   | j                   j
                  z  }
}	|j                  |||	|
      }|\  }}|| j                   j
                  z  || j                   j
                  z  }}t        j                  j                  |||fdd      }|j                  d      j                  dd      }t        j                  |||fd      }|S )Nr   r      bicubicFrN   modealign_cornersrK   )r1   r;   	transposerM   
image_size
patch_sizeviewr   
functionalinterpolateflattenr&   rP   )rF   	pos_embedimg_sizecls_pos_embeddet_pos_embedpatch_pos_embedrQ   r9   rV   patch_heightpatch_widthrS   rT   new_patch_heightnew_patch_widthscale_pos_embeds                   r.   rY   z,InterpolateInitialPositionEmbeddings.forward{   s   !!Q'*%ag.!!dkk&F&F%F%H!"KL#AqDKK,L,L+L'La$OP)33Aq9+:+@+@(
K KK""1%)?)??KK""1%)?)?? " *..z;Vab ,2dkk6L6L,LeW[WbWbWmWmNm/--33#3_"EIej 4 
 *11!4>>q!D))]O]$SYZ[r-   r2   N)i   i@  r"   r#   r$   r6   r&   r[   rY   r\   r]   s   @r.   rD   rD   v   s    %,, r-   rD   c                   B     e Zd Zd fdZddej
                  fdZ xZS ) InterpolateMidPositionEmbeddingsr2   c                 0    t         |           || _        y r`   ra   rb   s     r.   r6   z)InterpolateMidPositionEmbeddings.__init__   rc   r-   c                 v   |d d d d dd d f   }|d d d f   }|d d d d | j                   j                   d d d f   }|d d d d d| j                   j                   d d f   }|j                  dd      }|j                  \  }}}}	| j                   j                  d   | j                   j
                  z  | j                   j                  d   | j                   j
                  z  }}
|j                  ||z  ||
|      }|\  }}|| j                   j
                  z  || j                   j
                  z  }}t        j                  j                  |||fdd      }|j                  d      j                  dd      j                         j                  ||||z  |      }t        j                  |||fd      }|S )	Nr   r   re   r   rf   Frg   rK   )r1   r;   rj   rM   rk   rl   rm   r   rn   ro   rp   
contiguousr&   rP   )rF   rq   rr   rs   rt   ru   depthrQ   r9   rV   rv   rw   rS   rT   rx   ry   rz   s                    r.   rY   z(InterpolateMidPositionEmbeddings.forward   s   !!Q1*-%ag.!!Q)I)I(I(KQ"NO#Aq!t{{/O/O.O*OQR$RS)33Aq92A2G2G/z; KK""1%)?)??KK""1%)?)?? " *..uz/A;P\^ij ,2dkk6L6L,LeW[WbWbWmWmNm/--33#3_"EIej 4 
 ##A&Yq!_Z\T%%5%GU	 	  ))]O]$SYZ[r-   r{   r|   r}   r]   s   @r.   r   r      s    %,, r-   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )r=   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)r5   r6   rk   rl   rR   r9   
isinstancecollectionsabcIterabler?   r   Conv2d
projection)rF   r1   rk   rl   rR   r9   r?   rG   s          r.   r6   zYolosPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir-   rH   r2   c                     |j                   \  }}}}|| j                  k7  rt        d      | j                  |      j	                  d      j                  dd      }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.re   r   )rM   rR   
ValueErrorr   rp   rj   )rF   rH   rQ   rR   rS   rT   rU   s          r.   rY   zYolosPatchEmbeddings.forward   sb    2>2D2D/
L&%4,,,w  __\2::1=GG1M
r-   )	r"   r#   r$   r%   r6   r&   r[   rY   r\   r]   s   @r.   r=   r=      s)    jELL U\\ r-   r=   modulequerykeyvalueattention_maskscalingrC   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrJ         re   r   rK   )ptrainingr   )
rN   r&   matmulrj   r   rn   softmaxrC   r   r   )
r   r   r   r   r   r   rC   r   attn_weightsattn_outputs
             r.   eager_attention_forwardr      s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r-   c                   z     e Zd Zdef fdZdej                  deej                  ej                  f   fdZ xZ	S )YolosSelfAttentionr1   c                 2   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        |j                  | _        | j                  dz  | _        d| _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .r   F)bias)r5   r6   r9   num_attention_headshasattrr   r1   intattention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   rb   s     r.   r6   zYolosSelfAttention.__init__   sF    : ::a?PVXhHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r-   r    r2   c           
         |j                   d   }|d| j                  | j                  f} | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      }t        j                  | j                  j                  t              } || |||d | j                  | j                  | j                  sdn| j                         \  }}	|j#                         d d | j$                  fz   }
|j'                  |
      }||	fS )Nr   rJ   r   re           )r   r   rC   )rM   r   r   r   rm   rj   r   r   r   get_interfacer1   _attn_implementationr   r   r   r   r   rN   r   reshape)rF   r    rQ   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes              r.   rY   zYolosSelfAttention.forward  sF   "((+
D$<$<d>V>VV	0DHH]+00)<FFq!L	4djj/44i@JJ1aP4djj/44i@JJ1aP(?(M(MKK,,.E)
 *=nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EFo--r-   )
r"   r#   r$   r   r6   r&   r[   r+   rY   r\   r]   s   @r.   r   r      s:    ]{ ](.U\\ .eELL%,,<V6W .r-   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )YolosSelfOutputz
    The residual connection is defined in YolosLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r1   c                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r`   )	r5   r6   r   r   r9   denserA   rB   rC   rb   s     r.   r6   zYolosSelfOutput.__init__,  sB    YYv1163E3EF
zz&"<"<=r-   r    input_tensorr2   c                 J    | j                  |      }| j                  |      }|S r`   r   rC   rF   r    r   s      r.   rY   zYolosSelfOutput.forward1  s$    

=1]3r-   rZ   r]   s   @r.   r   r   &  s=    
>{ >
U\\  RWR^R^ r-   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )YolosAttentionr1   c                 b    t         |           t        |      | _        t	        |      | _        y r`   )r5   r6   r   	attentionr   outputrb   s     r.   r6   zYolosAttention.__init__9  s&    +F3%f-r-   r    r2   c                 R    | j                  |      \  }}| j                  ||      }|S r`   )r   r   )rF   r    self_attn_outputrW   r   s        r.   rY   zYolosAttention.forward>  s,    "nn];!-}=r-   	r"   r#   r$   r   r6   r&   r[   rY   r\   r]   s   @r.   r   r   8  s*    .{ .
U\\ ell r-   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )YolosIntermediater1   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r`   )r5   r6   r   r   r9   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnrb   s     r.   r6   zYolosIntermediate.__init__F  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r-   r    r2   c                 J    | j                  |      }| j                  |      }|S r`   )r   r   )rF   r    s     r.   rY   zYolosIntermediate.forwardN  s&    

=100?r-   r   r]   s   @r.   r   r   E  s*    9{ 9U\\ ell r-   r   c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZS )YolosOutputr1   c                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r`   )
r5   r6   r   r   r   r9   r   rA   rB   rC   rb   s     r.   r6   zYolosOutput.__init__V  sB    YYv779K9KL
zz&"<"<=r-   r    r   r2   c                 T    | j                  |      }| j                  |      }||z   }|S r`   r   r   s      r.   rY   zYolosOutput.forward[  s.    

=1]3%4r-   r   r]   s   @r.   r   r   U  s8    >{ >
U\\  RWR^R^ r-   r   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )
YolosLayerz?This corresponds to the Block class in the timm implementation.r1   c                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   eps)r5   r6   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr9   layer_norm_epslayernorm_beforelayernorm_afterrb   s     r.   r6   zYolosLayer.__init__f  s    '-'E'E$'/-f5!&) "V-?-?VEZEZ [!||F,>,>FDYDYZr-   r    r2   c                     | j                  |      }| j                  |      }||z   }| j                  |      }| j                  |      }| j	                  ||      }|S r`   )r   r   r   r   r   )rF   r    hidden_states_normattention_outputlayer_outputs        r.   rY   zYolosLayer.forwardp  si    !22=A>>*<= )=8 ++M:((6 {{<?r-   rZ   r]   s   @r.   r   r   c  s/    I[{ [U\\ ell r-   r   c                   T     e Zd Zdeddf fdZdej                  dededefdZ	 xZ
S )	YolosEncoderr1   r2   Nc                 @   t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        d|j                  d   |j                  d   z  |j                  dz  z  z   |j                  z   }|j                  rBt        j                  t        j                   |j                  dz
  d||j"                              nd | _        |j                  rt'        |      | _        y d | _        y c c}w )NFr   r   re   )r5   r6   r1   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingrk   rl   r;   use_mid_position_embeddingsr7   r&   r8   r9   mid_position_embeddingsr   rE   )rF   r1   rW   
seq_lengthrG   s       r.   r6   zYolosEncoder.__init__  s   ]]fF^F^@_#`1Jv$6#`a
&+# ""1%(9(9!(<<@Q@QST@TTUX^XsXss 	 11 LL,,q0&&	  	$ JPIkIk=fEqu' $as   Dr    rS   rT   c                 D   | j                   j                  r| j                  | j                  ||f      }t	        | j
                        D ]I  \  }} ||      }| j                   j                  s%|| j                   j                  dz
  k  sB||   z   }K t        |      S )Nr   )r   )r1   r   rE   r   	enumerater   r   r	   )rF   r    rS   rT   $interpolated_mid_position_embeddingsilayer_modules          r.   rY   zYolosEncoder.forward  s     ;;22373E3EdFbFbekmrds3t0(4 	\OA|(7M{{66559:$14XYZ4[$[M	\ ??r-   )r"   r#   r$   r   r6   r&   r[   r   r	   rY   r\   r]   s   @r.   r   r     sM    v{ vt v0@||@ @ 	@
 
@r-   r   c                   F    e Zd ZU eed<   dZdZdZdZg Z	dZ
dZdZdZeedZy)YolosPreTrainedModelr1   vitrH   )imageT)r    r!   N)r"   r#   r$   r   r(   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr,   r-   r.   r   r     sJ    $O!&*#N"&#(r-   r   c            
            e Zd Zddedef fdZdefdZe e	d      e
	 dd	ej                  dz  d
ee   defd                     Z xZS )
YolosModelr1   add_pooling_layerc                    t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r5   r6   r1   r0   rU   r   encoderr   r   r9   r   	layernormYolosPoolerpooler	post_init)rF   r1   r
  rG   s      r.   r6   zYolosModel.__init__  sk    
 	 )&1#F+f&8&8f>S>ST->k&)D 	r-   r2   c                 .    | j                   j                  S r`   )rU   r>   )rF   s    r.   get_input_embeddingszYolosModel.get_input_embeddings  s    ///r-   F)tie_last_hidden_statesNrH   r   c                    |t        d      | j                  |      }|j                  dd  \  }}| j                  |||      }|j                  }| j                  |      }| j                  | j                  |      nd }t        ||      S )Nz You have to specify pixel_valuesr   )rS   rT   )r   pooler_output)r   rU   rM   r  r   r  r  r
   )	rF   rH   r   embedding_outputrS   rT   encoder_outputssequence_outputpooled_outputs	            r.   rY   zYolosModel.forward  s     ?@@??<8$**23/+/<<8HQW_d<+e);;..98<8OO4UY)O[hiir-   )Tr`   )r"   r#   r$   r   boolr6   r=   r  r   r   r   r&   r[   r   r   r
   rY   r\   r]   s   @r.   r	  r	    s    { t "0&: 0  E2 -1jllT)j +,j 
$	j  3  jr-   r	  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )r  r1   c                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r`   )r5   r6   r   r   r9   r   Tanh
activationrb   s     r.   r6   zYolosPooler.__init__  s9    YYv1163E3EF
'')r-   r    r2   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )rF   r    first_token_tensorr  s       r.   rY   zYolosPooler.forward  s6     +1a40

#566r-   r   r]   s   @r.   r  r    s*    ${ $
U\\ ell r-   r  c                   (     e Zd ZdZ fdZd Z xZS )YolosMLPPredictionHeadz
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    c                     t         |           || _        |g|dz
  z  }t        j                  d t        |g|z   ||gz         D              | _        y )Nr   c              3   N   K   | ]  \  }}t        j                  ||        y wr`   )r   r   ).0nks      r.   	<genexpr>z2YolosMLPPredictionHead.__init__.<locals>.<genexpr>  s     #g1BIIaO#gs   #%)r5   r6   
num_layersr   r   ziplayers)rF   	input_dim
hidden_dim
output_dimr)  hrG   s         r.   r6   zYolosMLPPredictionHead.__init__  sS    $LJN+mm#gYKRSOUVZdYeUe@f#ggr-   c                     t        | j                        D ]D  \  }}|| j                  dz
  k  r%t        j                  j                   ||            n ||      }F |S r4   )r   r+  r)  r   rn   relu)rF   xr   r   s       r.   rY   zYolosMLPPredictionHead.forward  sT    !$++. 	VHAu01DOOa4G0G""58,USTXA	Vr-   )r"   r#   r$   r%   r6   rY   r\   r]   s   @r.   r"  r"    s    hr-   r"  zy
    YOLOS Model (consisting of a ViT encoder) with object detection heads on top, for tasks such as COCO detection.
    c                        e Zd Zdef fdZd Zee	 d
dej                  de
e   dz  dee   defd	              Z xZS )YolosForObjectDetectionr1   c                 "   t         |   |       t        |d      | _        t	        |j
                  |j
                  |j                  dz   d      | _        t	        |j
                  |j
                  dd      | _        | j                          y )NF)r
  r   r   )r,  r-  r.  r)     )
r5   r6   r	  r   r"  r9   
num_labelsclass_labels_classifierbbox_predictorr  rb   s     r.   r6   z YolosForObjectDetection.__init__  s      f> (>((V5G5GTZTeTehiTivw(
$ 5((V5G5GTUbc

 	r-   c                 ^    t        |d d |d d       D cg c]
  \  }}||d c}}S c c}}w )NrJ   )r   r   )r*  )rF   outputs_classoutputs_coordabs        r.   _set_aux_lossz%YolosForObjectDetection._set_aux_loss'  s7    ;>}Sb?QS`adbdSe;fg41a1A.gggs   )NrH   labelsr   r2   c           
      d    | j                   |fi |}|j                  }|dd| j                  j                   dddf   }| j	                  |      }| j                  |      j                         }d\  }}	}
|d\  }}| j                  j                  r<|j                  }| j	                  |      }| j                  |      j                         }| j                  ||| j                  || j                  ||      \  }}	}
t        ||	|||
|j                  |j                  |j                        S )a`	  
        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
            following 2 keys: `'class_labels'` and `'boxes'` (the class labels and bounding boxes of an image in the
            batch respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding
            boxes in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image,
            4)`.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
        >>> model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> target_sizes = torch.tensor([image.size[::-1]])
        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
        ...     0
        ... ]

        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(
        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
        ...         f"{round(score.item(), 3)} at location {box}"
        ...     )
        Detected remote with confidence 0.991 at location [46.48, 72.78, 178.98, 119.3]
        Detected remote with confidence 0.908 at location [336.48, 79.27, 368.23, 192.36]
        Detected cat with confidence 0.934 at location [337.18, 18.06, 638.14, 373.09]
        Detected cat with confidence 0.979 at location [10.93, 53.74, 313.41, 470.67]
        Detected remote with confidence 0.974 at location [41.63, 72.23, 178.09, 119.99]
        ```N)NNN)NN)r   r   r   r   r   r   r    r!   )r   r   r1   r;   r8  r9  sigmoidauxiliary_lossr    loss_functiondevicer   r!   )rF   rH   r@  r   outputsr  r   r   r   r   r   r;  r<  r   s                 r.   rY   zYolosForObjectDetection.forward*  s;   n /7dhh|.Nv.N!33 *!dkk.N.N-N-PRS*ST --o>((9AAC
-=*i*+5(M={{))&44 $ < <\ J $ 3 3L A I I K151C1CZmUb2.D). *!/%77!//))	
 		
r-   r`   )r"   r#   r$   r   r6   r?  r   r   r&   r'   r*   r)   r   r   r   rY   r\   r]   s   @r.   r4  r4    sw    { &h  %)S
''S
 T
T!S
 +,	S

 
$S
  S
r-   r4  )r4  r	  r   )Nr   )9r%   collections.abcr   r   dataclassesr   r&   r   activationsr   modeling_layersr   modeling_outputsr	   r
   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_yolosr   
get_loggerr"   loggerr   Moduler0   rD   r   r=   r[   floatr   r   r   r   r   r   r   r   r   r	  r  r"  r4  __all__r,   r-   r.   <module>rW     s6     $ !   ! 9 K F & M M I 5 , 
		H	% 
7 7 7B'bii 'T299 :ryy B299 P !%II%<<% 
% <<	%
 LL4'% T\% % '(%:/. /.fbii $	RYY 			  
")) 
+ <)@299 )@X ?  " (j% (j (jV"))  RYY & 
l
2 l

l
^ Lr-   