
    qi,                        d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZ  ej2                  e      Ze ed       G d de                    Ze G d de             ZddZ G d dej>                        Z  G d dej>                        Z! ed       G d de             Z"ddgZ#y)zPyTorch VitPose model.    )	dataclassN)nn   )initialization)load_backbone)BackboneOutput)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuple   )VitPoseConfigz6
    Class for outputs of pose estimation models.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)VitPoseEstimatorOutputaH  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
    heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
        Heatmaps as predicted by the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
        (also called feature maps) of the model at the output of each stage.
    Nlossheatmaps.hidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tupler        ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/vitpose/modeling_vitpose.pyr   r   $   sq    	 &*D%

d
"))-He$&-:>M5**C/047>7;Je'',-4;r"   r   c                       e Zd ZU eed<   dZdZdZdZ e	j                         dej                  ej                  z  ej                  z  fd       Zy)	VitPosePreTrainedModelconfigvitpixel_values)imageTmodulec                    t        |t        j                  t        j                  f      rct	        j
                  |j                  d| j                  j                         |j                   t	        j                  |j                         yyt        |t        j                        r?t	        j                  |j                         t	        j                  |j                         yy)zInitialize the weightsg        )meanstdN)
isinstancer   LinearConv2dinittrunc_normal_weightr&   initializer_rangebiaszeros_	LayerNormones_)selfr*   s     r#   _init_weightsz$VitPosePreTrainedModel._init_weightsD   s     fryy"))45v}}3DKK<Y<YZ{{&FKK( '-KK$JJv}}% .r"   N)r   r   r   r   r   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointingr   no_gradr   r/   r0   r7   r:   r!   r"   r#   r%   r%   <   sV    $O!&*#U]]_&BII		$9BLL$H & &r"   r%   c                    |dvrt        d      | j                  dk7  rt        d      | j                  \  }}}}d}|dk(  rd}| dddddd	f    | dddddd	f<   | j                  |d
|||      } | j	                         }|j                         D ])  \  }	}
| dd|
d	f   |dd|	d	f<   | dd|	d	f   |dd|
d	f<   + |j                  ||||f      }|j                  d
      }|S )a  Flip the flipped heatmaps back to the original form.

    Args:
        output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
            The output heatmaps obtained from the flipped images.
        flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
        target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
            Target type to use. Can be gaussian-heatmap or combined-target.
            gaussian-heatmap: Classification target with gaussian distribution.
            combined-target: The combination of classification target (response map) and regression target (offset map).
            Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

    Returns:
        torch.Tensor: heatmaps that flipped back to the original image
    )gaussian-heatmapcombined-targetz9target_type should be gaussian-heatmap or combined-target   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   rB   r   N.)
ValueErrorndimshapereshapeclonetolistflip)output_flipped
flip_pairstarget_type
batch_sizenum_keypointsheightwidthchannelsoutput_flipped_backleftrights              r#   	flip_backrW   P   s8   " AATUUa^__/=/C/C,JvuH''(6q!$Q$|(D'Dq!$Q$|$#++JHfeTN(..0 "((* Je,:1eS=,IAtSL)-;AtSL-IAucM*J .55z=RXZ_6`a-2226r"   c                        e Zd ZdZdef fdZd	dej                  dej                  dz  dej                  fdZ xZ	S )
VitPoseSimpleDecoderz
    Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
    feature maps into heatmaps.
    r&   c                    t         |           t        j                         | _        t        j
                  |j                  dd      | _        t        j                  |j                  j                  |j                  ddd      | _        y )NbilinearF)scale_factormodealign_cornersr   r   kernel_sizestridepadding)super__init__r   ReLU
activationUpsampler\   
upsamplingr0   backbone_confighidden_size
num_labelsconvr9   r&   	__class__s     r#   rd   zVitPoseSimpleDecoder.__init__~   se    '')++63F3FZglmII""..0A0AqYZde
	r"   Nhidden_staterM   returnc                     | j                  |      }| j                  |      }| j                  |      }|t        ||      }|S N)rf   rh   rl   rW   r9   ro   rM   r   s       r#   forwardzVitPoseSimpleDecoder.forward   sC    |4|499\*! :6Hr"   rr   
r   r   r   r   r   rd   r   Tensorrt   __classcell__rn   s   @r#   rY   rY   x   sB    

} 
	ELL 	ellT>Q 	]b]i]i 	r"   rY   c                   h     e Zd ZdZdef fdZddej                  dej                  dz  fdZ xZ	S )	VitPoseClassicDecoderz
    Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
    turning the feature maps into heatmaps.
    r&   c                    t         |           t        j                  |j                  j
                  ddddd      | _        t        j                  d      | _        t        j                         | _
        t        j                  dddddd      | _        t        j                  d      | _        t        j                         | _        t        j                  d|j                  ddd      | _        y )	N   rC      r   F)r`   ra   rb   r5   r   r_   )rc   rd   r   ConvTranspose2dri   rj   deconv1BatchNorm2d
batchnorm1re   relu1deconv2
batchnorm2relu2r0   rk   rl   rm   s     r#   rd   zVitPoseClassicDecoder.__init__   s    ))""..1VW^c
 ..-WWY
))#s!UV]bc..-WWY
IIc6#4#4!AWXY	r"   Nro   rM   c                    | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }| j                  |      }|t        ||      }|S rr   )r   r   r   r   r   r   rl   rW   rs   s       r#   rt   zVitPoseClassicDecoder.forward   s{    ||L1|4zz,/||L1|4zz,/99\*! :6Hr"   rr   ru   rx   s   @r#   rz   rz      s6    
Z} ZELL ellT>Q r"   rz   z?
    The VitPose model with a pose estimation head on top.
    c                        e Zd Zdef fdZee	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  de	e
   d	efd
              Z xZS )VitPoseForPoseEstimationr&   c                    t         |   |       t        |      | _        t	        | j                  j
                  d      st        d      t	        | j                  j
                  d      st        d      t	        | j                  j
                  d      st        d      |j                  rt        |      n
t        |      | _
        | j                          y )Nrj   z0The backbone should have a hidden_size attribute
image_sizez0The backbone should have an image_size attribute
patch_sizez/The backbone should have a patch_size attribute)rc   rd   r   backbonehasattrr&   rE   use_simple_decoderrY   rz   head	post_initrm   s     r#   rd   z!VitPoseForPoseEstimation.__init__   s     %f- t}}++];OPPt}}++\:OPPt}}++\:NOO4:4M4M(0ShioSp	 	r"   Nr(   dataset_indexrM   labelskwargsrp   c                    d}|t        d       | j                  j                  |fd|i|}|j                  d   }|j                  d   }	| j
                  j                  j                  d   | j
                  j                  j                  d   z  }
| j
                  j                  j                  d   | j
                  j                  j                  d   z  }|j                  ddd      }|j                  |	d|
|      j                         }| j                  ||      }t        |||j                  |j                  	      S )
a  
        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.

            This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
        flip_pairs (`torch.tensor`, *optional*):
            Whether to mirror pairs of keypoints (for example, left ear -- right ear).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
        >>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
        >>> inputs = processor(image, boxes=boxes, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> heatmaps = outputs.heatmaps
        ```NzTraining is not yet supportedr   rD   r   r   r}   )rM   )r   r   r   r   )NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsrG   r&   ri   r   r   permuterH   
contiguousr   r   r   r   )r9   r(   r   rM   r   r   r   outputssequence_outputrO   patch_heightpatch_widthr   s                r#   rt   z VitPoseForPoseEstimation.forward   s>   R %&EFF"L$--"L"L#
'#
 #
 "..r2$**1-
{{22==a@DKKD_D_DjDjklDmmkk11<<Q?4;;C^C^CiCijkCll)11!Q:)11*b,P[\ggi99_9D%!//))	
 	
r"   )NNN)r   r   r   r   rd   r   r   r   rv   r
   r   r   rt   rw   rx   s   @r#   r   r      s    } $  .2*.&*@
ll@
 ||d*@
 LL4'	@

 t#@
 +,@
 
 @
  @
r"   r   )rA   )$r   dataclassesr   r   r    r   r1   backbone_utilsr   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   r   r   utils.genericr   configuration_vitposer   
get_loggerr   loggerr   r%   rW   ModulerY   rz   r   __all__r!   r"   r#   <module>r      s     !   & + . - & M M - 0 
		H	%
 
<[ < <$ &_ & &&%P299 6#BII #L 
U
5 U

U
p $%?
@r"   