
    qi                        d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(  e jR                  e*      Z+e ed       G d de                    Z,e ed       G d de                    Z- G d dej\                        Z/ G d dej\                        Z0 G d d ej\                        Z1	 	 dWd!ej\                  d"ejd                  d#ejd                  d$ejd                  d%ejd                  dz  d&e3dz  d'e3d(ee   fd)Z4 G d* d+ej\                        Z5 G d, d-ej\                        Z6 G d. d/ej\                        Z7 G d0 d1ej\                        Z8 G d2 d3ej\                        Z9 G d4 d5e      Z: G d6 d7ej\                        Z; G d8 d9ej\                        Z<d: Z= G d; d<ej\                        Z> G d= d>ej\                        Z? G d? d@ej\                        Z@ G dA dBej\                        ZAe G dC dDe             ZBe G dE dFeB             ZC G dG dHej\                        ZD G dI dJej\                        ZE G dK dLej\                        ZF edM       G dN dOeB             ZG G dP dQej\                        ZH G dR dSej\                        ZIe G dT dUeB             ZJg dVZKy)XzPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)Callable)	dataclass)nn)CrossEntropyLoss   )initialization)ACT2FN)load_backbone)GradientCheckpointingLayer)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )	DPTConfigz
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.:
    )custom_introc                   l    e Zd ZU dZdZej                  dz  ed<   dZe	ej                  df   dz  ed<   y)*BaseModelOutputWithIntermediateActivationsak  
    last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_states.intermediate_activations)
__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r    tuple     V/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/dpt/modeling_dpt.pyr   r   -   s?     48))D07EIeE$5$5s$:;dBIr*   r   z
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    activations that can be used by the model at later stages.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)	4BaseModelOutputWithPoolingAndIntermediateActivationsa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) after further processing
        through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
        the classification token after processing through a linear layer and a tanh activation function. The linear
        layer weights are trained from the next sentence prediction (classification) objective during pretraining.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_statepooler_output.hidden_states
attentionsr    )r!   r"   r#   r$   r.   r%   r&   r'   r/   r0   r(   r1   r    r)   r*   r+   r-   r-   @   s     37u((4/6.2M5$$t+2:>M5**C/047>7;Je'',-4;EIeE$5$5s$:;dBIr*   r-   c                   r     e Zd ZdZddedeeef   dz  f fdZddZ	 dde	j                  ded	efd
Z xZS )DPTViTHybridEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Nconfigfeature_sizec                 b   t         
|           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }t        |      | _        | j                  j                  d   }t        | j                  j                        dk7  r+t        dt        | j                  j                               ddg| _        ||j                   }	|	dd  }|	d   }nCt        |t        j                  j                        r|n||f}| j                  j                  d   }|| _        |d   | _        || _        t#        j$                  ||d      | _        t#        j(                  t+        j,                  dd|j
                              | _        t#        j(                  t+        j,                  d|dz   |j
                              | _        y )Nr   r   r   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler
   backbonechannelslen
ValueErrorresidual_feature_map_indexbackbone_featmap_shaper   Conv2d
projection	Parameterr%   zeros	cls_tokenposition_embeddings)selfr4   r5   r=   r>   r?   r@   num_patchesfeature_dimfeat_map_shape	__class__s             r+   r<   zDPTViTHybridEmbeddings.__init__`   s   !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY%f-mm,,R0t}}%%&!+PQTUYUbUbUkUkQlPmnoo+,a&'#::N)"#.L(+K !+<9Q9Q RYegsXt  --004K$$Q-())K!Lekk!Q8J8J&KL#%<<A{QPVPbPb0c#d r*   c                 r   |d d d |f   }|d|d f   }t        t        |      dz        }|j                  d||d      j                  dddd      }t        j
                  j                  |||fd      }|j                  dddd      j                  d||z  d      }t        j                  ||gd	      }|S 
Nr         ?r   r7   r      bilinear)sizemodedim)	r   rG   reshapepermuter   
functionalinterpolater%   catrQ   posembgrid_size_heightgrid_size_widthstart_index
posemb_tokposemb_gridold_grid_sizes           r+   _resize_pos_embedz(DPTViTHybridEmbeddings._resize_pos_embed   s    A||O,
Q_-!#k"2c"9:!))!]M2NVVWXZ[]^`abmm//CSUdBelv/w!))!Q15==aAQTcAceghJ4!<r*   pixel_valuesinterpolate_pos_encodingreturnc                    |j                   \  }}}}|| j                  k7  rt        d      |sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  | j
                  || j                  z  || j                  z        }| j                  |      }|j                  d   }	| j                  D 
cg c]  }
|j                  |
    }}
| j                  |	      j                  d	      j                  dd	      }| j                  j                  |dd      }t        j                   ||fd
      }||z   }t#        ||      S c c}
w )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r7   rY   r]   )r   r    )shaper?   rH   r=   rl   rP   r>   rE   feature_mapsrI   rL   flatten	transposerO   expandr%   rc   r   )rQ   rm   rn   
batch_sizer?   heightwidthrP   backbone_outputfeaturesindexoutput_hidden_states
embeddings
cls_tokenss                 r+   forwardzDPTViTHybridEmbeddings.forward   s    3?2D2D/
L&%4,,,w  (++u8J/J (% 9+,Adooa.@-AE 
 #44$$f&?$//AY
 --5"//3 RVQpQpq < <U Cqq__X.66q9CCAqI
^^**:r2>
YY
J7Q?
  "55
 :)%9
 	
  rs   )E9Nr   F)r!   r"   r#   r$   r   r(   intr<   rl   r%   Tensorboolr   r   __classcell__rU   s   @r+   r3   r3   Y   sY     ey  ec3h$8N  eD LQ&
!LL&
DH&
	3&
r*   r3   c                   N     e Zd ZdZ fdZddZdej                  defdZ	 xZ
S )DPTViTEmbeddingszB
    Construct the CLS token, position and patch embeddings.

    c                    t         |           t        j                  t	        j
                  dd|j                              | _        t        |      | _	        | j                  j                  }t        j                  t	        j
                  d|dz   |j                              | _        t        j                  |j                        | _        || _        y )Nr   )r;   r<   r   rM   r%   rN   r@   rO   DPTViTPatchEmbeddingspatch_embeddingsrR   rP   Dropouthidden_dropout_probdropoutr4   )rQ   r4   rR   rU   s      r+   r<   zDPTViTEmbeddings.__init__   s    ekk!Q8J8J&KL 5f =++77#%<<A{QPVPbPb0c#d zz&"<"<=r*   c                 ~   |d d d |f   }|d|d f   }t        |j                  d      dz        }|j                  d||d      j                  dddd      }t        j
                  j                  |||fd      }|j                  dddd      j                  d||z  d      }t        j                  ||gd	      }|S rW   )	r   r[   r_   r`   r   ra   rb   r%   rc   rd   s           r+   rl   z"DPTViTEmbeddings._resize_pos_embed   s    A||O,
Q_-!+"2"21"5"<=!))!]M2NVVWXZ[]^`abmm//CSUdBelv/w!))!Q15==aAQTcAceghJ4!<r*   rm   ro   c                    |j                   \  }}}}| j                  j                  }| j                  | j                  ||z  ||z        }| j                  |      }|j                         \  }}	}
| j                  j                  |dd      }t        j                  ||fd      }||z   }| j                  |      }t        |      S )Nr7   r   r]   )r   )rs   r4   r>   rl   rP   r   r[   rO   rw   r%   rc   r   r   )rQ   rm   rx   r?   ry   rz   r>   rP   r   seq_len_r   s               r+   r   zDPTViTEmbeddings.forward   s    2>2D2D/
L&% [[++
"44$$f
&:EZ<O
 **<8
!+!2
GQ ^^**:r2>
YY
J7Q?
  "55
\\*-
9ZXXr*   r   )r!   r"   r#   r$   r<   rl   r%   r   r   r   r   r   s   @r+   r   r      s-    
YELL Y5_ Yr*   r   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )r   z$
    Image to Patch Embedding.

    r4   c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )r:   stride)r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rR   r   rK   rL   )rQ   r4   r=   r>   r?   r@   rR   rU   s          r+   r<   zDPTViTPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir*   rm   ro   c                     |j                   \  }}}}|| j                  k7  rt        d      | j                  |      j	                  d      j                  dd      }|S )Nrq   rY   r   )rs   r?   rH   rL   ru   rv   )rQ   rm   rx   r?   ry   rz   r   s          r+   r   zDPTViTPatchEmbeddings.forward  sb    2>2D2D/
L&%4,,,w  __\2::1=GG1M
r*   
r!   r"   r#   r$   r   r<   r%   r   r   r   r   s   @r+   r   r      s1    
jy jELL U\\ r*   r   modulequerykeyvalueattention_maskscalingr   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr7         rY   r   r]   )ptrainingr   )
r[   r%   matmulrv   r   ra   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r+   eager_attention_forwardr     s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r*   c                   z     e Zd Zdef fdZdej                  deej                  ej                  f   fdZ xZ	S )DPTSelfAttentionr4   c                 2   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        |j                  | _        | j                  dz  | _        d| _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .r   F)bias)r;   r<   r@   num_attention_headshasattrrH   r4   r   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   rQ   r4   rU   s     r+   r<   zDPTSelfAttention.__init__.  sF    : ::a?PVXhHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r*   r0   ro   c           
         |j                   d   }|d| j                  | j                  f} | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      }t        j                  | j                  j                  t              } || |||d | j                  | j                  | j                  sdn| j                         \  }}	|j#                         d d | j$                  fz   }
|j'                  |
      }||	fS )Nr   r7   r   rY           )r   r   r   r8   )rs   r   r   r   viewrv   r   r   r   get_interfacer4   _attn_implementationr   r   r   r   r   r[   r   r_   )rQ   r0   rx   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes              r+   r   zDPTSelfAttention.forwardB  sF   "((+
D$<$<d>V>VV	0DHH]+00)<FFq!L	4djj/44i@JJ1aP4djj/44i@JJ1aP(?(M(MKK,,.E)
 *=nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EFo--r*   )
r!   r"   r#   r   r<   r%   r   r(   r   r   r   s   @r+   r   r   -  s:    ]y ](.U\\ .eELL%,,<V6W .r*   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )DPTViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r4   c                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r   )	r;   r<   r   r   r@   denser   r   r   r   s     r+   r<   zDPTViTSelfOutput.__init__f  sB    YYv1163E3EF
zz&"<"<=r*   r0   input_tensorro   c                 J    | j                  |      }| j                  |      }|S r   r   r   rQ   r0   r   s      r+   r   zDPTViTSelfOutput.forwardk  s$    

=1]3r*   r   r   s   @r+   r   r   `  s=    
>y >
U\\  RWR^R^ r*   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )DPTViTAttentionr4   c                 b    t         |           t        |      | _        t	        |      | _        y r   )r;   r<   r   	attentionr   outputr   s     r+   r<   zDPTViTAttention.__init__s  s&    )&1&v.r*   r0   ro   c                 R    | j                  |      \  }}| j                  ||      }|S r   )r   r   )rQ   r0   self_attn_outputr   r   s        r+   r   zDPTViTAttention.forwardx  s,    "nn];!-}=r*   	r!   r"   r#   r   r<   r%   r   r   r   r   s   @r+   r   r   r  s*    /y /
U\\ ell r*   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )DPTViTIntermediater4   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r;   r<   r   r   r@   intermediate_sizer   rA   
hidden_actstrr	   intermediate_act_fnr   s     r+   r<   zDPTViTIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r*   r0   ro   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )rQ   r0   s     r+   r   zDPTViTIntermediate.forward  s&    

=100?r*   r   r   s   @r+   r   r     s*    9y 9U\\ ell r*   r   c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZS )DPTViTOutputr4   c                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
r;   r<   r   r   r   r@   r   r   r   r   r   s     r+   r<   zDPTViTOutput.__init__  sB    YYv779K9KL
zz&"<"<=r*   r0   r   ro   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r+   r   zDPTViTOutput.forward  s.    

=1]3%4r*   r   r   s   @r+   r   r     s8    >y >
U\\  RWR^R^ r*   r   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )DPTViTLayerz?This corresponds to the Block class in the timm implementation.r4   c                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   eps)r;   r<   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr@   layer_norm_epslayernorm_beforelayernorm_afterr   s     r+   r<   zDPTViTLayer.__init__  s    '-'E'E$(0.v6"6* "V-?-?VEZEZ [!||F,>,>FDYDYZr*   r0   ro   c                     | j                  |      }| j                  |      }||z   }| j                  |      }| j                  |      }| j	                  ||      }|S r   )r   r   r   r   r   )rQ   r0   hidden_states_normattention_outputlayer_outputs        r+   r   zDPTViTLayer.forward  si    !22=A>>*<= )=8 ++M:((6 {{<?r*   r   r   s   @r+   r   r     s/    I[y [U\\ ell r*   r   c                   N     e Zd Zdef fdZddej                  dedefdZ	 xZ
S )DPTViTEncoderr4   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r;   r<   r4   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingrQ   r4   r   rU   s      r+   r<   zDPTViTEncoder.__init__  sN    ]]vG_G_A`#aAK$7#ab
&+# $bs   A#r0   r~   ro   c                     |r|gnd }t        | j                        D ]!  \  }} ||      }|s|j                  |       # t        ||rt	        |            S d       S )N)r.   r0   )	enumerater   appendr   r(   )rQ   r0   r~   all_hidden_statesilayer_modules         r+   r   zDPTViTEncoder.forward  so    /C]O(4 	8OA|(7M !((7	8
 +6G% 12
 	
MQ
 	
r*   r   )r!   r"   r#   r   r<   r%   r   r   r   r   r   r   s   @r+   r   r     s.    ,y ,

U\\ 

 

Zi 

r*   r   c                   t     e Zd ZdZ fdZd Zd Zddeej                     deej                     fdZ
 xZS )	DPTReassembleStagea@  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                     t         |           || _        t        j                         | _        |j                  r| j                  |       n| j                  |       |j                  | _	        y r   )
r;   r<   r4   r   r   layers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   s     r+   r<   zDPTReassembleStage.__init__  sU    mmo,,V4%%f-"(";";r*   c           	      v   t        t        t        |j                              |j                        D ]r  \  }}|dk  r.| j
                  j                  t        j                                9|dkD  s?| j
                  j                  t        ||j                  |   |             t |j                  dk7  rt        d|j                   d      t        j                         | _        t        |      }t        t        |j                              D ]  }|dk  rA| j                  j                  t        j                  t        j                                      I|dkD  sO| j                  j                  t        j                  t        j                   d|z  |      t"        |j$                                   y)a   "
        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
        for more details.
        r   rF   factorprojectzReadout type z! is not supported for DPT-Hybrid.rY   N)zipr   rG   neck_hidden_sizesreassemble_factorsr  r   r   IdentityDPTReassembleLayerreadout_typerH   r   readout_projects_get_backbone_hidden_size
Sequentialr   r	   r   )rQ   r4   r  r  r@   s        r+   r	  z.DPTReassembleStage._init_reassemble_dpt_hybrid  sX    U3v'?'?#@A6C\C\] 	tIAvAv""2;;=1Q""#5fvG_G_`aGbkq#rs		t )+}V-@-@,AAbcdd !#/7s63345 	AAv%%,,R]]2;;=-IJQ%%,,MM"))AO["I6RXRcRcKde		r*   c           	      <   t        t        t        |j                              |j                        D ]9  \  }}| j
                  j                  t        ||j                  |   |             ; |j                  dk(  rt        j                         | _        t        |      }t        t        |j                              D ]Y  }| j                  j                  t        j                  t        j                  d|z  |      t        |j                                   [ y y )Nr  r  rY   )r  r   rG   r  r  r  r   r  r  r   r   r  r  r  r   r	   r   )rQ   r4   r  r  r@   r   s         r+   r
  z'DPTReassembleStage._init_reassemble_dpt  s    U3v'?'?#@A6C\C\] 	pIAvKK1&6C[C[\]C^gmno	p )+$&MMOD!3F;K3v7789 %%,,MM"))AO["I6RXRcRcKde ,r*   r0   ro   c                    g }t        |      D ]  \  }}|| j                  vr|dddf   |ddddf   }}|j                  \  }}	}
|||j                  ||||
      }n"t	        |	dz        }|j                  ||||
      }|j                  dddd      j                         }|j                  }| j                  j                  dk(  r|j                  d      j                  d      }|j                  d      j                  |      } | j                  |   t        j                  ||fd	            }|j                  ddd      j                  |      }nM| j                  j                  d
k(  r4|j                  d      |j                  d	      z   }|j                  |      } | j                  |   |      }|j!                  |        |S )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        Nr   r   rX   r   rY   r  )r   rY   r   r7   add)r   r  rs   r_   r   r`   r   r4   r  ru   	unsqueeze	expand_asr  r%   rc   r  r   )rQ   r0   patch_heightpatch_widthoutr  hidden_staterO   rx   sequence_lengthr?   r[   feature_shapereadouts                 r+   r   zDPTReassembleStage.forward  s    (7 	%OA|///*6q!t*<l1ab5>Q<	<H<N<N9
O\+0G#/#7#7
LR]_k#lL$_c%9:D#/#7#7
D$P\#]L+33Aq!Q?JJL , 2 2;;++y8#/#7#7#:#B#B9#ML'11!4>>|LG#;4#8#8#;EII|U\F]_a<b#cL#/#7#71a#@#H#H#WL[[--6#/#7#7#:Y=P=PQS=T#TL#/#7#7#FL-t{{1~l;JJ|$3	%6 
r*   NN)r!   r"   r#   r$   r<   r	  r
  listr%   r   r   r   r   s   @r+   r  r    s@    
<4
#T%,,%7 #aefkfrfras #r*   r  c                     | j                   ,t        | j                   d      r| j                   j                  S | j                  S )Nr@   )backbone_configr   r@   )r4   s    r+   r  r  8  s;    )gf6L6Lm.\%%111!!!r*   c                   2     e Zd Zdededef fdZd Z xZS )r  r4   rF   r  c           	      \   t         |           t        |      }t        j                  ||d      | _        |dkD  r t        j                  ||||d      | _        y |dk(  rt        j                         | _        y |dk  r,t        j                  ||dt        d|z        d      | _        y y )Nr   )in_channelsout_channelsr:   r   r:   r   paddingr   )
r;   r<   r  r   rK   rL   ConvTranspose2dresizer  r   )rQ   r4   rF   r  r@   rU   s        r+   r<   zDPTReassembleLayer.__init__@  s    /7))(`ab A:,,XxV\blmnDKq[++-DKaZ))HhAcRSV\R\oghiDK r*   c                 J    | j                  |      }| j                  |      }|S r   )rL   r0  )rQ   r!  s     r+   r   zDPTReassembleLayer.forwardO  s$    |4{{<0r*   )r!   r"   r#   r   r   r<   r   r   r   s   @r+   r  r  ?  s&    jy jC j jr*   r  c                   *     e Zd Zdef fdZd Z xZS )DPTFeatureFusionStager4   c                     t         |           t        j                         | _        t        t        |j                              D ]&  }| j                  j                  t        |             ( y r   )
r;   r<   r   r   r  r   rG   r  r   DPTFeatureFusionLayerr   s      r+   r<   zDPTFeatureFusionStage.__init__V  sR    mmos63345 	>AKK4V<=	>r*   c                     |d d d   }g }d }t        || j                        D ]*  \  }}|	 ||      }n	 |||      }|j                  |       , |S )Nr7   )r  r  r   )rQ   r0   fused_hidden_statesfused_hidden_stater!  r   s         r+   r   zDPTFeatureFusionStage.forward\  sq    %dd+ !#&}dkk#B 	;L%!)%*<%8"%*+=|%L"&&'9:	; #"r*   )r!   r"   r#   r   r<   r   r   r   s   @r+   r3  r3  U  s    >y >#r*   r3  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )DPTPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    r4   c                 l   t         |           |j                  | _        |j                  |j                  n| j                   }t        j                         | _        t        j                  |j                  |j                  ddd|      | _
        t        j                         | _        t        j                  |j                  |j                  ddd|      | _        | j                  rIt        j                  |j                        | _        t        j                  |j                        | _        y y )Nr   r   )r:   r   r.  r   )r;   r<   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr   ReLUactivation1rK   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rQ   r4   r>  rU   s      r+   r<   zDPTPreActResidualLayer.__init__v  s   $FF 11= ..((( 	$ 779II%%%%,
 779II%%%%,
 !~~f.G.GHD!~~f.G.GHD r*   r!  ro   c                    |}| j                  |      }| j                  |      }| j                  r| j                  |      }| j	                  |      }| j                  |      }| j                  r| j                  |      }||z   S r   )r@  rB  r=  rF  rC  rD  rG  rQ   r!  residuals      r+   r   zDPTPreActResidualLayer.forward  s    ''5((6++L9L''5((6++L9Lh&&r*   r   r   s   @r+   r:  r:  m  s2     Iy  ID'ELL 'U\\ 'r*   r:  c                        e Zd ZdZd
dedef fdZddej                  dej                  dz  dej                  fd	Z	 xZ
S )r5  a3  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    r4   align_cornersc                     t         |           || _        t        j                  |j
                  |j
                  dd      | _        t        |      | _        t        |      | _	        y )Nr   T)r:   r   )
r;   r<   rL  r   rK   rA  rL   r:  residual_layer1residual_layer2)rQ   r4   rL  rU   s      r+   r<   zDPTFeatureFusionLayer.__init__  sT    *))F$=$=v?X?Xfgnrs5f=5f=r*   Nr!  rJ  ro   c                    |l|j                   |j                   k7  r?t        j                  j                  ||j                   d   |j                   d   fdd      }|| j	                  |      z   }| j                  |      }t        j                  j                  |dd| j                        }| j                  |      }|S )NrY   r   rZ   Fr[   r\   rL  scale_factorr\   rL  )rs   r   ra   rb   rN  rO  rL  rL   rI  s      r+   r   zDPTFeatureFusionLayer.forward  s    !!X^^3==44L$6$6q$9<;M;Ma;P#QXbrw 5  ($*>*>x*HHL++L9}}00qzI[I[ 1 
 |4r*   Tr   )r!   r"   r#   r$   r   r   r<   r%   r   r   r   r   s   @r+   r5  r5    sI    >y > >ELL ELL4<O [`[g[g r*   r5  c                   z     e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdeiZ ej                           fd       Z xZS )DPTPreTrainedModelr4   dptrm   )imageTr1   c                     t         |   |       t        |t        t        f      r?t        j                  |j                         t        j                  |j                         yy)zInitialize the weightsN)	r;   _init_weightsrA   r   r3   initzeros_rO   rP   )rQ   r   rU   s     r+   rZ  z DPTPreTrainedModel._init_weights  sM     	f%f/1GHIKK(()KK223 Jr*   )r!   r"   r#   r   r'   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   _can_record_outputsr%   no_gradrZ  r   r   s   @r+   rV  rV    sa    $O!&*#N"&& U]]_4 4r*   rV  c            
            e Zd Zddedef fdZd Ze ed      e		 dde
j                  d	edz  d
efd                     Z xZS )DPTModelr4   add_pooling_layerc                 T   t         |   |       || _        |j                  rt	        |      | _        nt        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd| _        | j!                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r;   r<   r4   r  r3   r   r   r   encoderr   r   r@   r   	layernormDPTViTPoolerpooler	post_init)rQ   r4   ri  rU   s      r+   r<   zDPTModel.__init__  s    
 	  4V<DO.v6DO$V,f&8&8f>S>ST.?l6*T 	r*   c                 r    | j                   j                  r| j                  S | j                  j                  S r   )r4   r  r   r   )rQ   s    r+   get_input_embeddingszDPTModel.get_input_embeddings  s)    ;;  ??"??333r*   F)tie_last_hidden_statesNrm   r~   ro   c                 P   || j                   j                  }| j                  |      }|j                  }| j	                  ||      }|j
                  }| j                  |      }| j                  | j                  |      nd }t        |||j                  |j                        S )N)r~   )r.   r/   r    r0   )r4   r~   r   r   rk  r.   rl  rn  r-   r    r0   )	rQ   rm   r~   r   embedding_outputembedding_last_hidden_statesencoder_outputssequence_outputpooled_outputs	            r+   r   zDPTModel.forward  s      '#';;#C#C GKWcGd'7'J'J$+/<<(?S ,8 ,
 *;;..98<8OO4UYC-'%5%N%N)77	
 	
r*   rT  r   )r!   r"   r#   r   r   r<   rq  r   r   r   r%   r&   r-   r   r   r   s   @r+   rh  rh    so    y T *4  E2 -1
''
 #Tk

 
>
  3  
r*   rh  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )rm  r4   c                     t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        y r   )
r;   r<   r   r   r@   pooler_output_sizer   r	   
pooler_act
activationr   s     r+   r<   zDPTViTPooler.__init__%  s>    YYv1163L3LM
 !2!23r*   r0   ro   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r}  )rQ   r0   first_token_tensorrx  s       r+   r   zDPTViTPooler.forward*  s6     +1a40

#566r*   r   r   s   @r+   rm  rm  $  s*    4y 4
U\\ ell r*   rm  c            
            e Zd ZdZdef fdZ	 	 d
deej                     de	dz  de	dz  deej                     fd	Z
 xZS )DPTNecka;  
    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DPT, it includes 2 stages:

    * DPTReassembleStage
    * DPTFeatureFusionStage.

    Args:
        config (dict): config dict.
    r4   c           
         t         |           || _        |j                  !|j                  j                  dk(  rd | _        nt        |      | _        t        j                         | _	        |j                  D ]?  }| j                  j                  t        j                  ||j                  ddd             A t        |      | _        y )Nswinv2r   r   Fr:   r.  r   )r;   r<   r4   r(  
model_typereassemble_stager  r   r   convsr  r   rK   rA  r3  fusion_stage)rQ   r4   channelrU   s      r+   r<   zDPTNeck.__init__?  s     !!-&2H2H2S2SW_2_$(D!$6v$>D!]]_
// 	sGJJbii1J1JXYcdkpqr	s 2&9r*   Nr0   r  r  ro   c                    t        |t        t        f      st        d      t	        |      t	        | j
                  j                        k7  rt        d      | j                  | j                  |||      }t        |      D cg c]  \  }} | j                  |   |       }}}| j                  |      }|S c c}}w )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.)rA   r(   r&  	TypeErrorrG   r4   r  rH   r  r   r  r  )rQ   r0   r  r  r  featurer|   r   s           r+   r   zDPTNeck.forwardP  s     -%7PQQ}T[[%B%B!CCnoo   , 11-{[M=F}=UVzq'MDJJqM'*VV ""8, Ws   B:r%  )r!   r"   r#   r$   r   r<   r&  r%   r   r   r   r   r   s   @r+   r  r  3  sa    	:y :( $("&	ELL) Dj 4Z	
 
ell	r*   r  c                   f     e Zd ZdZdef fdZdeej                     dej                  fdZ	 xZ
S )DPTDepthEstimationHeada	  
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
    supplementary material).
    r4   c                    t         |           || _        d | _        |j                  rt        j                  ddddd      | _        |j                  }t        j                  t        j                  ||dz  ddd      t        j                  ddd	
      t        j                  |dz  dddd      t        j                         t        j                  ddddd      t        j                               | _        y )N   )r   r   )r   r   r-  rY   r   r   rZ   TrR      r   )r;   r<   r4   rL   add_projectionr   rK   rA  r  Upsampler?  headrQ   r4   r|   rU   s      r+   r<   zDPTDepthEstimationHead.__init__t  s       iiSfV]cdDO,,MMIIhA1QPQRKKQZtLIIh!mRQq!LGGIIIb!1a@GGI
	r*   r0   ro   c                     || j                   j                     }| j                  +| j                  |      } t        j                         |      }| j                  |      }|j                  d      }|S )Nr   r]   )r4   head_in_indexrL   r   r?  r  squeeze)rQ   r0   predicted_depths      r+   r   zDPTDepthEstimationHead.forward  sg    %dkk&?&?@??& OOM:M%BGGIm4M))M2)11a18r*   )r!   r"   r#   r$   r   r<   r&  r%   r   r   r   r   s   @r+   r  r  m  s4    
y 
&T%,,%7 ELL r*   r  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    c                        e Zd Z fdZee	 	 ddej                  dej                  dz  de	dz  de
fd              Z xZS )	DPTForDepthEstimationc                    t         |   |       d | _        |j                  du r|j                  t        |      | _        nt        |d      | _        t        |      | _	        t        |      | _        | j                          y NF)ri  )r;   r<   rE   r  r(  r
   rh  rW  r  neckr  r  ro  r   s     r+   r<   zDPTForDepthEstimation.__init__  sq     u$)?)?)K)&1DM%@DH FO	 +62	 	r*   Nrm   labelsr~   ro   c                     | j                   j                  }d}|t        d      d|d<    j                  *  j                  j                  |fi |}|j
                  }n  j                  |fi |}|j                  } j                   j                  s:t        |dd       D 	cg c]   \  }}	| j                   j                  v s|	" }}}	n4|j                  }
|
j                   fdt        |dd       D               |
}d\  }} j                   j                  S j                   j                  du r;|j                  \  }}}} j                   j                  j                  }||z  }||z  } j!                  |||      } j#                  |      }t%        |||r|j                  nd|j&                  	      S c c}	}w )
a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yetTr~   r   c              3   ^   K   | ]$  \  }}|j                   j                  d d v r| & ywrY   Nr4   backbone_out_indices.0idxr  rQ   s      r+   	<genexpr>z0DPTForDepthEstimation.forward.<locals>.<genexpr>  s6      .$Wdkk>>qrBB .s   *-r%  F)lossr  r0   r1   )r4   r~   NotImplementedErrorrE   forward_with_filtered_kwargsrt   rW  r0   r  r   r  r    extendr(  rs   r>   r  r  r   r1   )rQ   rm   r  r~   r   r  outputsr0   r  r  backbone_hidden_statesr  r  r   ry   rz   r>   r  s   `                 r+   r   zDPTForDepthEstimation.forward  s   `  '#';;#C#C %&GHH)-%&==$@dmm@@XQWXG#00Mdhh|6v6G#11M ;;((09-:K0L! ,WPSW[WbWbWwWwPwG! ! *1)I)I&&-- .(1-2C(D. 
 !7$.!k;;&&2t{{7L7LPU7U"."4"4Aq&%44??J!Z/L:-K		-{K))M2#+3G'//T))	
 	
-!s   * G G r%  )r!   r"   r#   r<   r   r   r%   r&   
LongTensorr   r   r   r   r   s   @r+   r  r    sm    $  +/,0	Z
''Z
   4'Z
 #Tk	Z
 
Z
  Z
r*   r  c                   b     e Zd Zdef fdZdeej                     dej                  fdZ xZ	S )DPTSemanticSegmentationHeadr4   c                    t         |           || _        |j                  }t	        j
                  t	        j                  ||ddd      t	        j                  |      t	        j                         t	        j                  |j                        t	        j                  ||j                  d      t	        j                  ddd	            | _        y )
Nr   r   Fr  r9   rY   rZ   TrR  )r;   r<   r4   rA  r   r  rK   rE  r?  r   semantic_classifier_dropout
num_labelsr  r  r  s      r+   r<   z$DPTSemanticSegmentationHead.__init__  s    ,,MMIIhaONN8$GGIJJv99:IIh 1 1qAKKQZtL
	r*   r0   ro   c                 Z    || j                   j                     }| j                  |      }|S r   )r4   r  r  rQ   r0   logitss      r+   r   z#DPTSemanticSegmentationHead.forward  s)    %dkk&?&?@=)r*   )
r!   r"   r#   r   r<   r&  r%   r   r   r   r   s   @r+   r  r    s/    
y 
T%,,%7 ELL r*   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )DPTAuxiliaryHeadr4   c                 X   t         |           |j                  }t        j                  t        j
                  ||ddd      t        j                  |      t        j                         t        j                  dd      t        j
                  ||j                  d            | _
        y )Nr   r   Fr  g?r9   )r;   r<   rA  r   r  rK   rE  r?  r   r  r  r  s      r+   r<   zDPTAuxiliaryHead.__init__#  sv    ,,MMIIhaONN8$GGIJJsE"IIh 1 1qA
	r*   r0   ro   c                 (    | j                  |      }|S r   )r  r  s      r+   r   zDPTAuxiliaryHead.forward/  s    =)r*   r   r   s   @r+   r  r  "  s*    

y 

U\\ ell r*   r  c                        e Zd Zdef fdZee	 	 	 d	dej                  dz  dej                  dz  de
dz  defd              Z xZS )
DPTForSemanticSegmentationr4   c                     t         |   |       t        |d      | _        t	        |      | _        t        |      | _        |j                  rt        |      nd | _
        | j                          y r  )r;   r<   rh  rW  r  r  r  r  use_auxiliary_headr  auxiliary_headro  r   s     r+   r<   z#DPTForSemanticSegmentation.__init__6  s^     Fe< FO	 07	:@:S:S.v6Y] 	r*   Nrm   r  r~   ro   c                     | j                   j                  }|$ j                   j                  dk(  rt        d      d|d<     j                  |fi |}|j
                  } j                   j                  s:t        |dd       D cg c]   \  }}| j                   j                  v s|" }}}n4|j                  }	|	j                   fdt        |dd       D               |	} j                  |      } j                  |      }
d} j                   j                  |d         }d}|t        j                  j!                  |
|j"                  d	d d
d      }|0t        j                  j!                  ||j"                  d	d d
d      }t%         j                   j&                        } |||      } ||      }| j                   j(                  |z  z   }t+        ||
|r|j
                  nd|j,                        S c c}}w )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr~   c              3   `   K   | ]%  \  }}|j                   j                  d d v s"| ' ywr  r  r  s      r+   r  z5DPTForSemanticSegmentation.forward.<locals>.<genexpr>x  s7      *(CCSWS^S^SsSstutvSwLw*s   #..)r0   r7   r8   rZ   FrQ  )ignore_index)r  r  r0   r1   )r4   r~   r  rH   rW  r0   r  r   r  r    r  r  r  r  r   ra   rb   rs   r   semantic_loss_ignore_indexauxiliary_loss_weightr   r1   )rQ   rm   r  r~   r   r  r0   r  r  r  r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossauxiliary_losss   `                 r+   r   z"DPTForSemanticSegmentation.forwardE  s$   B  '#';;#C#C $++"8"8A"=NOO)-%&HPQ]HhagHh-- {{$$,5mAB6G,H(CCSWS^S^SsSsLsM  &-%E%E"")) *,5mAB6G,H*  3M			>=)*#22=3DE!}}88V\\"#.Zu  9    +-/]]-F-F$6<<+<:]b .G .* (T[[5[5[\H !16:I%&@&INt{{@@>QQD&3G'//T))	
 	
Es    H,H)NNN)r!   r"   r#   r   r<   r   r   r%   r&   r  r   r   r   r   r   s   @r+   r  r  4  s|    y   26*.,0	S
''$.S
   4'S
 #Tk	S
 
!S
  S
r*   r  )r  r  rh  rV  )Nr   )Lr$   collections.abcrB   r   dataclassesr   r%   r   torch.nnr    r   r[  activationsr	   backbone_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_dptr   
get_loggerr!   loggerr   r-   Moduler3   r   r   r   floatr   r   r   r   r   r   r   r   r  r  r  r3  r:  r5  rV  rh  rm  r  r  r  r  r  r  __all__r)   r*   r+   <module>r     s    $ !   % & ! + 9 ^ ^ F & X X I 5 ( 
		H	% 	J 	J 	J J; J J$]
RYY ]
@4Yryy 4YnBII L !%II%<<% 
% <<	%
 LL4'% T\% % '(%:/.ryy /.fryy $	bii 	  
299 
, >
BII 
(e eP" ,#BII #0:'RYY :'z"BII "J 4 4 4. 8
! 8
 8
x299 7bii 7t%RYY %P 
o
. o

o
d")) ,ryy $ e
!3 e
 e
P dr*   