
    qi                        d dl Zd dlZd dl mZ d dlmZ d dlZd dlZd dl	m
c mZ d dlmZm
Z
 ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'  e       rd dl(m)Z)  e!       rd dl*m+Z+ d dl,m-Z- e e d       G d de                    Z.	 dPdej                  dej                  dej                  fdZ/dededefdZ0dej                  dej                  dej                  fdZ1 G d  d!e
jd                        Z3deded"e4defd#Z5dej                  dej                  d"e4dej                  fd$Z6 G d% d&e
jd                        Z7 G d' d(e
jd                        Z8 G d) d*e
jd                        Z9	 dQd+e
jd                  d,ej                  d-ej                  d.ej                  d/ej                  dz  d0e:d1e:fd2Z; G d3 d4e
jd                        Z< G d5 d6e
jd                        Z=dRd7ej                  d8e:d9e>dej                  fd:Z? G d; d<e
jd                        Z@ G d= d>e
jd                        ZA G d? d@e
jd                        ZB G dA dBe      ZC G dC dDe
j                        ZE G dE dFe
jd                        ZF G dG dHe
jd                        ZG G dI dJe
jd                        ZHe  G dK dLe             ZI e dM       G dN dOeI             ZJdLdOgZKy)S    N)Callable)	dataclass)Tensornn   )initialization)ACT2FN)ModelOutputis_scipy_availablerequires_backends)GradientCheckpointingLayer)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_accelerate_available)merge_with_config_defaults)capture_outputs   )
EomtConfig)linear_sum_assignment)PartialState)reducea  
    Class for outputs of [`EomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
    [`~EomtImageProcessor.post_process_instance_segmentation`] or
    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~EomtImageProcessor] for details regarding usage.
    )custom_introc                   <   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZeej                      dz  ed	<   y)
"EomtForUniversalSegmentationOutputa*  
    loss (`torch.Tensor`, *optional*):
        The computed loss, returned when labels are present.
    class_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
        query. Note the `+ 1` is needed because we incorporate the null class.
    masks_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
        query.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Last hidden states (final feature map) of the last layer.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
    patch_offsets (`list[torch.Tensor]`, *optional*):
        list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
    Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentionspatch_offsets)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r    r!   r"   tupler#   r$   listr        X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/eomt/modeling_eomt.pyr   r   3   s    * &*D%

d
")59%++d2959%++d2926u((4/659M5**+d2926Je''(4/6/3M4%,3r/   r   input_featurespoint_coordinatesreturnc                     |j                         dk(  rd}|j                  d      }t        j                  j                  j
                  | d|z  dz
  fi |}|r|j                  d      }|S )a(  
    A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.

    Args:
        input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
            A tensor that contains features map on a height * width grid
        point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
        2)):
            A tensor that contains [0, 1] * [0, 1] normalized point coordinates
        add_dim (`bool`):
            boolean value to keep track of added dimension

    Returns:
        point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
        height_grid, width_grid):
            A tensor that contains features for points in `point_coordinates`.
    r   T   g       @      ?)dim	unsqueezer)   r   
functionalgrid_samplesqueeze)r1   r2   add_dimkwargspoint_featuress        r0   sample_pointr?   ^   st    ( !#-77: XX((44^SK\E\_bEbmflmN'//2r/   inputslabelsc                    | j                         j                  d      } dt        j                  | |j                        z  }| j                  d      dddf   |j                  d      dddf   z   }d|dz   |dz   z  z
  }|S )a  
    A pair wise version of the dice loss, see `dice_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        `torch.Tensor`: The computed loss between each pairs.
    r   r5   N)sigmoidflattenr)   matmulTsum)r@   rA   	numeratordenominatorr   s        r0   pair_wise_dice_lossrK   ~   s|     ^^%%a(FELL22I**R.D)FJJrN47,CCK	A+/22DKr/   c                 \   | j                   d   }t        j                  d      } || t        j                  |             } || t        j
                  |             }t        j                  ||z  |j                        }t        j                  ||z  d|z
  j                        }||z   }|S )a  
    A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss between each pairs.
    r   none	reduction)shaper   BCEWithLogitsLossr)   	ones_like
zeros_likerF   rG   )	r@   rA   height_and_width	criterioncross_entropy_loss_poscross_entropy_loss_negloss_posloss_negr   s	            r0   $pair_wise_sigmoid_cross_entropy_lossrZ      s     ||A$$v6I&vuv/FG&vu/?/?/GH||25EEvxxPH||25EEF
~~VHhDKr/   c                        e Zd ZdZ	 ddedededef fdZ ej                         dej                  dej                  d	ej                  d
ej                  de
ee	      f
d       Z xZS )EomtHungarianMatcheraq  This class computes an assignment between the labels and the predictions of the network.

    For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
    predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).
    
cost_class	cost_mask	cost_dice
num_pointsc                     t         |           |dk(  r|dk(  r|dk(  rt        d      || _        || _        || _        || _        y)aH  Creates the matcher

        Params:
            cost_class (`float`, *optional*, defaults to 1.0):
                Relative weight of the classification error in the matching cost.
            cost_mask (`float`, *optional*,  defaults to 1.0):
                This is the relative weight of the focal loss of the binary mask in the matching cost.
            cost_dice (`float`, *optional*, defaults to 1.0):
                This is the relative weight of the dice loss of the binary mask in the matching cost.
            num_points (`int`, *optional*, defaults to 12544):
                No. of points to sample on which the mask loss will be calculated. The same set of K points are
                uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
                matching.
        r   zAll costs can't be 0N)super__init__
ValueErrorr`   r]   r^   r_   )selfr]   r^   r_   r`   	__class__s        r0   rc   zEomtHungarianMatcher.__init__   sK    " 	?yA~)q.344$$""r/   r    r   mask_labelsclass_labelsr3   c           	         g }|j                   d   }t        |      D ]  }||   j                  d      }||   }	|dd||   f    }
||   j                  |	      }|dddf   }|	dddf   }	t	        j
                  d| j                  d|	j                        }|j                  |j                   d   dd      }t        ||d      j                  d      }|j                  |	j                   d   dd      }t        |	|d      j                  d      }	t        |	|      }t        |	|      }| j                  |z  | j                  |
z  z   | j                  |z  z   }t	        j                   |t	        j"                  d	            }t	        j$                  |t	        j"                  d
            }t	        j&                  |d      }t)        |j+                               }|j-                  |        |D cg c]O  \  }}t	        j.                  |t        j0                        t	        j.                  |t        j0                        fQ }}}|S c c}}w )ao  
        Params:
            masks_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
            class_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
            class_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
                target) containing the class labels.
            mask_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes, height, width` containing the target masks.

        Returns:
            matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
            where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected labels (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
        r   rC   Nr   r5   deviceFalign_cornersg    _Bg    _©dtype)rP   rangesoftmaxtor)   randr`   rk   repeatr?   r;   rZ   rK   r^   r]   r_   minimumtensormaximum
nan_to_numr   cpuappend	as_tensorint64)re   r    r   rg   rh   indices
batch_sizei
pred_probs	pred_maskr]   target_maskr2   target_coordinatespred_coordinatesr^   r_   cost_matrixassigned_indicesjmatched_indicess                        r0   forwardzEomtHungarianMatcher.forward   s/   8 *, *//2
z" 	-A-a088<J,Q/I %QQ%788J%a.++I6K%ag.K!!T'*I !&

1dooqIYIY Z!2!9!9+:K:KA:NPQST!U&{4FV[\ddefgK077	8JAqQ$Y0@PUV^^_`aI =YTI+I{CI..94t7SSVZVdVdgpVppK--U\\$5GHK--U\\%5HIK**;:K0EkooFW0XNN+,?	-F ho
_c_`bcU__Qekk2EOOAU[[4YZ
 
 
s   5AI)r6   r6   r6   i 1  )r%   r&   r'   r(   floatintrc   r)   no_gradr   r-   r,   r   __classcell__rf   s   @r0   r\   r\      s     jo##27#JO#cf#4 U]]_D#llD $llD \\	D
 llD 
eFm	D Dr/   r\   	num_masksc                     | j                         j                  d      }d||z  j                  d      z  }|j                  d      |j                  d      z   }d|dz   |dz   z  z
  }|j                         |z  }|S )a4  
    Compute the DICE loss, similar to generalized IOU for masks as follows:

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$

    In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).
        num_masks (`int`):
            The number of masks present in the current batch, used for normalization.

    Returns:
        `torch.Tensor`: The computed loss.
    r   r5   rC   )rD   rE   rH   )r@   rA   r   probsrI   rJ   r   s          r0   	dice_lossr     sz    , NN$$Q'EUV^((,,I))B-&**R.0K	A+/22D88:	!DKr/   c                     t        j                  d      } || |      }|j                  d      j                         |z  }|S )a|  
    Args:
        inputs (`torch.Tensor`):
            A float tensor of arbitrary shape.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss.
    rM   rN   r   )r   rQ   meanrH   )r@   rA   r   rU   cross_entropy_lossr   s         r0   sigmoid_cross_entropy_lossr   8  sD     $$v6I"662""1%))+i7DKr/   c                       e Zd Zdedeeef   f fdZdeee	      dee	   fdZ
dee   deeef   fdZd	ed
ee   deej                     deeef   fdZdej                  deej                     deej                     de	deeej                  f   f
dZd Zd Zdej                  dej                  fdZdej                  de	de	dedej                  f
dZ	 ddej                  d	ej                  deej                     d
eej                     deeej                  f   dz  deeej                  f   fdZd
ej                  dej0                  dej                  fdZ xZS )EomtLossconfigweight_dictc                    t         |           t        | dg       |j                  | _        || _        |j
                  | _        t        j                  | j                  dz         }| j                  |d<   | j                  d|       |j                  | _        |j                  | _        |j                  | _        t        |j                  |j                   |j"                  | j                        | _        y)aH  
        The Eomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
        compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
        of matched ground-truth / prediction (supervise class and mask)

        Args:
            config (`EomtConfig`):
                The configuration for Eomt model also containing loss calculation specific parameters.
            weight_dict (`dict[str, float]`):
                A dictionary of weights to be applied to the different losses.
        scipyr   rC   empty_weight)r]   r_   r^   r`   N)rb   rc   r   
num_labelsr   no_object_weighteos_coefr)   onesregister_buffertrain_num_pointsr`   oversample_ratioimportance_sample_ratior\   class_weightdice_weightmask_weightmatcher)re   r   r   r   rf   s       r0   rc   zEomtLoss.__init__M  s     	$	* ++& //zz$//A"56==R^\: !11 & 7 7'-'E'E$+**((((	
r/   sizesr3   c                 n    |d   }|dd  D ]'  }t        |      D ]  \  }}t        ||   |      ||<    ) |S )Nr   r   )	enumeratemax)re   r   maxessublistindexitems         r0   _max_by_axiszEomtLoss._max_by_axisp  sS    aQRy 	7G(1 7t"5<6e7	7 r/   tensorsc                 `   | j                  |D cg c]  }t        |j                         c}      }t        |      g|z   }|\  }}}}|d   j                  }	|d   j
                  }
t        j                  ||	|
      }t        j                  |||ft        j                  |
      }t        |||      D ]o  \  }}}|d |j                  d   d |j                  d   d |j                  d   f   j                  |       d|d |j                  d   d |j                  d   f<   q ||fS c c}w )Nr   ro   rk   r   r5   F)r   r-   rP   lenro   rk   r)   zerosr   boolzipcopy_)re   r   rv   max_sizebatch_shaper~   _heightwidthro   rk   padded_tensorspadding_maskspadded_tensorpadding_masks                  r0   _pad_images_to_max_in_batchz$EomtLoss._pad_images_to_max_in_batchx  s7   $$w%OVd6<<&8%OP7|nx/'2$
Avu
  ""[fM

J#>ejjY_`36wP]3^ 	G/FM<+FLLO+->v||A->@Q&,,q/@QQRXXY_`AFL*6<<?*,=fll1o,==>	G },, &Ps   D+r   rh   r}   c           	         |}|j                   \  }}}t        j                  | j                        }| j	                  |      }	t        j                  t        ||      D 
cg c]  \  }
\  }}|
|    c}}}
      }t        j                  ||f| j                  t
        j                  |j                        }|||	<   |j                  dd      } |||      }d|i}|S c c}}}
w )a  Compute the losses related to the labels using cross entropy.

        Args:
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `batch_size, num_queries, num_labels`
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.

        Returns:
            `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
        )weight)
fill_valuero   rk   r   r5   loss_cross_entropy)rP   r   CrossEntropyLossr   $_get_predictions_permutation_indicesr)   catr   fullr   r|   rk   	transpose)re   r   rh   r}   pred_logitsr~   num_queriesr   rU   idxtargetr   target_classes_otarget_classespred_logits_transposedloss_celossess                    r0   loss_labelszEomtLoss.loss_labels  s    " +%0%6%6"
K''t/@/@A	77@ 99-0w-GHH>66AqVAYH
 %$//]h]o]o
 /s!,!6!6q!!<2NC&0 Is   #C!r    rg   r   c                      j                  |      } j                  |      }||   } j                  |      \  }}	||   }|dddf   }|dddf   }t        j                         5   j                  | fd j                   j                   j                        }
t        ||
d      j                  d      }ddd       t        |
d      j                  d      }t        ||      t        |||      d}~~|S # 1 sw Y   ExY w)a  Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.
            num_masks (`int)`:
                The number of masks, used for normalization.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
            - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
              masks.
        Nc                 &    j                  |       S N)calculate_uncertainty)logitsre   s    r0   <lambda>z%EomtLoss.loss_masks.<locals>.<lambda>  s    t99&A r/   Frl   r   )	loss_mask	loss_dice)r    _get_targets_permutation_indicesr   r)   r   sample_points_using_uncertaintyr`   r   r   r?   r;   r   r   )re   r    rg   r}   r   src_idxtgt_idx
pred_maskstarget_masksr   r2   point_labelspoint_logitsr   s   `             r0   
loss_maskszEomtLoss.loss_masks  s,   4 ;;GD77@)'2
 ::;Ga#G,  4(
#AtG, ]]_ 		i $ D DA%%,,! (6GW\]eefghL		i $J0AQVW__`ab 4L,PYZ"<yI

 )		i 		is   (AD  D	c                    t        j                  t        |      D cg c]  \  }\  }}t        j                  ||        c}}}      }t        j                  |D cg c]  \  }}|	 c}}      }||fS c c}}}w c c}}w r   r)   r   r   	full_like)re   r}   r   srcr   batch_indicespredictions_indicess          r0   r   z-EomtLoss._get_predictions_permutation_indices  sj    		iX_N`"a"a{q(35??3#:"ab#iiW(E#q(EF111 #b(E   #A7A>
c                    t        j                  t        |      D cg c]  \  }\  }}t        j                  ||        c}}}      }t        j                  |D cg c]  \  }}|	 c}}      }||fS c c}}}w c c}}w r   r   )re   r}   r   r   tgtr   target_indicess          r0   r   z)EomtLoss._get_targets_permutation_indices  sh    		iX_N`"a"a{q(1c5??3#:"ab#@HQC#@An,, #b#@r   r   c                 2    t        j                  |       }|S )a  
        In Eomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
        for the foreground class in `classes`.

        Args:
            logits (`torch.Tensor`):
            A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
            the number of foreground classes. The values are logits.

        Returns:
            scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
            uncertain locations having the highest uncertainty score.
        )r)   abs)re   r   uncertainty_scoress      r0   r   zEomtLoss.calculate_uncertainty  s      %yy01!!r/   r`   r   r   c           	         |j                   d   }t        ||z        }t        j                  ||d|j                        }t        ||d      }	 ||	      }
t        ||z        }||z
  }t        j                  |
dddddf   |d      d   }|t        j                  |t        j                  |j                  	      z  }||dddf   z  }|j                  d
d      |j                  d
      ddf   j                  ||d      }|dkD  r:t        j                  |t        j                  ||d|j                        gd      }|S )a  
        This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
        uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
        prediction as input.

        Args:
            logits (`float`):
                Logit predictions for P points.
            uncertainty_function:
                A function that takes logit predictions for P points and returns their uncertainties.
            num_points (`int`):
                The number of points P to sample.
            oversample_ratio (`int`):
                Oversampling parameter.
            importance_sample_ratio (`float`):
                Ratio of points that are sampled via importance sampling.

        Returns:
            point_coordinates (`torch.Tensor`):
                Coordinates for P sampled points.
        r   r5   rj   Frl   Nr   )kr7   r   rC   r7   )rP   r   r)   rs   rk   r?   topkarangelongviewr   )re   r   uncertainty_functionr`   r   r   	num_boxesnum_points_sampledr2   r   point_uncertaintiesnum_uncertain_pointsnum_random_pointsr   shifts                  r0   r   z(EomtLoss.sample_points_using_uncertainty  sI   < LLO	 .>!>? "JJy2DaPVP]P]^#F,=US2<@"#:Z#GH&)==jj,Q1W59MSTUVWX"U\\)5::V\VcVc%dduQW~-222q9#((2,/JOOPY[oqrsq  %		"EJJy:KQW]WdWd$ef! ! r/   Nauxiliary_predictionsc                    | j                  ||||      }| j                  ||d   j                        }i | j                  ||||      | j	                  |||      }|jt        |      D ]\  \  }	}
|
d   }|
d   }| j                  ||||      }|j                         D ci c]  \  }}| d|	 | }}}|j                  |       ^ |S c c}}w )a  
        This performs the loss computation.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, num_labels)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
                if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], then it contains the logits from
                the inner layers of the EomtMaskedAttentionDecoder.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
            - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
              masks.
            if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], the dictionary contains additional
            losses for each auxiliary predictions.
        r   rj   r    r   r   )	r   get_num_masksrk   r   r   r   r   itemsupdate)re   r    r   rg   rh   r  r}   r   r   r   aux_outputs	loss_dictkeyvalues                 r0   r   zEomtLoss.forward=  s
   H ,,35I;Xde&&|LO<R<R&S	%
oo2K)T%
3\7K%

 !,$-.C$D ) ['23I'J$'23I'J$ LL)=?SU`bno	EN__EVWzsEuAcU^U2W	Wi()  Xs   "Crk   c                 &   t        d |D              }t        j                  |t        j                  |      }d}t	               r2t
        j                  i k7  rt        |      }t               j                  }t        j                  ||z  d      }|S )zk
        Computes the average number of target masks across the batch, for normalization purposes.
        c              3   2   K   | ]  }t        |        y wr   )r   ).0classess     r0   	<genexpr>z)EomtLoss.get_num_masks.<locals>.<genexpr>x  s     AGAs   r   r   )min)
rH   r)   r{   r   r   r   _shared_stater   num_processesclamp)re   rh   rk   r   
world_sizes        r0   r  zEomtLoss.get_num_maskst  su     ALAA	OOIU[[P	
"$))R/"9-	)^99
KK	J 6A>	r/   r   )r%   r&   r'   r   dictstrr   rc   r-   r   r   r   r,   r   nparrayr   r)   r   r   r   r   r   r   rk   r  r   r   s   @r0   r   r   L  s   !
z !
S%Z8H !
F$tCy/ d3i -4< -E&RX.DY -" $* :>v, QVWYW_W_Q` 	c6k	 D<#ll< %,,'< rxx	<
 < 
c5<<	 <|2-"ELL "U\\ ""5!5! 	5!
 5! "'5! 
5!z AE5#ll5 $ll5 %,,'	5
 5<<(5  $C$56=5 
c5<<	 5n%,,  QVQ]Q] r/   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )EomtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   kernel_sizestride)rb   rc   
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)re   r   r!  r"  r#  r$  r)  rf   s          r0   rc   zEomtPatchEmbeddings.__init__  s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir/   pixel_valuesr3   c                     |j                   d   }|| j                  k7  rt        d| j                   d| d      | j                  |      j	                  d      j                  dd      }|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r5   )rP   r#  rd   r+  rE   r   )re   r,  r#  
embeddingss       r0   r   zEomtPatchEmbeddings.forward  sz    #))!,4,,,!../yaI  __\2::1=GG1M
r/   )	r%   r&   r'   r(   rc   r)   r   r   r   r   s   @r0   r  r    s)    jELL U\\ r/   r  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )EomtEmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    r   r3   Nc                    t         |           || _        |j                  | _        t	        j
                  t        j                  dd|j                              | _	        t	        j
                  t        j                  d|j                  |j                              | _        t        |      | _        | j                  j                  }t	        j                   |j"                        | _        d|j                  z   | _        t	        j(                  ||j                        | _        | j-                  dt        j.                  |      j1                  d      d       y )Nr   position_idsr   rC   F)
persistent)rb   rc   r   r"  r   	Parameterr)   randnr$  	cls_tokenr   num_register_tokensregister_tokensr  patch_embeddingsr)  Dropouthidden_dropout_probdropoutnum_prefix_tokens	Embeddingposition_embeddingsr   r   expand)re   r   r)  rf   s      r0   rc   zEomtEmbeddings.__init__  s     ++ekk!Q8J8J&KL!||EKK6;U;UW]WiWi,jk 3F ;++77zz&"<"<=!"V%?%?!?#%<<V=O=O#P ^U\\+-F-M-Mg-Vchir/   r,  c                    |j                   \  }}}}| j                  j                  j                  j                  }| j                  |j                  |            }| j                  j                  |dd      }| j                  j                  |dd      }|| j                  | j                        z   }t        j                  |||gd      }| j                  |      }|S )Nrn   rC   r   r   )rP   r;  r+  r   ro   rr   r8  rB  r:  rA  r3  r)   r   r>  )re   r,  r~   r   target_dtyper/  
cls_tokensr:  s           r0   r   zEomtEmbeddings.forward  s    *00
Aq!,,77>>DD**<???+NO
^^**:r2>
..55j"bI$":":4;L;L"MM
YY
OZHaP
\\*-
r/   
r%   r&   r'   r(   r   rc   r)   r   r   r   r   s   @r0   r1  r1    s9    jz jd j ELL U\\ r/   r1  modulequeryr  r  attention_maskscalingr>  c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrC   )r7   ro   )ptrainingr   r5   )r)   rF   r   r   r9   rq   float32rr   ro   r>  rN  
contiguous)
rG  rH  r  r  rI  rJ  r>  r=   attn_weightsattn_outputs
             r0   eager_attention_forwardrS    s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r/   c            
            e Zd ZdZ fdZ	 ddej                  dej                  dz  deej                  ej                  dz  f   fdZ xZ	S )	EomtAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)rb   rc   r   r$  	embed_dimnum_attention_heads	num_headshead_dimrd   scaleattention_dropoutr>  	is_causalr   Lineark_projv_projq_projout_projre   r   rf   s     r0   rc   zEomtAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar/   Nr"   rI  r3   c           
         |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|| j                  | j                  | j                  sdn| j                         \  }}|j#                  |||      j%                         }| j'                  |      }||fS )z#Input shape: Batch x Time x Channelr   r5           )r]  rJ  r>  )rP   ra  r_  r`  r   rY  rZ  r   r   get_interfacer   _attn_implementationrS  r]  r[  rN  r>  reshaperP  rb  )re   r"   rI  r=   r~   
seq_lengthrW  querieskeysvaluesattention_interfacerR  rQ  s                r0   r   zEomtAttention.forward  sW    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0L((r/   r   )
r%   r&   r'   r(   rc   r)   r   r,   r   r   r   s   @r0   rU  rU    sV    GB. /3$)||$) t+$)
 
u||U\\D00	1$)r/   rU  c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )EomtLayerScaler3   c                     t         |           t        j                  |j                  t        j                  |j                        z        | _        y r   )	rb   rc   r   r6  layerscale_valuer)   r   r$  lambda1rc  s     r0   rc   zEomtLayerScale.__init__   s8    ||F$;$;ejjI[I[>\$\]r/   hidden_statec                      || j                   z  S r   )rr  re   rs  s     r0   r   zEomtLayerScale.forward$  s    dll**r/   r3   Nr%   r&   r'   rc   r)   r   r   r   r   s   @r0   ro  ro    s$    ^+ELL +U\\ +r/   ro  input	drop_probrN  c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    re  r   r   )r   r   )rP   ndimr)   rs   ro   rk   floor_div)rx  ry  rN  	keep_probrP   random_tensoroutputs          r0   	drop_pathr  (  s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr/   c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
EomtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nry  r3   c                 0    t         |           || _        y r   )rb   rc   ry  )re   ry  rf   s     r0   rc   zEomtDropPath.__init__:  s    "r/   r"   c                 D    t        || j                  | j                        S r   )r  ry  rN  re   r"   s     r0   r   zEomtDropPath.forward>  s    FFr/   c                      d| j                    S )Nzp=)ry  re   s    r0   
extra_reprzEomtDropPath.extra_reprA  s    DNN#$$r/   r   )r%   r&   r'   r(   r   rc   r)   r   r   r  r  r   r   s   @r0   r  r  7  sG    b#%$, #$ #GU\\ Gell G%C %r/   r  c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )EomtMLPr3   c                 ~   t         |           |j                  x}}t        |j                  |j                  z        }t        j                  ||d      | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  ||d      | _        y )NTbias)rb   rc   r$  r   	mlp_ratior   r^  fc1r%  
hidden_actr  r	   
activationfc2re   r   in_featuresout_featureshidden_featuresrf   s        r0   rc   zEomtMLP.__init__F  s    %+%7%77lf0063C3CCD99[/Ef''-$V%6%67DO$//DO99_lFr/   rs  c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r  r  r  ru  s     r0   r   zEomtMLP.forwardQ  s2    xx-|4xx-r/   rv  rw  r   s   @r0   r  r  E  s$    	GELL U\\ r/   r  c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )EomtSwiGLUFFNr3   c                 0   t         |           |j                  x}}t        |j                  |j                  z        }t        |dz  dz        dz   dz  dz  }t        j                  |d|z  d      | _        t        j                  ||d      | _        y )Nr5   r         Tr  )	rb   rc   r$  r   r  r   r^  
weights_inweights_outr  s        r0   rc   zEomtSwiGLUFFN.__init__Y  s    %+%7%77lf0063C3CCD2Q67!;AAE))K_1D4P99_lNr/   rs  c                     | j                  |      }|j                  dd      \  }}t        j                  j	                  |      |z  }| j                  |      S )Nr5   rC   r   )r  chunkr   r9   silur  )re   rs  x1x2hiddens        r0   r   zEomtSwiGLUFFN.forwardb  sS    |4##A2#.B##B'",''r/   rv  rw  r   s   @r0   r  r  X  s$    O(ELL (U\\ (r/   r  c                        e Zd ZdZdeddf fdZ	 d	dej                  dej                  dz  dej                  fdZ xZ	S )
	EomtLayerzCThis corresponds to the Block class in the original implementation.r   r3   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |      | _        t        |      | _
        |j                  dkD  rt        |j                        nt        j                         | _        t        j                  |j                  |j
                        | _        |j                   rt#        |      | _        nt'        |      | _        t        |      | _        y )Nepsre  )rb   rc   r   	LayerNormr$  layer_norm_epsnorm1rU  	attentionro  layer_scale1drop_path_rater  Identityr  norm2use_swiglu_ffnr  mlpr  layer_scale2rc  s     r0   rc   zEomtLayer.__init__l  s    \\&"4"4&:O:OP
&v.*62@F@U@UX[@[f&;&;<acalalan\\&"4"4&:O:OP
  $V,DHvDH*62r/   r"   rI  c                 *   | j                  |      }| j                  ||      \  }}| j                  |      }| j                  |      |z   }| j	                  |      }| j                  |      }| j                  |      }| j                  |      |z   }|S r   )r  r  r  r  r  r  r  )re   r"   rI  hidden_states_normself_attention_outputr   layer_outputs          r0   r   zEomtLayer.forward|  s    
 "ZZ6#'>>2Dn#U q $ 1 12G H '<=M zz-0xx-((6 ~~l3mCr/   r   rF  r   s   @r0   r  r  i  sP    M3z 3d 3& /3|| t+ 
	r/   r  c                   X     e Zd Zd fd	Zdej
                  dej
                  fdZ xZS )EomtLayerNorm2dc                 *    t         |   |||       y )N)r  elementwise_affine)rb   rc   )re   r#  r  affinerf   s       r0   rc   zEomtLayerNorm2d.__init__  s    36Jr/   rs  r3   c                     |j                  dddd      }t        j                  || j                  | j                  | j
                  | j                        }|j                  dddd      }|S )Nr   r5   r   r   )permuteF
layer_normnormalized_shaper   r  r  ru  s     r0   r   zEomtLayerNorm2d.forward  sb    #++Aq!Q7||L$2G2GVZV_V_aeaiaij#++Aq!Q7r/   )gư>Trw  r   s   @r0   r  r    s$    KELL U\\ r/   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )EomtScaleLayerr   c                    t         |           |j                  }t        j                  ||dd      | _        t        |j                     | _        t        j                  ||dd|d      | _
        t        |      | _        y )Nr5   r  r   r   F)r  paddinggroupsr  )rb   rc   r$  r   ConvTranspose2dconv1r	   r  r  r*  conv2r  layernorm2dre   r   r$  rf   s      r0   rc   zEomtScaleLayer.__init__  su    ((''[aXYZ
 !2!23YY

 +;7r/   r"   r3   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r  r  r  r  r  s     r0   r   zEomtScaleLayer.forward  sB    

=16

=1((7r/   	r%   r&   r'   r   rc   r)   r   r   r   r   s   @r0   r  r    s*    8z 8 U\\ ell r/   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )EomtScaleBlockr   c                     t         |           |j                  | _        t	        j
                  t        | j                        D cg c]  }t        |       c}      | _        y c c}w r   )	rb   rc   num_upscale_blocks
num_blocksr   
ModuleListrp   r  blockre   r   r   rf   s      r0   rc   zEomtScaleBlock.__init__  sG     33]]E$//DZ#[qN6$:#[\
#[s   A&r"   r3   c                 8    | j                   D ]
  } ||      } |S r   )r  )re   r"   r  s      r0   r   zEomtScaleBlock.forward  s%    ZZ 	1E!-0M	1r/   r  r   s   @r0   r  r    s,    ]z ]
U\\ ell r/   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )EomtMaskHeadr   c                    t         |           |j                  }t        j                  ||      | _        t        j                  ||      | _        t        j                  ||      | _        t        |j                     | _
        y r   )rb   rc   r$  r   r^  r  r  fc3r	   r  r  r  s      r0   rc   zEomtMaskHead.__init__  sa    ((99[+699[+699[+6 !2!23r/   r"   r3   c                     | j                  | j                  |            }| j                  | j                  |            }| j                  |      }|S r   )r  r  r  r  r  s     r0   r   zEomtMaskHead.forward  sD    (?@(?@/r/   r  r   s   @r0   r  r    s*    4z 4U\\ ell r/   r  c                       e Zd ZU dZeed<   dZdZdZdZ	dgZ
dZeed	Z ej                          d
ej$                  ddfd       Zy)EomtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r   eomtr,  )imageFr  T)r"   r#   rG  r3   Nc                     | j                   j                  }t        |t        j                  t        j
                  t        j                  f      rt        j                  |j                  t        j                  d             |j                  xt        j                  j                  j                  |j                        \  }}|dkD  rdt        j                  |      z  nd}t        j                  |j                  | |       y y t        |t        j                         r?t        j"                  |j                         t        j$                  |j                         y t        |t        j&                        rtt        j(                  |j                  dd       |j*                  Et-        |j                  dd      s-t        j$                  |j                  |j*                            y y y t        |t.              rBt1        |d	      r5t        j2                  |j4                  | j                   j6                         y y t        |t8              rt        j:                  |j<                  d|       t        j$                  |j>                         t        j@                  |jB                  t        jD                  |jB                  jF                  d
         jI                  d             y t        |tJ              rRt        jL                  |jN                  dz         }|jP                  |d
<   t        j@                  |jR                  |       y t        |tT              r t        j"                  |jV                         y y )N   )ar   r   re  )r   std_is_hf_initializedFrr  rC   r4  ),r   initializer_ranger%  r   r^  r*  r  initkaiming_uniform_r   mathsqrtr  r)   _calculate_fan_in_and_fan_outuniform_r  ones_zeros_r@  normal_padding_idxgetattrro  hasattr	constant_rr  rq  r1  trunc_normal_r8  r:  r   r3  r   rP   rB  r   r   r   r   r   EomtForUniversalSegmentationattn_mask_probs)re   rG  r  fan_inr   boundr   s          r0   _init_weightsz!EomtPreTrainedModel._init_weights  sJ   kk++fryy"))R5G5GHI!!&--499Q<@{{&!HHMMGGV	17!DIIf--fkkE659 ' -JJv}}%KK$-LLSa8!!-gfmmMach6iFMM&*<*<=> 7j-/vy)v~~t{{/K/KL */v//csCKK../JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh) ::f&7&7!&;<L%LJJv**L9 <=JJv--. >r/   )r%   r&   r'   r(   r   r+   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpar  rU  _can_record_outputsr)   r   r   Moduler  r.   r/   r0   r  r    so    
 $O!&+#$N"#
 U]]_/BII /$ / /r/   r  zV
    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
    c                       e Zd ZdZdef fdZdededededeeef   d	eeef   fd
Z	deeef   d	efdZ
eee	 	 	 ddedee   dz  dee   dz  dee   dz  dee   d	efd                     Zd Zdej                  fdZed        Z xZS )r  r,  r   c                    t         |   |       || _        |j                  | _        t	        |      | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        t%        |      | _        t)        |      | _        t        j,                  |j                  |j.                  dz         | _        |j2                  |j4                  z  |j2                  |j4                  z  f| _        |j8                  |j:                  |j<                  d| _        tA        || j>                        | _!        | jE                  dtG        jH                  |jJ                               | jM                          y c c}w )Nr  r   )r   r   r   )r   r   r  )'rb   rc   r   num_hidden_layersr1  r/  r   r  r$  r  	layernormr@  r   rH  r  rp   r  layersr  upscale_blockr  	mask_headr^  r   class_predictorr!  r"  	grid_sizer   r   r   r   r   rU   r   r)   r   r  	post_initr  s      r0   rc   z%EomtForUniversalSegmentation.__init__  sp    !'!9!9(0f&8&8f>S>ST\\&"4"4f6H6HI
mmfF^F^@_$`1Yv%6$`a+F3%f-!yy););V=N=NQR=RS ++v/@/@@&BSBSW]WhWhBhi"("5"5++++.
 "T=M=MN.

6;L;L0MN% %as   >G*r    r   rg   rh   r  r3   c                     | j                  |||||      }| j                  j                         D ]'  \  }}|j                         D ]  \  }	}
||	v s|
|z  }
 ) |S )Nr    r   rg   rh   r  )rU   r   r  )re   r    r   rg   rh   r  r
  r  r   loss_keyr   s              r0   get_loss_dictz*EomtForUniversalSegmentation.get_loss_dict+  s|     (,~~!5!5#%"7 (6 (
	  ++113 	#KC"+//"3 #$(?FND#	#
 r/   r
  c                 4    t        |j                               S r   )rH   rl  )re   r
  s     r0   get_lossz%EomtForUniversalSegmentation.get_lossC  s    9##%&&r/   Nr$   r=   c                 (   d\  }}d}|t        d      | j                  |      }	t        | j                        D ]  \  }
}|
| j                  | j
                  j                  z
  k(  rp| j                  j                  dddddf   j                  |	j                  d   dd      j                  |	j                        }t        j                  ||	fd      }	|
| j                  | j
                  j                  z
  k\  r| j                  s7| j                   |
| j                  z
  | j
                  j                  z      dkD  r| j#                  |	      }| j%                  |      \  }}||fz  }||fz  }t        j&                  |	j                  d   |	j                  d   |	j                  d   |	j                  t        j(                        }t+        j,                  || j.                  d	
      }|j1                  |j3                  d      |j3                  d      d      }| j
                  j4                  }|| j                  j6                  z   }|dkD  |ddd||df<   | j9                  || j                   |
| j                  z
  | j
                  j                  z      |||j                        }|ddddf   j                  d| j
                  j:                  dd      }|j=                         j?                  | d      } ||	|      }	 | j#                  |	      }| j%                  |      \  }}||fz  }||fz  }d}|B|@d}tA        ||      D ]/  \  }}| jC                  ||||d      }|| jE                  |      z  }1 tG        |||||      S )ag  
        mask_labels (`list[torch.Tensor]`, *optional*):
            list of mask labels of shape `(num_labels, height, width)` to be fed to a model
        class_labels (`list[torch.LongTensor]`, *optional*):
            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
        patch_offsets (`list[torch.Tensor]`, *optional*):
            list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
        )r.   r.   Nz You have to specify pixel_valuesr   rC   r   r   )rk   ro   bilinear)sizemode)probnum_query_tokensencoder_start_tokensrk   .g    ere  r  )r   r    r   r!   r$   )$rd   r/  r   r  r  r   r  rH  r   rB  rP   rr   rk   r)   r   rN  r  r  predictr   r   r  interpolater  r   r  r   r?  _disable_attention_maskrX  r   masked_fillr   r  r  r   )re   r,  rg   rh   r$   r=   masks_queries_logits_per_layerclass_queries_logits_per_layerrI  r"   r   layer_modulerH  norm_hidden_statesr    r   interpolated_logitsr  r  sequence_outputr   r
  s                         r0   r   z$EomtForUniversalSegmentation.forwardF  s   * JPF&(F?@@5!*4;;!7 .	HCd,,t{{/E/EEE

))$1*5<<]=P=PQR=SUWY[\__`m`t`tu %		5-*@a Hd,,t{{/E/EEE!5!5cD<R<R6RUYU`U`UkUk6k!lop!p%)^^M%B"=A\\J\=]:$&:.3G2II..3G2II.!&!''*!''*!''*(//**" '(mm4Ht~~dn&o#&9&>&>',,Q/1D1I1I!1Lb'# $(;;#:#: '7$//:[:['[$ ObdeNeq"3#3"35I5JJK "&!=!="--cD4J4J.JT[[McMc.cd%5)=)00 "> " "04!=!D!DRIhIhjlnp!q!/!5!5!7!C!C^OUY!Z(GM].	H` ..759\\/5R22&+?*AA&&+?*AA&"|'?D>A.0N? 
1:$&: !..)=)= +!-*. / 	 i00
1 2!5!5-'
 	
r/   c                 .    | j                   j                  S r   )r/  r;  r  s    r0   get_input_embeddingsz1EomtForUniversalSegmentation.get_input_embeddings  s    ///r/   r   c                    |d d d | j                   j                  d d f   }| j                  |      }|d d | j                   j                  | j                  j                  z   d d d f   }|j                  dd      } |j                  |j                  d   dg| j                   }| j                  |      }| j                  |      }t        j                  d||      }||fS )Nr   r5   r   rC   zbqc, bchw -> bqhw)r   r   r
  r/  r?  r   rh  rP   r  r	  r  r)   einsum)re   r   query_tokensclass_logitsprefix_tokensmask_logitss         r0   r  z$EomtForUniversalSegmentation.predict  s    a!:4;;#:#:!:A=>++L9q$++"9"9DOO<]<]"]"_abbc%//15---m.A.A!.DbZ4>>Z~~l3**=9ll#6mTL((r/   c                     |dk  r9t        j                  | j                  d   ||      |kD  }d| d d d ||d f   |<   | S )Nr   r   rj   )r)   rs   rP   )	attn_maskr  r  r  rk   random_queriess         r0   r  z4EomtForUniversalSegmentation._disable_attention_mask  sW    !8"ZZ	(:<LU[\_ccN VWIa***,@,AAB>Rr/   )NNN)r%   r&   r'   r  r   rc   r   r  r  r  r  r   r   r   r-   r   r   r   r   r%  r)   r  staticmethodr  r   r   s   @r0   r  r    s?    %Oz 8$ % 	
   $CK0 
c6k	0'$sF{"3 ' '   ,0,0-1e
e
 &\D(e
 6lT)	e

 F|d*e
 +,e
 
,e
    e
N0)ell )   r/   r  )F)re  )re  F)Lcollections.abcr&  r  r   dataclassesr   numpyr  r)   torch.nn.functionalr   r9   r  r    r   r  activationsr	   
file_utilsr
   r   r   modeling_layersr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   configuration_eomtr   scipy.optimizer   
accelerater   accelerate.utilsr   r   r?   rK   rZ   r  r\   r   r   r   r   r  r1  r   rS  rU  ro  r   r  r  r  r  r  r  r  r  r  r  r  r  __all__r.   r/   r0   <module>rB     s  *   $ !      & ! L L 9 F & P P 7 5 * 4'' 	4 4	 4B LQLL5:\\
\\@  6 , u|| X]XdXd 8g299 gTf f   <u|| U\\ VY ^c^j^j (uryy up	")) B"RYY "X %II%<<% 
% <<	%
 LL4'% % %.;)BII ;)|+RYY +U\\ e T V[VbVb %299 %bii &(BII ("'* 'Tbll RYY 2	RYY 	299 " /// // //d 
@#6 @
@F !"@
Ar/   