
    qi              	           d Z ddlZddlZddlmZ ddlZddlmZ ddlm	Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ  ej0                  e      Ze ed       G d de                    Ze ed       G d de                    Ze ed       G d de                    Zd Zd Z G d dej@                        Z! G d dej@                        Z" G d dej@                        Z#d>dejH                  d e%d!e&d"ejH                  fd#Z' G d$ d%ej@                        Z( G d& d'ej@                        Z) G d( d)ej@                        Z* G d* d+ej@                        Z+ G d, d-ej@                        Z, G d. d/ej@                        Z- G d0 d1ej@                        Z. G d2 d3e      Z/ G d4 d5ej@                        Z0e G d6 d7e             Z1e G d8 d9e1             Z2 ed:       G d; d<e1             Z3g d=Z4y)?zPyTorch Donut Swin Transformer model.

This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
states.    N)	dataclass)nn   )initialization)ACT2FN)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )DonutSwinConfigzS
    DonutSwin encoder's outputs, with potential hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZe	ej                  df   dz  ed<   dZ
e	ej                  df   dz  ed<   dZe	ej                  df   dz  ed<   y)DonutSwinEncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   tupler   r        _/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/donut/modeling_donut_swin.pyr   r   %   s}     37u((4/6:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr    r   z[
    DonutSwin model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)	DonutSwinModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_output.r   r   r   )r   r   r   r   r   r   r   r   r$   r   r   r   r   r   r    r!   r#   r#   <   s    	 37u((4/6.2M5$$t+2:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr    r#   z5
    DonutSwin outputs for image classification.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)	DonutSwinImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlosslogits.r   r   r   )r   r   r   r   r'   r   r   r   r(   r   r   r   r   r   r    r!   r&   r&   V   s     &*D%

d
")'+FE$+:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr    r&   c                     | j                   \  }}}}| j                  |||z  |||z  ||      } | j                  dddddd      j                         j                  d|||      }|S )z2
    Partitions the given input into windows.
    r   r   r            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowss          r!   window_partitionr:   s   s}     /<.A.A+J|!&&Fk);8Lk[gM ##Aq!Q15@@BGGKYdfrsGNr    c                     | j                   d   }| j                  d||z  ||z  |||      } | j                  dddddd      j                         j                  d|||      } | S )z?
    Merges windows to produce higher resolution features.
    r-   r   r   r   r*   r+   r,   r.   )r9   r4   r6   r7   r8   s        r!   window_reverser<      sn     ==$Lll2v4e{6JKYdfrsGooaAq!Q/::<AA"feUabGNr    c            
            e Zd ZdZd fd	Zdej                  dededej                  fdZ	 	 dd	ej                  dz  d
ej                  dz  dedeej                     fdZ xZS )DonutSwinEmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    c                 ~   t         |           t        |      | _        | j                  j                  }| j                  j
                  | _        |r4t        j                  t        j                  dd|j                              nd | _        |j                  r=t        j                  t        j                  d|dz   |j                              | _        nd | _        t        j                  |j                        | _        t        j"                  |j$                        | _        |j(                  | _        || _        y )Nr   )super__init__DonutSwinPatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)selfrS   use_mask_tokenrD   	__class__s       r!   rA   zDonutSwinEmbeddings.__init__   s     8 @++77//99O]",,u{{1a9I9I'JKcg))')||EKK;QR?TZTdTd4e'fD$'+D$LL!1!12	zz&"<"<= ++r    
embeddingsr6   r7   returnc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr-   g      ?r   r   r*   bicubicF)sizemodealign_cornersdim)r/   rL   r   jit
is_tracingrR   r   reshaper1   r   
functionalinterpolater0   cat)rT   rW   r6   r7   rD   num_positionsclass_pos_embedpatch_pos_embedr_   
new_height	new_widthsqrt_num_positionss               r!   interpolate_pos_encodingz,DonutSwinEmbeddings.interpolate_pos_encoding   s`    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr    Npixel_valuesbool_masked_posrl   c                    |j                   \  }}}}| j                  |      \  }}	| j                  |      }|j                         \  }
}}|K| j                  j                  |
|d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  (|r|| j                  |||      z   }n|| j                  z   }| j                  |      }||	fS )Nr-   g      ?)r/   rC   rN   r[   rJ   expand	unsqueezetype_asrL   rl   rQ   )rT   rm   rn   rl   _r8   r6   r7   rW   output_dimensionsr5   seq_lenmask_tokensmasks                 r!   forwardzDonutSwinEmbeddings.forward   s     *6););&<(,(=(=l(K%
%YYz*
!+!2
GQ&//00WbIK",,R088ED#sTz2[45GGJ##/''$*G*G
TZ\a*bb
'$*B*BB
\\*-
,,,r    )FNF)r   r   r   r   rA   r   Tensorintrl   r   
BoolTensorboolr   rx   __classcell__rV   s   @r!   r>   r>      s    &&D5<< &D &DUX &D]b]i]i &DV 48).	-''$.- ))D0- #'	-
 
u||	-r    r>   c                   v     e Zd ZdZ fdZd Zdej                  dz  deej                  ee
   f   fdZ xZS )rB   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        |d   |d   z  |d   |d   z  f| _        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)r@   rA   
image_sizerR   r8   rI   
isinstancecollectionsabcIterablerD   rE   r   Conv2d
projection)rT   rS   r   rR   r8   hidden_sizerD   rV   s          r!   rA   z!DonutSwinPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79I9Ik#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY))L+:^hir    c                 n   || j                   d   z  dk7  rDd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|| j                   d   z  dk7  rFddd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|S )Nr   r   )rR   r   rc   pad)rT   rm   r6   r7   
pad_valuess        r!   	maybe_padz"DonutSwinPatchEmbeddings.maybe_pad   s    4??1%%*T__Q/%$//!:L2LLMJ==,,\:FLDOOA&&!+Q4??1#5QRAS8S#STJ==,,\:FLr    rm   NrX   c                     |j                   \  }}}}| j                  |||      }| j                  |      }|j                   \  }}}}||f}|j                  d      j	                  dd      }||fS )Nr*   r   )r/   r   r   flatten	transpose)rT   rm   rs   r8   r6   r7   rW   rt   s           r!   rx   z DonutSwinPatchEmbeddings.forward  s}    )5););&<~~lFEB__\2
(..1fe#UO''*44Q:
,,,r    )r   r   r   r   rA   r   r   r   r   rz   r{   rx   r~   r   s   @r!   rB   rB      sF    j	-E$5$5$< 	-u||UZ[^U_G_A` 	-r    rB   c            	            e Zd ZdZej
                  fdee   dedej                  ddf fdZ	d Z
d	ej                  d
eeef   dej                  fdZ xZS )DonutSwinPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    input_resolutionr_   
norm_layerrX   Nc                     t         |           || _        || _        t	        j
                  d|z  d|z  d      | _         |d|z        | _        y )Nr+   r*   Fbias)r@   rA   r   r_   r   Linear	reductionrN   )rT   r   r_   r   rV   s       r!   rA   zDonutSwinPatchMerging.__init__!  sI     01s7AG%@q3w'	r    c                     |dz  dk(  xs |dz  dk(  }|r.ddd|dz  d|dz  f}t         j                  j                  ||      }|S )Nr*   r   r   )r   rc   r   )rT   r3   r6   r7   
should_padr   s         r!   r   zDonutSwinPatchMerging.maybe_pad(  sU    qjAo:519>
Q519a!<JMM--mZHMr    r3   input_dimensionsc                    |\  }}|j                   \  }}}|j                  ||||      }| j                  |||      }|d d dd ddd dd d f   }|d d dd ddd dd d f   }	|d d dd ddd dd d f   }
|d d dd ddd dd d f   }t        j                  ||	|
|gd      }|j                  |dd|z        }| j                  |      }| j                  |      }|S )Nr   r*   r   r-   r+   )r/   r0   r   r   re   rN   r   )rT   r3   r   r6   r7   r5   r_   r8   input_feature_0input_feature_1input_feature_2input_feature_3s               r!   rx   zDonutSwinPatchMerging.forward0  s   ((5(;(;%
C%**:vulS}feD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?O_Ve"fhjk%**:r1|;KL		-0}5r    )r   r   r   r   r   rM   r   r{   ModulerA   r   r   rz   rx   r~   r   s   @r!   r   r     sr    
 XZWcWc (s (# (299 (hl (U\\ U3PS8_ Y^YeYe r    r   input	drop_probtrainingrX   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

            r   r   )r   dtypedevice)r/   ndimr   randr   r   floor_div)r   r   r   	keep_probr/   random_tensoroutputs          r!   	drop_pathr   K  s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr    c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
DonutSwinDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rX   c                 0    t         |           || _        y N)r@   rA   r   )rT   r   rV   s     r!   rA   zDonutSwinDropPath.__init__^  s    "r    r   c                 D    t        || j                  | j                        S r   )r   r   r   rT   r   s     r!   rx   zDonutSwinDropPath.forwardb  s    FFr    c                      d| j                    S )Nzp=)r   rT   s    r!   
extra_reprzDonutSwinDropPath.extra_repre  s    DNN#$$r    r   )r   r   r   r   floatrA   r   rz   rx   strr   r~   r   s   @r!   r   r   [  sG    b#%$, #$ #GU\\ Gell G%C %r    r   c            
            e Zd Z fdZ	 	 d	dej
                  dej                  dz  dedz  deej
                     fdZ	d Z
 xZS )
DonutSwinSelfAttentionc                    t         |           ||z  dk7  rt        d| d| d      || _        t	        ||z        | _        | j                  | j
                  z  | _        t        |t        j                  j                        r|n||f| _        t        j                  t        j                  d| j                  d   z  dz
  d| j                  d   z  dz
  z  |            | _        | j#                  d| j%                                t        j&                  | j                  | j                  |j(                        | _        t        j&                  | j                  | j                  |j(                        | _        t        j&                  | j                  | j                  |j(                        | _        t        j0                  |j2                        | _        y )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r*   r   relative_position_indexr   )r@   rA   
ValueErrornum_attention_headsr{   attention_head_sizeall_head_sizer   r   r   r   r4   r   rG   r   rH   relative_position_bias_tableregister_buffercreate_relative_position_indexr   qkv_biasquerykeyvaluerO   attention_probs_dropout_probrQ   rT   rS   r_   	num_headsr4   rV   s        r!   rA   zDonutSwinSelfAttention.__init__k  s   ?a#C5(^_h^iijk  $- #&sY#7 !558P8PP%k;??3K3KLKS^`kRl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
) 	68[8[8]^YYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr    Nr   attention_maskoutput_attentionsrX   c                    |j                   \  }}}||d| j                  f}| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
t        j                  ||	j	                  dd            }|t        j                  | j                        z  }| j                  | j                  j                  d         }|j                  | j                  d   | j                  d   z  | j                  d   | j                  d   z  d      }|j                  ddd      j                         }||j!                  d      z   }|r|j                   d   }|j                  ||z  || j"                  ||      }||j!                  d      j!                  d      z   }|j                  d| j"                  ||      }t$        j&                  j)                  |d      }| j+                  |      }t        j                  ||
      }|j                  dddd      j                         }|j-                         d d | j.                  fz   }|j                  |      }|r||f}|S |f}|S )Nr-   r   r*   r   r^   r   )r/   r   r   r0   r   r   r   r   matmulmathsqrtr   r   r4   r1   r2   rq   r   r   rc   softmaxrQ   r[   r   )rT   r   r   r   r5   r_   r8   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                     r!   rx   zDonutSwinSelfAttention.forward  s    )6(;(;%
C"CT-E-EFjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<Y5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.D.N.Nq.QQ%'--a0J/44j(*d6N6NPSUX   0.2J2J12M2W2WXY2ZZ/44R9Q9QSVX[\ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r    c                    t        j                  | j                  d         }t        j                  | j                  d         }t        j                  t        j                  ||gd            }t        j
                  |d      }|d d d d d f   |d d d d d f   z
  }|j                  ddd      j                         }|d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   d| j                  d   z  dz
  z  cc<   |j                  d      }|S )Nr   r   ij)indexingr*   r-   )	r   aranger4   stackmeshgridr   r1   r2   sum)rT   coords_hcoords_wcoordscoords_flattenrelative_coordsr   s          r!   r   z5DonutSwinSelfAttention.create_relative_position_index  s-   << 0 0 34<< 0 0 34U^^Xx,@4PQvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"9&&r    ry   )r   r   r   rA   r   rz   r   r}   r   rx   r   r~   r   s   @r!   r   r   j  s^    G: 48).	1||1 ))D01  $;	1
 
u||	1f'r    r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )DonutSwinSelfOutputc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y r   )r@   rA   r   r   denserO   r   rQ   rT   rS   r_   rV   s      r!   rA   zDonutSwinSelfOutput.__init__  s6    YYsC(
zz&"E"EFr    r   input_tensorrX   c                 J    | j                  |      }| j                  |      }|S r   r   rQ   )rT   r   r   s      r!   rx   zDonutSwinSelfOutput.forward  s$    

=1]3r    r   r   r   rA   r   rz   rx   r~   r   s   @r!   r   r     s2    G
U\\  RWR^R^ r    r   c            
            e Zd Z fdZ	 	 ddej
                  dej                  dz  dedz  deej
                     fdZ	 xZ
S )	DonutSwinAttentionc                 j    t         |           t        ||||      | _        t	        ||      | _        y r   )r@   rA   r   rT   r   r   r   s        r!   rA   zDonutSwinAttention.__init__  s.    *63	;O	)&#6r    Nr   r   r   rX   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )rT   r   )rT   r   r   r   self_outputsattention_outputr   s          r!   rx   zDonutSwinAttention.forward  sE     yy@QR;;|AF#%QR(88r    ry   )r   r   r   rA   r   rz   r   r}   r   rx   r~   r   s   @r!   r   r     sW    7 48).		||	 ))D0	  $;		
 
u||		r    r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DonutSwinIntermediatec                    t         |           t        j                  |t	        |j
                  |z              | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r@   rA   r   r   r{   	mlp_ratior   r   
hidden_actr   r   intermediate_act_fnr   s      r!   rA   zDonutSwinIntermediate.__init__  sa    YYsC(8(83(>$?@
f''-'-f.?.?'@D$'-'8'8D$r    r   rX   c                 J    | j                  |      }| j                  |      }|S r   )r   r  r   s     r!   rx   zDonutSwinIntermediate.forward  s&    

=100?r    r   r   s   @r!   r  r    s#    9U\\ ell r    r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DonutSwinOutputc                     t         |           t        j                  t	        |j
                  |z        |      | _        t        j                  |j                        | _	        y r   )
r@   rA   r   r   r{   r  r   rO   rP   rQ   r   s      r!   rA   zDonutSwinOutput.__init__  sF    YYs6#3#3c#9:C@
zz&"<"<=r    r   rX   c                 J    | j                  |      }| j                  |      }|S r   r   r   s     r!   rx   zDonutSwinOutput.forward  s$    

=1]3r    r   r   s   @r!   r	  r	    s#    >
U\\ ell r    r	  c                        e Zd Zd fd	Zd Zd Zd Z	 	 ddej                  de	e
e
f   dedz  d	edz  d
e	ej                  ej                  f   f
dZ xZS )DonutSwinLayerc                    t         |           |j                  | _        || _        |j                  | _        || _        t        j                  ||j                        | _	        t        |||| j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t!        ||      | _        t%        ||      | _        y )N)eps)r4   r   )r@   rA   chunk_size_feed_forward
shift_sizer4   r   r   rM   layer_norm_epslayernorm_beforer   	attentionr   Identityr   layernorm_afterr  intermediater	  r   )rT   rS   r_   r   r   drop_path_rater  rV   s          r!   rA   zDonutSwinLayer.__init__  s    '-'E'E$$!-- 0 "Sf6K6K L+FCPTP`P`a>Ls>R*>:XZXcXcXe!||CV5J5JK1&#>%fc2r    c                    t        |      | j                  k  rgt        d      | _        t        j
                  j                         r(t	        j                   t	        j                  |            n
t        |      | _        y y Nr   )minr4   r   r  r   r`   ra   tensor)rT   r   s     r!   set_shift_and_window_sizez(DonutSwinLayer.set_shift_and_window_size  s\     D$4$44'lDO=BYY=Q=Q=S		%,,'789Y\]mYn  5r    c           	         | j                   dkD  rht        j                  d||df||      }t        d| j                         t        | j                   | j                          t        | j                    d       f}t        d| j                         t        | j                   | j                          t        | j                    d       f}d}|D ]  }	|D ]  }
||d d |	|
d d f<   |dz  }  t        || j                        }|j                  d| j                  | j                  z        }|j                  d      |j                  d      z
  }|j                  |dk7  d      j                  |dk(  d      }|S d }|S )Nr   r   r   r-   r*   g      Yr   )	r  r   rH   slicer4   r:   r0   rq   masked_fill)rT   r6   r7   r   r   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r!   get_attn_maskzDonutSwinLayer.get_attn_mask  s   ??Q{{Avua#8fUHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E - #/ K@EHQk1<=QJE
 ,Hd6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir    c                     | j                   || j                   z  z
  | j                   z  }| j                   || j                   z  z
  | j                   z  }ddd|d|f}t        j                  j                  ||      }||fS r  )r4   r   rc   r   )rT   r   r6   r7   	pad_right
pad_bottomr   s          r!   r   zDonutSwinLayer.maybe_pad8  s    %%0@0@(@@DDTDTT	&&$2B2B)BBdFVFVV
Ay!Z8
))-Dj((r    r   r   r   Nalways_partitionrX   c                    |s| j                  |       n	 |\  }}|j                         \  }}}	|}
| j                  |      }|j                  ||||	      }| j	                  |||      \  }}|j
                  \  }}}}| j                  dkD  r1t        j                  || j                   | j                   fd      }n|}t        || j                        }|j                  d| j                  | j                  z  |	      }| j                  |||j                  |j                        }| j                  |||      }|d   }|j                  d| j                  | j                  |	      }t        || j                  ||      }| j                  dkD  r/t        j                  || j                  | j                  fd      }n|}|d   dkD  xs |d   dkD  }|r|d d d |d |d d f   j!                         }|j                  |||z  |	      }|
| j#                  |      z   }| j%                  |      }| j'                  |      }|| j)                  |      z   }|r	||d	   f}|S |f}|S )
Nr   )r   r*   )shiftsdimsr-   r   )r   r   r,   r   )r  r[   r  r0   r   r/   r  r   rollr:   r4   r)  r   r   r  r<   r2   r   r  r  r   )rT   r   r   r   r-  r6   r7   r5   rs   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsr(  attention_outputsr   attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                           r!   rx   zDonutSwinLayer.forward?  s     **+;<("/"4"4"6
Ax --m<%**:vuhO %)NN=&%$P!z&3&9&9#:y!??Q$)JJ}tFVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&	)<)<EZEaEa ' 
	 !NN+@)_pNq,Q/,11"d6F6FHXHXZbc():D<L<LjZcd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:v~xX 4>>2C#DD++M:((6$t{{<'@@@Q'8';< YeWfr    )r   r   FF)r   r   r   rA   r  r)  r   r   rz   r   r{   r}   rx   r~   r   s   @r!   r  r    sz    38) */(->||>  S/>  $;	>
 +> 
u||U\\)	*>r    r  c                        e Zd Z fdZ	 	 d	dej
                  deeef   dedz  dedz  deej
                     f
dZ	 xZ
S )
DonutSwinStagec                 h   t         	|           || _        || _        t	        j
                  t        |      D cg c]-  }t        ||||||   |dz  dk(  rdn|j                  dz        / c}      | _	        |& |||t        j                        | _        d| _        y d | _        d| _        y c c}w )Nr*   r   )rS   r_   r   r   r  r  )r_   r   F)r@   rA   rS   r_   r   
ModuleListranger  r4   blocksrM   
downsamplepointing)
rT   rS   r_   r   depthr   r   rE  irV   s
            r!   rA   zDonutSwinStage.__init__  s    mm u
  !%5'#,Q<%&UaZqf6H6HA6M

 !()9sr||\DO  #DO'
s   2B/r   r   r   Nr-  rX   c                    |\  }}t        | j                        D ]  \  }} |||||      }	|	d   } |}
| j                  )|dz   dz  |dz   dz  }}||||f}| j                  |
|      }n||||f}||
|f}|r|	dd  z  }|S )Nr   r   r*   )	enumeraterD  rE  )rT   r   r   r   r-  r6   r7   rH  layer_moduler=  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledrt   stage_outputss                  r!   rx   zDonutSwinStage.forward  s     )(5 	-OA|(8HJ[]mnM)!,M	-
 -:)??&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_`M!' >&(IK\]]12..Mr    r>  )r   r   r   rA   r   rz   r   r{   r}   rx   r~   r   s   @r!   r@  r@    sb    < */(-||  S/  $;	
 + 
u||	r    r@  c                        e Zd Z fdZ	 	 	 	 	 ddej
                  deeef   dedz  dedz  dedz  dedz  d	edz  d
ee	z  fdZ
 xZS )DonutSwinEncoderc                    t         |           t        |j                        | _        || _        t        j                  d|j                  t        |j                        d      D cg c]  }|j                          }}t        j                  t        | j                        D cg c]  }t        |t        |j                   d|z  z        |d   d|z  z  |d   d|z  z  f|j                  |   |j"                  |   |t        |j                  d |       t        |j                  d |dz           || j                  dz
  k  rt$        nd        c}      | _        d| _        y c c}w c c}w )Nr   cpu)r   r*   r   )rS   r_   r   rG  r   r   rE  F)r@   rA   lendepths
num_layersrS   r   linspacer  r   itemr   rB  rC  r@  r{   rI   r   r   layersgradient_checkpointing)rT   rS   rE   xdpri_layerrV   s         r!   rA   zDonutSwinEncoder.__init__  sM   fmm,!&63H3H#fmmJ\ej!klAqvvxllmm  %T__5  !F,,q'z9:&/lq'z&BIaLUVX_U_D`%a --0$..w7!#fmmHW&=">V]]S`U\_`U`EaAbc9@4??UVCV9V4]a
 ',#! ms   )E&(B*E+r   r   r   Noutput_hidden_states(output_hidden_states_before_downsamplingr-  return_dictrX   c                    |rdnd }|rdnd }	|rdnd }
|rE|j                   \  }}} |j                  |g|| }|j                  dddd      }||fz  }|	|fz  }	t        | j                        D ]  \  }} |||||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                   \  }}} |j                  |g|d   |d   f| }|j                  dddd      }||fz  }|	|fz  }	nI|rG|sE|j                   \  }}} |j                  |g|| }|j                  dddd      }||fz  }|	|fz  }	|s|
|dd  z  }
 |st        d |||
fD              S t        |||
|		      S )
Nr   r   r   r   r*   r   r-   c              3   &   K   | ]	  }||  y wr   r   ).0vs     r!   	<genexpr>z+DonutSwinEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   )r   r   r   r   )r/   r0   r1   rJ  rY  r   r   )rT   r   r   r   r^  r_  r-  r`  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsr5   rs   r   reshaped_hidden_staterH  rK  r=  rL  rt   s                       r!   rx   zDonutSwinEncoder.forward  s0    #7BD+?RT"$5b4)6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5 	9OA|(8HJ[]mnM)!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF* #}QR'88#9	9< m]4EGZ$[mmm%++*#=	
 	
r    )FFFFT)r   r   r   rA   r   rz   r   r{   r}   r   rx   r~   r   s   @r!   rQ  rQ    s    ,4 */,1@E(-#'<
||<
  S/<
  $;	<

 #Tk<
 37+<
 +<
 D[<
 
'	'<
r    rQ  c                   h     e Zd ZU eed<   dZdZdZdZdgZ	 e
j                          fd       Z xZS )DonutSwinPreTrainedModelrS   donutrm   )imageTr@  c                    t         |   |       t        |t              rX|j                  t        j                  |j                         |j                   t        j                  |j                         yyt        |t              rNt        j                  |j                         t        j                  |j                  |j                                yy)zInitialize the weightsN)r@   _init_weightsr   r>   rJ   initzeros_rL   r   r   copy_r   r   )rT   modulerV   s     r!   ro  z&DonutSwinPreTrainedModel._init_weights  s     	f%f12  ,F--.))5F667 6 67KK;;<JJv55v7\7\7^_ 8r    )r   r   r   r   r   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr   no_gradro  r~   r   s   @r!   rk  rk    sG     $O!&*#)*U]]_
` 
`r    rk  c                        e Zd Zd fd	Zd Ze	 	 	 	 	 	 ddej                  dz  dej                  dz  de	dz  de	dz  de	d	e	dz  d
e
ez  fd       Z xZS )DonutSwinModelc                    t         |   |       || _        t        |j                        | _        t        |j                  d| j
                  dz
  z  z        | _        t        ||      | _
        t        || j                  j                        | _        |rt        j                  d      nd| _        | j#                          y)z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        r*   r   )rU   N)r@   rA   rS   rT  rU  rV  r{   rI   num_featuresr>   rW   rQ  rF   encoderr   AdaptiveAvgPool1dpooler	post_init)rT   rS   add_pooling_layerrU   rV   s       r!   rA   zDonutSwinModel.__init__)  s     	 fmm, 0 0119L3M MN-f^T'0J0JK1Bb**1- 	r    c                 .    | j                   j                  S r   )rW   rC   r   s    r!   get_input_embeddingsz#DonutSwinModel.get_input_embeddings=  s    ///r    Nrm   rn   r   r^  rl   r`  rX   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  |||      \  }}	| j                  ||	|||      }
|
d   }d}| j                  7| j                  |j                  dd            }t        j                  |d      }|s||f|
dd z   }|S t        |||
j                  |
j                  |
j                        S )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rn   rl   )r   r^  r`  r   r   r*   )r   r$   r   r   r   )rS   r   r^  use_return_dictr   rW   r~  r  r   r   r   r#   r   r   r   )rT   rm   rn   r   r^  rl   r`  kwargsembedding_outputr   encoder_outputssequence_outputpooled_outputr   s                 r!   rx   zDonutSwinModel.forward@  s<    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@-1__/Tl .= .
** ,,/!5# ' 
 *!,;;" KK(A(A!Q(GHM!MM-;M%}58KKFM#-')77&11#2#I#I
 	
r    )TFNNNNFN)r   r   r   rA   r  r   r   r   r|   r}   r   r#   rx   r~   r   s   @r!   r{  r{  '  s    (0  2637)-,0).#'5
''$.5
 ))D05
  $;	5

 #Tk5
 #'5
 D[5
 
%	%5
 5
r    r{  a  
    DonutSwin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune DonutSwin on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                        e Zd Z fdZe	 	 	 	 	 	 ddej                  dz  dej                  dz  dedz  dedz  dededz  d	e	e
z  fd
       Z xZS )DonutSwinForImageClassificationc                 >   t         |   |       |j                  | _        t        |      | _        |j                  dkD  r4t        j                  | j                  j                  |j                        nt        j                         | _	        | j                          y r  )r@   rA   
num_labelsr{  rl  r   r   r}  r  
classifierr  )rT   rS   rV   s     r!   rA   z(DonutSwinForImageClassification.__init__  sx      ++#F+
 FLEVEVYZEZBIIdjj--v/@/@A`b`k`k`m 	
 	r    Nrm   labelsr   r^  rl   r`  rX   c                 V   ||n| j                   j                  }| j                  |||||      }|d   }	| j                  |	      }
d}|| j	                  ||
| j                         }|s|
f|dd z   }||f|z   S |S t        ||
|j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r^  rl   r`  r   r*   )r'   r(   r   r   r   )	rS   r  rl  r  loss_functionr&   r   r   r   )rT   rm   r  r   r^  rl   r`  r  r   r  r(   r'   r   s                r!   rx   z'DonutSwinForImageClassification.forward  s    " &1%<k$++B]B]**/!5%=#  
  
/%%ffdkkBDY,F)-)9TGf$EvE-!//))#*#A#A
 	
r    r  )r   r   r   rA   r   r   r   
LongTensorr}   r   r&   rx   r~   r   s   @r!   r  r  y  s       26*.)-,0).#',
''$.,
   4',
  $;	,

 #Tk,
 #',
 D[,
 
/	/,
 ,
r    r  )r{  rk  r  )r   F)5r   collections.abcr   r   dataclassesr   r   r    r   rp  activationsr   modeling_layersr   modeling_utilsr	   utilsr
   r   r   r   configuration_donut_swinr   
get_loggerr   loggerr   r#   r&   r:   r<   r   r>   rB   r   rz   r   r}   r   r   r   r   r   r  r	  r  r@  rQ  rk  r{  r  __all__r   r    r!   <module>r     sC  
   !   & ! 9 - D D 5 
		H	% H[ H H  H; H H& H[ H H,	Y-")) Y-z(-ryy (-X3BII 3nU\\ e T V[VbVb  %		 %Z'RYY Z'|
")) 
 &BII  	bii 	wRYY wv4/ 4pS
ryy S
l ` ` `, N
- N
 N
b <
&> <
<
~ \r    