
    qiK                     >   d Z ddlZddlZddlZddlmZ ddlmc mZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddl m!Z!  ejD                  e#      Z$ G d dejJ                        Z& G d dejJ                        Z'e G d de             Z( G d dejJ                        Z) G d dejJ                        Z* G d dejJ                        Z+ G d de      Z, G d dejJ                        Z- G d dejJ                        Z. G d  d!ejJ                        Z/e G d" d#e(             Z0 G d$ d%ejJ                        Z1 ed&'       G d( d)e(             Z2e G d* d+e(             Z3 ed,'       G d- d.e(             Z4g d/Z5y)0zPyTorch LayoutLMv3 model.    N)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringlogging	torch_int   )LayoutLMv3Configc                   *     e Zd ZdZ fdZddZ xZS )LayoutLMv3PatchEmbeddingszLayoutLMv3 image (patch) embeddings. This class also automatically interpolates the position embeddings for varying
    image sizes.c                    t         |           t        |j                  t        j
                  j                        r|j                  n|j                  |j                  f}t        |j                  t        j
                  j                        r|j                  n|j                  |j                  f}|d   |d   z  |d   |d   z  f| _        t        j                  |j                  |j                  ||      | _        y )Nr   r   )kernel_sizestride)super__init__
isinstance
input_sizecollectionsabcIterable
patch_sizepatch_shapennConv2dnum_channelshidden_sizeproj)selfconfig
image_sizer!   	__class__s       d/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.pyr   z"LayoutLMv3PatchEmbeddings.__init__2   s     &++[__-E-EF ##V%6%67 	 &++[__-E-EF ##V%6%67 	
 'qMZ]:JqMZXY]<Z[IIf1163E3ES]fpq	    c                 l   | j                  |      }||j                  d| j                  d   | j                  d   d      }|j                  dddd      }|j                  d   |j                  d   }}t        j                  |||fd      }||z   }|j                  d      j                  dd      }|S )Nr   r   r      bicubic)sizemode)	r'   viewr"   permuteshapeFinterpolateflatten	transpose)r(   pixel_valuesposition_embedding
embeddingspatch_heightpatch_widths         r,   forwardz!LayoutLMv3PatchEmbeddings.forwardB   s    YY|,
)!3!8!8D<L<LQ<OQUQaQabcQdfh!i!3!;!;Aq!Q!G(2(8(8(;Z=M=Ma=P+L!"/AWbHcjs!t#&88J''*44Q:
r-   N__name__
__module____qualname____doc__r   r@   __classcell__r+   s   @r,   r   r   .   s    r r-   r   c                   F     e Zd ZdZ fdZd Zd Zd Z	 	 	 	 	 ddZ xZ	S )LayoutLMv3TextEmbeddingszm
    LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
    c                 .   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d       |j                  | _        t        j                  |j$                  |j
                  | j(                        | _        t        j                  |j,                  |j.                        | _        t        j                  |j,                  |j.                        | _        t        j                  |j,                  |j4                        | _        t        j                  |j,                  |j4                        | _        y )N)padding_idxepsposition_idsr   r/   F
persistent)r   r   r#   	Embedding
vocab_sizer&   pad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangemax_position_embeddingsexpandrL   position_embeddingsmax_2d_position_embeddingscoordinate_sizex_position_embeddingsy_position_embeddings
shape_sizeh_position_embeddingsw_position_embeddingsr(   r)   r+   s     r,   r   z!LayoutLMv3TextEmbeddings.__init__V   s}   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 "..#%<<**F,>,>DL\L\$
  &(\\&2S2SU[UkUk%l"%'\\&2S2SU[UkUk%l"%'\\&2S2SU[UfUf%g"%'\\&2S2SU[UfUf%g"r-   c           	      H   	 | j                  |d d d d df         }| j                  |d d d d df         }| j                  |d d d d df         }| j                  |d d d d df         }| j                  t	        j
                  |d d d d df   |d d d d df   z
  dd            }| j                  t	        j
                  |d d d d df   |d d d d df   z
  dd            }t	        j                  ||||||gd      }	|	S # t        $ r}t        d      |d }~ww xY w)	Nr   r   r0   r   z;The `bbox` coordinate values should be within 0-1000 range.i  r/   dim)rf   rg   
IndexErrorri   r_   cliprj   cat)
r(   bboxleft_position_embeddingsupper_position_embeddingsright_position_embeddingslower_position_embeddingseri   rj   spatial_position_embeddingss
             r,   %calculate_spatial_position_embeddingsz>LayoutLMv3TextEmbeddings.calculate_spatial_position_embeddingsm   sN   	c'+'A'A$q!Qw-'P$(,(B(B41a=(Q%(,(B(B41a=(Q%(,(B(B41a=(Q% !% : :5::d1aQR7mVZ[\^_ab[bVcFcefhl;m n $ : :5::d1aQR7mVZ[\^_ab[bVcFcefhl;m n ',ii()))%% 
'
# +*%  	cZ[abb	cs   A,D 	D!DD!c                     |j                  |      j                         }t        j                  |d      j	                  |      |z  }|j                         |z   S )z
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
        r   rm   )neintr_   cumsumtype_aslong)r(   	input_idsrL   maskincremental_indicess        r,   "create_position_ids_from_input_idsz;LayoutLMv3TextEmbeddings.create_position_ids_from_input_ids   sP     ||K(,,.$||Da8@@F$N"'')K77r-   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
        Nr/   r   dtypedevicer   )r2   r_   r`   rL   r   r   	unsqueezerb   )r(   inputs_embedsinput_shapesequence_lengthrO   s        r,   &create_position_ids_from_inputs_embedsz?LayoutLMv3TextEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r-   c                 N   |I|6| j                  || j                        j                  |j                        }n| j	                  |      }||j                         }n|j                         d d }|:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }||z   }| j                  |      }	||	z  }| j                  |      }
||
z   }| j                  |      }| j                  |      }|S )Nr/   r   )r   rL   tor   r   r2   r_   zerosr   rO   rV   rX   rc   ry   rY   r]   )r(   r   rr   token_type_idsrO   r   r   rX   r=   rc   rx   s              r,   r@   z LayoutLMv3TextEmbeddings.forward   s+    $#FFyRVRbRbcff$$   $JJ=Y #..*K',,.s3K!"[[EJJtO`O`OgOghN  00;M $ : :> J"%::
"66|D))
&*&P&PQU&V#"==
^^J/
\\*-
r-   )NNNNN)
rC   rD   rE   rF   r   ry   r   r   r@   rG   rH   s   @r,   rJ   rJ   Q   s3    h.+48
= 'r-   rJ   c                   Z     e Zd ZU eed<   dZdZ ej                          fd       Z	 xZ
S )LayoutLMv3PreTrainedModelr)   
layoutlmv3)imagetextc                 h   t         |   |       t        |t              r| j                  j
                  r>t        j                  |j                         t        j                  |j                         t        |d      rGt        j                  |j                  |j                  |j                  |j                  f             yyt        |t              rZt        j                  |j                   t#        j$                  |j                   j&                  d         j)                  d             yy)zInitialize the weightsvisual_bboxr*   r/   rP   N)r   _init_weightsr   LayoutLMv3Modelr)   visual_embedinitzeros_	cls_token	pos_embedhasattrcopy_r   create_visual_bboxr2   rJ   rO   r_   r`   r6   rb   )r(   moduler+   s     r,   r   z'LayoutLMv3PreTrainedModel._init_weights   s     	f%fo.{{''F,,-F,,-v}-

6--v/H/HU[U`U`bhbmbmTn/H/op . 89JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh :r-   )rC   rD   rE   r   __annotations__base_model_prefixinput_modalitiesr_   no_gradr   rG   rH   s   @r,   r   r      s1    $(U]]_
i 
ir-   r   c                   6     e Zd Z fdZddZ	 	 	 	 ddZ xZS )LayoutLMv3SelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |j"                  | _        |j$                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())r   r   r&   num_attention_headsr   
ValueErrorr|   attention_head_sizeall_head_sizer#   Linearquerykeyvaluer[   attention_probs_dropout_probr]   has_relative_attention_biashas_spatial_attention_biasrk   s     r,   r   z LayoutLMv3SelfAttention.__init__   s8    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF+1+M+M(*0*K*K'r-   c                     ||z  }|j                  d      j                  d      }||z
  |z  } t        j                  d      |      S )a  
        https://huggingface.co/papers/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
        (PB-Relax). A replacement of the original nn.Softmax(dim=-1)(attention_scores). Seems the new attention_probs
        will result in a slower speed and a little bias. Can use torch.allclose(standard_attention_probs,
        cogview_attention_probs, atol=1e-08) for comparison. The smaller atol (e.g., 1e-08), the better.
        r/   rm   )amaxr   r#   Softmax)r(   attention_scoresalphascaled_attention_scores	max_valuenew_attention_scoress         r,   cogview_attentionz)LayoutLMv3SelfAttention.cogview_attention   sT     #3U":+00b0:DDRH	 7) CuL!rzzb!"677r-   c                 x   |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  |      j                  |d| j                  | j                        j                  dd      }
| j                  |      j                  |d| j                  | j                        j                  dd      }t        j                  |	t        j                  | j                        z  |
j                  dd            }| j                  r5| j                  r)|||z   t        j                  | j                        z  z  }n1| j                  r%||t        j                  | j                        z  z  }|||z   }| j                  |      }| j                  |      }t        j                  ||      }|j!                  dddd      j#                         }|j%                         d d | j&                  fz   } |j                  | }|r||f}|S |f}|S )Nr/   r   r0   r   r   )r6   r   r4   r   r   r:   r   r   r_   matmulmathsqrtr   r   r   r]   r5   
contiguousr2   r   )r(   hidden_statesattention_maskoutput_attentionsrel_pos
rel_2d_pos
batch_size
seq_length_query_layer	key_layervalue_layerr   attention_probscontext_layernew_context_layer_shapeoutputss                    r,   r@   zLayoutLMv3SelfAttention.forward   s    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<dii@X@X6Y(Y[d[n[noqsu[vw++0O0O:!54C[C[9\ \\--$))D4L4L*M MM%/.@ 001AB ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r-   )    NFNN)rC   rD   rE   r   r   r@   rG   rH   s   @r,   r   r      s!    L(
8 7r-   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )LayoutLMv3SelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrM   )r   r   r#   r   r&   denserY   rZ   r[   r\   r]   rk   s     r,   r   zLayoutLMv3SelfOutput.__init__8  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r-   r   input_tensorreturnc                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rA   r   r]   rY   r(   r   r   s      r,   r@   zLayoutLMv3SelfOutput.forward>  7    

=1]3}|'CDr-   rC   rD   rE   r   r_   Tensorr@   rG   rH   s   @r,   r   r   7  1    >U\\  RWR^R^ r-   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )LayoutLMv3Attentionc                 b    t         |           t        |      | _        t	        |      | _        y rA   )r   r   r   r(   r   outputrk   s     r,   r   zLayoutLMv3Attention.__init__G  s&    +F3	*62r-   c                 n    | j                  |||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   r   r   )r(   r   )	r(   r   r   r   r   r   self_outputsattention_outputr   s	            r,   r@   zLayoutLMv3Attention.forwardL  sV     yy! ! 
  ;;|AF#%QR(88r-   r   )rC   rD   rE   r   r@   rG   rH   s   @r,   r   r   F  s    3 r-   r   c                   4     e Zd Z fdZ	 	 	 	 ddZd Z xZS )LayoutLMv3Layerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y Nr   )
r   r   chunk_size_feed_forwardseq_len_dimr   	attentionLayoutLMv3IntermediateintermediateLayoutLMv3Outputr   rk   s     r,   r   zLayoutLMv3Layer.__init__b  sI    '-'E'E$,V426:&v.r-   c                     | j                  |||||      }|d   }|dd  }t        | j                  | j                  | j                  |      }	|	f|z   }|S )N)r   r   r   r   r   )r   r   feed_forward_chunkr   r   )
r(   r   r   r   r   r   self_attention_outputsr   r   layer_outputs
             r,   r@   zLayoutLMv3Layer.forwardj  sy     "&/! "0 "
 2!4(,0##T%A%A4CSCSUe
  /G+r-   c                 L    | j                  |      }| j                  ||      }|S rA   )r   r   )r(   r   intermediate_outputr   s       r,   r   z"LayoutLMv3Layer.feed_forward_chunk  s,    "//0@A{{#68HIr-   r   )rC   rD   rE   r   r@   r   rG   rH   s   @r,   r   r   a  s     / 4r-   r   c                   J     e Zd Z fdZddZd Zd Z	 	 	 	 	 	 	 	 ddZ xZS )LayoutLMv3Encoderc                    t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        |j                  | _
        |j                  | _        | j                  rS|j                  | _        |j                  | _        t        j                  | j                  |j                  d      | _        | j                  r|j"                  | _        |j$                  | _        t        j                  | j$                  |j                  d      | _        t        j                  | j$                  |j                  d      | _        y y c c}w )NF)bias)r   r   r)   r#   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingr   r   rel_pos_binsmax_rel_posr   r   rel_pos_biasmax_rel_2d_posrel_2d_pos_binsrel_pos_x_biasrel_pos_y_bias)r(   r)   r   r+   s      r,   r   zLayoutLMv3Encoder.__init__  s   ]]U6KcKcEd#eOF$;#ef
&+#+1+M+M(*0*K*K'++ & 3 3D%11D "		$*;*;V=W=W^c dD**"("7"7D#)#9#9D "$))D,@,@&B\B\ch"iD"$))D,@,@&B\B\ch"iD	 + $fs   E5c                 6   d}|r4|dz  }||dkD  j                         |z  z  }t        j                  |      }n*t        j                  | t        j                  |            }|dz  }||k  }|t        j
                  |j                         |z        t        j
                  ||z        z  ||z
  z  j                  t        j                         z   }	t        j                  |	t        j                  |	|dz
              }	|t        j                  |||	      z  }|S )Nr   r0   r   )r   r_   absmax
zeros_likelogfloatr   r   min	full_likewhere)
r(   relative_positionbidirectionalnum_bucketsmax_distanceretn	max_exactis_smallval_if_larges
             r,   relative_position_bucketz*LayoutLMv3Encoder.relative_position_bucket  s   AK%)//1K??C		+,A		,,e.>.>?P.QRA  1$	y= !IIaggi)+,txxy8P/QQU`clUlm
"UZZ. yyu|[[\_/]^u{{8Q55
r-   c                    |j                  d      |j                  d      z
  }| j                  || j                  | j                        }t	        j
                         5  | j                  j                  j                         |   j                  dddd      }d d d        |j                         }|S # 1 sw Y   xY w)Nr   r/   r  r  r   r   r   r0   )r   r   r  r  r_   r   r	  weighttr5   r   )r(   rO   rel_pos_matr   s       r,   _cal_1d_pos_embz!LayoutLMv3Encoder._cal_1d_pos_emb  s    ",,R0<3I3I"3MM//)))) 0 
 ]]_ 	P''..0027;CCAq!QOG	P$$&	P 	Ps    :B44B=c                    |d d d d df   }|d d d d df   }|j                  d      |j                  d      z
  }|j                  d      |j                  d      z
  }| j                  || j                  | j                        }| j                  || j                  | j                        }t	        j
                         5  | j                  j                  j                         |   j                  dddd      }| j                  j                  j                         |   j                  dddd      }d d d        |j                         }|j                         }||z   }|S # 1 sw Y   0xY w)Nr   r   r   r/   r"  r   r0   )r   r   r  r
  r_   r   r  r#  r$  r5   r  r   )	r(   rr   position_coord_xposition_coord_yrel_pos_x_2d_matrel_pos_y_2d_mat	rel_pos_x	rel_pos_yr   s	            r,   _cal_2d_pos_embz!LayoutLMv3Encoder._cal_2d_pos_emb  st   1a=1a=+55b9<L<V<VWY<ZZ+55b9<L<V<VWY<ZZ11,,,, 2 
	
 11,,,, 2 
	 ]]_ 	V++22446yAII!QPQSTUI++22446yAII!QPQSTUI	V ((*	((*	*
	V 	Vs   A3E%%E.c
                 v   |rdnd }
|rdnd }| j                   r| j                  |      nd }| j                  r| j                  |      nd }t	        | j
                        D ]+  \  }}|r|
|fz   }
 ||||||      }|d   }|s#||d   fz   }- |r|
|fz   }
|st        d ||
|fD              S t        ||
|      S )N r   r   r   c              3   $   K   | ]  }|| 
 y wrA   r0  ).0vs     r,   	<genexpr>z,LayoutLMv3Encoder.forward.<locals>.<genexpr>  s      
 = s   last_hidden_stater   
attentions)r   r&  r   r.  	enumerater  tupler
   )r(   r   rr   r   r   output_hidden_statesreturn_dictrO   r>   r?   all_hidden_statesall_self_attentionsr   r   ilayer_modulelayer_outputss                    r,   r@   zLayoutLMv3Encoder.forward  s    #7BD$5b48<8X8X$&&|4^b373R3RT))$/X\
(4 	POA|#$58H$H!(!%M *!,M &9]1=M<O&O#	P    1]4D D  "%'   ++*
 	
r-   )Tr      )NNFFTNNN)	rC   rD   rE   r   r   r&  r.  r@   rG   rH   s   @r,   r   r     s7    j(."< "3
r-   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )r   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rA   )r   r   r#   r   r&   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnrk   s     r,   r   zLayoutLMv3Intermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r-   r   r   c                 J    | j                  |      }| j                  |      }|S rA   )r   rG  )r(   r   s     r,   r@   zLayoutLMv3Intermediate.forward"  s&    

=100?r-   r   rH   s   @r,   r   r     s#    9U\\ ell r-   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )r   c                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r   r   r#   r   rD  r&   r   rY   rZ   r[   r\   r]   rk   s     r,   r   zLayoutLMv3Output.__init__*  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r-   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rA   r   r   s      r,   r@   zLayoutLMv3Output.forward0  r   r-   r   rH   s   @r,   r   r   )  r   r-   r   c                   `    e Zd Z fdZd Zd ZddZd Zd Ze		 	 	 	 	 	 	 	 	 	 dde
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  de
j                  dz  de
j                  dz  dedz  dedz  dedz  deez  fd       Z xZS )r   c                 @   t         |   |       || _        |j                  rt	        |      | _        |j                  rt        |      | _        t        |j                  |j                  z        | _        t        j                  t        j                   dd|j"                              | _        t        j                  t        j                   d| j                  | j                  z  dz   |j"                              | _        t        j(                  d      | _        t        j,                  |j"                  |j.                        | _        t        j(                  |j0                        | _        | j                  j4                  s| j                  j6                  r:| j9                  d| j;                  | j                  | j                  f      d       t        j,                  |j"                  d	      | _        t?        |      | _         | jC                          y )
Nr   g        )prM   r   r   FrQ   gư>)"r   r   r)   
text_embedrJ   r=   r   r   patch_embedr|   r   r!   r2   r#   	Parameterr_   r   r&   r   r   r[   pos_droprY   rZ   r\   r]   r   r   r^   r   normr   encoder	post_initrk   s     r,   r   zLayoutLMv3Model.__init__9  s    6v>DO  9@DF--0A0AABDI\\%++aF<N<N*OPDN\\%++aTYY9NQR9RTZTfTf*ghDNJJ-DM\\&*<*<&BWBWXDN::f&@&@ADL{{66$++:`:`$$!4#:#:tyyRVR[R[F\#:#]jo %  V%7%7TBDI(0r-   c                 .    | j                   j                  S rA   r=   rV   r(   s    r,   get_input_embeddingsz$LayoutLMv3Model.get_input_embeddingsX  s    ...r-   c                 &    || j                   _        y rA   rW  r(   r   s     r,   set_input_embeddingsz$LayoutLMv3Model.set_input_embeddings[  s    */'r-   c           	         t        j                  t        j                  d||d   dz   z  |      |d   d      }t        j                  t        j                  d||d   dz   z  |      |d   d      }t        j                  |dd j	                  |d   d      |dd j	                  |d   d      j                  dd      |dd j	                  |d   d      |dd j	                  |d   d      j                  dd      gd      j                  dd      }t        j                  dd|dz
  |dz
  gg      }t        j                  ||gd      S )	zJ
        Create the bounding boxes for the visual (patch) tokens.
        r   r   trunc)rounding_modeNr/   rm      )	r_   divr`   stackrepeatr:   r4   tensorrq   )r(   r*   max_lenvisual_bbox_xvisual_bbox_yr   cls_token_boxs          r,   r   z"LayoutLMv3Model.create_visual_bbox^  s\    		LLGz!}q'897CZPQ]bi
 		LLGz!}q'897CZPQ]bi
 kkcr"))*Q-;cr"))*Q-;EEaKab!((A:ab!((A:DDQJ	 
 $r1+ 	 ueWq['A+&N%OPyy-51==r-   c                     | j                   j                  |dd      }|j                  |      j                  |      }|S r   )r   rc  r   type)r(   r   r   r   r   s        r,   calculate_visual_bboxz%LayoutLMv3Model.calculate_visual_bboxu  s;    &&--j!Q?!nnV,11%8r-   c                 6   | j                  |      }|j                         \  }}}| j                  j                  |dd      }t	        j
                  ||fd      }| j                  || j                  z   }| j                  |      }| j                  |      }|S )Nr/   r   rm   )	rP  r2   r   rb   r_   rq   r   rR  rS  )r(   r;   r=   r   seq_lenr   
cls_tokenss          r,   forward_imagezLayoutLMv3Model.forward_imagez  s    %%l3
 ",!2
GQ^^**:r2>
YY
J7Q?
 >>%#dnn4J]]:.
YYz*
r-   Nr   rr   r   r   rO   r   r;   r   r:  r;  r   c                 	   ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
|"|j	                         }|\  }}|j
                  }nL|%|j	                         dd }|\  }}|j
                  }n%|t        |      }|j
                  }nt        d      |||t        j                  |f|      }|&t        j                  t        j                  |      }|<t        j                  t        t              dgz         t        j                  |      }| j                  |||||      }dx}}dx}}|&t        |j                   d   | j                   j"                  z        t        |j                   d	   | j                   j"                  z        }}| j%                  |      }t        j                  ||j                   d
   ft        j                  |      }|t        j&                  ||gd
      }n|}| j                   j(                  s| j                   j*                  r| j                   j*                  r@| j-                  |t        j                  |      }|t        j&                  ||gd
      }n|}t        j.                  d|j                   d
   t        j                  |      j1                  |d
      }||Ut        j.                  dd
   |      j3                  d      }|j5                  |      }t        j&                  ||gd
      }n|}||t        j&                  |gd
      }n|}| j7                  |      }| j9                  |      }n| j                   j(                  s| j                   j*                  rc| j                   j*                  r|}| j                   j(                  r5| j                  j:                  dddd
   f   }|j=                  |      }|}| j?                  |dj@                        }| jC                  ||||||	|
||	      }|d   }|
s	|f|d
d z   S tE        ||jF                  |jH                        S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        bbox (`torch.LongTensor` of shape `(batch_size, token_sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.
        token_type_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, token_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")

        >>> outputs = model(**encoding)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr/   zEYou have to specify either input_ids or inputs_embeds or pixel_values)r   r   r`  )r   rr   rO   r   r   r0   r   r   rm   )r   r   r   )r   )rr   rO   r   r   r:  r;  r>   r?   r5  )%r)   r   r:  use_return_dictr2   r   lenr   r_   onesr   r   r9  listr=   r   r6   r!   ro  rq   r   r   rk  r`   rc  r   rb   rY   r]   rO   	expand_asget_extended_attention_maskr   rT  r
   r   r7  )r(   r   rr   r   r   rO   r   r;   r   r:  r;  kwargsr   r   r   r   embedding_output
final_bboxfinal_position_idsr>   r?   visual_embeddingsvisual_attention_maskr   visual_position_idsextended_attention_maskencoder_outputssequence_outputs                               r,   r@   zLayoutLMv3Model.forward  sz   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] #..*K%0"J
%%F&',,.s3K%0"J
"))F%\*J!((Fdee M$=%!&j*-Ev!V%!&[

SY!Z|{{5k):aS)@#A\bc##)-+  /   +/.
'%)){#,,,Q/$++2H2HHI,,,Q/$++2H2HHI &L !% 2 2< @$)JJ.44Q78

SY%! )!&N<Q+RXY!Z!6{{66$++:`:`;;99"&"<"<V5::bl"<"mK'%*YYk/B%J
%0
&+ll(..q1F'&Q' $ (M,E#(<<;q>&#Q#[#[\]#^L#/#6#6{#CL).LBU3V\])^&)<&$(A#(99.>@Q-RXY#Z #4 #~~.>?#||,<=[[448^8^{{55!
{{66#;;A?OQ?O<OP+55i@%1"040P0PD(8(>(> 1Q 1
 ,,+2/!5#%# ' 

 *!,#%(;;;-)77&11
 	
r-   ))   r  i  )
NNNNNNNNNN)rC   rD   rE   r   rY  r\  r   rk  ro  r   r_   
LongTensorFloatTensorboolr9  r
   r@   rG   rH   s   @r,   r   r   7  s,   >/0>.
"  .2(,3726042615)-,0#'{
##d*{
 %{
 ))D0	{

 ((4/{
 &&-{
 ((4/{
 ''$.{
  $;{
 #Tk{
 D[{
 
	 {
 {
r-   r   c                   *     e Zd ZdZd fd	Zd Z xZS )LayoutLMv3ClassificationHeadz\
    Head for sentence-level classification tasks. Reference: RobertaClassificationHead
    c                    t         |           || _        |r3t        j                  |j
                  dz  |j
                        | _        n/t        j                  |j
                  |j
                        | _        |j                  |j                  n|j                  }t        j                  |      | _
        t        j                  |j
                  |j                        | _        y )Nr   )r   r   pool_featurer#   r   r&   r   classifier_dropoutr\   r[   r]   
num_labelsout_proj)r(   r)   r  r  r+   s       r,   r   z%LayoutLMv3ClassificationHead.__init__O  s    (6#5#5#96;M;MNDJ6#5#5v7I7IJDJ)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHr-   c                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S rA   )r]   r   r_   tanhr  )r(   xs     r,   r@   z$LayoutLMv3ClassificationHead.forward\  sI    LLOJJqMJJqMLLOMM!r-   )FrB   rH   s   @r,   r  r  J  s    Ir-   r  a  
    LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
    for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
    [SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
    [Kleister-NDA](https://github.com/applicaai/kleister-nda).
    )custom_introc                   l    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  de
dz  de
dz  de
dz  dej                  dz  deez  fd       Z xZS ) LayoutLMv3ForTokenClassificationc                 p   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        |j                  dk  r0t        j                  |j                  |j                        | _        nt        |d      | _        | j                          y )N
   Fr  )r   r   r  r   r   r#   r[   r\   r]   r   r&   
classifierr  rU  rk   s     r,   r   z)LayoutLMv3ForTokenClassification.__init__n  s      ++)&1zz&"<"<=r! ii(:(:F<M<MNDO:6PUVDOr-   c                 6    | j                   j                         S rA   r   rY  rX  s    r,   rY  z5LayoutLMv3ForTokenClassification.get_input_embeddings{      3355r-   c                 :    | j                   j                  |       y rA   r   r\  r[  s     r,   r\  z5LayoutLMv3ForTokenClassification.set_input_embeddings~      ,,U3r-   Nr   rr   r   r   rO   r   labelsr   r:  r;  r;   r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
|
      }||j                         }n|j                         dd }|d   }|d   ddd|f   }| j	                  |      }| j                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a!  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForTokenClassification
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
        >>> word_labels = example["ner_tags"]

        >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")

        >>> outputs = model(**encoding)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```N)	rr   r   r   rO   r   r   r:  r;  r;   r/   r   r   losslogitsr   r7  )r)   rq  r   r2   r]   r  r   r4   r  r   r   r7  )r(   r   rr   r   r   rO   r   r  r   r:  r;  r;   rw  r   r   r   r  r  r  loss_fctr   s                        r,   r@   z(LayoutLMv3ForTokenClassification.forward  s7   ^ &1%<k$++B]B]//))%'/!5#% " 
  #..*K',,.s3K ^
!!*Q^4,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r-   NNNNNNNNNNN)rC   rD   rE   r   rY  r\  r   r_   r  r  r  r9  r   r@   rG   rH   s   @r,   r  r  e  s7   64  .2(,37260426*.)-,0#'04U
##d*U
 %U
 ))D0	U

 ((4/U
 &&-U
 ((4/U
   4'U
  $;U
 #TkU
 D[U
 &&-U
 
&	&U
 U
r-   r  c                       e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  de
dz  de
dz  de
dz  dej                  dz  dej                  dz  deez  fd       Z xZS )LayoutLMv3ForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        |d      | _        | j                          y NFr  )r   r   r  r   r   r  
qa_outputsrU  rk   s     r,   r   z'LayoutLMv3ForQuestionAnswering.__init__  sA      ++)&16vERr-   c                 6    | j                   j                         S rA   r  rX  s    r,   rY  z3LayoutLMv3ForQuestionAnswering.get_input_embeddings  r  r-   c                 :    | j                   j                  |       y rA   r  r[  s     r,   r\  z3LayoutLMv3ForQuestionAnswering.set_input_embeddings  r  r-   Nr   r   r   rO   r   start_positionsend_positionsr   r:  r;  rr   r;   r   c                 *   |
|
n| j                   j                  }
| j                  |||||||	|
||
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
        >>> from datasets import load_dataset
        >>> import torch

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
        >>> start_positions = torch.tensor([1])
        >>> end_positions = torch.tensor([3])

        >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
        >>> loss = outputs.loss
        >>> start_scores = outputs.start_logits
        >>> end_scores = outputs.end_logits
        ```N	r   r   rO   r   r   r:  r;  rr   r;   r   r   r/   rm   )ignore_indexr0   )r  start_logits
end_logitsr   r7  )r)   rq  r   r  splitsqueezer   rr  r2   clampr   r   r   r7  )r(   r   r   r   rO   r   r  r  r   r:  r;  rr   r;   rw  r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                            r,   r@   z&LayoutLMv3ForQuestionAnswering.forward  s   f &1%<k$++B]B]//))%'/!5#% " 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r-   )NNNNNNNNNNNN)rC   rD   rE   r   rY  r\  r   r_   r  r  r  r9  r   r@   rG   rH   s   @r,   r  r    sP   64  .2372604263715)-,0#'(,04c
##d*c
 ))D0c
 ((4/	c

 &&-c
 ((4/c
 ))D0c
 ''$.c
  $;c
 #Tkc
 D[c
 %c
 &&-c
 
-	-c
 c
r-   r  a
  
    LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for document image classification tasks such as the
    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
    c                   l    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de
dz  de
dz  de
dz  dej                  dz  dej                  dz  deez  fd       Z xZS )#LayoutLMv3ForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |      | _        t        |d      | _        | j                          y r  )	r   r   r  r)   r   r   r  r  rU  rk   s     r,   r   z,LayoutLMv3ForSequenceClassification.__init__Z  sH      ++)&16vERr-   c                 6    | j                   j                         S rA   r  rX  s    r,   rY  z8LayoutLMv3ForSequenceClassification.get_input_embeddingsc  r  r-   c                 :    | j                   j                  |       y rA   r  r[  s     r,   r\  z8LayoutLMv3ForSequenceClassification.set_input_embeddingsf  r  r-   Nr   r   r   rO   r   r  r   r:  r;  rr   r;   r   c                 4   |	|	n| j                   j                  }	| j                  ||||||||	|
|
      }|d   dddddf   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|	s|f|dd z   }||f|z   S |S t        |||j                   |j"                  	      S )
a_  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForSequenceClassification
        >>> from datasets import load_dataset
        >>> import torch

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")
        >>> sequence_label = torch.tensor([1])

        >>> outputs = model(**encoding, labels=sequence_label)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr/   r  )r)   rq  r   r  problem_typer  r   r_   r   r|   r   r  r   r4   r   r   r   r7  )r(   r   r   r   rO   r   r  r   r:  r;  rr   r;   rw  r   r  r  r  r  r   s                      r,   r@   z+LayoutLMv3ForSequenceClassification.forwardi  s   \ &1%<k$++B]B]//))%'/!5#% " 
 "!*Q1W-1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r-   r  )rC   rD   rE   r   rY  r\  r   r_   r  r  r  r9  r   r@   rG   rH   s   @r,   r  r  R  s7   64  .237260426*.)-,0#'(,04_
##d*_
 ))D0_
 ((4/	_

 &&-_
 ((4/_
   4'_
  $;_
 #Tk_
 D[_
 %_
 &&-_
 
)	)_
 _
r-   r  )r  r  r  r   r   )6rF   r   r   r_   torch.nnr#   torch.nn.functional
functionalr7   r   r   r    r   r   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_layoutlmv3r   
get_loggerrC   loggerModuler   rJ   r   r   r   r   r   r   r   r   r   r  r  r  r  __all__r0  r-   r,   <module>r     s           A A & ! 9  . 6 
 7 
		H	% 		  Fsryy sl i i i&Xbii Xx299 ")) 6&0 &RK
		 K
^RYY  ryy  O
/ O
 O
d299 6 j
'@ j
j
Z t
%> t
 t
n p
*C p
p
fr-   