
    qi                      L   d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/  e)j`                  e1      Z2e e'd       G d de%                    Z3dejh                  dz  dejh                  dz  dedz  fdZ5 e,ddd !      	 	 	 	 d]d"ed ejh                  d#ejh                  dz  d$ejh                  d%edz  d&ejh                  dz  dejh                  dz  d'ejl                  dz  d(e7d)e7dz  de8fd*       Z9 G d+ d,ejt                        Z; G d- d.ejt                        Z< G d/ d0ejt                        Z=d1e<iZ> G d2 d3ejt                        Z? G d4 d5ejt                        Z@ G d6 d7ejt                        ZA G d8 d9e      ZB G d: d;ejt                        ZCe' G d< d=e             ZD G d> d?ejt                        ZE G d@ dAejt                        ZF	 d^dBejt                  dCejh                  dDejh                  dEejh                  d#ejh                  dz  dFeGdGeGfdHZH G dI dJejt                        ZI G dK dLe      ZJ G dM dNejt                        ZK G dO dPejt                        ZL e'dQ       G dR dSeD             ZM G dT dUejt                        ZN e'dV       G dW dXeD             ZO e'dY       G dZ d[eDe             ZPg d\ZQy)_zPyTorch GIT model.    N)Callable)	dataclass)nn   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)GenerationMixin)create_masks_for_generate)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)deprecate_kwarg   )	GitConfigGitVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)GitVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r$   torchFloatTensor__annotations__r%   r&   tupler'        V/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/git/modeling_git.pyr#   r#   7   sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r1   r#   token_type_idsimage_group_idsreturnc           
      Z      ydt         dt         dt         dt         dt        f
 fd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    N	batch_idxhead_idxq_idxkv_idxr5   c                 :   t        j                  |j                  d   k  |d      }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }	t        j                  |j                  d   k  |	d      }	|dk(  |dk(  z  }
||	k(  }|
|z  S )Nr   r   )r,   whereshape)r7   r8   r9   r:   
safe_q_idxsafe_kv_idxtoken_type_ids_at_q_idxtoken_type_ids_at_kv_idximage_group_ids_at_q_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockr4   r3   s               r2   
inner_maskz0token_type_ids_mask_function.<locals>.inner_maskW   sM    [[)=)=a)@!@%K
kk&>+?+?+B"BFAN"0J1F"G"'++en6J6J16M.MOfhi"j#1)[2H#I #(;;v8L8LQ8O/OQikl#m #29j3H#I #(;;u7L7LQ7O/OQikm#n $3I{4J$K!$)KK9N9Nq9Q0QSlnp$q!1Q6;SWX;XY37PP  000r1   )intbool)r3   r4   rG   s   `` r2   token_type_ids_mask_functionrJ   K   s>     1c 1S 1 1c 1d 12 r1   input_embedsz5.6.0inputs_embeds)versionnew_nameconfigattention_maskcache_positionpast_key_valuesposition_idspixel_valuesis_trainingis_first_iterationc
                    |r|t        d      | j                         |||||d}|	|	n|du xs |j                   xs |du}	||	r|dk(  j                  |j                        }t
        j                  j                  |dd      ddddf   }|| z  }t        j                  |j                         d	      dz
  }t        j                  ||d      }t        |j                  |j                        |      |d
<   t        di |S )a  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when training)rO   rL   rP   rQ   rR   rS   r   )r   r   r   )valuer<   dimor_mask_functionr0   )
ValueErrorget_text_configis_initializedtodevicer   
functionalpadr,   cumsumrH   r=   rJ   r   )rO   rL   rP   rQ   rR   rS   r3   rT   rU   rV   kwargsmask_kwargsis_imageis_previous_imagenew_image_startr4   s                   r2   create_causal_mask_mappingri   s   s4   * ~-VWW ((*&((*$K ) 	%g_-K-K)Kg|cgOg 
 !&8 #a'++N,A,ABMM--ha-HCRCP"&7%77,,':':'<!DqH++hD*Fn334o+
&' %3{33r1   c                        e Zd ZdZ fdZ	 	 	 	 d
dej                  dz  dej                  dz  dej                  dz  dedej                  f
d	Z
 xZS )GitEmbeddingsz;Construct the embeddings from word and position embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j                        j%                  d      d       y )N)padding_idxepsrS   r   r<   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr,   arangeexpandselfrO   	__class__s     r2   rt   zGitEmbeddings.__init__   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
r1   N	input_idsrS   rL   past_key_values_lengthr5   c                 ,   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|| j                  |      }n|}| j                  |      }||z  }| j	                  |      }| j                  |      }|S )Nr<   r   )sizerS   ry   r{   r|   r   )	r   r   rS   rL   r   input_shape
seq_length
embeddingsr{   s	            r2   forwardzGitEmbeddings.forward   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL --i8J&J"66|D))
^^J/
\\*-
r1   )NNNr   )r(   r)   r*   r+   rt   r,   
LongTensorr-   rH   Tensorr   __classcell__r   s   @r2   rk   rk      ss    E

 .20426&'##d* &&- ((4/	
 !$ 
r1   rk   c                        e Zd Zd	 fd	Z	 	 	 d
dej
                  dej                  dz  dedz  dej
                  dz  deej
                     f
dZ	 xZ
S )GitSelfAttentionNc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |-t        j                  d| j                  j                   d       |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        |j                  j                  |j                  j                   z  dz  d	z         | _        |j$                  | xj"                  |j$                  z  c_        t'        j(                  |j                  | j                        | _        t'        j(                  |j                  | j                        | _        t'        j(                  |j                  | j                        | _        t'        j0                  |j2                        | _        y )
Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.   r   )rs   rt   rw   num_attention_headshasattrr\   	layer_idxloggerwarning_oncer   r(   rH   attention_head_sizeall_head_sizevision_config
image_size
patch_sizeimage_patch_tokensnum_image_with_embeddingr   LinearquerykeyrX   r~   attention_probs_dropout_probr   r   rO   r   r   s      r2   rt   zGitSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8  # !8!8 9 :, , $*#=#= #&v'9'9F<V<V'V#W !558P8PP"%v';';'F'FI]I]IhIh'hmn&nqr&r"s**6##v'F'FF#YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EFr1   r&   rP   rR   rQ   r5   c                    |j                   d   }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }|$|j                  ||| j                  d|i      \  }}t        j                  ||j                  dd            }	|	t        j                  | j                        z  }	||	|z   }	t        j                  j!                  |	d      }
| j#                  |
      }
t        j                  |
|      }|j%                  dddd	      j'                         }|j)                         d d | j*                  fz   }|j                  |      }||
fS )
Nr   r<   r   r   rQ   )cache_kwargsrY   r   )r>   r   viewr   r   	transposer   rX   updater   r,   matmulmathsqrtr   ra   softmaxr   permute
contiguousr   r   )r   r&   rP   rR   rQ   
batch_sizequery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes                r2   r   zGitSelfAttention.forward   s    #((+
JJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	
 &%4%;%;;FVXfEg &< &"I{
 !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r1   N)NNN)r(   r)   r*   rt   r,   r   r-   r	   r/   r   r   r   s   @r2   r   r      sn    G> 48(,.22.||2. ))D02. 	2.
 t+2. 
u||	2.r1   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )GitSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nrn   )rs   rt   r   r   rw   denser|   r}   r~   r   r   r   s     r2   rt   zGitSelfOutput.__init__.  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r1   r&   input_tensorr5   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   r|   r   r&   r   s      r2   r   zGitSelfOutput.forward4  7    

=1]3}|'CDr1   r(   r)   r*   rt   r,   r   r   r   r   s   @r2   r   r   -  1    >U\\  RWR^R^ r1   r   eagerc                        e Zd Zd
 fd	Z	 	 	 	 ddej
                  dej                  dz  dedz  dej
                  dz  dedz  de	ej
                     fd	Z
 xZS )GitAttentionNc                     t         |           t        |j                     ||      | _        t        |      | _        y )Nr   )rs   rt   GIT_SELF_ATTENTION_CLASSES_attn_implementationr   r   outputr   s      r2   rt   zGitAttention.__init__A  s4    .v/J/JKF^gh	#F+r1   r&   rP   rR   rQ   output_attentionsr5   c                 ^    | j                  ||||      \  }}| j                  ||      }||fS )N)rQ   )r   r   )	r   r&   rP   rR   rQ   r   attn_outputself_attn_weightsattention_outputs	            r2   r   zGitAttention.forwardF  sH     *.)	 *3 *
&&  ;;{MB!222r1   r   NNNF)r(   r)   r*   rt   r,   r   r-   r	   rI   r/   r   r   r   s   @r2   r   r   @  s~    , 48(,.2).3||3 ))D03 	3
 t+3  $;3 
u||	3r1   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GitIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rs   rt   r   r   rw   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r2   rt   zGitIntermediate.__init__Z  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r1   r&   r5   c                 J    | j                  |      }| j                  |      }|S r   )r   r   r   r&   s     r2   r   zGitIntermediate.forwardb  s&    

=100?r1   r   r   s   @r2   r   r   Y  s#    9U\\ ell r1   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )	GitOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )rs   rt   r   r   r   rw   r   r|   r}   r~   r   r   r   s     r2   rt   zGitOutput.__init__j  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r1   r&   r   r5   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r2   r   zGitOutput.forwardp  r   r1   r   r   s   @r2   r   r   i  r   r1   r   c                        e Zd Zd fd	Z	 	 	 	 ddej
                  dej                  dz  dedz  dej
                  dz  dedz  de	ej
                     fd	Z
d
 Z xZS )GitLayerNc                     t         |           |j                  | _        d| _        t	        ||      | _        t        |      | _        t        |      | _	        y )Nr   r   )
rs   rt   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   r   s      r2   rt   zGitLayer.__init__x  sK    '-'E'E$%f	B+F3'r1   r&   rP   rR   rQ   r   r5   c                     | j                  |||||      \  }}t        | j                  | j                  | j                  |      }||fS )N)r   rR   rQ   )r   r   feed_forward_chunkr   r   )	r   r&   rP   rR   rQ   r   r   self_attention_weightslayer_outputs	            r2   r   zGitLayer.forward  sd     48>>/+) 4B 4
00 1##T%A%A4CSCSUe
 333r1   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )r   r   intermediate_outputr   s       r2   r   zGitLayer.feed_forward_chunk  s,    "//0@A{{#68HIr1   r   r   )r(   r)   r*   rt   r,   r   r-   r	   rI   r/   r   r   r   r   s   @r2   r   r   w  s    ( 48(,.2).4||4 ))D04 	4
 t+4  $;4 
u||	4,r1   r   c                        e Zd Z fdZ	 	 	 	 	 	 	 ddej
                  dej                  dz  dedz  dedz  dedz  dedz  d	edz  d
ej
                  dz  de	ej
                     e
z  fdZ xZS )
GitEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w NF)
rs   rt   rO   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r   rO   ir   s      r2   rt   zGitEncoder.__init__  sP    ]]vG_G_A`#aAHVQ$7#ab
&+# $bs   A$Nr&   rP   rR   	use_cacher   output_hidden_statesreturn_dictrQ   r5   c	                    | j                   r%| j                  r|rt        j                  d       d}|r|t	        | j
                        }|rdnd }	|rdnd }
t        | j                        D ]*  \  }}|r|	|fz   }	 ||||||      }|d   }|s"|
|d   fz   }
, |r|	|fz   }	|st        d |||	|
fD              S t        |||	|
      S )	NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rO   r0   r   r   c              3   $   K   | ]  }|| 
 y wr   r0   ).0vs     r2   	<genexpr>z%GitEncoder.forward.<locals>.<genexpr>  s      	 = 	s   r%   rR   r&   r'   )
r   trainingr   r   r
   rO   	enumerater   r/   r   )r   r&   rP   rR   r   r   r   r   rQ   all_hidden_statesall_self_attentionsr   layer_modulelayer_outputss                 r2   r   zGitEncoder.forward  s    &&4==##p "	0*$++>O"6BD$5b4(4 	POA|#$58H$H!(!M *!,M &9]1=M<O&O#	P    1]4D D 	 "#%'		 	 	 '+++*	
 	
r1   )NNNFFTN)r(   r)   r*   rt   r,   r   r-   r	   rI   r/   r   r   r   r   s   @r2   r   r     s    , 48(,!%).,1#'.2:
||:
 ))D0:
 	:

 $;:
  $;:
 #Tk:
 D[:
 t+:
 
u||	6	6:
r1   r   c                   R    e Zd ZU eed<   dZdZdZ ej                         d        Z
y)GitPreTrainedModelrO   git)imagetextTc                    t        |t              rt        j                  |j                  d| j
                  j                         t        j                  |j                  j                  | j
                  j                         t        j                  |j                  j                  | j
                  j                         t        j                  |j                  t        j                  |j                  j                  d         j                  d             t        |t         j"                        rct        j                  |j                  d| j
                  j                         |j$                   t        j&                  |j$                         yyt        |t         j(                        rt        j                  |j                  d| j
                  j                         |j*                  Et-        |j                  dd      s-t        j&                  |j                  |j*                            yyyt        |t         j.                        r?t        j&                  |j$                         t        j0                  |j                         yt        |t2              rZt        j                  |j                  t        j                  |j                  j                  d         j                  d             yy)	zInitialize the weights        )meanstd)r  r<   rp   N_is_hf_initializedF)r   GitVisionEmbeddingsinitnormal_class_embeddingrO   initializer_rangepatch_embeddingweightposition_embeddingcopy_rS   r,   r   r>   r   r   r   biaszeros_ru   rm   getattrr|   ones_rk   )r   modules     r2   _init_weightsz GitPreTrainedModel._init_weights  s    f12LL//ct{{?\?\]LL//66DKK<Y<YZLL2299t{{?\?\]JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghfbii(LLSdkk6S6ST{{&FKK( '-LLSdkk6S6ST!!-gfmmMach6iFMM&*<*<=> 7j--KK$JJv}}%.JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh /r1   N)r(   r)   r*   r   r.   base_model_prefixinput_modalitiessupports_gradient_checkpointingr,   no_gradr%  r0   r1   r2   r  r    s6    (&*#U]]_i ir1   r  c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )r  rO   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestrider   r   r   rS   rp   rq   )rs   rt   rO   rw   	embed_dimr   r   r   	Parameterr,   randnr  Conv2dnum_channelsr  num_patchesnum_positionsru   r  r   r   r   r   s     r2   rt   zGitVisionEmbeddings.__init__  s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr1   r   heightwidthr5   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr<   g      ?r   r   bicubicF)r   modealign_cornersrY   )r>   r  r  	unsqueezer,   jit
is_tracingrS   r   r   reshaper   r   ra   interpolater   cat)r   r   r7  r8  r5  r  r6  class_pos_embedpatch_pos_embedrZ   
new_height	new_widthsqrt_num_positionss                r2   interpolate_pos_encodingz,GitVisionEmbeddings.interpolate_pos_encoding  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr1   rT   c                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().dtyper   r   r<   rY   )r>   r   r\   r  r  rM  r_   flattenr   r  r   r,   rB  rH  r  rS   )r   rT   rH  r   _r7  r8  target_dtypepatch_embedsclass_embedsr   s              r2   r   zGitVisionEmbeddings.forward@  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr1   F)r(   r)   r*   r    rt   r,   r   rH   rH  r-   r   r   r   s   @r2   r  r     sd    q q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r1   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GitVisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )rs   rt   rO   r   r   activation_fnr   r   rw   r   fc1fc2r   s     r2   rt   zGitVisionMLP.__init__T  sd    #F$5$5699V//1I1IJ99V55v7I7IJr1   r&   r5   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rX  rW  rY  r   s     r2   r   zGitVisionMLP.forward[  s4    /**=9/r1   r   r   s   @r2   rU  rU  S  s$    KU\\ ell r1   rU  r$  r   r   rX   scalingr   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr<   r   )rZ   rM  )pr  r   r   )r,   r   r   r   ra   r   float32r_   rM  r   r  r   )
r$  r   r   rX   rP   r[  r   rd   attn_weightsr   s
             r2   eager_attention_forwardr`  c  s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r1   c                        e Zd ZdZ fdZ	 	 d	dej                  dej                  dz  dedz  deej                  ej                  dz  f   fdZ	 xZ
S )
GitVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: rK  g      F)rs   rt   rO   rw   r0  r   	num_headshead_dimr\   scaleattention_dropoutr   	is_causalr   r   k_projv_projq_projout_projr   s     r2   rt   zGitVisionAttention.__init__}  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar1   Nr&   rP   r   r5   c           
      &   |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|| j                  | j                  | j                  sdn| j                         \  }}|j#                  |||      j%                         }| j'                  |      }|sd}||fS )z#Input shape: Batch x Time x Channelr   r   r  )rh  r[  r   N)r>   rk  ri  rj  r   rd  re  r   r   get_interfacerO   r   r`  rh  rf  r  r   r@  r   rl  )r   r&   rP   r   r   r   r0  querieskeysvaluesattention_interfacer   r_  s                r2   r   zGitVisionAttention.forward  s_    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r1   r   )r(   r)   r*   r+   rt   r,   r   rI   r/   r   r   r   s   @r2   rb  rb  z  sg    GB. /3).	%)||%) t+%)  $;	%)
 
u||U\\D00	1%)r1   rb  c                        e Zd Zdef fdZ	 d
dej                  dej                  dedz  dee	   de
ej                     f
d	Z xZS )GitVisionEncoderLayerrO   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   )rs   rt   rw   r0  rb  	self_attnr   r|   r}   layer_norm1rU  mlplayer_norm2r   s     r2   rt   zGitVisionEncoderLayer.__init__  sm    +++F3<<F<Q<QR'<<F<Q<QRr1   r&   rP   r   Nrd   r5   c                     |}| j                  |      } | j                  d|||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r&   rP   r   r0   )rw  rv  ry  rx  )r   r&   rP   r   rd   residualr_  outputss           r2   r   zGitVisionEncoderLayer.forward  s    " !((7&4dnn '
')/'
 	'
#| !=0 ((7/ =0 "&Gr1   rS  )r(   r)   r*   r    rt   r,   r   rI   r   r   r/   r-   r   r   r   s   @r2   rt  rt    sh    S S */	&||& &  $;	&
 +,& 
u  	!&r1   rt  c                        e Zd ZdZdef fdZe	 	 	 	 ddej                  dz  de	dz  de	dz  de	dz  d	e
e   d
eez  fd       Z xZS )GitVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`GitVisionEncoderLayer`].

    Args:
        config: GitVisionConfig
    rO   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
rs   rt   rO   r   r   r   r   rt  layersr   )r   rO   rO  r   s      r2   rt   zGitVisionEncoder.__init__  sP    mmERXRjRjLk$lq%:6%B$lm&+# %ms   A#NrP   r   r   r   rd   r5   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|fd|i|}|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr0   r   r   r   r%   r&   r'   )rO   r   r   use_return_dictr  r  r   )r   rL   rP   r   r   r   rd   encoder_statesall_attentionsr&   idxencoder_layerr  s                r2   r   zGitVisionEncoder.forward  s   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B) #4 	M *!,M !/=3C2E!E	F  +}.>>N+>Vd
 	
r1   )NNNN)r(   r)   r*   r+   r    rt   r   r,   r   rI   r   r   r/   r   r   r   r   s   @r2   r~  r~    s    , ,  /3)-,0#'=
 t+=
  $;	=

 #Tk=
 D[=
 +,=
 
	 =
 =
r1   r~  c                        e Zd Zdef fdZe	 	 	 	 	 ddej                  dz  dedz  dedz  dedz  dedz  d	e	e
z  fd
       Z xZS )GitVisionTransformerrO   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )rs   rt   rO   rw   r  r   r   r|   r}   pre_layrnormr~  encoderpost_layernorm)r   rO   r0  r   s      r2   rt   zGitVisionTransformer.__init__?  sj    &&	-f5LL8M8MN'/ ll9&:O:OPr1   NrT   r   r   rH  r   r5   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  ||||      }|d   }| j                  |      }|s	|f|dd  z   S t        ||j                  |j                        S )Nz You have to specify pixel_valuesrH  )rL   r   r   r   r   r   r  )rO   r   r   r  r\   r   r  r  r  r   r&   r'   )	r   rT   r   r   rH  r   r&   encoder_outputsr%   s	            r2   r   zGitVisionTransformer.forwardI  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5#	 ' 
 ,A. //0AB%'/!"*===/)77&11
 	
r1   NNNFN)r(   r)   r*   r    rt   r   r,   r-   rI   r/   r   r   r   r   s   @r2   r  r  =  s    Q Q  26)-,005#'&
''$.&
  $;&
 #Tk	&

 #'+&
 D[&
 
	 &
 &
r1   r  zY
    The vision model from CLIP, used in GIT, without any head or projection on top.
    c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e	 	 	 	 	 ddej                  dz  dedz  d	edz  d
ededz  deez  fd       Z xZS )GitVisionModelrO   rT   )r  c                 d    t         |   |       t        |      | _        | j	                          y r   )rs   rt   r  vision_model	post_initr   s     r2   rt   zGitVisionModel.__init__~  s'     08r1   r5   c                 B    | j                   j                  j                  S r   )r  r   r  r   s    r2   get_input_embeddingsz#GitVisionModel.get_input_embeddings  s      ++;;;r1   Nr   r   rH  r   c                 b    ||n| j                   j                  }| j                  |||||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, GitVisionModel

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = GitVisionModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```)rT   r   r   rH  r   )rO   r  r  )r   rT   r   r   rH  r   rd   s          r2   r   zGitVisionModel.forward  sA    > &1%<k$++B]B]  %/!5%=# ! 
 	
r1   r  )r(   r)   r*   r    r.   main_input_namer'  rt   r   Moduler  r   r,   r-   rI   r/   r   r   r   r   s   @r2   r  r  s  s     $O! <bii <  26)-,0).#'&
''$.&
  $;&
 #Tk	&

 #'&
 D[&
 
	 &
 &
r1   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )GitProjectionrO   c                 0   t         |           || _        t        j                  t        j
                  |j                  j                  |j                        t        j                  |j                  |j                  j                              | _
        y r   )rs   rt   rO   r   
Sequentialr   r   rw   r|   r}   visual_projectionr   s     r2   rt   zGitProjection.__init__  sf    !#IIf**668J8JKLL++1E1E1T1TU"
r1   r   r5   c                 $    | j                  |      S r   )r  )r   r   s     r2   r   zGitProjection.forward  s    %%j11r1   )	r(   r)   r*   r   rt   r,   r   r   r   r   s   @r2   r  r    s*    
y 
2%,, 25<< 2r1   r  zy
    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
    c                   d    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	dz  de
dz  de
dz  de
dz  de
de
dz  dej                  dz  deej                     ez  fd       Z xZS )GitModelc                 l   t         |          | _        t              | _        t        j                        | _        t              | _	        t              | _        j                  6t        j                  fdt        j                        D              | _        | j#                          y )Nc              3      K   | ]B  }t        j                  t        j                  d d j                  j
                               D yw)r   N)r   r1  r,   zerosr   rw   )r  rO  rO   s     r2   r  z$GitModel.__init__.<locals>.<genexpr>  s;      ; U[[Av/C/C/O/OPQ;s   AA)rs   rt   rO   rk   r   r  r   image_encoderr   r  r  r  r   r   ParameterListr   img_temporal_embeddingr  r   s    `r2   rt   zGitModel.__init__  s     '/+F,@,@A!&)!.v!6**6*,*:*: ;v>>?; +D' 	r1   c                 .    | j                   j                  S r   r   ry   r  s    r2   r  zGitModel.get_input_embeddings  s    ...r1   c                 &    || j                   _        y r   r  )r   rX   s     r2   set_input_embeddingszGitModel.set_input_embeddings  s    */'r1   Nr   rP   rS   rT   rL   rR   r   r   r   rH  r   rQ   r5   c           
      Z   ||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }||n| j                   j                  }||t        d      d}|0t        |t              s|j                         n|j                         }|||j                  d   dk(  r||z   }| j                  ||||      }|2t        j                  |||j                  d   z   |j                        }t        j                  |t        j                        d   }||j                   d	k(  r| j#                  ||

      j$                  }n|j                   dk(  rg }t'        |j                  d         D ]O  }| j#                  |dd|ddddf   |

      j$                  }|| j(                  |   z  }|j+                  |       Q t        j,                  |d      }nt        d      | j/                  |      }|j1                  |j3                  d      |j3                  d      z  dd      }t        j,                  ||fd      }t        j4                  |t        j                        d   }t        j,                  ||gd      }t        j                  |j                  d   |j                  t        j                        }|t        j,                  t        j4                  |      |gd      }n||j                  d   dk(  rt        j6                  |g|j8                  |j                        }t        j:                  |j                  d   ||j                  d   z
  dz   f|j8                  |j                        }t        j,                  ||gd      }t=        | j                   ||||d||      }|}| j?                  ||||||	||      }|d   }|s	|f|dd z   S tA        ||jB                  |jD                  |jF                        S )a   
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = AutoModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> text = "this is an image of two cats"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer   r   )r   rS   rL   r   )r`   rL  ).r      r     rY   z#pixel_values must be of rank 4 or 5r<   )r`   rM  )rM  r`   )rP   rR   r   r   r   r   rQ   r  )$rO   r   r   r   r  r\   r   r	   get_seq_lengthr>   r   r,   r   r`   
zeros_likerH   ndimr  r%   r   r  appendrB  r  repeatr   	ones_liketensorrM  onesri   r  r   rR   r&   r'   )r   r   rP   rS   rT   rL   rR   r   r   r   rH  r   rQ   rd   r   embedding_outputr3   visual_features	frame_idxvisual_features_frameprojected_visual_featuresimage_token_type_idsextended_attention_maskcausal_maskr&   r  sequence_outputs                              r2   r   zGitModel.forward  sw   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd "#& "/59  ..0$335 # O$?IOOTUDVZ[D['*@@L??%'#9	 + 
 !"\\&&)9)?)?)BB'..N ))*:%))LVT#  A%"&"4"4 ;S #5 ###   ""a'"$!&|'9'9!'<!= BI,0,>,>$Q	1a%78Sk -? -'' * *T-H-H-SS)#**+@AB #())O"C !!FGG(,(>(>(O% )B(H(H %%a(,E,J,J1,MMqRS)%
  %yy*CEU)V\]^#(??3LTYT]T]#^_e#f "YY(<n'MSUVN"\\*:*@*@*CL\LcLckpktktuN)!&EOO<P,QSa+bhj!k(Y__Q-?1-D #\\'(0D0D^MbMbN ',jj%%a(*@>CWCWXYCZ*Z]^*^_$**%,,'#
 #YY(?'PVXYN 1KK	
 ),,&+/!5#) ' 	
 *!,#%(;;;&-+;;)77&11	
 	
r1   )NNNNNNNNNFNN)r(   r)   r*   rt   r  r  r   r,   r   r	   rI   r/   r   r   r   r   s   @r2   r  r    s4   &/0  *..2,0,0-1(,!%)-,0).#'.2c
<<$&c
 t+c
 llT)	c

 llT)c
 ||d*c
 c
 $;c
  $;c
 #Tkc
 #'c
 D[c
 t+c
 
u||	9	9c
 c
r1   r  z`
    GIT Model with a `language modeling` head on top for autoregressive language modeling.
    c            !           e Zd ZddiZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  de
dz  dedz  dedz  dedz  dededz  deej                  z  dej                  dz  deej                     ez  fd       Z	 	 	 	 	 	 d fd	Z xZS )GitForCausalLMzoutput.weightz%git.embeddings.word_embeddings.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        | j                          y r   )
rs   rt   r  r  r   r   rw   rv   r   r  r   s     r2   rt   zGitForCausalLM.__init__  sF     F#ii 2 2F4E4EF 	r1   c                     | j                   S r   r   r  s    r2   get_output_embeddingsz$GitForCausalLM.get_output_embeddings  s    {{r1   c                     || _         y r   r  )r   new_embeddingss     r2   set_output_embeddingsz$GitForCausalLM.set_output_embeddings  s	    $r1   Nr   rP   rS   rT   rL   labelsrR   r   r   r   rH  r   logits_to_keeprQ   r5   c                 *   ||n| j                   j                  }|d}| j                  ||||||||	|
|||      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|| j                  j                  j                  d   j                  j                  j                  }|dd|dddf   j                         }|ddddf   j                         } | j                  |j                  d| j                   j                        |j                  d      fd| j                   j                  i|}|s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                  |j&                        S )	a0  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Examples:

        Image captioning example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
        >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_caption)
        two cats sleeping on a pink blanket next to remotes.
        ```

        Visual question answering (VQA) example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> from huggingface_hub import hf_hub_download
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

        >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
        >>> image = Image.open(file_path).convert("RGB")

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> question = "what does the front of the bus say at the top?"

        >>> input_ids = processor(text=question, add_special_tokens=False).input_ids
        >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
        >>> input_ids = torch.tensor(input_ids).unsqueeze(0)

        >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
        >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
        ['what does the front of the bus say at the top? special']
        ```

        Video captioning example:

        ```python
        >>> import av
        >>> import numpy as np
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download
        >>> from transformers import AutoProcessor, AutoModelForCausalLM

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")

        >>> # set seed for reproducibility
        >>> np.random.seed(45)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`list[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # load video
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample frames
        >>> num_frames = model.config.num_image_with_embedding
        >>> indices = sample_frame_indices(
        ...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
        ... )
        >>> frames = read_video_pyav(container, indices)

        >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

        >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
        Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
        ```
        NF)rP   rS   rT   rL   rR   r   r   r   rH  r   rQ   r   r<   r   rv   )losslogitsrR   r&   r'   )rO   r  r  r   rH   slicer   r  r   r   r   r   r   loss_functionr   rv   r   rR   r&   r'   )r   r   rP   rS   rT   rL   r  rR   r   r   r   rH  r   r  rQ   rd   r|  r&   slice_indicesr  r  num_image_tokensshifted_logitsr   s                           r2   r   zGitForCausalLM.forward  s   p &1%<k$++B]B]I(()%%'+/!5%=#)  
  
8B>SV8W~ot4]k]1mQ+>?@#xx//55a8BBGGZZ#A'7':A$=>IIKNAqrE]--/F%4%%##B(>(>?B  ;;11 	D Y,F)-)9TGf$EvE%#33!//))
 	
r1   c           	      F    t        
|   |f|||||d|}	|s|s||	d<   |	S )N)rR   rP   r   rQ   rV   rT   )rs   prepare_inputs_for_generation)r   r   rR   rT   rP   r   rQ   rV   rd   model_inputsr   s             r2   r  z,GitForCausalLM.prepare_inputs_for_generationd  sL     w<
+))1
 
 Y+7L(r1   )NNNNNNNNNNFNr   N)NNNNNF)r(   r)   r*   _tied_weights_keysrt   r  r  r   r,   r   r	   rI   rH   r/   r   r   r  r   r   s   @r2   r  r    s    *+RS%  *..2,0,0-1&*(,!%)-,0).#'-..2F
<<$&F
 t+F
 llT)	F

 llT)F
 ||d*F
 t#F
 F
 $;F
  $;F
 #TkF
 #'F
 D[F
 ell*F
 t+F
" 
u||	5	5#F
 F
V   r1   r  )r  r  r  r  )NNFN)r  )Rr+   r   collections.abcr   dataclassesr   r,   r    r   r  activationsr   cache_utilsr	   r
   configuration_utilsr   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   utils.deprecationr   configuration_gitr   r    
get_loggerr(   r   r#   r   rJ   r-   rI   dictri   r  rk   r   r   r   r   r   r   r   r   r  r  rU  floatr`  rb  rt  r~  r  r  r  r  r  __all__r0   r1   r2   <module>r     s     $ !   & ! . 3 ) 6 9  G & 6  1 9 
		H	% 	<; 	< 	<%LL4'%\\D(% _%P ?K +/-1&*5454<<54 LL4'54 LL	54
 T\54 ,,%54 LL4'54 ##d*54 54 t54 
54 L54p*BII *ZO.ryy O.fBII   
3299 32bii  		 ") "JA
 A
H i i i>P")) Pf299 . %II%<<% 
% <<	%
 LL4'% % %.<) <)@/6 /fM
ryy M
`3
299 3
l 
6
' 6

6
r
2BII 
2 
~
! ~

~
B 
u' u
up Qr1   