
    qi+                        d dl Z d dlZd dlmZmZmZ d dlmZ ddlmZ	 ddl
mZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ  ej@                  e!      Z" G d dejF                        Z$ G d dejF                        Z% G d dejF                        Z& G d dejF                        Z' G d dejF                        Z( G d dejF                        Z) G d de      Z* G d dejF                        Z+ G d  d!ejF                        Z, G d" d#ejF                        Z- G d$ d%ejF                        Z. G d& d'ejF                        Z/ G d( d)e      Z0 G d* d+e0      Z1 G d, d-e0e      Z2g d.Z3y)/    N)Tensordevicenn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)apply_chunking_to_forward)logging   )BlipTextConfigc                        e Zd ZdZ fdZ	 	 	 	 d
dej                  dz  dej                  dz  dej                  dz  dedej                  f
d	Z
 xZS )BlipTextEmbeddingsz;Construct the embeddings from word and position embeddings.c                 ,   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j                        j%                  d      d       || _        y )N)padding_idxepsposition_idsr   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandconfigselfr3   	__class__s     ]/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.pyr"   zBlipTextEmbeddings.__init__-   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
     N	input_idsr   inputs_embedspast_key_values_lengthreturnc                 *   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|| j                  |      }|}| j                  |      }||z  }| j	                  |      }| j                  |      }|S )Nr   r   )sizer   r'   r)   r*   r.   )	r5   r9   r   r:   r;   input_shape
seq_length
embeddingsr)   s	            r7   forwardzBlipTextEmbeddings.forward<   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL  00;M"
"66|D))
^^J/
\\*-
r8   )NNNr   )__name__
__module____qualname____doc__r"   r0   
LongTensorFloatTensorintr   rB   __classcell__r6   s   @r7   r   r   *   ss    E" .20426&'##d* &&- ((4/	
 !$ 
r8   r   c                       e Zd Zd fd	Zd Zd Zd Zd Z	 	 	 	 	 	 ddej                  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dej                  dz  deej                     fdZ xZS )BlipTextSelfAttentionNc                    t         |           || _        |j                  |j                  z  dk7  r0t        |d      s$t        d|j                  |j                  fz        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        || _
        t        j                  |j                  | j                        | _        |r_t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        n^t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j"                  |j$                        | _        y )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d))r!   r"   r3   r%   num_attention_headshasattr
ValueErrorrI   attention_head_sizeall_head_size	layer_idxr   Linearqueryencoder_hidden_sizekeyvaluer,   attention_probs_dropout_probr.   r5   r3   is_cross_attentionrU   r6   s       r7   r"   zBlipTextSelfAttention.__init__\   s`    : ::a?PVXhHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PP"YYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EFr8   c                     || _         y Nattn_gradients)r5   ra   s     r7   save_attn_gradientsz)BlipTextSelfAttention.save_attn_gradientst   s
    ,r8   c                     | j                   S r_   r`   r5   s    r7   get_attn_gradientsz(BlipTextSelfAttention.get_attn_gradientsw   s    """r8   c                     || _         y r_   attention_map)r5   rh   s     r7   save_attention_mapz(BlipTextSelfAttention.save_attention_mapz   s
    *r8   c                     | j                   S r_   rg   rd   s    r7   get_attention_mapz'BlipTextSelfAttention.get_attention_map}   s    !!!r8   hidden_statesattention_maskencoder_hidden_statesencoder_attention_maskpast_key_valuesoutput_attentionscache_positionr<   c                    |j                   \  }}	}
| j                  |      j                  |d| j                  | j                        j                  dd      }|d u}|r|n|}d}|St        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|I|rGj                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      j                  |d| j                  | j                        j                  dd      }| j#                  |      j                  |d| j                  | j                        j                  dd      }|T|s|nd }j%                  ||| j                  d|i      \  }}|r)t        |t              rd|j                  | j                  <   t'        j(                  ||j                  dd            }|t+        j,                  | j                        z  }|||j/                  |j0                        z   } t3        j4                  d      |      }| j7                  |      }t'        j(                  ||      }|j9                  d	ddd
      j;                         }|j=                         d d | j>                  fz   } |j                  | }||fS )Nr   r      Frr   T)dimr   r   ) shaperW   viewrP   rS   	transpose
isinstancer   
is_updatedgetrU   cross_attention_cacheself_attention_cachelayerskeysvaluesrY   rZ   updater0   matmulmathsqrttor   r   Softmaxr.   permute
contiguousr>   rT   )r5   rl   rm   rn   ro   rp   rq   rr   
batch_sizer@   _query_layerr]   r{   curr_past_key_valuescurrent_states	key_layervalue_layerattention_scoresattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                          r7   rB   zBlipTextSelfAttention.forward   s    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 3$>3E/>
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$2D.-/"=*,33DNNCHHI.55dnnELLK (j"d&>&>@X@XY1a  

>*j"d&>&>@X@XY1a  *7It)=)D)D{DNN=M~<^*&	; &*_FY*ZAEO..t~~> !<<Y5H5HR5PQ+dii8P8P.QQ%/.2C2CDTD[D[2\\ -"**,-=> #',,"?%<kJ%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDo--r8   r_   NNNNFN)rC   rD   rE   r"   rb   re   ri   rk   r0   r   rH   r
   booltuplerB   rJ   rK   s   @r7   rM   rM   [   s    G0-#+" 48:>;?(,)..2S.||S. ))D0S.  %0047	S.
 !& 1 1D 8S. S.  $;S. t+S. 
u||	S.r8   rM   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BlipTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r!   r"   r   rV   r%   denser*   r+   r,   r-   r.   r4   s     r7   r"   zBlipTextSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r8   rl   input_tensorr<   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r_   r   r.   r*   r5   rl   r   s      r7   rB   zBlipTextSelfOutput.forward   7    

=1]3}|'CDr8   rC   rD   rE   r"   r0   r   rB   rJ   rK   s   @r7   r   r      1    >U\\  RWR^R^ r8   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dedz  dedz  dej
                  dz  d	e	ej
                     fd
Z
 xZS )BlipTextAttentionNc                 h    t         |           t        |||      | _        t	        |      | _        y )NrU   )r!   r"   rM   r5   r   outputr\   s       r7   r"   zBlipTextAttention.__init__   s,    )&2DPYZ	(0r8   rl   rm   rn   rp   rq   rr   r<   c                 p    | j                  ||||||      }| j                  |d   |      }|f|dd  z   }	|	S )Nrm   rn   rp   rq   rr   r   r   )r5   r   )
r5   rl   rm   rn   rp   rq   rr   self_outputsattention_outputoutputss
             r7   rB   zBlipTextAttention.forward   sY     yy)"7+/) ! 
  ;;|AF#%QR(88r8   )FN)NNNFN)rC   rD   rE   r"   r0   r   rH   r
   r   r   rB   rJ   rK   s   @r7   r   r      s    1 48:>(,)..2|| ))D0  %0047	
   $; t+ 
u||	r8   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r_   )r!   r"   r   rV   r%   intermediate_sizer   rz   
hidden_actstrr	   intermediate_act_fnr4   s     r7   r"   zBlipTextIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r8   rl   r<   c                 J    | j                  |      }| j                  |      }|S r_   )r   r   r5   rl   s     r7   rB   zBlipTextIntermediate.forward  s&    

=100?r8   r   rK   s   @r7   r   r     s#    9U\\ ell r8   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BlipTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r!   r"   r   rV   r   r%   r   r*   r+   r,   r-   r.   r4   s     r7   r"   zBlipTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r8   rl   r   r<   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r_   r   r   s      r7   rB   zBlipTextOutput.forward  r   r8   r   rK   s   @r7   r   r     r   r8   r   c                        e Zd Z fdZ	 	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	ej
                  dz  d
e	ej
                     fdZ
d Z xZS )BlipTextLayerc                 L   t         |           || _        |j                  | _        d| _        t        ||      | _        || _        | j                  j                  r't        || j                  j                  |      | _	        t        |      | _        t        |      | _        y )Nr   r   )r]   rU   )r!   r"   r3   chunk_size_feed_forwardseq_len_dimr   	attention	layer_num
is_decodercrossattentionr   intermediater   r   )r5   r3   r   r6   s      r7   r"   zBlipTextLayer.__init__"  s    '-'E'E$*6YG";;!!"34;;+A+AY#D 18$V,r8   Nrl   rm   rn   ro   rp   rq   rr   r<   c                     | j                  |||||      }|d   }	|dd  }
|$| j                  |	|||||      }|d   }	|
|dd  z   }
t        | j                  | j                  | j
                  |	      }|f|
z   S )N)rm   rq   rp   rr   r   r   r   )r   r   r   feed_forward_chunkr   r   )r5   rl   rm   rn   ro   rp   rq   rr   self_attention_outputsr   r   cross_attention_outputslayer_outputs                r7   rB   zBlipTextLayer.forward0  s     "&)/+) "0 "
 2!4(, ,&*&9&9 5&; /"3- ': '#  7q9 7 ;;G0##T%A%A4CSCSUe
 ((r8   c                 L    | j                  |      }| j                  ||      }|S r_   )r   r   )r5   r   intermediate_outputr   s       r7   r   z BlipTextLayer.feed_forward_chunkT  s,    "//0@A{{#68HIr8   r   )rC   rD   rE   r"   r0   r   rH   r
   r   r   rB   r   rJ   rK   s   @r7   r   r   !  s    -" 48:>;?(,)..2")||") ))D0")  %0047	")
 !& 1 1D 8") ")  $;") t+") 
u||	")Hr8   r   c                        e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	edz  d
edz  dedz  dej
                  dz  de	ej
                     e
z  fdZ xZS )BlipTextEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w )NF)
r!   r"   r3   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r5   r3   ir6   s      r7   r"   zBlipTextEncoder.__init__\  sP    ]]eFLdLdFe#fM&!$<#fg
&+# $gs   A$Nrl   rm   rn   ro   rp   	use_cacherq   output_hidden_statesreturn_dictrr   r<   c           
         | j                   r%| j                  r|rt        j                  d       d}|rgt	        |t
              r!t        |t        | j                              }n6|4t        t        | j                        t        | j                              }|rdnd }|rdnd }|r|dnd }t        | j                  j                        D ]D  }| j                  |   }|r||fz   } ||||||||
      }|d   }|s0||d   fz   }|<||d   fz   }F |r||fz   }|	st        d |||||fD              S t        |||||	      S )
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r3    r   r   rt   c              3   $   K   | ]  }|| 
 y wr_   r   ).0vs     r7   	<genexpr>z*BlipTextEncoder.forward.<locals>.<genexpr>  s      
 = 
s   )last_hidden_staterp   rl   
attentionscross_attentions)r   trainingloggerwarningrz   r   r   r3   r   r   r   r   r   )r5   rl   rm   rn   ro   rp   r   rq   r   r   rr   all_hidden_statesall_self_attentionsall_cross_attentionsr   layer_modulelayer_outputss                    r7   rB   zBlipTextEncoder.forwardb  s    &&4==p "	 /<8"5o|[_[f[fGg"h ("5 4l$++6V# #7BD$5b4%6;P;\rbft{{445 	VA::a=L#$58H$H!(%&!M *!,M &9]1=M<O&O#(4+?=QRCSBU+U('	V*   1]4D D 
 "#%'(
 
 
 9+++*1
 	
r8   )	NNNNNFFTN)rC   rD   rE   r"   r0   r   rH   r
   r   r   r   rB   rJ   rK   s   @r7   r   r   [  s    , 48:>;?(,!%).,1#'.2L
||L
 ))D0L
  %0047	L

 !& 1 1D 8L
 L
 $;L
  $;L
 #TkL
 D[L
 t+L
 
u||	H	HL
r8   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r_   )r!   r"   r   rV   r%   r   Tanh
activationr4   s     r7   r"   zBlipTextPooler.__init__  s9    YYv1163E3EF
'')r8   rl   r<   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r   )r5   rl   first_token_tensorpooled_outputs       r7   rB   zBlipTextPooler.forward  s6     +1a40

#566r8   r   rK   s   @r7   r   r     s#    $
U\\ ell r8   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )r!   r"   r   rV   r%   r   rz   r   r   r	   transform_act_fnr*   r+   r4   s     r7   r"   z(BlipTextPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr8   rl   r<   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r_   )r   r   r*   r   s     r7   rB   z'BlipTextPredictionHeadTransform.forward  s4    

=1--m<}5r8   r   rK   s   @r7   r   r     s$    UU\\ ell r8   r   c                   $     e Zd Z fdZd Z xZS )BlipTextLMPredictionHeadc                    t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        y )NT)bias)r!   r"   r   	transformr   rV   r%   r$   decoder	Parameterr0   zerosr   r4   s     r7   r"   z!BlipTextLMPredictionHead.__init__  s[    8@ yy!3!3V5F5FTRLLV->->!?@	r8   c                 J    | j                  |      }| j                  |      }|S r_   )r   r   r   s     r7   rB   z BlipTextLMPredictionHead.forward  s$    }5]3r8   )rC   rD   rE   r"   rB   rJ   rK   s   @r7   r   r     s    Ar8   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextOnlyMLMHeadc                 B    t         |           t        |      | _        y r_   )r!   r"   r   predictionsr4   s     r7   r"   zBlipTextOnlyMLMHead.__init__  s    3F;r8   sequence_outputr<   c                 (    | j                  |      }|S r_   )r  )r5   r  prediction_scoress      r7   rB   zBlipTextOnlyMLMHead.forward  s     ,,_=  r8   r   rK   s   @r7   r  r    s#    <!u|| ! !r8   r  c                   6     e Zd ZU dZeed<   dZg Z fdZ xZ	S )BlipTextPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r3   bertc                     t         |   |       t        |t              rZt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             y y )Nr   r   )r!   _init_weightsrz   r   initcopy_r   r0   r1   rw   r2   )r5   moduler6   s     r7   r  z%BlipTextPreTrainedModel._init_weights  s[    f%f01JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 2r8   )
rC   rD   rE   rF   r   __annotations__base_model_prefix_no_split_modulesr  rJ   rK   s   @r7   r  r    s(    
 i ir8   r  c                        e Zd ZdZd fd	Zd Zd Zdedee	   de
ded	ef
d
Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dedz  dedz  dej                  dz  d	eej                     ez  fdZ xZS )BlipTextModela&  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
    `encoder_hidden_states` is then expected as an input to the forward pass.
    c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd | _        | j                          y r_   )
r!   r"   r3   r   rA   r   encoderr   pooler	post_init)r5   r3   add_pooling_layerr6   s      r7   r"   zBlipTextModel.__init__
  sI     ,V4&v.0AnV,tr8   c                 .    | j                   j                  S r_   rA   r'   rd   s    r7   get_input_embeddingsz"BlipTextModel.get_input_embeddings  s    ...r8   c                 &    || j                   _        y r_   r  )r5   rZ   s     r7   set_input_embeddingsz"BlipTextModel.set_input_embeddings  s    */'r8   rm   r?   r   r   r<   c                     |j                         dk(  r|dddddddf   }n5|j                         dk(  r|r|\  }}t        j                  ||      }|ddddf   j                  ||d      |ddddf   k  }	|	j	                  |j
                        }	|	j                  d   |j                  d   k  r[|j                  d   |	j                  d   z
  }
t        j                  t        j                  |||
f||	j
                        |	gd      }	|	dddddddf   |ddddddf   z  }n*|ddddddf   }nt        d	| d
|j                   d      |j	                  | j
                        }d|z
  dz  }|S )a=  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.
            device (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r   Nrt   r   r   )r   dtyper   )axisz!Wrong shape for input_ids (shape z) or attention_mask (shape ))r   g      ?g     )
rv   r0   r1   repeatr   r   rw   catonesrR   )r5   rm   r?   r   r   extended_attention_maskr   r@   seq_idscausal_maskprefix_seq_lens              r7   get_extended_attention_maskz)BlipTextModel.get_extended_attention_mask  s   & 1$&4Qa]&C#!Q& )4&
J,,z&A%dD!m4;;J
TUVZabfhikoboZpp)nn^-A-AB$$Q'.*>*>q*AA%3%9%9!%<{?P?PQR?S%SN"'))!JJ!+Z HQW_j_p_p (	  #K +6aq!m*D~VWY]_cefVfGg*g'*8D$9I*J'3K=@[\j\p\p[qqrs  #:"<"<4::"<"N#&)@#@H"L&&r8   Nr9   r   r:   encoder_embedsrn   ro   rp   r   rq   r   r   rr   c                 d   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|r|	|	n| j                   j                  }	nd}	||t        d      |4| j                  ||       |j                         }|\  }}|j                  }nY|%|j                         dd }|\  }}|j                  }n2|%|j                         dd }|\  }}|j                  }nt        d      |dn|j                         }|)t        j                  |||z   f      j                  |      }| j                  ||||      }|t        |t              r|d   j                         \  }}}n|j                         \  }}}||f}t        |t              r|D cg c]  }| j!                  |       }}n?|)t        j                  ||      }| j!                  |      }n| j!                  |      }nd}|| j#                  ||||      }n|}| j%                  ||||||	|
|||	
      }|d   }| j&                  | j'                  |      nd}|s
||f|d
d z   S t)        |||j*                  |j,                  |j.                  |j0                        S c c}w )a  
        encoder_hidden_states  (`torch.FloatTensor`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`Cache`, *optional*):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   zGYou have to specify either input_ids or inputs_embeds or encoder_embedsr   r  )r9   r   r:   r;   )	rm   rn   ro   rp   r   rq   r   r   rr   r   )r   pooler_outputrp   rl   r   r   )r3   rq   r   use_return_dictr   rR   %warn_if_padding_and_no_attention_maskr>   r   get_seq_lengthr0   r%  r   r*  rz   listinvert_attention_maskrA   r  r  r   rp   rl   r   r   ) r5   r9   rm   r   r:   r+  rn   ro   rp   r   rq   r   r   r   rr   kwargsr?   r   r@   r   r;   r&  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapemaskencoder_extended_attention_maskembedding_outputencoder_outputsr  r   s                                    r7   rB   zBlipTextModel.forwardX  s&   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%.%:	@U@UII ]%>cdd"66y.Q#..*K%0"J
%%F&',,.s3K%0"J
"))F'(--/4K%0"J
#**Ffgg&5&=?CaCaCc!"ZZZBX5X(YZ]]^deN 150P0PK1
 !,/6AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$7`v2wX\43M3Md3S2w/2w'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+!##)+'=	  /    .,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
K 3xs   5J-)T)NNNNNNNNNNNNFN)rC   rD   rE   rF   r"   r  r  r   r   rI   r   r   r*  r0   r
   r   rB   rJ   rK   s   @r7   r  r    s   /0<'$<'38:<'GM<'[_<'	<'@ *..2,0-1.2596:(,!%)-,0#'"'.2B
<<$&B
 t+B
 llT)	B

 ||d*B
 t+B
  %||d2B
 !&t 3B
 B
 $;B
  $;B
 #TkB
 D[B
 4KB
 t+B
" 
u||	K	K#B
r8   r  c            &           e Zd ZdddZ fdZd Zd Zd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
e	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  de	j                  d	z  dee	j                  z  dee	j                     ez  f$dZd fd	Z xZS ) BlipTextLMHeadModelzcls.predictions.biasz&bert.embeddings.word_embeddings.weight)zcls.predictions.decoder.biaszcls.predictions.decoder.weightc                     t         |   |       t        |d      | _        t	        |      | _        |j                  | _        | j                          y )NF)r  )r!   r"   r  r	  r  clslabel_smoothingr  r4   s     r7   r"   zBlipTextLMHeadModel.__init__  sB     !&EB	&v.%55r8   c                 6    | j                   j                         S r_   )r	  r  rd   s    r7   r  z(BlipTextLMHeadModel.get_input_embeddings  s    yy--//r8   c                 :    | j                   j                  |       y r_   )r	  r  r5   new_embeddingss     r7   r  z(BlipTextLMHeadModel.set_input_embeddings  s    		&&~6r8   c                 B    | j                   j                  j                  S r_   )r>  r  r   rd   s    r7   get_output_embeddingsz)BlipTextLMHeadModel.get_output_embeddings  s    xx##+++r8   c                     || j                   j                  _        |j                  | j                   j                  _        y r_   )r>  r  r   r   rB  s     r7   set_output_embeddingsz)BlipTextLMHeadModel.set_output_embeddings  s,    '5$$2$7$7!r8   Nr9   rm   r   r:   rn   ro   labelsrp   r   rq   r   r   return_logitsr   	reductionrr   logits_to_keepr<   c                    ||n| j                   j                  }|d}	| j                  ||||||||	|
||||      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }|r|ddddddf   j                         S d}||ddddddf   j                         }|ddddf   j                         j                  |j                        }t        || j                        } ||j                  d| j                   j                        |j                  d            }|dk(  r0|j                  |j                  d      d      j                  d      }|s|f|d	d z   }||f|z   S |S t!        |||j"                  |j$                  |j&                  |j(                  
      S )a  
        encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
            configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        past_key_values (`Cache`, *optional*):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NF)rm   r   r:   rn   ro   rp   r   rq   r   r   r   rr   r   r   r   )rJ  r?  nonert   )losslogitsrp   rl   r   r   )r3   r.  r	  rz   rI   slicer>  r   r   r   r   r?  rx   r$   r>   sumr   rp   rl   r   r   )r5   r9   rm   r   r:   rn   ro   rH  rp   r   rq   r   r   rI  r   rJ  rr   rK  r3  r   rl   slice_indicesr  lm_lossshifted_prediction_scoresloss_fctr   s                              r7   rB   zBlipTextLMHeadModel.forward  s   V &1%<k$++B]B]I)))%'"7#9+/!5#!)  
   
8B>SV8W~ot4]k HH]1mQ3F%GH$QQY/::<<(9!SbS!)(D(O(O(Q%AqrE]--/223L3S3STF')TMaMabH8==b$++BXBXY[a[f[fgi[jkGF"!,,'8'='=a'@"EII!L')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r8   c                 8    t        |   |f||d|}d|d<   |S )N)rp   rm   Tr   )r!   prepare_inputs_for_generation)r5   r9   rp   rm   model_kwargsmodel_inputsr6   s         r7   rW  z1BlipTextLMHeadModel.prepare_inputs_for_generationX  s>     w<
+)
 	
 &*\"r8   )NNNNNNNNNNNNFTmeanNr   )NN)rC   rD   rE   _tied_weights_keysr"   r  r  rE  rG  r0   r   r
   r   r   rI   r   r   rB   rW  rJ   rK   s   @r7   r<  r<    s   (>*R
07,8 *..2,0-1596:&*(,!%)-,0#'%*"& &.2-.%\
<<$&\
 t+\
 llT)	\

 ||d*\
  %||d2\
 !&t 3\
 t#\
 \
 $;\
  $;\
 #Tk\
 D[\
 d{\
 4K\
  :!\
" t+#\
$ ell*%\
( 
u||	@	@)\
| r8   r<  )r  r<  r  )4r   r0   r   r   r   torch.nnr    r   r  activationsr	   cache_utilsr
   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   utilsr   configuration_blipr   
get_loggerrC   r   Moduler   rM   r   r   r   r   r   r   r   r   r   r  r  r  r<  __all__r   r8   r7   <module>rj     sQ      $ $ % & ! C C ) 9 
 . 6  . 
		H	%- -bx.BII x.x 		 :299  RYY 6. 6tS
bii S
nRYY  bii $ryy "!")) !io i"Y
+ Y
zE1? EP Nr8   