
    qi                        d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dl
m	c mZ d dlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z: ddl;m<Z<m=Z=  e4j|                  e?      Z@ G d de	j                        ZB G d de	j                        ZC G d dej                  j                        ZD G d de	j                        ZE G d  d!e	j                        ZG ed"       G d# d"e	j                               ZH G d$ d%e	j                        ZId&ej                  d'ej                  d(ej                  d)eKej                  ej                  f   fd*ZLd+ej                  d,eMd)ej                  fd-ZN	 d`d.e	j                  d/ej                  d0ej                  d1ej                  d2ej                  dz  d3eOd4eOfd5ZP	 d`d.e	j                  d/ej                  d0ej                  d1ej                  d2ej                  dz  d3eOd4eOfd6ZQ G d7 d8e	j                        ZR G d9 d:e!      ZSe2 G d; d<e-             ZTe2 G d= d>eT             ZU G d? d@eTe      ZVe e2dAB       G dC dDe'                    ZW G dE dFej                  j                        ZX G dG dHe	j                        ZYdI ZZ G dJ dKe	j                        Z[dLej                  d/ej                  fdMZ\d/ej                  d0ej                  dLej                  d)eKej                  ej                  f   fdNZ] G dO dPe	j                        Z^ G dQ dRe	j                        Z_ G dS dTe!      Z` G dU dVe	j                        Za G dW dXe	j                        Zb G dY dZe	j                        Zc G d[ d\eT      Zd G d] d^eTe      Zeg d_Zfy)a    N)Callable)	dataclass)Optional)Llama4VisionConfig   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )Llama4ConfigLlama4TextConfigc                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Llama4TextExpertsconfigc                    t         |           |j                  | _        |j                  | _        |j
                  | _        | j                  | _        t        j                  t        j                  | j                  | j
                  d| j                  z              | _        t        j                  t        j                  | j                  | j                  | j
                  f            | _        t        |j                     | _        y N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchzerosgate_up_projempty	down_projr	   
hidden_actact_fnselfr)   	__class__s     \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/llama4/modeling_llama4.pyr.   zLlama4TextExperts.__init__9   s    !33!'!9!9!--00LLT5E5EtGWGWYZ]a]l]lYl)mnekk43C3CT__VZVfVf2g&hiV../    hidden_statesreturnc                 v   |j                  | j                  j                  d   d| j                        }t	        j
                  || j                        }|j                  dd      \  }}t	        j
                  || j                  |      z  | j                        }|j                  d| j                        }|S )a2  
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        r   r,   dim)	viewr8   shaper2   r6   bmmchunkr<   r:   )r>   rB   gate_upgateupnext_statess         r@   forwardzLlama4TextExperts.forwardC   s     &**4+<+<+B+B1+Er4K[K[\))M4+<+<====+biidkk$&7!7$..I!&&r4+;+;<rA   )	__name__
__module____qualname__r&   r.   r6   TensorrP   __classcell__r?   s   @r@   r(   r(   8   s+    0/ 0U\\ ell rA   r(   c                   &     e Zd Zd fd	Zd Z xZS )Llama4TextMLPc                 f   t         |           ||j                  }|| _        t	        j
                  |j                  |d      | _        t	        j
                  |j                  |d      | _        t	        j
                  ||j                  d      | _	        t        |j                     | _        y NFbias)r-   r.   r1   r)   r4   Linearr2   	gate_projup_projr:   r	   r;   activation_fn)r>   r)   r1   r?   s      r@   r.   zLlama4TextMLP.__init__Z   s    $ & 8 86#5#57HuUyy!3!35FUS#4f6H6HuU#F$5$56rA   c                     | j                  | j                  |            | j                  |      z  }| j                  |      S N)r`   r^   r_   r:   )r>   xr:   s      r@   rP   zLlama4TextMLP.forwardf   s7    &&t~~a'89DLLOK	~~i((rA   rb   rQ   rR   rS   r.   rP   rU   rV   s   @r@   rX   rX   Y   s    
7)rA   rX   c                   8     e Zd Zddef fdZd Zd Zd Z xZS )Llama4TextL2Normepsc                 0    t         |           || _        y rb   )r-   r.   rg   )r>   rg   r?   s     r@   r.   zLlama4TextL2Norm.__init__l   s    rA   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S Nr,   rE   T)keepdimr6   rsqrtpowmeanrg   r>   rc   s     r@   _normzLlama4TextL2Norm._normp   4    5;;quuQx}}R}>IJJJrA   c                 ^    | j                  |j                               j                  |      S rb   )rq   floattype_asrp   s     r@   rP   zLlama4TextL2Norm.forwards   s"    zz!'')$,,Q//rA   c                      d| j                    S )Nzeps=rg   r>   s    r@   
extra_reprzLlama4TextL2Norm.extra_reprv   s    dhhZ  rA   )gư>)	rQ   rR   rS   rt   r.   rq   rP   ry   rU   rV   s   @r@   rf   rf   k   s    E K0!rA   rf   c                   2     e Zd Zd fd	Zd Zd Zd Z xZS )Llama4TextRMSNormc                     t         |           || _        t        j                  t        j                  |            | _        y)z<
        Llama4RMSNorm is equivalent to T5LayerNorm
        N)r-   r.   rg   r4   r5   r6   onesweight)r>   r2   rg   r?   s      r@   r.   zLlama4TextRMSNorm.__init__{   s0     	ll5::k#:;rA   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S rj   rl   rp   s     r@   rq   zLlama4TextRMSNorm._norm   rr   rA   c                 |    | j                  |j                               j                  |      }|| j                  z  S rb   )rq   rt   ru   r~   )r>   rc   outputs      r@   rP   zLlama4TextRMSNorm.forward   s0    AGGI&..q1##rA   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler~   rI   rg   rx   s    r@   ry   zLlama4TextRMSNorm.extra_repr   s'    ))*+6$((<<rA   )gh㈵>)rQ   rR   rS   r.   rq   rP   ry   rU   rV   s   @r@   r{   r{   z   s    <K$=rA   r{   c                   (     e Zd Z fdZ fdZ xZS )Llama4Routerc                     t         |   |j                  |j                  d       |j                  | _        |j
                  | _        y rZ   )r-   r.   r2   r/   r0   num_experts_per_toktop_kr=   s     r@   r.   zLlama4Router.__init__   s>    ++V-E-EER!33//
rA   c                 t   t         |   |      }t        j                  || j                  d      \  }}t        j
                  |t        d            j                  d||      }t        j                  j                  j                  |j                               j                  |j                        }||fS )Nr$   rF   z-inf)r-   rP   r6   topkr   	full_likert   scatter_r4   
functionalsigmoidtodtype)r>   rB   router_logitsrouter_top_valuerouter_indicesrouter_scoresr?   s         r@   rP   zLlama4Router.forward   s    6+0::mTZZUV+W(.uV}ENNqR`brs++33M4G4G4IJMMmNaNabm++rA   rd   rV   s   @r@   r   r      s    0
, ,rA   r   Llama4TextMoec                   $     e Zd Z fdZd Z xZS )r   c                     t         |           |j                  | _        |j                  | _        |j                  | _        t        |      | _	        t        |      | _        t        |      | _        y rb   )r-   r.   r   r   r2   
hidden_dimr/   r0   r(   expertsr   routerrX   shared_expertr=   s     r@   r.   zLlama4TextMoe.__init__   s[    //
 ,,!33(0"6**62rA   c                    |j                  d| j                        }| j                  |      \  }}|j                  |j                  d   d      }||j                  dd      j                  dd      z  }| j                  |      }| j                  |      }|j                  |j                  |j                  d   d|j                  d         j                  d             ||fS )NrE   r$   r   rF   )
reshaper   r   repeatrI   	transposer   r   add_sum)r>   rB   r   r   	routed_in
routed_outouts          r@   rP   zLlama4TextMoe.forward   s    %--b$//B'+{{='A$}!(()<)<Q)?C	 7 71 = E Eb! LL	\\),
  /##M$7$7$:B
@P@PQS@TUYY^_Y`aM!!rA   rd   rV   s   @r@   r   r      s    3"rA   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )Llama4TextRotaryEmbeddinginv_freqNr)   c                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)r-   r.   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr)   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r>   r)   devicerope_init_fnr   r?   s        r@   r.   z"Llama4TextRotaryEmbedding.__init__   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUrA   r   ztorch.deviceseq_lenrC   ztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimN      ?r   r,   r   )r   r   )	r   getattrr2   num_attention_headsr6   arangeint64r   rt   )r)   r   r   baserG   attention_factorr   s          r@   r   z9Llama4TextRotaryEmbedding.compute_default_rope_parameters   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))rA   c                 v   | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                  |j
                        |z  j                  dd      }t        j                  t        j                  |      |      }|| j                  z  }d d d        |S # 1 sw Y   S xY w)	Nr   rE   r$   mpscpuF)device_typeenabledr,   )r   rt   expandrI   
isinstancer   typestrr!   r   r   r6   polar	ones_liker   )r>   rc   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_ciss           r@   rP   z!Llama4TextRotaryEmbedding.forward   s    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	;&))!((36KKVVWXZ[\EEOOE$:EBI!D$:$::I	;
 	;
 s   =A'D..D8rb   )NNN)rQ   rR   rS   r6   rT   __annotations__r&   r.   staticmethodr   intr   rt   r   no_gradr   rP   rU   rV   s   @r@   r   r      s    llV/ V  *.+/"* 4'*(* t* 
~u$	%	* *< U]]_
  
rA   r   xqxkr   rC   c           	      &   t        j                   | j                         j                  g | j                  d d dd       }t        j                   |j                         j                  g |j                  d d dd       }t        j
                  ||d d d d d d d f   z        j                  d      }t        j
                  ||d d d d d d d f   z        j                  d      }|j                  |       |j                  |      fS )NrE   r,   r   )r6   view_as_complexrt   r   rI   view_as_realflattenru   )r   r   r   xq_xk_xq_outxk_outs          r@   apply_rotary_embr      s    
 

 2
 2 2 IBHHSbM I2 Iq I
JC


 2
 2 2 IBHHSbM I2 Iq I
JCi1dA&> >?GGJFi1dA&> >?GGJF>>"v~~b111rA   rB   n_repc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r$   N)rI   r   r   )rB   r   batchnum_key_value_headsslenr   s         r@   	repeat_kvr     so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTrA   modulequerykeyvalueattention_maskscalingdropoutc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
d      }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr,   r   rE   rF   ptrainingr$   )r   num_key_value_groupsr6   matmulr   r4   r   softmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputs               r@   eager_attention_forwardr     s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2(>L==((6??([L,,|\:K''1-88:K$$rA   c                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            | j
                  dz  z  }
||
|z   }
t        j                  j                  |
d      }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr,   r         rE   rF   r   r$   )r   r   r6   r   r   r   r4   r   r   r   r   r   r   s               r@   vision_eager_attention_forwardr   (  s     3 ; ;<JUF$?$?@L<<z';';Aq'ABV__VZEZZL!#n4==((2(>L==((6??([L,,|\:K''1-88:K$$rA   c                   2    e Zd ZdZdef fdZ	 	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  dz  eej                     dz  f   fdZ xZS )Llama4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr)   c                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  | _        |j                  |j                  z  | _	        |j                  | _        | j                  dz  | _
        |j                  | _        |j                  | _        |j                  | _        |j                  | _        d| _        |j                   |   | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j                  | j                  z  |j
                  |j(                        | _        | j                  j2                  r(| j"                  rt5        |j6                        | _        y y y )Nr   r   Tr[   )r-   r.   r)   	layer_idxr   r2   r   r   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper4   r]   attention_biasq_projk_projv_projo_projuse_qk_normrf   rms_norm_epsqk_normr>   r)   r  r?   s      r@   r.   zLlama4TextAttention.__init__D  s   "
F4F4F&JdJd4de#)#=#= $*$>$>&B\B\$\!#)#=#= }}d* ++!--'-'E'E$!'!9!9--i8ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 ;;""t}}+F,?,?@DL (5"rA   NrB   position_embeddingsr   past_key_valuescache_positionr   rC   c                 H   |j                   d d }g |d| j                  }| j                  |      j                  |      }	 | j	                  |      j                  g |d| j                   }
| j                  |      j                  |      j                  dd      }| j                  r)t        |	|
|j                  |	j                              \  }	}
t        | d      r"| j                  |	      }	| j                  |
      }
| j                  r| j                  st        j                  t        j                   |j#                         dz   | j$                  z              | j&                  z  dz   }|j                  d|d   ddf      j)                  g |dd      }|	|z  j                  |	j*                        }	|	j                  dd      }	|
j                  dd      }
|%d|i}|j-                  |
|| j.                  |      \  }
}t1        j2                  | j4                  j6                  t8              } || |	|
||f| j:                  sdn| j<                  | j>                  d|\  }} |j@                  g |d jC                         }| jE                  |      }||fS )	NrE   r$   r,   r  r   r          )r   r   )#rI   r   r  rH   r  r  r   r  r   r   r   hasattrr  r  r6   log1pfloorrt   r  r  r   r   updater  r   get_interfacer)   _attn_implementationr   r   r  r   r   r   r  )r>   rB   r  r   r  r  r   input_shapehidden_shapequery_statesr   r   attn_scalescache_kwargsattention_interfacer   r   s                    r@   rP   zLlama4TextAttention.forwardb  s    $))#2.88b8$--8{{=166|D4T[[/44UkU2Ut}}U
{{=166|DNNqRST=='7j*=*@*@ATAT*U($L* 4#<<5Lj1J ''EKK)=)=)?#)EIYIY(YZ[^b^m^mmpss  &**A{21+EFMMNbP[Nb]^Nb`aNbcK(;6::<;M;MNL#--a3))!Q/
&,n=L'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((rA   NN)rQ   rR   rS   __doc__r&   r.   r6   rT   r   r
   
LongTensorr   r   rP   rU   rV   s   @r@   r  r  A  s    GA/ AF )-269)||9) #5<<#=>9) t+	9)
 9) ((4/9) -.9) 
u||U\\D0%2E2LL	M9)rA   r  c                   Z    e Zd Z fdZ	 	 	 	 	 	 ddej
                  dej
                  dz  dej                  dz  dedz  dedz  dej                  dz  d	e	ej
                  ej
                  f   dz  d
e
e   de	ej                  e	ej                  ej                  f   dz  f   fdZ xZS )Llama4TextDecoderLayerc                    t         |           |j                  | _        || _        |j                  |   | _        t        ||      | _        ||j                  v | _	        | j                  rt        |      | _        nt        ||j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )N)r1   rw   )r-   r.   r2   r  layer_typesattention_typer  	self_attn
moe_layersis_moe_layerr   feed_forwardrX   intermediate_size_mlpr{   r  input_layernormpost_attention_layernormr  s      r@   r.   zLlama4TextDecoderLayer.__init__  s    !--"$00;,VY?%):):: -f 5D -fHdHd eD01C1CI\I\](9&:L:LRXReRe(f%rA   NrB   r   r   r  	use_cacher  r  r   rC   c           
         |}	| j                  |      } | j                  d||||||d|\  }
}|	|
z   }|}	| j                  |      }| j                  |      }| j                  r|\  }}|	|j                  |	j                        z   }|S )N)rB   r  r   r  r5  r   )r3  r.  r4  r1  r0  rH   rI   )r>   rB   r   r   r  r5  r  r  r   residualattention_states_s               r@   rP   zLlama4TextDecoderLayer.forward  s     !,,]; -dnn 
' 3)+)
 
! !#33 !55mD))-8,M1 =#5#5hnn#EErA   )NNNFNN)rQ   rR   rS   r.   r6   rT   r(  r
   boolr   r   r   FloatTensorrP   rU   rV   s   @r@   r*  r*    s    g$ /304(,!&26HL"||" t+" &&-	"
 " $;" ((4/" #5<<#=>E" -." 
u  %(9(95;L;L(L"MPT"TT	U"rA   r*  c                   t     e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZdZ ej                          fd       Z xZS )Llama4PreTrainedModelr)   )imagetextTr  Fc                 4   t         |   |       t        | j                  d      r| j                  j                  n| j                  j
                  j                  }t        |t              rEt        j                  |j                  d|       t        j                  |j                  d|       y t        |t              rWt        j                  |j                  |j                         t        j                  |j                  |j                         y y )Ninitializer_ranger  )ro   std)rC  )r-   _init_weightsr  r)   rB  text_configr   r(   initnormal_r8   r:   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r>   r   rC  r?   s      r@   rD  z#Llama4PreTrainedModel._init_weights  s    f% t{{$78 KK))((:: 	
 f/0LL,,3C@LL))= 12LL//V\\BLL88fllK 3rA   )rQ   rR   rS   r%   r   input_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr6   r   rD  rU   rV   s   @r@   r>  r>    sW    (&*##4"5 N!"&U]]_L LrA   r>  c                   H    e Zd ZU dgZdZdZeed<   ee	e
dZdef fdZeeee	 	 	 	 	 	 	 ddej$                  dz  d	ej&                  dz  d
ej$                  dz  dedz  dej*                  dz  dedz  dej$                  dz  dee   deez  fd                            Z xZS )Llama4TextModelr*  model)r@  r)   )
attentionsrB   r   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nrw   r)   F)r-   r.   pad_token_idpadding_idx
vocab_sizer4   	Embeddingr2   embed_tokens
ModuleListrangenum_hidden_layersr*  layersr{   r  normr   
rotary_embgradient_checkpointing	post_initr  s      r@   r.   zLlama4TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammHMfNfNfHgh9#FI6h
 &f&8&8f>Q>QR	36B&+# 	 is   DN	input_idsr   r   r  inputs_embedsr5  r  r   rC   c                    |d u |d uz  rt        d      |>| j                  |j                  | j                  j                  j                              }|r|t        | j                        }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              s*| j                  |||||d}t        d
i |t        d
i |d}
|}| j!                  ||      }| j"                  d | j                  j$                   D ]  } ||f|
|j&                     |||||d|}! | j)                  |      }t+        ||r|	      S d 	      S )N:You must specify exactly one of input_ids or inputs_embedsrY  r   r$   )r   )r)   rh  r   r  r  r   )full_attentionchunked_attention)r   r   r  r5  r  r  )last_hidden_stater  r7  )
ValueErrorr^  r   r~   r   r   r)   get_seq_lengthr6   r   rI   	unsqueezer   dictr   r   rd  rb  ra  r-  rc  r   )r>   rg  r   r   r  rh  r5  r  r   past_seen_tokenscausal_mask_mappingmask_kwargsrB   freq_cisdecoder_layers                  r@   rP   zLlama4TextModel.forward  s    -t";<YZZ  --ill4;L;L;S;S;Z;Z.[\M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L ?-F ++!."0"0#2 ,K #5"C{"C%?%N+%N#
 & ??=,?![[)H4;;+H+HI 
	M)	2=3O3OP) /#-$,	 	M
	 		-0&+/8O
 	
>B
 	
rA   )NNNNNNN)rQ   rR   rS   _no_split_modulesbase_model_prefixrL  r&   r   r  r*  r   _can_record_outputsr.   r   r"   r#   r   r6   r(  rT   r
   r<  r;  r   r   r   r   rP   rU   rV   s   @r@   rU  rU    s+   12 )/&/    .2.204(,26!%26C
##d*C
 t+C
 &&-	C

 C
 ((4/C
 $;C
 ((4/C
 +,C
 
(	(C
     C
rA   rU  c                   t    e Zd ZU dgZdZddiZddiZeed<   def fdZ	e
e	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  ded	z  dej                   d	z  dej                  d	z  ded	z  dej                  d	z  deej                  z  dee   deez  fd              Z xZS )Llama4ForCausalLMr*  language_modelzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr)   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y rZ   )
r-   r.   rU  rV  r\  r4   r]   r2   r}  rf  r=   s     r@   r.   zLlama4ForCausalLM.__init__]  sU     $V,
 ++yy!3!3V5F5FUS 	rA   Nrg  r   r   r  rh  labelsr5  r  logits_to_keepr   rC   c
                 l    | j                   d|||||||d|
}|d   }t        |	t              rt        |	 d      n|	}| j	                  |dd|ddf         }d}|* | j
                  d||| j                  j                  d|
}t        |||j                  |j                  |j                        S )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Llama4ForCausalLM

        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rg  r   r   r  rh  r5  r  r   N)logitsr  r\  )lossr  r  rB   rW  r7  )rV  r   r   slicer}  loss_functionr)   r\  r   r  rB   rW  )r>   rg  r   r   r  rh  r  r5  r  r  r   outputsrB   slice_indicesr  r  s                   r@   rP   zLlama4ForCausalLM.forwardf  s    J $** 	
)%+')	
 	
  
8B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
rA   )	NNNNNNNNr   )rQ   rR   rS   rw  rx  _tied_weights_keys_tp_planr&   r   r.   r   r   r6   r(  rT   r
   r<  r;  r   r   r   r   r   rP   rU   rV   s   @r@   r{  r{  V  s>   12(*,GH23H/   .2.204(,26*.!%26-.<
##d*<
 t+<
 &&-	<

 <
 ((4/<
   4'<
 $;<
 ((4/<
 ell*<
 +,<
 
'	'<
  <
rA   r{  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	Llama4CausalLMOutputWithPasta3  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr  r  r  rB   rW  image_hidden_states)rQ   rR   rS   r'  r  r6   r<  r   r  r  r
   rB   r   rW  r  r7  rA   r@   r  r    s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18rA   r  c                   $     e Zd Z fdZd Z xZS )Llama4VisionMLP2c                 ~   t         |           |j                  | _        |j                  | _        t	        j
                  | j                  |j                  d      | _        t	        j
                  |j                  |j                  d      | _	        t	        j                         | _        |j                  | _        y rZ   )r-   r.   r2   r1   r4   r]   projector_input_dimfc1projector_output_dimfc2GELUr`   projector_dropoutr   r=   s     r@   r.   zLlama4VisionMLP2.__init__  s    !--!'!9!999T33V5O5OV[\99V88&:U:U\abWWY//rA   c                     | j                  |      }| j                  |      }t        j                  || j                  | j                        }| j                  | j                  |            S )Nr   )r  r`   Fr   r   r  r>   rB   s     r@   rP   zLlama4VisionMLP2.forward  sT    /**=9		-4<<$--X!!$((="9::rA   rd   rV   s   @r@   r  r    s    0;rA   r  c                   $     e Zd Z fdZd Z xZS )Llama4MultiModalProjectorc                     t         |           t        j                  |j                  j
                  |j                  j                  d      | _        y rZ   )	r-   r.   r4   r]   vision_configvision_output_dimrE  r2   linear_1r=   s     r@   r.   z"Llama4MultiModalProjector.__init__  s?    		  22**
rA   c                 (    | j                  |      }|S rb   )r  )r>   image_featuresrB   s      r@   rP   z!Llama4MultiModalProjector.forward  s    n5rA   rd   rV   s   @r@   r  r    s    
rA   r  c           
      J   | j                   \  }}}t        t        j                  |            }| j	                  |||d      } | j                         \  }}}}| j	                  ||t        ||z        t        ||z              }|j                  dddd      j                         }|j	                  |t        ||z        t        ||z        t        ||dz  z              }|j                  dddd      j                         }|j	                  |d|j                   d         }	|	S )NrE   r   r,   r$   r   )rI   r   mathsqrtrH   sizepermuter   )
input_tensorshuffle_ratio
batch_sizenum_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensors
             r@   pixel_shuffler    s%   (4(:(:%JXTYY{+,J$$ZZLL*6*;*;*='Jx"''
FC@U<VX[\dgt\tXuvO%--aAq9DDFO%**C./U]5J1KSQY]jlm]mQnMoO &--aAq9DDFO#((R9N9Nr9RSMrA   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4VisionPixelShuffleMLPc                     t         |           |j                  | _        t        |j                  | j                  dz  z        | _        |j                  | _        t        |      | _	        y r+   )
r-   r.   pixel_shuffle_ratior   r  	inner_dimr  
output_dimr  mlpr=   s     r@   r.   z$Llama4VisionPixelShuffleMLP.__init__  sX    #)#=#= V77D<T<TVW<WXY 55#F+rA   encoded_patchesrC   c                 P    t        || j                        }| j                  |      S rb   )r  r  r  )r>   r  s     r@   rP   z#Llama4VisionPixelShuffleMLP.forward   s#    '9Q9QRxx((rA   rQ   rR   rS   r.   r6   rT   rP   rU   rV   s   @r@   r  r    s#    ,)u|| ) )rA   r  freqs_cic                     |j                   }t        |j                        D cg c]  \  }}|dk(  s||dz
  k(  r|nd }}} | j                  | S c c}}w )Nr$   )ndim	enumeraterI   rH   )r  r   r  idrI   s         r@   reshape_for_broadcastr    sW    ::D=Fu{{=STTQ!q&AMQq0TET8==%   Us   Ac                 B   t        j                   | j                         j                  g | j                  d d dd       }t        j                   |j                         j                  g |j                  d d dd       }t        ||      }|j                  |j                        }t        j                  ||z        j                  d      }t        j                  ||z        j                  d      }|j                  |       |j                  |      fS )NrE   r,   )r  r   r   )r6   r   rt   r   rI   r  r   r   r   r   ru   )r   r   r  query_key_	query_outkey_outs          r@   vision_apply_rotary_embr    s    
 ""#85;;=#8#8#R%++cr:J#RB#RPQ#RSF  !4!4!4!Lciin!Lb!L!!LMD$hfEH{{6==)H""6H#45==a@I  199!<GU#W__S%999rA   c                        e Zd Zdef fdZ	 	 ddej                  dej                  dej                  dz  dedz  dee	   d	e
ej                  ej                  dz  e
ej                     dz  f   fd
Z xZS )Llama4VisionAttentionr)   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  |j
                  z  | _        d| _        |j                  | _	        | j                  dz  | _
        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  | j                  d      | _        y )Nr$   r   Tr[   )r-   r.   r)   r2   	embed_dimr   	num_headsr   r   r  r   r4   r]   r  r  r  r  r=   s     r@   r.   zLlama4VisionAttention.__init__  s   ++33**f.H.HH$%!!'!9!9}}d*ii0NUYZii0NUYZii0NUYZii >UYZrA   NrB   r  r   r  r   rC   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      }| j	                  |      j                  |      }	| j                  |      j                  |      }
t        ||	|      \  }}	|j                  dd      }|	j                  dd      }	|
j                  dd      }
t        j                  | j                  j                  t              } || ||	|
d f| j                  sdn| j                  d dd|\  }} |j                  g |d j!                         }| j#                  |      }||fS )NrE   )r  r$   r,   r  F)r   r   r	  )rI   r   r  rH   r  r  r  r   r   r  r)   r  r   r   r  r   r   r  )r>   rB   r  r   r  r   r   r!  r"  r   r   r%  r   r   s                 r@   rP   zLlama4VisionAttention.forward*  sj    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D#:<^f#g j#--a3))!Q/
#--a3(?(M(MKK,,.L)
 %8
%
  $}}C$2H2H
%
 
%
!\ *k));;;;FFHkk+.L((rA   r&  )rQ   rR   rS   r   r.   r6   rT   r
   r   r   r   rP   rU   rV   s   @r@   r  r    s    [1 [& /3(,')||') ,,') t+	')
 ') -.') 
u||U\\D0%2E2LL	M')rA   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4VisionMLPc                 &   t         |           || _        t        j                         | _        t        j                  |j                  |j                  d      | _	        t        j                  |j                  |j                  d      | _
        y )NTr[   )r-   r.   r)   r4   r  r`   r]   r2   r1   r  r  r=   s     r@   r.   zLlama4VisionMLP.__init__U  se    WWY99V//1I1IPTU99V55v7I7IPTUrA   rB   rC   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rb   )r  r`   r  r  s     r@   rP   zLlama4VisionMLP.forward\  s4    /**=9/rA   r  rV   s   @r@   r  r  T  s$    VU\\ ell rA   r  c            
            e Zd Zdef fdZ	 	 d	dej                  dej                  dej                  dz  dedz  fdZ xZ	S )
Llama4VisionEncoderLayerr)   c                    t         |           |j                  | _        t        |      | _        t        |      | _        t        j                  |j                        | _	        t        j                  |j                        | _
        y rb   )r-   r.   r2   r  r.  r  r  r4   	LayerNormr3  r4  r=   s     r@   r.   z!Llama4VisionEncoderLayer.__init__d  sb    !--.v6"6*!||F,>,>?(*V5G5G(H%rA   Nhidden_stater  r   output_attentionsc                     |}| j                  |      }| j                  |||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )N)r  r   )r3  r.  r4  r  )r>   r  r  r   r  r8  r   r  s           r@   rP   z Llama4VisionEncoderLayer.forwardn  s      ++L9%)^^) &4 &
"l
  ,.  44\Bxx-,./&GrA   r&  )
rQ   rR   rS   r   r.   r6   rT   r;  rP   rU   rV   s   @r@   r  r  c  sZ    I1 I /3)-ll ,, t+	
  $;rA   r  c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  dej                  dej                  dz  dedz  d	edz  d
edz  de	e
z  fdZ xZS )Llama4VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    r)   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        || _        y c c}w )NF)
r-   r.   r)   r4   r_  r`  ra  r  rb  re  )r>   r)   r:  r?   s      r@   r.   zLlama4VisionEncoder.__init__  sW    mmuU[UmUmOn$o!%=f%E$op&+# %ps   A*NrB   r  r   r  output_hidden_statesreturn_dictrC   c                 z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}| j                  D ]&  }	|r||fz   } |	||||      }
|r	||
d   fz   }|
d   }( |r||fz   }|st        d |||fD              S t        |||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr7  )r  r   r  r  r$   r   c              3   &   K   | ]	  }||  y wrb   r7  .0vs     r@   	<genexpr>z.Llama4VisionEncoder.forward.<locals>.<genexpr>  s     eqWXWde   rm  rB   rW  )r)   r  r  use_return_dictrb  r   r   )r>   rB   r  r   r  r  r  encoder_statesall_attentionsencoder_layerlayer_outputss              r@   rP   zLlama4VisionEncoder.forward  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d![[ 	-M#!/=2B!B)*-"3!	M !!/=3C2E!E)!,M	-   +}.>>Ne]NN$Seee+>Vd
 	
rA   NNNN)rQ   rR   rS   r'  r   r.   r6   rT   r;  r   r   rP   rU   rV   s   @r@   r  r    s    1  /3)-,0#'?
||?
 ,,?
 t+	?

  $;?
 #Tk?
 D[?
 
	 ?
rA   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4UnfoldConvolutionc                 <   t         |           |j                  }t        |t              r||f}t
        j                  j                  ||j                        | _        t        j                  |j                  |d   z  |d   z  |j                  d      | _        y )N)kernel_sizestrider   r$   Fr[   )r-   r.   r  r   r   r6   r4   Unfoldunfoldr]   num_channelsr2   linear)r>   r)   r  r?   s      r@   r.   z Llama4UnfoldConvolution.__init__  s    ''k3'&4Khhoo+fFWFWoXii+a.0;q>A
rA   rB   rC   c                 p    | j                  |      }|j                  ddd      }| j                  |      }|S )Nr   r,   r$   )r  r  r  r  s     r@   rP   zLlama4UnfoldConvolution.forward  s8    M2%--aA6M2rA   r  rV   s   @r@   r  r    s#    

U\\ ell rA   r  c                   *     e Zd Zdef fdZd Z xZS )Llama4VisionRotaryEmbeddingr)   c                    t         |           |j                  |j                  z  }t	        j
                  |dz  t        j                        j                  |dz  d      }t	        j                  ||d d gd      }d|d<   ||z  }||z  }|j                  |j                  z  dz  }d|j                  d	   t	        j
                  d|d      d |dz   j                         |z  z  z  }|dz   d
   |d d d d f   z  j                  dd      }|dz   d
   |d d d d f   z  j                  dd      }	t	        j                  ||	gd      j                         j                         dd d df   }
|
j                  |j                  ddd      dk  d      }
t	        j                   t	        j"                  t	        j$                  |
      t	        j&                  |
      gd            }|| _        y )Nr,   r   r$   r   rF   )rE   rE   r   r   ).NrE   .)r-   r.   
image_sizer  r6   r   int32r   catr2   r   r   rt   repeat_interleaver   masked_fillr   stackcossinr  )r>   r)   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   ru  r?   s               r@   r.   z$Llama4VisionRotaryEmbedding.__init__  s   6#4#44,,sAvU[[9AA#q&!L))Wgbqk2:#3%%)C)CCqH""<0Q!,->A?EEG(RT
	 "A%y1IdD!m4LL__`agi_j!A%y1IdD!m4LL__`agi_j		7G,"5;;=HHJ3PSRSPS8T!!'//"a";a"?C((eii6F		RWHX5Y_a)bc rA   c                 L    | j                   j                  |j                        S rb   )r  r   r   r  s     r@   rP   z#Llama4VisionRotaryEmbedding.forward
  s    }} 4 455rA   )rQ   rR   rS   r   r.   rP   rU   rV   s   @r@   r  r    s    !1 !(6rA   r  c                        e Zd ZU dZdZdgZeed<   def fdZd Z		 	 	 	 dde
j                  d	e
j                  dz  d
edz  dedz  dedz  deee
j                  df   z  fdZ xZS )rH  vision_model)r?  r  r)   c                 r   t         |   |       |j                  | _        |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  z  dz  dz   | _        |j                  dz  | _        t        |      | _	        t        j                  | j                  t        j                  | j                        z        | _        t        j                  | j                  t        j                  | j                  | j                        z        | _        t!        |      | _        t        j$                  | j                        | _        t        j$                  | j                        | _        t+        |      | _        t/        |      | _        | j3                          y )Nr,   r$   r   )r-   r.   r  r  r2   r  r  rJ  r  patch_embeddingr4   r5   r6   randnrI  rK  r  rotary_embeddingr  layernorm_prelayernorm_postr  rV  r  vision_adapterrf  r=   s     r@   r.   zLlama4VisionModel.__init__  sA     ++ ++!--"// OOt>1DqH''-
6v>!||DJJTEUEU9V,VW(*TZZ%++dN^N^`d`p`pBq5q(r% ;F C  \\$*:*:; ll4+;+;< )0
9&ArA   c                     | j                   S )zg
        This function is used to fetch the first embedding layer to activate grads on inputs.
        )r  rx   s    r@   get_input_embeddingsz&Llama4VisionModel.get_input_embeddings-  s     ###rA   Npixel_valuesr   r  r  r  rC   .c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j                  \  }}}	}
d}d}| j                  |      }|j                  \  }}}|j                  ||z  |z  ||      }| j                  j                  |j                  d   d|j                  d         }t        j                  ||gd      }|dz  }|j                  ||z  |||      }| j                  j                  |j                  |j                        }||z   }| j                  |      }|j!                  |d|      }| j#                  |      }| j%                  |d|||      }|j&                  }| j)                  |      }|ddddddf   }| j+                  |      }|r|j,                  nd}|r|d   }nd}|st/        d	 |||fD              S t1        |||
      S )a  

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, MllamaVisionModel

        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
        >>> processor = AutoProcessor.from_pretrained(checkpoint)

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> output = model(**inputs)

        >>> print(output.last_hidden_state.shape)
        torch.Size([1, 1, 4, 1025, 7680])
        ```
        Nr$   r   rE   rF   r   r   )r   r  r  r  r,   c              3   &   K   | ]	  }||  y wrb   r7  r  s     r@   r  z,Llama4VisionModel.forward.<locals>.<genexpr>  s     _qQRQ^_r  r  )r)   r  r  r  rI   r  r   rI  r   r6   r  rK  r   r   r   r  rH   r  rV  rm  r  r  rB   r   r   )r>   r  r   r  r  r  r   batch_size_times_num_tilesr  r  r  num_concurrent_media
num_chunksr  r:  r  r   rI  positional_embeddingr  r   rB   rW  s                          r@   rP   zLlama4VisionModel.forward3  sT   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] COBTBT?"L&% 
++L9%1%7%7";
 $++&)==
JKYc
 ..55l6H6H6KQP\PbPbcePfgyy,!@aHq $++&)==z;Xb
  $<<??lFXFXamatat?u#&::)),7#(()CRT((6!5/  
 //**<8#AssAI. **<80D,,$JJ_\=*$M___)*'!
 	
rA   r  )rQ   rR   rS   rx  rL  rw  r   r   r.   r  r6   rT   r;  r   r   rP   rU   rV   s   @r@   rH  rH    s    &!341 2$ /3)-,0#'b
llb
 t+b
  $;	b

 #Tkb
 D[b
 
$eELL#,=&>	>b
rA   rH  c            %           e Zd ZU ddgZi ZdZeed<   def fdZd Z	d Z
d Zd	 Zd
 Zd Ze ed       ed      dej&                  dedee   deez  fd                     Zdej4                  dej&                  dej&                  fdZe ed      e	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&dej4                  dz  dej&                  dz  dej8                  dz  dej4                  dz  dedz  dej&                  dz  dedz  dej4                  dz  dedz  dedz  d edz  d!edz  d"ej4                  dz  d#eej8                  z  dee   dee z  f d$                     Z!	 	 	 	 	 	 	 d'd%Z" xZ#S )(Llama4ForConditionalGenerationr*  r  rV  r)   c                    t         |   |       t        |j                        | _        t        |      | _        t        |j                        | _	        |j                  j                  | _
        t        | j                  d      r| j                  j                  | _        n)| j                  j                  j                  xs d| _        | j                          y )NrZ  rE   )r-   r.   rH  r  r  r  multi_modal_projectorr{  rE  r|  r\  r  r)   rZ  rf  r=   s     r@   r.   z'Llama4ForConditionalGeneration.__init__  s     -f.B.BC%>v%F"/0B0BC ,,774;;/ $ 8 8D $ 7 7 D D JDrA   c                 6    | j                   j                         S rb   )r|  r  rx   s    r@   r  z3Llama4ForConditionalGeneration.get_input_embeddings  s    ""7799rA   c                 :    | j                   j                  |       y rb   )r|  set_input_embeddings)r>   r   s     r@   r'  z3Llama4ForConditionalGeneration.set_input_embeddings  s    007rA   c                 6    | j                   j                         S rb   )r|  get_output_embeddingsrx   s    r@   r)  z4Llama4ForConditionalGeneration.get_output_embeddings  s    ""88::rA   c                 :    | j                   j                  |       y rb   )r|  set_output_embeddings)r>   new_embeddingss     r@   r+  z4Llama4ForConditionalGeneration.set_output_embeddings  s    11.ArA   c                 :    | j                   j                  |       y rb   )r|  set_decoder)r>   decoders     r@   r.  z*Llama4ForConditionalGeneration.set_decoder  s    ''0rA   c                 6    | j                   j                         S rb   )r|  get_decoderrx   s    r@   r1  z*Llama4ForConditionalGeneration.get_decoder  s    ""..00rA   F)tie_last_hidden_stateszOObtains image last hidden states from the vision tower and apply al projection.r  r  vision_feature_select_strategyr   rC   c                     |j                         D ci c]  \  }}|	|| }}} | j                  |fi |S c c}}w )aj  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
            The tensors corresponding to the input images.
        vision_feature_select_strategy (`str`):
            The feature selection strategy used to select the vision feature from the vision backbone.
            Can be one of `"default"` or `"full"`
        )itemsr  )r>   r  r3  r   kr  s         r@   get_image_featuresz1Llama4ForConditionalGeneration.get_image_features  sG      $*<<>C41aQ]!Q$CC t  888 Ds   
::rg  rh  r  c                 *   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|j                  d           |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r  rE   z6Image features and image tokens do not match, tokens: z, features: r   )r  r6   tensorr)   image_token_idlongr   allr   rp  	expand_asr   r    numelrI   )r>   rg  rh  r  special_image_maskn_image_tokenss         r@   get_placeholder_maskz3Llama4ForConditionalGeneration.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aoauauvwax`yz	
 "!rA   Nr   r   r  r  r5  r  r  r  r  r  c                    |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|du |duz  rt	        d      ||t	        d      | | j                         |      }|| j                  ||d      j                  }|j                  d|j                  d            }| j                  |      j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                   d|||||	|
||||d
|}|d	   }d}|<||dd|j"                  d
   d
z
   df   j                  |j                        }|dddddf   |j                  |j                        d	k7     j%                         }|dd
df   |j                  |j                        d	k7     j%                         }n1|dddddf   j%                         }|dd
df   j%                         }t'        j(                         } ||j                  d|j                  d            |j                  d      j                  |j                              }|s|f|d
d z   }||f|z   S |S t+        |||j,                  |j.                  |j0                  |      S d      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```Nrj  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oneT)r  r3  r  rE   )rh  r  )
r   r   r  rh  r5  r  r  r  r  r  r   r$   .)r  r  r  rB   rW  r  r7  )r)   r  r  r  rn  r  r7  rm  rH   r  r$  r   r   r   rA  masked_scatterr|  rI   r   r4   CrossEntropyLossr  r  rB   rW  )r>   rg  r  r   r   r  rh  r3  r  r5  r  r  r  r  r  r   r  vision_flatprojected_vision_flatr?  r  r  r  shift_attention_maskshift_logitsshift_labelsloss_fctr   s                               r@   rP   z&Llama4ForConditionalGeneration.forward  sT   f 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<YZZ#(Av   7D557	BM#!44)/M  5   	  )--b.2E2Eb2IJK$($>$>{$K$N$N$$m&9&9%! "&!:!:G\ "; " *889KMbcM%$%% 
)%+'/!5#))
 
 ) (6a6<<?Q;N9O9Q6Q'R'U'UV\VcVc'd$%c3B3k23G3J3J6==3Y]^3^_jjl%c12g/C/F/Fv}}/UYZ/Z[ffh%c3B3k2==?%c12g99;**,H!!"l&7&7&;<l>O>OPR>S>V>VWcWjWj>kD Y,F'+'7D7V#CVC+#33!//))2>2J
 	
 QU
 	
rA   c	           
           | j                   j                  |f||||||d|	}
|s|	j                  dd      s||
d<   |
S )N)r  rh  r   r  r  is_first_iterationr5  Tr  )r|  prepare_inputs_for_generationget)r>   rg  r  rh  r  r   r  r  rL  r   model_inputss              r@   rM  z<Llama4ForConditionalGeneration.prepare_inputs_for_generationk  se     It**HH	
+')))1	
 	
 VZZT%B
 ,8L(rA   )NNNNNNNNNNNNNr   )NNNNNNF)$rQ   rR   rS   rw  r  rx  r%   r   r.   r  r'  r)  r+  r.  r1  r"   r#   r   r6   r<  r   r   r   r   r   r7  r(  rA  rT   r
   r;  r   r  rP   rM  rU   rV   s   @r@   r"  r"    s|   13MNH| :8;B11  E2!rs9''9 ),9 +,	9
 
+	+9 t 3  9 "))":?:K:K"]b]n]n".  E2 .215.204(,2659*.!%)-,0#'26-.~
##d*~
 ''$.~
 t+	~

 &&-~
 ~
 ((4/~
 ),d
~
   4'~
 $;~
  $;~
 #Tk~
 D[~
 ((4/~
 ell*~
  +,!~
" 
-	-#~
  3  ~
F   rA   r"  )r>  rU  rH  r{  r"  )r  )gr  collections.abcr   dataclassesr   typingr   r6   torch.nnr4   torch.nn.functionalr   r  /transformers.models.llama4.configuration_llama4r    r   rF  activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r    utils.genericr!   r"   utils.output_capturingr#   configuration_llama4r%   r&   
get_loggerrQ   loggerModuler(   rX   rf   r{   r]   r   r   r   rT   r   r   r   r   rt   r   r   r  r*  r>  rU  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  rH  r"  __all__r7  rA   r@   <module>rj     s    $ !      N & ! . ) 7 K B 9  G & j j G 5 @ 
		H	%		 B)BII )$!uxx !=		 =(,299 , _-"BII " .",?		 ?D	2	2	2 ||	2 5<<%&		2	UU\\ 	U# 	U%,, 	U( %II%<<% 
% <<	%
 LL4'% % %B %II%<<% 
% <<	%
 LL4'% % %2Z)")) Z)z27 2j LO L L8 b
+ b
 b
JN
- N
b 
9; 9 90;uxx ;"		 (
)")) 
)!ELL ! !:<<:	: ll: 5<<%&	:7)BII 7)tbii )9 )XO
")) O
dbii (6")) 62G
- G
Ts%:O slrA   