
    qil                       d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	Z	d dl
mZ d dlmc mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z: ee/ G d de#                    Z;d Z< ed      ded       Z=de	j|                  de?de	j|                  fdZ@	 dfd ej                  d!e	j|                  d"e	j|                  d#e	j|                  d$e	j|                  dz  d%eBd&eBd'e,e.   fd(ZC ee=       G d) d*ej                               ZD ed+       G d, d-ej                               ZE G d. d/ej                        ZF G d0 d1e       ZG G d2 d3ej                        ZH G d4 d5ej                        ZI G d6 d7ej                        ZJ G d8 d9ej                        ZK G d: d;ej                        ZL G d< d=ej                        ZM G d> d?ej                        ZN G d@ dAej                        ZO G dB dCej                        ZP G dD dEej                        ZQ G dF dGej                        ZS G dH dIej                        ZT G dJ dKej                        ZU G dL dMej                        ZV G dN dOej                        ZW G dP dQej                        ZX e/dRS       G dT dUe*             ZY G dV dW      ZZe/ G dX dYe*             Z[ G dZ d[ej                        Z\e/ G d\ d]e[             Z]e/ G d^ d_e[e             Z^ G d` dae[      Z_ G db dce[e      Z`g ddZay)g    N)Callable)	dataclass)cached_property)Optional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                   :    e Zd ZU dZdZej                  dz  ed<   y)Emu3VQVAEModelOutputz
    image_tokens (`torch.LongTensor` of shape `(batch_size, config.vocab_size`):
        Indices of the image tokens predicted by the VQ-VAE model.
    Nimage_tokens)__name__
__module____qualname____doc__r'   torch
LongTensor__annotations__     X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/emu3/modeling_emu3.pyr&   r&   1   s    
 -1L%""T)0r0   r&   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..N   dim)shaper,   cat)xx1x2s      r1   rotate_halfr<   <   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r0   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer<   )qkcossinunsqueeze_dimq_embedk_embeds          r1   apply_rotary_pos_embrG   C   sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr0   hidden_statesn_repreturnc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)r7   expandreshape)rH   rI   batchnum_key_value_headsslenhead_dims         r1   	repeat_kvrR   ]   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr0   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr4   r   r3   )r6   dtype)ptrainingr!   )rR   num_key_value_groupsr,   matmul	transposenn
functionalsoftmaxfloat32tor\   rY   r^   
contiguous)rS   rT   rU   rV   rW   rX   rY   rZ   
key_statesvalue_statesattn_weightsattn_outputs               r1   eager_attention_forwardrl   i   s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r0   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  d	e
dz  d
ej                  dz  dee   de	ej                  ej                  f   fdZ xZS )Emu3Attention=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 d   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y )NrQ         Tbias)super__init__rp   rq   getattrhidden_sizenum_attention_headsrQ   rO   r_   rX   attention_dropout	is_causalrb   Linearattention_biasq_projk_projv_projo_projselfrp   rq   	__class__s      r1   rw   zEmu3Attention.__init__   sM   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r0   NrH   position_embeddingsrW   past_key_valuescache_positionrZ   rJ   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t              } || |	|
||f| j                  sdn| j                   | j"                  d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )Nr3   r!   r4   )rC   rB   r           )rY   rX   )r7   rQ   r   viewra   r   r   rG   updaterq   r   get_interfacerp   _attn_implementationrl   r^   r{   rX   rM   rg   r   )r   rH   r   rW   r   r   rZ   input_shapehidden_shapequery_statesrh   ri   rB   rC   cache_kwargsattention_interfacerk   rj   s                     r1   forwardzEmu3Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r0   )NNNN)r(   r)   r*   r+   r"   intrw   r,   Tensortupler
   r-   r   r   r   __classcell__r   s   @r1   rn   rn      s    G
z 
c 
4 IM.2(,26))||)) #5<<#=>E)) t+	))
 )) ((4/)) +,)) 
u||U\\)	*))r0   rn   RMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	Emu3RMSNormepsrJ   Nc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z:
        Emu3RMSNorm is equivalent to T5LayerNorm
        N)rv   rw   rb   	Parameterr,   onesweightvariance_epsilon)r   ry   r   r   s      r1   rw   zEmu3RMSNorm.__init__   s1     	ll5::k#:; #r0   rH   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr4   r3   T)keepdim)	r\   rf   r,   re   powmeanrsqrtr   r   )r   rH   input_dtypevariances       r1   r   zEmu3RMSNorm.forward   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r0   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r   r   r7   r   r   s    r1   
extra_reprzEmu3RMSNorm.extra_repr   s*    ))*+6$2G2G1HIIr0   )ư>)
r(   r)   r*   floatrw   r,   r   r   r   r   r   s   @r1   r   r      s7    $ $$ $;U\\ ;ell ;Jr0   r   c                   $     e Zd Z fdZd Z xZS )Emu3MLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y )Nrt   )rv   rw   rp   ry   intermediate_sizerb   r}   mlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr   rp   r   s     r1   rw   zEmu3MLP.__init__   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r0   c                     | j                  | j                  | j                  |            | j                  |      z        }|S N)r   r   r   r   )r   r9   r   s      r1   r   zEmu3MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r0   r(   r)   r*   rw   r   r   r   s   @r1   r   r      s    0r0   r   c                   "    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  deej                  ej                  f   dz  dee   dej                  fdZ xZS )Emu3DecoderLayerrp   rq   c                 h   t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        t        j                  |j                        | _        y )N)rp   rq   r   )rv   rw   ry   rn   	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernormrb   Dropoutr{   rY   r   s      r1   rw   zEmu3DecoderLayer.__init__   s    !--&f	J6?*6+=+=6CVCVW(3F4F4FFL_L_(`%zz&":":;r0   NrH   rW   position_idsr   	use_cacher   r   rZ   rJ   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
|	| j                  |      z   }|}	| j                  |      }| j	                  |      }|	| j                  |      z   }|S )N)rH   rW   r   r   r   r   r   r/   )r   r   rY   r   r   )r   rH   rW   r   r   r   r   r   rZ   residual_s              r1   r   zEmu3DecoderLayer.forward   s     !,,];)4>> 	
')%+) 3	
 	
q !4<<#>> 55mD/ 4<<#>>r0   )NNNFNN)r(   r)   r*   r"   r   rw   r,   r   r-   r
   boolr   r   r   r   r   r   s   @r1   r   r      s    	<z 	<c 	< /304(,!&26HL|| t+ &&-	
  $; ((4/ #5<<#=>E +, 
r0   r   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )Emu3VQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    rp   c                    t         |           t        j                  |j                  |j
                        | _        | j                  j                  j                  j                  d|j                  z  d|j                  z         y )Ng            ?)
rv   rw   rb   	Embeddingcodebook_size	embed_dim	embeddingr   datauniform_r   s     r1   rw   z!Emu3VQVAEVectorQuantizer.__init__&  sb    f&:&:F<L<LM""++D63G3G,GvOcOcIcdr0   hidden_statec                    |j                   \  }}}}}|j                  ddddd      j                         }|j                  d|      }t	        j
                  |dz  dd      }t	        j
                  | j                  j                  dz  d	      }	dt	        j                  || j                  j                  j                  dd            z  }
||	z   |
z
  }
t	        j                  |
d	      }|j                  ||||      }|S )
Nr   r!   r      r4   r3   T)r6   r   r5   )r7   permuterg   r   r,   sumr   r   r`   ra   argmin)r   r   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r1   r   z Emu3VQVAEVectorQuantizer.forward+  s    8D8J8J5
Hh#++Aq!Q:EEG!-!2!22x!@ !99%;Q%>AtT		$.."7"7":B %;T^^=R=R=\=\]^`a=bcc	$}4y@	$||I1=388XvW\]##r0   )
r(   r)   r*   r+   r$   rw   r,   r   r   r   r   s   @r1   r   r     s&    e e
$ELL $r0   r   c                   $     e Zd Z fdZd Z xZS )Emu3VQVAEEncoderConvDownsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr   r4   r   kernel_sizestridepaddingrv   rw   rb   Conv2dconvr   in_channelsr   s     r1   rw   z'Emu3VQVAEEncoderConvDownsample.__init__>  '    IIk;AaYZ[	r0   c                 Z    t        j                  |ddd      }| j                  |      }|S )N)r   r!   r   r!   constantr   )padmoderV   )Fr   r   r   rH   s     r1   r   z&Emu3VQVAEEncoderConvDownsample.forwardB  s+    mJVWX		-0r0   r   r   s   @r1   r   r   =  s    \r0   r   c                   $     e Zd Z fdZd Z xZS )Emu3VQVAEEncoderConvUpsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr   r!   r   r   r   s     r1   rw   z%Emu3VQVAEEncoderConvUpsample.__init__J  r   r0   c                 X    t        j                  |dd      }| j                  |      }|S )N       @nearestscale_factorr   )r   interpolater   r   s     r1   r   z$Emu3VQVAEEncoderConvUpsample.forwardN  s(    m#IV		-0r0   r   r   s   @r1   r   r   I  s    \r0   r   c            	       \     e Zd Zdededee   dee   f fdZdej                  fdZ xZ	S )Emu3VQVAEConv3d
in_channelout_channelr   r   c                 P   t         	|           t        |dd  |dd        D cg c]
  \  }}||z
   }}}d| _        |d d d   D ]%  }| xj                  |dz  |dz  z   |dz  fz  c_        ' | xj                  dz  c_        t	        j
                  ||||      | _        y c c}}w )Nr!   r/   r3   r4   )r4   r   )r   )rv   rw   zipr   rb   Conv3dr   )
r   r  r  r   r   
one_kernel
one_stridepadding_sizespad_sizer   s
            r1   rw   zEmu3VQVAEConv3d.__init__U  s     	ORS^_`_aSbdjklkmdnOop5KZj0pp%dd+ 	JHLLX]X\98q=IIL	JII	
	 qs   B"rH   c                 h    t        j                  || j                        }| j                  |      }|S r   )r   r   r   r   r   s     r1   r   zEmu3VQVAEConv3d.forwardk  s*    mT\\:		-0r0   )
r(   r)   r*   r   r   rw   r,   r   r   r   r   s   @r1   r  r  T  sF    

 
 3Z	

 c

,U\\ r0   r  c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZS )Emu3VQVAESpatialNormr   out_channelsc                     t         |           t        j                  |ddd      | _        t        j
                  ||ddd      | _        t        j
                  ||ddd      | _        y )N    r   Tnum_channels
num_groupsr   affiner!   r   r   )rv   rw   rb   	GroupNorm
norm_layerr   conv_yconv_br   r   r  r   s      r1   rw   zEmu3VQVAESpatialNorm.__init__r  sn    
 	,,%	
 ii
 ii
r0   rH   quant_statesc                     t        j                  ||j                  dd  d      }| j                  |      }|| j	                  |      z  | j                  |      z   }|S )Nr   )sizer   )r   r  r7   r  r  r  )r   rH   r  s      r1   r   zEmu3VQVAESpatialNorm.forward  sX    }}\8K8KBC8PW`a6%L(AADKKP\D]]r0   	r(   r)   r*   r   rw   r,   r   r   r   r   s   @r1   r  r  q  s5    

 
8U\\  r0   r  c                   H     e Zd Zdedef fdZdej                  fdZ xZS )Emu3VQVAETemporalUpsampler  r  c                 J    t         |           t        ||dd      | _        y )Nr   r   r   r!   r!   r!   r   r   rv   rw   r  r   r   r  r  r   s      r1   rw   z"Emu3VQVAETemporalUpsample.__init__  (    
 	#!	
	r0   rH   c                 P   |j                   \  }}}}}|j                  ddddd      j                         j                  |d|      }t	        j
                  |dd	      }|j                  ||||d      j                  ddddd      j                         }| j                  |      }|S )
Nr   r!   r   r   r4   r3   r   r   r   )r7   r   rg   r   r   r  r   )r   rH   r   r   r   r   r   s          r1   r   z!Emu3VQVAETemporalUpsample.forward  s    8E8K8K5
Hh%--aAq!<GGINNz[]_ghm#IV%**:xPRS[[\]_`bcefhijuuw		-0r0   r!  r   s   @r1   r#  r#    s*    

 
U\\ r0   r#  c                   H     e Zd Zdedef fdZdej                  fdZ xZS )Emu3VQVAETemporalDownsampler  r  c                 J    t         |           t        ||dd      | _        y )N)r   r   r   )r4   r!   r!   r'  r(  r)  s      r1   rw   z$Emu3VQVAETemporalDownsample.__init__  r*  r0   rH   c                 (    | j                  |      }|S r   )r   r   s     r1   r   z#Emu3VQVAETemporalDownsample.forward  s    		-0r0   r!  r   s   @r1   r-  r-    s*    

 
U\\ r0   r-  c                   (     e Zd Z	 d fd	Zd Z xZS )Emu3VQVAETemporalResnetBlockc                 p   t         |           || _        ||n|| _        t	        j
                  |      | _        t        ||dd      | _        t	        j
                  |      | _	        t        ||dd      | _
        | j                  | j                  k7  r t	        j                  ||ddd      | _        y y )Nr%  r&  r'  r!   r   r   )rv   rw   r   r  rb   BatchNorm3dnorm1r  conv1norm2conv2r	  nin_shortcutr  s      r1   rw   z%Emu3VQVAETemporalResnetBlock.__init__  s    
 	&+7+?K\^^K0
$!	

 ^^L1
$!	

 t000 "		!D 1r0   c                 L   |}| j                  |      }|t        j                  |      z  }| j                  |      }| j	                  |      }|t        j                  |      z  }| j                  |      }| j                  | j                  k7  r| j                  |      }||z   S r   )	r4  r,   sigmoidr5  r6  r7  r   r  r8  )r   rH   r   s      r1   r   z$Emu3VQVAETemporalResnetBlock.forward  s     

=1}55

=1

=1}55

=1t000((2H-''r0   r   r   r   s   @r1   r1  r1    s     @(r0   r1  c                   ~     e Zd Z	 	 ddededz  dedz  f fdZd	dej                  dej                  dz  fdZ xZS )
Emu3VQVAEResnetBlockNr   r  quant_channelsc                    t         |           || _        ||n|}|| _        || _        |=t        j                  |ddd      | _        t        j                  |ddd      | _        n"t        ||      | _        t        ||      | _        t        j                  ||ddd      | _        t        j                  ||ddd      | _        | j                  | j                  k7  r t        j                  ||ddd      | _        y y )	Nr  r   Tr  r   r!   r   r   )rv   rw   r   r  r=  rb   r  r4  r6  r  r   r5  r7  r8  )r   r   r  r=  r   s       r1   rw   zEmu3VQVAEResnetBlock.__init__  s    	&&2&:{(,!;2SW`deDJ<BTXaefDJ-nkJDJ-nlKDJYY

 YY

 t000 "		!D 1r0   rH   c                 v   | j                   dn|f}|} | j                  |g| }|t        j                  |      z  }| j	                  |      } | j
                  |g| }|t        j                  |      z  }| j                  |      }| j                  | j                  k7  r| j                  |      }||z   S Nr/   )
r=  r4  r,   r:  r5  r6  r7  r   r  r8  )r   rH   r=  	norm_argsr   s        r1   r   zEmu3VQVAEResnetBlock.forward  s    --5BN;L	 "

==9=}55

=1"

==9=}55

=1t000((2H-''r0   )NNr   r!  r   s   @r1   r<  r<    sV     $(%)	** Dj* d
	*X(U\\ (5<<RVCV (r0   r<  c            
            e Zd ZdZdef fdZ	 d	dej                  dej                  dz  deej                  ej                  dz  f   fdZ	 xZ
S )
Emu3VQVAEAttentionBlockro   rp   c                 &   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        d| _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rs   Fr!   )rv   rw   rp   ry   r   rz   	num_headsrQ   
ValueErrorscaler{   rY   r|   rb   r}   r   r   r   out_projr_   r   s     r1   rw   z Emu3VQVAEAttentionBlock.__init__2  s$   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..A %&!r0   NrH   rW   rJ   c           
         |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|| j                  | j                  | j                  sdn| j                         \  }}|j#                  |||      j%                         }| j'                  |      }||fS )z#Input shape: Batch x Time x Channelr!   r4   r   )r|   rX   rY   )r7   r   r   r   r   rE  rQ   ra   r   r   rp   r   rl   r|   rG  r^   rY   rM   rg   rH  )r   rH   rW   rZ   r   
seq_lengthr   querieskeysvaluesr   rk   rj   s                r1   r   zEmu3VQVAEAttentionBlock.forwardI  sW    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0L((r0   r   )r(   r)   r*   r+   r$   rw   r,   r   r   r   r   r   s   @r1   rC  rC  /  s\    G& &4 /3$)||$) t+$)
 
u||U\\D00	1$)r0   rC  c                   *     e Zd ZdZ fdZddZ xZS )Emu3VQVAEGroupNormz
    Same as the torch GroupNorm with the only difference that this ones accepts
    an optional kwarg `quant_states` which is not used. This class makes it easier to
    use SpatialNorm or GroupNorm without conditionals
    c                 $    t        |   di | y r@  )rv   rw   )r   rZ   r   s     r1   rw   zEmu3VQVAEGroupNorm.__init__w  s    "6"r0   c                     t        j                  || j                  | j                  | j                  | j
                        S r   )r   
group_normr  r   ru   r   )r   inputr  s      r1   r   zEmu3VQVAEGroupNorm.forwardz  s)    ||E4??DKKDHHUUr0   r   )r(   r)   r*   r+   rw   r   r   r   s   @r1   rO  rO  p  s    #Vr0   rO  c                   `     e Zd Zd fd	Zddej
                  dej
                  dz  fdZ xZS )Emu3VQVAEMiddleBlockNc                     t         |           t        |||      | _        t	        |      | _        |t        |ddd      | _        nt        ||      | _        t        |||      | _	        y )Nr   r  r=  r  r   Tr  )
rv   rw   r<  block_1rC  attn_1rO  	attn_normr  block_2)r   rp   r   r=  r   s       r1   rw   zEmu3VQVAEMiddleBlock.__init__  so    +#$)

 .f5!/[UW]ajnoDN1.+NDN+#$)
r0   rH   r  c                 b   | j                  ||      }|}| j                  ||      }|j                  \  }}}}|j                  ||||z        j	                  dd      }| j                  |      d   }|j                  ||||      j                  dddd      }||z   }| j                  ||      }|S )Nr!   r4   r   r   )	rX  rZ  r7   r   ra   rY  rM   r   r[  )r   rH   r  r   r   r   r   r   s           r1   r   zEmu3VQVAEMiddleBlock.forward  s    ]LA }lC.;.A.A+
Hfe%**:x%PZZ[\^_`M215%--j&%RZZ[\^_abdef =0]LAr0   r   r(   r)   r*   rw   r,   FloatTensorr   r   r   s   @r1   rU  rU  ~  s-    
(
U%6%6 
eFWFWZ^F^ 
r0   rU  c                   >     e Zd Z fdZdej
                  fdZ xZS )Emu3VQVAEDownBlockc           
         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }dt        |      z   }|| _        t        j                         | _        t        | j                        D ]K  }t        j                         }t        j                         }t        j                         }|||   z  }	|||   z  }
t        | j
                        D ]~  }|j                  t        |	|
             |
}	|j                  .||j                  v s=|j                  t!        |             |j                  t        j"                  |	ddd              t        j$                         }||_        ||_        ||_        || j                  dz
  k7  rt-        |	      |_        | j                  j                  |       N y )Nr!   r   r  r  r   Tr  r!   )rv   rw   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsr   in_channel_multiplierrb   
ModuleListdownrangeappendr<  attn_resolutionsrC  r  Moduleblockattn
attn_normsr   
downsample)r   rp   rh  re  ri  i_levelrp  rq  rr  block_in	block_outi_blockrk  r   s                r1   rw   zEmu3VQVAEDownBlock.__init__  s   "6#<#<=$33,,#66 $u-?'@ @%:"MMO	T112 	#GMMOE==?DJ$'<W'EEH%(:7(CCI !4!45 
q($,%. %**67fF]F];]KK 7 ?@%%bllUW]ajn&op
q 99;DDJDI(DO$..22"@"JIIT"1	#r0   rH   c                 >   t        | j                        D ]  \  }}t        | j                        D ]  } |j                  |   |      }t        |j                        dkD  s1|} |j                  |   |      }|j                  \  }}}}	|j                  ||||	z        j                  dd      } |j                  |   |      d   }|j                  |||	|      j                  dddd      }||z   } || j                  dz
  k7  s|j                  |      } |S )Nr   r!   r4   r   )	enumeraterk  rl  rg  rp  rd  rq  rr  r7   r   ra   rM   r   rf  rs  )
r   rH   rt  blocksrw  r   r   r   r   r   s
             r1   r   zEmu3VQVAEDownBlock.forward  s5   (3 	AOGV !4!45 = 5W 5m Dv{{#a',H$>F$5$5g$>}$MM:G:M:M7J&%$1$6$6z8VV[^$\$f$fghjk$lM$8FKK$8$G$JM$1$9$9*feU]$^$f$fghjkmnpq$rM$,}$<M= $..22 & 1 1- @	A" r0   r]  r   s   @r1   r`  r`    s    ##JU%6%6 r0   r`  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Emu3VQVAEUpBlockc           	         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  |j                  d   z  }t        j                         | _
        t        t        | j                              D ]5  }t        j                         }t        j                         }t        j                         }|j                  |j                  |   z  }t        | j
                  dz         D ]e  }	|j                  t        |||             |}||j                  v s1|j                  t!        |             |j                  t#        ||             g t        j$                         }
||
_        ||
_        ||
_        |dk7  rt-        |      |
_        | j                  j1                  d|
       8 y )Nr3   r!   rW  r   )rv   rw   rd  re  rf  rg  r   rh  rb   rj  upreversedrl  rm  r<  rn  rC  r  ro  rp  rq  rr  r   upsampleinsert)r   rp   r=  ru  rt  rp  rq  rr  rv  rw  r~  r   s              r1   rw   zEmu3VQVAEUpBlock.__init__  s   "6#<#<=$33))''&*C*CB*GG--/d&:&: ;< 	"GMMOE==?DJ,,v/H/H/QQI !4!4q!89 V($,%.'5 %f555KK 7 ?@%%&:>8&TUV BBHBG&BM!|:8DGGNN1b!3	"r0   rH   r  c                 h   t        | j                  d d d         D ]  \  }}t        | j                  dz         D ]  } |j                  |   ||      }t        |j                        dkD  s2|} |j                  |   ||      }|j                  \  }}}	}
|j                  |||	|
z        j                  dd      } |j                  |   |      d   }|j                  ||	|
|      j                  dddd      }||z   } |t        | j                        dz
  k7  s|j                  |      } |S )Nr3   r!   r   r4   r   )ry  r~  rl  rg  rp  rd  rq  rr  r7   r   ra   rM   r   r  )r   rH   r  rt  rz  rw  r   r   r   r   r   s              r1   r   zEmu3VQVAEUpBlock.forward  sD   (27 	?OGV !4!4q!89 = 5W 5m\ Rv{{#a',H$>F$5$5g$>}l$[M:G:M:M7J&%$1$6$6z8VV[^$\$f$fghjk$lM$8FKK$8$G$JM$1$9$9*feU]$^$f$fghjkmnpq$rM$,}$<M= #dgg,** & >	?  r0   r]  r   s   @r1   r|  r|    s(    #"JU%6%6 eFWFW r0   r|  c                   >     e Zd Z fdZdej
                  fdZ xZS )Emu3VQVAEEncoderc                    t         |           |j                  }|j                  }|j                  }|j
                  }|j                  }|rd|z  n|}||d   z  }t        j                  j                  ||ddd      | _
        t        |      | _        t        ||      | _        t        j                  j                  d|dd	      | _        t        j                  j                  ||ddd      | _        t%        t'        j(                  |j*                              }	t        j,                         | _        t        j,                         | _        t3        |	      D ])  }
t5        ||      }| j.                  j7                  |       + t3        |j8                        D ]*  }t;        ||
      }| j0                  j7                  |       , y )Nr4   r3   r   r!   r   r  r   T)r  r  r   r  rc  )rv   rw   rh  r   double_latentlatent_channelsre  r,   rb   r   conv_inr`  
down_blockrU  middle_blockr  norm_outconv_outr   mathlog2temporal_downsample_factorrj  	time_convtime_res_stackrl  r-  rm  rg  r1  )r   rp   rh  r   r  r  re  r  ru  temporal_down_blocksir   r   time_res_convr   s                 r1   rw   zEmu3VQVAEEncoder.__init__  s   ,,((,, 00#66.;q?* #5b#99xx{MqYZdef,V40B**bxUYbf*g ( 
  #499V-N-N#OP mmo+, 	(A.|\JDNN!!$'	( v,,- 	6A8()M &&}5	6r0   pixel_valuesc                 h   |j                   d   } |j                  dg|j                   dd   }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }|t        j                  |      z  }| j                  |      } |j                  d|g|j                   dd   }|j                  ddddd      }| j                  D ]"  } ||      }|t        j                  |      z  }$ | j                  D ]
  } ||      } |j                  ddddd      }|S )Nr!   r3   r4   r   r   r   )r7   rM   r  r  r  r  r,   r:  r  r   r  r  )r   r  temporal_dimrH   r   layers         r1   r   zEmu3VQVAEEncoder.forward=  sH   #))!,+|++BH1C1CAB1GH \26))-8 m4}55m4---b,YATATUVUWAXY%--aAq!< NN 	:D /MU]]=99M	: (( 	1E!-0M	1 &--aAq!<r0   )r(   r)   r*   rw   r,   r-   r   r   r   s   @r1   r  r    s    %6NE$4$4 r0   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Emu3VQVAEDecoderrp   c                    t         	|           |j                  }|j                  |j                  d   z  }t        j                         | _        t        |j                        D ]>  }t        |j                  |j                        }| j                  j                  |       @ t        t        j                  |j                               }t        j                         | _        t        |      D ]=  }t%        |j                  |j                        }| j"                  j                  |       ? t        j&                  |j                  |ddd      | _        t+        |||      | _        t/        |      | _        |j                  |j                  d   z  }t3        ||      | _        t        j&                  ||j6                  ddd      | _        y )Nr3   rc  r   r!   r   )r=  r   )rv   rw   r   rh  re  rb   rj  r  rl  rg  r1  r  rm  r   r  r  r  r  r#  r   r  rU  r  r|  up_blockr  r  r  r  )
r   rp   r=  ru  r   r  temp_upsample_block_numr  r   r   s
            r1   rw   zEmu3VQVAEDecoder.__init__\  s   ))''&*C*CB*GG mmov,,- 	6A8"22AWAWM &&}5		6 #&dii0Q0Q&R"S./ 	(A,V-C-CVE[E[\DNN!!$'	( yy""
 1R`a(0''&*C*CA*FF,^XF		
r0   rH   r  c                    t        j                  ||fd      }|j                  ddddd      }| j                  D ]
  } ||      } | j                  D ]"  } ||      }|t        j
                  |      z  }$ |j                  ddddd      }t        j                  |dd      \  }} |j                  dg|j                  dd   } |j                  dg|j                  dd   }| j                  |      }| j                  ||      }| j                  ||      }| j                  ||      }|t        j
                  |      z  }| j                  |      }|S )Nr   r5   r4   r!   r   r   r3   )r,   r8   r   r  r  r:  chunkrM   r7   r  r  r  r  r  )r   rH   r  hidden_quant_statesr  s        r1   r   zEmu3VQVAEDecoder.forward  sp   #ii(E1M199!Q1aH (( 	=E"'(;"<	= ^^ 	FE"'(;"<5==1D#EE	F 299!Q1aH&+kk2Eqa&P#|---bK=3F3Fqr3JK+|++BH1C1CAB1GH]3 ))-Fm\Bm\B}55m4r0   )	r(   r)   r*   r$   rw   r,   r   r   r   r   s   @r1   r  r  [  s+    %
 %
NU\\  r0   r  aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    custom_introc            
       
    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZg dZeegedZ ej$                         d        Zdef fd	Zeedej.                  d
ej.                  dee   defd              Zdej.                  fdZ xZS )	Emu3VQVAErp   
emuvideovqr  )imageT)r1  rC  r<  r   rH   
attentionsc                    t        |t        j                  t        j                  f      rt	        j
                  |j                  dd       |j                  qt        j                  j                  j                  |j                        \  }}dt        j                  |      z  }t	        j                  |j                  | |       y y t        |t        j                        rt	        j                  |j                  t        j                  d             |j                  xt        j                  j                  j                  |j                        \  }}|dkD  rdt        j                  |      z  nd}t	        j                  |j                  | |       y y t        |t        j                  t        j                   t        j"                  f      rt	        j$                  |j                  d       t	        j$                  |j                  d	       t'        |d
d       ^t	        j(                  |j*                         t	        j,                  |j.                         t	        j(                  |j0                         y y t        |t        j2                        rqt	        j4                  |j                         |j6                  Et'        |j                  dd      s-t	        j(                  |j                  |j6                            y y y y )Nfan_outrelu)r   nonlinearityr!      )ar   r   r   running_mean_is_hf_initializedF)
isinstancerb   r   r	  initkaiming_normal_r   ru   r,   _calculate_fan_in_and_fan_outr  sqrtr   r}   kaiming_uniform_BatchNorm2dr3  r  	constant_rx   zeros_r  ones_running_varnum_batches_trackedr   normal_padding_idx)r   rS   fan_inr   bounds        r1   _init_weightszEmu3VQVAE._init_weights  s   fryy"))45  YVT{{&!HHMMGGV	DIIf--fkkE659 ' 		*!!&--499Q<@{{&!HHMMGGV	17!DIIf--fkkE659 '  NONN6==#.NN6;;,v~t4@F//0

6--.F667 A -LL'!!-gfmmMach6iFMM&*<*<=> 7j- .r0   c                    t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        dt        |j                        dz
  z  | _        t        |j                  |j                  dd      | _        t        |j                  |j                  dd      | _        dt        |j                        dz
  z  | _        | j%                          | j'                          y )Nr4   r!   )r   r!   r!   r&  r'  )rv   rw   rp   r  encoderr  decoderr   quantizerd  re  vision_spatial_factorr  r  r   
quant_convpost_quant_convspatial_scale_factoreval	post_initr   s     r1   rw   zEmu3VQVAE.__init__  s     '/'/08%&3v/H/H+IA+M%N")""F$4$4)T]
  /f44)T] 
 %&#f.G.G*H1*L$M!		r0   image_sizesrZ   rJ   c                    |j                   dk(  }|rL| j                  j                  }|j                  \  }}}}	|j	                  d      j                  d|ddd      }n|j                  \  }}}}}	| j                  |      }
|
j                  ddddd      }| j                  |      }|j                  ddddd      }| j                  |      }|r|j                  d      n|}t        ||      D cg c]B  \  }}|d t        |d   | j                  z        d t        |d   | j                  z        f   D }}}t        |
|      S c c}}w )Nr   r!   r   r4   r   )last_hidden_stater'   )ndimrp   r  r7   r?   repeatr  r   r  r  squeezer  r   r  r&   )r   r  r  rZ   is_imager   r   r   r   r   rH   conv_hidden_statescodesr'   single_imager   s                   r1   encodezEmu3VQVAE.encode  sl   
  $$){{==H2>2D2D/J&%'11!4;;AxAqQL<H<N<N9J(FE\2 +221aAqA!__-?@ 0771aAF01+3u}}Q' '*,&D
"d D3tAw)C)CCDDFqDQRGVZVpVpLpHqFqqr
 

 $+%
 	

s   1AErH   c                    |j                   dk(  }|r|j                  d      }|j                  \  }}}}| j                  j	                  |j                               }|j                  d   }|j                  |||||      j                  ddddd      j                         }| j                  |      }	|j                  ddddd      }|	j                  ddddd      }	| j                  |	|      }
|
j                  ||| j                  j                  z  | j                  j                  || j                  z  || j                  z        }
|r	|
d d df   S |
S )Nr   r!   r3   r   r   r4   )r  r?   r7   r  r   flattenr   r   rg   r  r  rM   rp   r  r  r  )r   rH   r  r   r   r   r   quantr   
post_quantvideos              r1   decodezEmu3VQVAE.decode  sK    %%*)33A6M.;.A.A+
Hfe''(=(=(?@;;r?

:xIQQRSUVXY[\^_`kkm))%0
aAq!,''1aA6
Z/t{{===KK$$T...D---
 'uQT{1E1r0   )r(   r)   r*   r$   r.   base_model_prefixmain_input_nameinput_modalities_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr<  r1  rC  _can_record_outputsr,   no_gradr  rw   r   r    r   r   r   r&   r  r  r   r   s   @r1   r  r    s     $$O!N"& /0LM-
 U]]_? ?4 *  
!LL
7<||
OUVhOi
	
   
B2ELL 2r0   r  c                       e Zd ZdZd Zed        Zed        Zed        Zed        Z	ed        Z
ed        Zd	eej                     d
ej                  fdZd	ej                  d
ej                  fdZy)Emu3ImageVocabularyMappingzM
    A class for mapping discrete image tokens from VQGAN to BPE tokens.
    c                 j    || _         |j                  d      | _        |j                  d      | _        y )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)r   r  s     r1   rw   z#Emu3ImageVocabularyMapping.__init__/  s+    "%MM/:'mmI6r0   c           	          t        | j                  j                         D cg c]  \  }}|j                  d      s| c}}      S c c}}w Nz<|visual tokensortedr  items
startswithr   namevals      r1   r'   z'Emu3ImageVocabularyMapping.image_tokens4  s8    DNN,@,@,BhytSdooVfFgshiih
   A	
A	
c           	          t        | j                  j                         D cg c]  \  }}|j                  d      s| c}}      S c c}}w r  r  r  s      r1   image_tokens_strz+Emu3ImageVocabularyMapping.image_tokens_str8  s8    T^^-A-A-Ci	ctWgGhtijjir  c                 t    | j                   D ci c]  }t        |dd       | j                  |     c}S c c}w )Nir  )r  r   r  )r   tokens     r1   img2bpez"Emu3ImageVocabularyMapping.img2bpe<  s5    FJF[F[\UE"RL!4>>%#88\\\s   #5c                 j    | j                   j                         D ci c]  \  }}||
 c}}S c c}}w r   )r  r  )r   rA   vs      r1   bpe2imgz"Emu3ImageVocabularyMapping.bpe2img@  s+    !%!3!3!56A1666s   /c                     t        j                  t        | j                  j	                               dz   t         j
                        }| j                  j                         D ]
  \  }}|||<    |S Nr!   r\   )r,   zerosmaxr  rL  r   r  r   mappingrA   r  s       r1   bpe2img_mapping_tensorz1Emu3ImageVocabularyMapping.bpe2img_mapping_tensorD  [    ++c$,,"3"3"56:%))LLL&&( 	DAqGAJ	r0   c                     t        j                  t        | j                  j	                               dz   t         j
                        }| j                  j                         D ]
  \  }}|||<    |S r  )r,   r  r  r  rL  r   r  r  s       r1   img2bpe_mapping_tensorz1Emu3ImageVocabularyMapping.img2bpe_mapping_tensorK  r  r0   	img_batchrJ   c                 ,   |j                   }t        j                  |j                  d   dft        j                        | j
                  z  }| j                  |j                  d         }t        j                  ||gd      }|j                  |      S )Nr   r!   r  cpur3   r5   )	devicer,   r   r7   r   r  r
  rf   r8   )r   r  r  eol_row
img_tokenss        r1   convert_img2bpez*Emu3ImageVocabularyMapping.convert_img2bpeR  sw    !!**iooa0!4EIIFIZIZZ00e1DE
YY
G4"=
}}V$$r0   c                     |j                   }|dd df   }| j                  |j                  d         }|j                  |      S )N.r3   r  )r  r  rf   )r   r  r  r  s       r1   convert_bpe2imgz*Emu3ImageVocabularyMapping.convert_bpe2imgY  sG    !!c3B3h'	00e1DE
}}V$$r0   N)r(   r)   r*   r+   rw   r   r'   r  r  r  r  r
  listr,   r   r  r  r/   r0   r1   r  r  *  s    7
 j j k k ] ] 7 7    %ell); % %% %%,, %r0   r  c                   P    e Zd ZU eed<   dZdZdZdgZddgZ	dZ
dZdZdZdZeedZy	)
Emu3PreTrainedModelrp   modelr  textTr   r   causal_maskr  N)r(   r)   r*   r"   r.   r  r  supports_gradient_checkpointingr  _skip_keys_device_placementr  r  _can_compile_fullgraphr  r  r   rn   r  r/   r0   r1   r  r  `  s]    (&*# $5m"DN!"&)#r0   r  c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )Emu3RotaryEmbeddinginv_freqNrp   c                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)rv   rw   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrp   rope_parametersr"  compute_default_rope_parametersr   attention_scalingregister_bufferclone)r   rp   r  rope_init_fnr   r   s        r1   rw   zEmu3RotaryEmbedding.__init__y  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr0   r  ztorch.deviceseq_lenrJ   ztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetarQ   Nr   r   r4   r  )r  r\   )	r)  rx   ry   rz   r,   arangeint64rf   r   )rp   r  r/  baser6   attention_factorr   s          r1   r*  z3Emu3RotaryEmbedding.compute_default_rope_parameters  s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r0   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r3   r!   mpsr  F)device_typeenabledr4   r5   r  )r   r   rL   r7   rf   r  r  typestrr   ra   r,   r8   rB   r+  rC   r\   )
r   r9   r   inv_freq_expandedposition_ids_expandedr8  freqsembrB   rC   s
             r1   r   zEmu3RotaryEmbedding.forward  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$r   )NNN)r(   r)   r*   r,   r   r.   r"   rw   staticmethodr   r   r   r   r*  r  r   r   r   r   s   @r1   r  r  v  s    llVz V  $(+/"*T!*(* t* 
~u$	%	* *: U]]_<  <r0   r  c                       e Zd Zdef fdZeee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	ej                  dz  d
edz  dee   defd                     Z xZS )Emu3TextModelrp   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr   rp   F)rv   rw   pad_token_idr  
vocab_sizerb   r   ry   embed_tokensrj  rl  num_hidden_layersr   layersr   r   normr  
rotary_embgradient_checkpointingr  r   s      r1   rw   zEmu3TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammBGH`H`BabYfi0b
   2 28K8KL	-V<&+# 	 cs   DN	input_idsrW   r   r   inputs_embedsr   r   rZ   rJ   c                 D   |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|E||j	                         nd}	t        j                  |j                  d   |j                        |	z   }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f|
|||||d|} | j                  |      }t        ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embedsrD  r   r!   )r  )rp   rN  rW   r   r   r   )r   )rW   r   r   r   r   r   )r  r   )rF  rG  r   rp   get_seq_lengthr,   r2  r7   r  r?   r   rK  rI  rH  rJ  r   )r   rM  rW   r   r   rN  r   r   rZ   past_seen_tokensr  rH   r   decoder_layers                 r1   r   zEmu3TextModel.forward  s]    -t";<YZZ *.*;*;I*FM0*$++>O!CRC^==?de]003M<P<PQTdd  )33A6L(;;'))+%
 &"oom,oW![[)H4;;+H+HI 
	M)	*$7) /#-	 	M
	 		-0&++
 	
r0   )NNNNNNN)r(   r)   r*   r"   rw   r   r    r   r,   r-   r   r
   r^  r   r   r   r   r   r   r   s   @r1   rB  rB    s    z     .2.204(,2626!%9
##d*9
 t+9
 &&-	9

 9
 ((4/9
 ((4/9
 $;9
 +,9
 
!9
    9
r0   rB  c                   n    e Zd ZU ddiZddiZddgdgfiZeed<    fdZe	e
	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  ded	z  dej                  d	z  dej                  d	z  ded	z  dej                  d	z  deej                  z  dee   defd              Z xZS )Emu3ForCausalLMlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrH   logitsrp   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y NFrt   )
rv   rw   rB  r  rF  rb   r}   ry   rV  r  r   s     r1   rw   zEmu3ForCausalLM.__init__  sU     "6*
 ++yy!3!3V5F5FUS 	r0   NrM  rW   r   r   rN  labelsr   r   logits_to_keeprZ   rJ   c
                 z    | j                   d|||||||d|
}|j                  }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|
}t        |||j                  |j                  |j                        S )a  
        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```rM  rW   r   r   rN  r   r   NrX  r[  rF  lossrX  r   rH   r  r/   )r  r  r  r   slicerV  loss_functionrp   rF  r   r   rH   r  )r   rM  rW   r   r   rN  r[  r   r   r\  rZ   outputsrH   slice_indicesrX  ra  s                   r1   r   zEmu3ForCausalLM.forward  s    B ,64:: 	,
)%+')	,
 	,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r0   )	NNNNNNNNr   )r(   r)   r*   _tied_weights_keys_tp_plan_pp_planr#   r.   rw   r   r   r,   r-   r   r
   r^  r   r   r   r   r   r   r   r   s   @r1   rT  rT    s5   *,GH23H_-z:;H  .2.204(,26*.!%26-.9
##d*9
 t+9
 &&-	9

 9
 ((4/9
   4'9
 $;9
 ((4/9
 ell*9
 +,9
 
 9
  9
r0   rT  c                       e Zd ZddiZ fdZd Zd Zdej                  dej                  dej                  fd	Z
e ed
      dej                  dej                  dee   deez  fd              Z ej$                         dej                  dedefd       Zdej                  dej                  dej                  fdZee	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej,                  dz  dej,                  dz  dej                  dz  dedz  dej                  dz  dedz  dej                  dz  dee   deez  fd              Z xZS )	Emu3Modelztext_model.model
text_modelc                     t         |   |       t        j                  |j                        | _        t        |j                        | _        t        |j                        | _        | j                          y r   )rv   rw   rB  _from_configtext_configrk  r  	vq_configvqmodelr  vocabulary_mapvocabulary_mappingr  r   s     r1   rw   zEmu3Model.__init__Y  sY     '44V5G5GH !1!12"<V=R=R"S 	r0   c                 6    | j                   j                         S r   )rk  get_input_embeddingsr   s    r1   rt  zEmu3Model.get_input_embeddingsb  s    3355r0   c                 :    | j                   j                  |       y r   )rk  set_input_embeddingsr   rV   s     r1   rv  zEmu3Model.set_input_embeddingse  s    ,,U3r0   r  r  rJ   c                     | j                   j                  ||d      }|j                  D cg c]+  }| j                  j	                  |      j                         - }}t        j                  |      }|S c c}w )a  
        Tokenizes images into discrete tokens with VQGAN module. Converts
        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
        special tokens.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
                The sizes of the images in the batch, being (height, width) for each image.
        T)return_dict)rp  r  r'   rr  r  r  r,   r8   )r   r  r  vqmodel_outputstokensbpe_tokens_list
bpe_tokenss          r1   get_image_tokenszEmu3Model.get_image_tokensh  sv     150C0CLR]ko0C0pTcTpTp
JPD##33F;CCE
 
 YY/
	
s   0A6zbTokenizes images into discrete tokens with VQGAN module and embeds them with text embeddings layerr  rZ   c                     | j                   j                  ||fddi|}|D cg c];  \  }}|| j                   j                  z  || j                   j                  z  dz   z  = }}}|j                  D cg c]+  }| j                  j                  |      j                         - }	}t        j                  |	      }
 | j                         |
      }t        j                  ||      }||_        |S c c}}w c c}w )z
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
            The tensors corresponding to the input images.
        ry  Tr!   )rp  r  r  r'   rr  r  r  r,   r8   rt  splitpooler_output)r   r  r  rZ   rz  r   r   split_sizesr{  r|  r}  image_embeddingsimage_featuress                r1   get_image_featureszEmu3Model.get_image_features{  s	    1D0C0C+1
371
;A1

 "-
 t||999et||GiGi>ilm>mn
 

 UdTpTp
JPD##33F;CCE
 
 YY/
64446zB%5{C(6%

s   A C370C9r'   r   r   c                     |ddddf   j                  d||dz         }| j                  j                  |      }| j                  j	                  |      }|S )a  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.

        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
            height (`int`):
                Height of the generated image before upsampling.
            width (`int`):
                Width of the generated image before upsampling.
        Nr3   r!   )r   rr  r  rp  r  )r   r'   r   r   	sequencesr  s         r1   decode_image_tokenszEmu3Model.decode_image_tokens  sX     !CRC(--b&%!)D	..>>yI##L1r0   rM  rN  r  c                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )r\   r  r3   r   r!   z6Image features and image tokens do not match, tokens: z, features: )rt  r,   tensorrr  r  longr  allr   r7   r?   	expand_asrf   r   numel)r   rM  rN  r  special_image_maskn_image_tokensn_image_featuress          r1   get_placeholder_maskzEmu3Model.get_placeholder_mask  s    !.2M$2K2K2MT44CC5::^k^r^rs3 " "4!7!7!;!*d.E.E.T.T!T+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r0   NrW   r   r   r   r   c
           
      F   |du |duz  rt        d      | | j                         |      }|Y| j                  ||      j                  }t	        j
                  |d      }| j                  |||      }|j                  ||      } | j                  d||||||	d|
}|S )ap  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   r5   )rN  r  )rW   r   r   rN  r   r   r/   )	rF  rt  r  r  r,   r8   r  masked_scatterrk  )r   rM  r  r  rW   r   r   rN  r   r   rZ   r  r  rd  s                 r1   r   zEmu3Model.forward  s    * -t";<s   7D557	BM#!44\;O]]N"YY~1=N!%!:!:~ "; " *889K^\M "$// 
)%+')
 
 r0   )	NNNNNNNNN)r(   r)   r*   _checkpoint_conversion_mappingrw   rt  rv  r,   r^  r-   r~  r   r   r   r   r   r&   r  r  r   r  r  r   r
   r   r   r   r   r   s   @r1   rj  rj  V  s   &8,%G"64U->-> UM]M] bgbrbr & y!--<A<L<LX^_qXr	%	% 0 U]]_0@0@ # VY  $"))":?:K:K"]b]n]n"0  .215+/.204(,26!%26.##d*. ''$.. \\D(	.
 t+. &&-. . ((4/. $;. ((4/. +,. 
'	'.  .r0   rj  c                       e Zd ZdZddiZddddZ fdZd	 Zd
 Zde	j                  fdZd Zee	 	 	 	 	 	 	 	 	 	 	 ddej                   dz  dej"                  dz  dej$                  dz  dej$                  dz  dej                   dz  dedz  dej"                  dz  dedz  dej                   dz  dej                   dz  deej$                  z  dee   deez  fd              Z	 	 	 	 	 	 	 	 d fd	Z xZS )Emu3ForConditionalGenerationr  rU  z$model.text_model.embed_tokens.weightzmodel.text_modelzmodel.vqmodelrV  )z^text_model.modelz^vqmodelz^text_model.lm_headc                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y rZ  )rv   rw   rj  r  rb   r}   rn  ry   rF  rV  r  r   s     r1   rw   z%Emu3ForConditionalGeneration.__init__  sS     v&
yy!3!3!?!?ASASA^A^ejkr0   c                 6    | j                   j                         S r   )r  rt  r   s    r1   rt  z1Emu3ForConditionalGeneration.get_input_embeddings  s    zz..00r0   c                 :    | j                   j                  |       y r   )r  rv  rw  s     r1   rv  z1Emu3ForConditionalGeneration.set_input_embeddings  s    

''.r0   rJ   c                     | j                   S r   )rV  r   s    r1   get_output_embeddingsz2Emu3ForConditionalGeneration.get_output_embeddings  s    ||r0   c                 :     | j                   j                  di |S r@  )r  r  )r   rZ   s     r1   r  z0Emu3ForConditionalGeneration.decode_image_tokens  s    -tzz--777r0   NrM  r  r  rW   r   r   rN  r   r   r[  r\  rZ   c                     | j                   d|||||||	d|}|d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|
4 | j
                  d||
| j                  j                  j                  d|}t        |||j                  |j                  |j                        S )a  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> conversation = [
        ...     {
        ...     "role": "system",
        ...     "content": [
        ...         {"type": "text", "text": "You are a helpful assistant."},
        ...         ],
        ...     },
        ...     {
        ...     "role": "user",
        ...     "content": [
        ...         {"type": "image"},
        ...         {"type": "text", "text": "Please describe the image."},
        ...         ],
        ...     },
        ... ]

        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```r^  r   Nr_  r`  r/   )r  r  r   rb  rV  rc  rp   rn  rF  r   r   rH   r  )r   rM  r  r  rW   r   r   rN  r   r   r[  r\  rZ   rd  rH   re  rX  ra  s                     r1   r   z$Emu3ForConditionalGeneration.forward  s    B $** 	
)%+')	
 	
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD &#33!//))
 	
r0   c
                 L    t        |   |f||||||||	d|
}|	s|rd |d<   |S )N)r   rW   rN  r   r   r  r   is_first_iterationr  )rv   prepare_inputs_for_generation)r   rM  r   rW   rN  r   r   r   r  r  rZ   model_inputsr   s               r1   r  z:Emu3ForConditionalGeneration.prepare_inputs_for_generationp  sU     w<
+)')%%1
 
 "i+/L(r0   )NNNNNNNNNNr   )NNNNNTNF)r(   r)   r*   output_modalitiesrf  r  rw   rt  rv  rb   ro  r  r  r   r   r,   r-   r^  r   r
   r   r   r   r   r   r   r   r  r   r   s   @r1   r  r    s   )*,RS/#(&"1/ryy 8  .215+/.204(,26!%26*.-.[
##d*[
 ''$.[
 \\D(	[

 t+[
 &&-[
 [
 ((4/[
 $;[
 ((4/[
   4'[
 ell*[
 +,[
 
'	'[
  [
@   r0   r  )r  rT  rB  r  r  rj  rb  )r   )br  collections.abcr   dataclassesr   	functoolsr   typingr   r,   torch.nnrb   torch.nn.functionalrc   r    r   r  activationsr	   cache_utilsr
   r   
generationr   integrationsr   r   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr    configuration_emu3r"   r#   r$   r&   r<   rG   r   r   rR   ro  r   rl   rn   r   r   r   r   r   r   r  r  r#  r-  r1  r<  rC  r  rO  rU  r`  r|  r  r  r  r  r  r  rB  rT  rj  r  __all__r/   r0   r1   <module>r     s  ,  $ ! %      & ! . ) f f / 9 k k K F & a a G 5 K K 15 1  1( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*C)BII C) +C)L Y'J")) J (J(bii  *1 *Z$ryy $D	RYY 	299 bii :!299 !H		 .")) &.(299 .(b<(299 <(~>)bii >)BV V299 D8 8v7ryy 7tCryy CLCryy CL ~2 ~2~2B3% 3%l /  *><")) ><B M
' M
 M
` J
)? J
 J
Z\# \~Z#6 Zzr0   