
    qi                     |   d dl mZ d dlmZ d dlmZ d dlZd dlmZ ddlm	Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7  ed       G d dejp                               Z9 G d dejp                        Z: G d d ejp                        Z; G d! d"ejp                        Z< G d# d$ejp                        Z=d% Z> G d& d'ejp                        Z? G d( d)ejp                        Z@ G d* d+ejp                        ZAd, ZB ed-      dXd.       ZCd/ej                  d0eEd1ej                  fd2ZF	 dYd3ejp                  d4ej                  d5ej                  d6ej                  d7ej                  dz  d8eGd9eGd:e(e*   fd;ZH eeC       G d< d=ejp                               ZI G d> d?e      ZJe+ G d@ dAe&             ZKe+ G dB dCe&             ZL G dD dEejp                        ZMe+ G dF dGeK             ZNe+ G dH dIeKe             ZOe e+dJK       G dL dMe                     ZPe e+dNK       G dO dPe                    ZQ e+dQK       G dR dSeL             ZR e+dTK       G dU dVeLe             ZSg dWZTy)Z    )Callable)	dataclass)OptionalN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModel   )
AriaConfigAriaTextConfigRMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	AriaTextRMSNormepsreturnNc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z>
        AriaTextRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer+   	__class__s      X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/aria/modeling_aria.pyr/   zAriaTextRMSNorm.__init__5   s1     	ll5::k#:; #    hidden_statesc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr#   T)keepdim)	dtypetor1   float32powmeanrsqrtr4   r3   )r5   r:   input_dtypevariances       r8   forwardzAriaTextRMSNorm.forward=   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r9   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler3   shaper4   r5   s    r8   
extra_reprzAriaTextRMSNorm.extra_reprD   s*    ))*+6$2G2G1HIIr9   )gư>)
__name__
__module____qualname__floatr/   r1   TensorrF   rK   __classcell__r7   s   @r8   r*   r*   3   s7    $ $$ $;U\\ ;ell ;Jr9   r*   c                   (     e Zd ZdZ fdZd Z xZS )AriaProjectorMLPa!  
    Feed-Forward Network module for the Aria Projector.

    Args:
        in_features (`int`):
            Input embedding dimension.
        hidden_features (`int`):
            Hidden dimension of the feed-forward network.
        output_dim (`int`):
            Output dimension.
    c                     t         |           t        j                  ||d      | _        t        j                  ||d      | _        t        d   | _        y )NFbiasgelu_new)r.   r/   r   Linear	linear_in
linear_outr	   act)r5   in_featureshidden_features
output_dimr7   s       r8   r/   zAriaProjectorMLP.__init__U   sB    ;eL))OZeL*%r9   c                 h    | j                  | j                  |            }| j                  |      }|S N)r\   rZ   r[   )r5   r:   s     r8   rF   zAriaProjectorMLP.forward[   s-    !>?6r9   rL   rM   rN   __doc__r/   rF   rQ   rR   s   @r8   rT   rT   H   s    
&r9   rT   c                   6     e Zd ZdZddedef fdZddZ xZS )AriaCrossAttentionzv
    Aria Cross-Attention module.

    Args:
        config (`AriaConfig`):
            The configuration to use.
    configdropout_ratec                 B   t         |           |j                  j                  }|j                  j                  }|| _        t        j                  ||d      | _        t        j                  ||d      | _	        t        j                  ||d      | _
        t        j                  ||d      | _        t        j                  ||      | _        t        j                  |      | _        t        j                   |      | _        t        j                   |      | _        y )NFrV   T)batch_first)r.   r/   vision_configr6   num_attention_heads	num_headsr   rY   q_projk_projv_projMultiheadAttentionmultihead_attnlinearDropoutdropout	LayerNorm
layer_normlayer_norm_kv)r5   rf   rg   r6   rl   r7   s        r8   r/   zAriaCrossAttention.__init__j   s    **66((<<	"ii[uEii[uEii[uE !33KX\]ii[9zz,/,,{3\\+6r9   c                    | j                  | j                  |            }| j                  |      }| j                  |      }| j	                  |      }| j                  ||||      \  }}| j                  | j                  |            }|S )a  
        Forward pass of the AriaCrossAttention module.

        Args:
            key_value_states (`torch.Tensor`):
                Input tensor for key and value.
            hidden_states (`torch.Tensor`):
                Input tensor for query.
            attn_mask (`torch.Tensor`, *optional*, defaults to None):
                Attention mask.

        Returns:
            torch.Tensor:
                Output tensor after cross-attention.
        	attn_mask)rm   rv   rw   rn   ro   rq   rt   rr   )	r5   key_value_statesr:   rz   querykeyvalueattn_output_s	            r8   rF   zAriaCrossAttention.forward{   s      DOOM:;--.>?kk*+,-,,UC),TQll4;;{#;<r9   )r   ra   )	rL   rM   rN   rc   r&   rO   r/   rF   rQ   rR   s   @r8   re   re   a   s     7z 7 7"r9   re   c                   h     e Zd ZdZdef fdZddej                  dej                  dz  fdZ xZ	S )	AriaProjectora  
    Aria Projector module.

    This module projects vision features into the language model's embedding space, enabling interaction between vision and language components.

    Args:
        config (`AriaConfig`):
            Configuration object for the model.
    rf   c                    t         |           |j                  | _        |j                  j
                  | _        |j                  j                  | _        |j                  j
                  | _	        |j                  j
                  | _        |j                  j
                  | _        t        j                  t        j                   |j"                  | j                              | _        t'        |      | _        t        j*                  | j                        | _        t/        | j                  | j                  | j                        | _        y ra   )r.   r/   projector_patch_to_query_dictpatch_to_query_dictrj   r6   r]   rk   rl   kv_dimtext_configr^   r_   r   r0   r1   zeros'max_value_projector_patch_to_query_dictr|   re   
cross_attnru   rv   rT   feed_forwardr5   rf   r7   s     r8   r/   zAriaProjector.__init__   s     	#)#G#G !//;;--AA**66%11== ,,88\\%++f.\.\^b^n^n"op
,V4,,t'7'78,T-=-=t?S?SUYUdUder9   Nr{   rz   c                 4   |j                   d   |j                   d   }}|| j                  vr*t        d| d| j                  j                          d      | j                  |   }| j                  d| j                  d      j                  |dd      }|M|j                  | j                  d      }|j                  d      j                  d|j                  d      d      }| j                  |||      }| j                  | j                  |            }|S )	a  
        Forward pass of the Projector module.

        Args:
            key_value_states (`torch.Tensor`):
                Input tensor of shape (batch_size, num_patches, kv_dim).
            attn_mask (`torch.Tensor`, *optional*, default is None):
                Attention mask.

        Returns:
            `torch.Tensor`: Output tensor of shape (batch_size, query_number, output_dim).
        r   r%   zNumber of patches z: not found in patch_to_query_dict amongst possible values .Nr<   ry   )rI   r   KeyErrorkeysr|   	unsqueezerepeatrepeat_interleaverl   expandsizer   r   rv   )	r5   r{   rz   
batch_sizenum_patches	query_numqueriesattention_outouts	            r8   rF   zAriaProjector.forward   s6    #3"8"8";=M=S=STU=VK
d666$[M1klp  mE  mE  mJ  mJ  mL  lM  MN  O  ,,[9	**Zi(2215<<ZAN !33DNNAFI!++A.55b',,q/2NI(8'YW >?
r9   ra   )
rL   rM   rN   rc   r&   r/   r1   rP   rF   rQ   rR   s   @r8   r   r      s8    ff( PTAT r9   r   c                   .     e Zd ZdZdef fdZd Z xZS )AriaSharedExpertsMLPa/  
    Shared Expert MLP for shared experts.

    Unlike routed experts, shared experts process all tokens without routing.
    This class reconfigures the intermediate size in comparison to the LlamaMLP.

    Args:
        config (`AriaTextConfig`): Configuration object for the Aria language model.
    rf   c                     t         |           || _        |j                  | _        |j                  |j
                  z  | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        j                  | j                  | j                  |j                        | _        t        |j                     | _        y )NrV   )r.   r/   rf   r6   intermediate_sizemoe_num_shared_expertsr   rY   mlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr   s     r8   r/   zAriaSharedExpertsMLP.__init__   s    !--!'!9!9F<Y<Y!Y4#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r9   c                     | j                  | j                  | j                  |            | j                  |      z        }|S ra   )r   r   r   r   )r5   xr   s      r8   rF   zAriaSharedExpertsMLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r9   )rL   rM   rN   rc   r'   r/   rF   rQ   rR   s   @r8   r   r      s    0~ 0r9   r   c                    | j                   d   }|j                   d   }t        j                  ||| j                  | j                        }t        j
                  |d      }t        j                  dt        j                  |j                        }t        j                  ||f      }t        |j                   d         D ]2  }||   }	||dz      }
| |	|
 }t        j                  |||         }|||	|
 4 |S )a*  
    Compute the matrix multiplication (GEMM) for each expert sequentially. This approach is computationally inefficient, especially when dealing with a large number of experts.

    Args:
        token_states (torch.Tensor): Input tensor of shape (num_tokens, in_features).
        expert_weights (torch.Tensor): Weight tensor of shape (num_experts, in_features, out_features).
        tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert.

    Returns:
        torch.Tensor: Output tensor of shape (num_tokens, out_features).
    r   r<   r>   devicedimr%   )
rI   r1   r   r>   r   cumsumlongcatrangematmul)token_statesexpert_weightstokens_per_expert
num_tokensout_featuresoutputcumsum_num_tokenszero_tensor
expert_numstartendtokensr   s                r8   sequential_experts_gemmr      s     ##A&J!''+L[[\9K9KT`TgTghF%6A>++auzz:K:R:RSK		;0A"BCN0034  
!*-
Q/eC(ll6>*#=>uS  Mr9   c                   (     e Zd ZdZ fdZd Z xZS )AriaGroupedExpertsGemmaP  
    Grouped GEMM (General Matrix Multiplication) module for efficient expert computation.
    This module utilizes the grouped_gemm library (https://github.com/fanshiqing/grouped_gemm)
    for optimized performance. If the grouped_gemm library is not installed, it gracefully
    falls back to a sequential GEMM implementation, which may be slower but ensures
    functionality.

    Args:
        in_features (`int`):
            Number of input features.
        out_features (`int`):
            Number of output features.
        groups (`int`):
            Number of expert groups.
    c                     t         |           || _        || _        || _        t        j                  t        j                  |||            | _	        y ra   )
r.   r/   r]   r   groupsr   r0   r1   emptyr3   )r5   r]   r   r   r7   s       r8   r/   zAriaGroupedExpertsGemm.__init__#  sB    &(ll5;;v{L#QRr9   c                 L    t        || j                  |j                               S )au  
        Perform grouped matrix multiplication.

        Args:
            input (`torch.Tensor`):
                Input tensor of shape (num_tokens, in_features).
            tokens_per_expert (`torch.Tensor`):
                Number of tokens assigned to each expert.

        Returns:
            torch.Tensor: Output tensor of shape (num_tokens, out_features).
        )r   r3   cpu)r5   inputr   s      r8   rF   zAriaGroupedExpertsGemm.forward*  s'     'KK!!#
 	
r9   rb   rR   s   @r8   r   r     s     S
r9   r   c                   N     e Zd Zdeddf fdZd Zdej                  fdZ xZ	S )AriaExpertsrf   r,   Nc                     t         |           || _        t        |j                  |j
                  dz  |j                        | _        t        |j
                  |j                  |j                        | _        y )Nr#   )	r.   r/   rf   r   r6   r   moe_num_expertsfc1fc2r   s     r8   r/   zAriaExperts.__init__?  sa    )&*<*<f>V>VYZ>Z\b\r\rs)&*B*BFDVDVX^XnXnor9   c                     t        j                  || j                  j                  d      \  }}t        j
                  j                  |d      }||fS )Nr%   )kr   r<   r   )r1   topkrf   moe_topkr   
functionalsoftmax)r5   router_logits
top_logitstop_indicesscoress        r8   route_tokens_to_expertsz#AriaExperts.route_tokens_to_expertsE  sH    "'**]dkk>R>RXY"Z
K&&zr&:F""r9   c                 B   | j                  |      \  }}|j                  }t        j                  |j	                         j                  t        j                        | j                  j                  d| j                  j                  dz
        j                  |      }|}|j                  d      }t        j                  |      }	|j                  d|	| j                  j                  z        }
| j                  |
|      }t        j                  |dd      \  }}t        j                   j#                  |      |z  }| j%                  ||      }t        j&                  |j(                  d   | j                  j                  z  |j+                  d      f|j                  |j,                        }|j/                  d|	|       |j                  d| j                  j                  |j+                  d            }||j1                  d      z  j3                  d      }|S )Nr   r%   )binsminmaxr<   r#   r   r   )r   r>   r1   histcflattenr?   r@   rf   r   viewargsortindex_selectr   r   chunkr   r   silur   r   rI   r   r   index_copy_r   sum)r5   r:   r   top_k_indextop_k_weightsoriginal_dtyper   indicesflatten_indicessorted_indicespermuted_tokens
fc1_output
projectiongateexpert_outputunpermuted_tokensr   s                    r8   rF   zAriaExperts.forwardJ  s   %)%A%A-%P"]$**!KK!$$U]]3,,++a/	

 "^
 	 !,,r*7'44Q$++J^J^8^_XXo/@A
 ;;z1"=
D]]''
3d:
->?!KK  #dkk&:&::M<N<Nq<QR%% ''

 	%%aG-222t{{7K7K]M_M_`aMbc#m&=&=b&AAFF1FMr9   )
rL   rM   rN   r'   r/   r   r1   rP   rF   rQ   rR   s   @r8   r   r   >  s.    p~ p$ p#
u|| r9   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )AriaTextMoELayerrf   c                     t         |           t        j                  |j                  |j
                  d      | _        t        |      | _        t        |      | _
        || _        y NFrV   )r.   r/   r   rY   r6   r   routerr   expertsr   shared_expertsrf   r   s     r8   r/   zAriaTextMoELayer.__init__k  sO    ii 2 2F4J4JQVW"6*26:r9   r:   r,   c                 
   |j                   }|j                  d|j                  d            }| j                  |      }| j	                  ||      j                  |      }| j                  |j                  |            }||z   S Nr<   )rI   r   r   r   r   r   )r5   r:   original_shaper   r   shared_expert_outputs         r8   rF   zAriaTextMoELayer.forwardr  s{    &,,%**2}/A/A"/EFM2]MBGGW#22=3E3En3UV333r9   )	rL   rM   rN   r'   r/   r1   rP   rF   rQ   rR   s   @r8   r   r   j  s*    ~ 4U\\ 4ell 4r9   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr<   r#   r   )rI   r1   r   )r   x1x2s      r8   rotate_halfr   {  sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r9   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   r   )qr   cossinunsqueeze_dimq_embedk_embeds          r8   apply_rotary_pos_embr	    sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr9   r:   n_repr,   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r%   N)rI   r   reshape)r:   r
  batchnum_key_value_headsslenhead_dims         r8   	repeat_kvr    so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr9   moduler|   r}   r~   attention_maskscalingrt   kwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr#   r   r<   )r   r>   )ptrainingr%   )r  num_key_value_groupsr1   r   	transposer   r   r   r@   r?   r>   rt   r  
contiguous)r  r|   r}   r~   r  r  rt   r  
key_statesvalue_statesattn_weightsr   s               r8   eager_attention_forwardr    s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r9   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  d	e
dz  d
ej                  dz  dee   de	ej                  ej                  f   fdZ xZS )AriaTextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrf   	layer_idxc                 d   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y )Nr  g      TrV   )r.   r/   rf   r"  getattrr6   rk   r  r  r  r  attention_dropout	is_causalr   rY   attention_biasrm   rn   ro   o_projr5   rf   r"  r7   s      r8   r/   zAriaTextAttention.__init__  sM   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r9   Nr:   position_embeddingsr  past_key_valuescache_positionr  r,   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t              } || |	|
||f| j                  sdn| j                   | j"                  d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )Nr<   r%   r#   )r  r  r,          )rt   r  )rI   r  rm   r   r  rn   ro   r	  updater"  r   get_interfacerf   _attn_implementationr  r  r%  r  r  r  r(  )r5   r:   r*  r  r+  r,  r  input_shapehidden_shapequery_statesr  r  r  r  cache_kwargsattention_interfacer   r  s                     r8   rF   zAriaTextAttention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r9   )NNNN)rL   rM   rN   rc   r'   intr/   r1   rP   rH   r
   
LongTensorr   r   rF   rQ   rR   s   @r8   r!  r!    s    G
~ 
# 
4 IM.2(,26))||)) #5<<#=>E)) t+	))
 )) ((4/)) +,)) 
u||U\\)	*))r9   r!  c                   &    e Zd ZdZdedef fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  d	e
dz  d
edz  dej                  dz  deej                  ej                  f   dz  dee   dej                  fdZ xZS )AriaTextDecoderLayerag  
    Aria Text Decoder Layer.

    This class defines a single decoder layer in the language model, incorporating self-attention and Mixture of Experts (MoE) feed-forward network.

    Args:
        config (`AriaTextConfig`):
            Configuration object for the text component of the model.
        layer_idx (`int`):
            Index of the layer.
    rf   r"  c                     t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y )N)rf   r"  r+   )r.   r/   r6   r!  	self_attnr   mlpr*   rms_norm_epsinput_layernormpost_attention_layernormr)  s      r8   r/   zAriaTextDecoderLayer.__init__  sm    !--*&IN#F+.v/A/AvGZGZ[(78J8JPVPcPc(d%r9   Nr:   r  position_idsr+  	use_cacher,  r*  r  r,   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
|	|z   }|}	| j                  |      }| j                  |      }|	|z   }|S )N)r:   r  rB  r+  rC  r,  r*   )r@  r=  rA  r>  )r5   r:   r  rB  r+  rC  r,  r*  r  residualr   s              r8   rF   zAriaTextDecoderLayer.forward  s     !,,];)4>> 	
')%+) 3	
 	
q !=0 !55mD/ =0r9   )NNNFNN)rL   rM   rN   rc   r'   r7  r/   r1   rP   r8  r
   boolrH   r   r   rF   rQ   rR   s   @r8   r:  r:    s    
e~ e# e /304(,!&26HL|| t+ &&-	
  $; ((4/ #5<<#=>E +, 
r9   r:  c                        e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZdZeedZ ej"                          fd	       Z xZS )
AriaTextPreTrainedModelrf   model)imagetextr:  r   Tr+  r:   
attentionsc                     t         |   |       t        |t              r7t	        j
                  |j                  d| j                  j                         y y )Nr.  )rB   std)	r.   _init_weights
isinstancer   initnormal_r3   rf   initializer_ranger5   r  r7   s     r8   rQ  z%AriaTextPreTrainedModel._init_weightsQ  s>    f%f45LLSdkk6S6ST 6r9   )rL   rM   rN   r'   __annotations__base_model_prefixinput_modalities_no_split_modulessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_attention_backendr:  r!  _can_record_outputsr1   no_gradrQ  rQ   rR   s   @r8   rI  rI  @  sj    (/1IJ&*#"3N"&-'
 U]]_U Ur9   rI  c                        e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZ ej$                          fd       Z xZS )	AriaPreTrainedModelrf   rJ  TAriaDecoderLayerr+  FrM  c                     t         |   |       t        |t              r6t	        j
                  |j                  | j                  j                         y y )N)rP  )	r.   rQ  rR  r   rS  trunc_normal_r|   rf   rU  rV  s     r8   rQ  z!AriaPreTrainedModel._init_weightsi  s=    f%fm,v||1N1NO -r9   )rL   rM   rN   r&   rW  rX  r[  rZ  r\  r]  r^  _supports_flex_attn_can_compile_fullgraphr_  r:  r!  r`  r1   ra  rQ  rQ   rR   s   @r8   rc  rc  X  sp    &*#+,#4"5N""&-'
 U]]_P Pr9   rc  c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )AriaTextRotaryEmbeddinginv_freqNrf   c                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultrk  F)
persistentoriginal_inv_freq)r.   r/   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrf   rope_parametersrm  compute_default_rope_parametersr   attention_scalingregister_bufferclone)r5   rf   r   rope_init_fnrk  r7   s        r8   r/   z AriaTextRotaryEmbedding.__init__s  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr9   r   ztorch.deviceseq_lenr,   ztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetar  Ng      ?r   r#   r>   )r   r>   )	rt  r$  r6   rk   r1   arangeint64r?   rO   )rf   r   rz  baser   attention_factorrk  s          r8   ru  z7AriaTextRotaryEmbedding.compute_default_rope_parameters  s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r9   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r<   r%   mpsr   F)device_typeenabledr#   r   r}  )rk  rO   r   rI   r?   r   rR  typestrr    r  r1   r   r  rv  r  r>   )
r5   r   rB  inv_freq_expandedposition_ids_expandedr  freqsembr  r  s
             r8   rF   zAriaTextRotaryEmbedding.forward  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$ra   )NNN)rL   rM   rN   r1   rP   rW  r'   r/   staticmethodr   r7  rH   rO   ru  ra  r   rF   rQ   rR   s   @r8   rj  rj  p  s    llV~ V  (,+/"*%*(* t* 
~u$	%	* *: U]]_<  <r9   rj  c                       e Zd Zdef fdZeee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	ej                  dz  d
edz  dee   defd                     Z xZS )AriaTextModelrf   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr<  rf   F)r.   r/   pad_token_idpadding_idx
vocab_sizer   	Embeddingr6   embed_tokens
ModuleListr   num_hidden_layersr:  layersr*   r?  normrj  
rotary_embgradient_checkpointing	post_initr)  s      r8   r/   zAriaTextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammFKFLdLdFef!&)4f
 $F$6$6F<O<OP	1@&+# 	 gs   DN	input_idsr  rB  r+  inputs_embedsr,  rC  r  r,   c                 D   |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|E||j	                         nd}	t        j                  |j                  d   |j                        |	z   }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f|
|||||d|} | j                  |      }t        ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embedsr  r   r%   )r   )rf   r  r  r,  r+  rB  )rB  )r  r*  rB  r+  rC  r,  )last_hidden_stater+  )
ValueErrorr  r   rf   get_seq_lengthr1   r~  rI   r   r   r   r  r  r  r  r   )r5   r  r  rB  r+  r  r,  rC  r  past_seen_tokenscausal_maskr:   r*  decoder_layers                 r8   rF   zAriaTextModel.forward  s]    -t";<YZZ *.*;*;I*FM0*$++>O!CRC^==?de]003M<P<PQTdd  )33A6L(;;'))+%
 &"oom,oW![[)H4;;+H+HI 
	M)	*$7) /#-	 	M
	 		-0&++
 	
r9   )NNNNNNN)rL   rM   rN   r'   r/   r!   r"   r   r1   r8  rP   r
   FloatTensorrG  r   r   r   rF   rQ   rR   s   @r8   r  r    s    ~     .2.204(,2626!%9
##d*9
 t+9
 &&-	9

 9
 ((4/9
 ((4/9
 $;9
 +,9
 
!9
    9
r9   r  c                   ^    e Zd ZddiZddiZddgdgfiZdef fdZe	 	 	 	 	 	 	 	 	 dd
e	j                  d	z  de	j                  d	z  de	j                  d	z  ded	z  de	j                  d	z  de	j                  d	z  ded	z  de	j                  d	z  dee	j                  z  dee   defd       Z xZS )AriaTextForCausalLMlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr:   logitsrf   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r   )
r.   r/   r  rJ  r  r   rY   r6   r  r  r   s     r8   r/   zAriaTextForCausalLM.__init__  sU     "6*
 ++yy!3!3V5F5FUS 	r9   Nr  r  rB  r+  r  labelsrC  r,  logits_to_keepr  r,   c
                 z    | j                   d|||||||d|
}|j                  }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|
}t        |||j                  |j                  |j                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, AriaTextForCausalLM

        >>> model = AriaTextForCausalLM.from_pretrained("meta-aria_text/AriaText-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-aria_text/AriaText-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r  r  rB  r+  r  rC  r,  Nr  r  r  lossr  r+  r:   rN  rE  )rJ  r  rR  r7  slicer  loss_functionrf   r  r   r+  r:   rN  )r5   r  r  rB  r+  r  r  rC  r,  r  r  outputsr:   slice_indicesr  r  s                   r8   rF   zAriaTextForCausalLM.forward  s    > ,64:: 	,
)%+')	,
 	,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r9   )	NNNNNNNNr   )rL   rM   rN   _tied_weights_keys_tp_plan_pp_planr'   r/   r   r1   r8  rP   r
   r  rG  r7  r   r   r   rF   rQ   rR   s   @r8   r  r    s.   *,GH23H_-z:;H~   .2.204(,26*.!%26-.8
##d*8
 t+8
 &&-	8

 8
 ((4/8
   4'8
 $;8
 ((4/8
 ell*8
 +,8
 
 8
 8
r9   r  zP
    Base class for Aria causal language model (or autoregressive) outputs.
    custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	AriaCausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr  r  r+  r:   rN  image_hidden_states)rL   rM   rN   rc   r  r1   r  rW  r  r+  r
   r:   rH   rN  r  rE  r9   r8   r  r  M  s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r9   r  zI
    Base class for Aria outputs, with hidden states and attentions.
    c                   :    e Zd ZU dZdZej                  dz  ed<   y)AriaModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr  )rL   rM   rN   rc   r  r1   r  rW  rE  r9   r8   r  r  k  s    	 59**T18r9   r  zt
    The Aria model which consists of a vision backbone and a language model, without a language modeling head.
    c                   L    e Zd ZddiZdef fdZd Zd Zee	 e
d      	 	 	 dd
ej                  dej                  d	z  deded	z  dee   deez  fd                     Zdej(                  dej                  dej                  fdZee
	 	 	 	 	 	 	 	 	 ddej(                  d	z  d
ej                  d	z  dej(                  d	z  dej,                  d	z  dej(                  d	z  ded	z  dej                  d	z  ded	z  dej(                  d	z  dee   deez  fd              Zd Z xZS )	AriaModel^language_model.modellanguage_modelrf   c                     t         |   |       t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _	        | j                          y ra   )r.   r/   r$   from_configrj   vision_towerr   multi_modal_projectorr   r  r  r   s     r8   r/   zAriaModel.__init__  sY     %11&2F2FG%26%:"'33F4F4FGr9   c                 6    | j                   j                         S ra   )r  get_input_embeddingsrJ   s    r8   r  zAriaModel.get_input_embeddings  s    ""7799r9   c                 :    | j                   j                  |       y ra   )r  set_input_embeddingsr5   r~   s     r8   r  zAriaModel.set_input_embeddings  s    007r9   zWObtains image last hidden states from the vision tower and apply multimodal projection.r  Npixel_values
pixel_maskvision_feature_layeroutput_hidden_statesr  r,   c                     | j                  |      } | j                  |f|ddd|}d }|&|j                  d      }	t        j                  |	      }|j
                  |   }
| j                  |
|      |_        |S )NT)patch_attention_maskr  return_dictr%   ry   )_create_patch_attention_maskr  r   r1   logical_notr:   r  pooler_output)r5   r  r  r  r  r  r  image_outputsimage_attn_maskflattened_maskselected_image_features              r8   get_image_featureszAriaModel.get_image_features  s      $@@L)))
!5!%	

 
 +199!<N#//?O!.!<!<=Q!R&*&@&@AWcr&@&s#r9   r  r  image_featuresc                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r   r<   r   r%   z6Image features and image tokens do not match, tokens: z, features: )r  r1   tensorrf   image_token_idr   r   allr   rI   r   	expand_asr?   r   numel)r5   r  r  r  special_image_maskn_image_tokensn_image_featuress          r8   get_placeholder_maskzAriaModel.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r9   r  rB  r+  rC  r,  c
           
         | | j                         |      }||j                  d   dk7  r| j                  ||| j                  j                  d      j
                  }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d||||||	d|
}t        |j                  |r|j                  nd |j                  |j                   |      S d       S )Nr%   T)r  r  r  r  )r  r  )r  rB  r+  r  rC  r,  )r  r+  r:   rN  r  rE  )r  rI   r  rf   r  r  r?   r   r>   r  masked_scatterr  r  r  r+  r:   rN  )r5   r  r  r  r  rB  r+  r  rC  r,  r  r  r  r  s                 r8   rF   zAriaModel.forward  sC     7D557	BM #(;(;A(>!(C!44)%%)[[%E%E 	 5 
 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M%$%% 
)%+')
 
 '%777@G33d!//))2>2J
 	

 QU
 	
r9   c                    |y |j                  d| j                  j                  j                  | j                  j                  j                        }|j                  d| j                  j                  j                  | j                  j                  j                        }|j	                  d      dkD  j                         S )Nr%   )	dimensionr   stepr#   )r<   r   r   )unfoldr  rf   
patch_sizer   rG  )r5   r  patches_subgrids      r8   r  z&AriaModel._create_patch_attention_mask   s    $++""))44""))44 , 

 *00""))44""))44 1 

  ###1A5;;==r9   )Nr<   N)	NNNNNNNNN)rL   rM   rN   _checkpoint_conversion_mappingr&   r/   r  r  r   r!   r   r1   r  r7  rG  r   r   rH   r   r  r8  r  rP   r
   r   r  rF   r  rQ   rR   s   @r8   r  r    s    	!"2&"z :8 n 04$&,0'' %%, "	
 #Tk +, 
+	+   
4"))":?:K:K"]b]n]n"0  .215.2.204(,26!%26.
##d*.
 ''$..
 $$t+	.

 t+.
 &&-.
 .
 ((4/.
 $;.
 ((4/.
 -..
 
(	(.
  .
`>r9   r  z
    Aria model for conditional generation tasks.

    This model combines a vision tower, a multi-modal projector, and a language model
    to perform tasks that involve both image and text inputs.
    c                   R    e Zd ZdddddZddiZdef fd	Zd
 Zd Zde	j                  fdZe	 	 ddej                  dej                  dz  dedee   deez  f
d       Zee	 	 	 	 	 	 	 	 	 	 	 d dej,                  dz  dej                  dz  dej,                  dz  dej.                  dz  dej,                  dz  dedz  dej                  dz  dej,                  dz  dedz  deej.                  z  dej,                  dz  dee   deez  fd              Z	 	 	 	 	 	 	 	 d! fd	Z xZS )"AriaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorr  )r  z^vision_towerz^multi_modal_projectorz^language_model.lm_headr  z(model.language_model.embed_tokens.weightrf   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y r   )r.   r/   r  rJ  r   rY   r   r6   r  r  r  r   s     r8   r/   z%AriaForConditionalGeneration.__init__"  sS     v&
yy!3!3!?!?ASASA^A^ejkr9   c                 6    | j                   j                         S ra   )rJ  r  rJ   s    r8   r  z1AriaForConditionalGeneration.get_input_embeddings(  s    zz..00r9   c                 :    | j                   j                  |       y ra   )rJ  r  r  s     r8   r  z1AriaForConditionalGeneration.set_input_embeddings+  s    

''.r9   r,   c                     | j                   S ra   )r  rJ   s    r8   get_output_embeddingsz2AriaForConditionalGeneration.get_output_embeddings.  s    ||r9   Nr  r  r  r  c                 B     | j                   j                  d|||d|S )N)r  r  r  rE  )rJ  r  )r5   r  r  r  r  s        r8   r  z/AriaForConditionalGeneration.get_image_features1  s5     -tzz,, 
%!!5
 	
 	
r9   r  r  rB  r+  r  r  rC  r  r,  c                     | j                   d||||||||	|d	|}|d   }t        |
t              rt        |
 d      n|
}| j	                  |dd|ddf         }d}|4 | j
                  d||| j                  j                  j                  d|}t        |||j                  |j                  |j                        S )a{  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `AriaForConditionalGeneration`).
            Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
            computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> import torch
        >>> from PIL import Image
        >>> from io import BytesIO

        >>> from transformers import AutoProcessor, AutoModel
        >>> from transformers.image_utils import load_image

        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

        >>> processor = AutoProcessor.from_pretrained("Rhymes-AI/Aria")
        >>> model = AutoModel.from_pretrained("Rhymes-AI/Aria", dtype=torch.bfloat16, device_map="auto")

        >>> # Create inputs
        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {"type": "image"},
        ...             {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
        ...             {"type": "image"},
        ...             {"type": "text", "text": "What can we see in this image?"},
        ...         ]
        ...     },
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {"type": "image"},
        ...             {"type": "text", "text": "In which city is that bridge located?"},
        ...         ]
        ...     }
        ... ]

        >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages]
        >>> images = [[image1, image2], [image3]]
        >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device)

        >>> # Generate
        >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

        >>> print(generated_texts[0])
        Assistant: There are buildings, trees, lights, and water visible in this image.

        >>> print(generated_texts[1])
        Assistant: The bridge is in San Francisco.
        ```)	r  r  r  r  rB  r+  r  rC  r,  r   Nr  r  rE  )rJ  rR  r7  r  r  r  rf   r   r  r  r+  r:   rN  )r5   r  r  r  r  rB  r+  r  r  rC  r  r,  r  r  r:   r  r  r  s                     r8   rF   z$AriaForConditionalGeneration.forward@  s    \ $** 
%!)%+')
 
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD *#33!//))
 	
r9   c
           
      r    t        |   |f||||||	d|
}|	s|
j                  dd      s
||d<   ||d<   |S )N)r+  r  r  r,  r  is_first_iterationrC  Tr  r  )r.   prepare_inputs_for_generationget)r5   r  r+  r  r  r  r  r,  r  r  r  model_inputsr7   s               r8   r  z:AriaForConditionalGeneration.prepare_inputs_for_generation  sf     w<	
+')))1	
 	
 VZZT%B
 ,8L()3L&r9   r   )NNNNNNNNNr   N)NNNNNNNF)rL   rM   rN   r  r  r&   r/   r  r  r   Moduler  r   r1   r  r7  r   r   rH   r   r  r   r8  rP   r
   rG  r  rF   r  rQ   rR   s   @r8   r  r    s    #9.#@$-	&" +,VWz 1/ryy   04$&	
''
 %%,
 "	

 +,
 
+	+
 
  .215.2.204(,26*.!%-.26j
##d*j
 ''$.j
 $$t+	j

 t+j
 &&-j
 j
 ((4/j
   4'j
 $;j
 ell*j
 ((4/j
 +,j
 
+	+j
  j
^     r9   r  )r  rc  rI  r  r  r  )r%   )r.  )Ucollections.abcr   dataclassesr   typingr   r1   r    r   rS  activationsr	   cache_utilsr
   r   
generationr   integrationsr   r   r   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr    r!   utils.output_capturingr"   autor$   configuration_ariar&   r'   r  r*   rT   re   r   r   r   r   r   r   r   r	  rP   r7  r  rO   r  r!  r:  rI  rc  rj  r  r  r  r  r  r  __all__rE  r9   r8   <module>r     s1  ( % !    & ! . ) f f / B 9  L F & a a G 5  : Y'Jbii J (J(ryy 24 4n>BII >B299 4>)
RYY )
X)")) )X4ryy 4"( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*C)		 C) +C)L55 5p Uo U U. P/ P P.><bii ><B M
+ M
 M
` G
1? G
 G
T 
9 9 90 
95 9 9 
I># I>
I>X u#6 uupr9   