
    qiV                         d dl mZmZ d dlmZ d dlmZ  ej                  e      Z	 G d de      Z
 G d de      Z G d d	e      Zg d
Zy)   )PreTrainedConfiglayer_type_validation)RopeParameters)loggingc            *       6    e Zd ZdZddddddddZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	edz  d
edz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  de	dz  dedz  de	dz  de	dz  dedz  dedz  de
dz  de	dz  de	dz  deeeef   z  dz  f& fdZ xZS )Llama4VisionConfigaB  
    This is the configuration class to store the configuration of a [`Llama4VisionModel`]. It is used to instantiate a
    Llama4 vision model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Llama4 109B.

    e.g. [meta-llama/Llama-4-Scout-17B-16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        num_hidden_layers (`int`, *optional*, defaults to 34):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input image.
        intermediate_size (`int`, *optional*, defaults to 5632):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        vision_output_dim (`int`, *optional*, defaults to 7680):
            Dimensionality of the vision model output. Includes output of transformer
            encoder with intermediate layers and global transformer encoder.
        image_size (`int`, *optional*, defaults to 448):
            The size (resolution) of each image *tile*.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
            Controls which vision tokens are kept from the backbone. `"default"` drops the CLS token and `"full"` keeps all tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        pixel_shuffle_ratio (`float`, *optional*, defaults to 0.5):
            Pixel-shuffle ratio for downsampling patch tokens. Smaller values produce fewer tokens (more downsampling).
        projector_input_dim (`int`, *optional*, defaults to 4096):
            Width of the vision adapter MLP before pixel shuffle. Larger value increases capacity and compute.
        projector_output_dim (`int`, *optional*, defaults to 4096):
            Output width of the vision adapter. Larger value yields higher-dimensional image features.
        multi_modal_projector_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the multi-modal projector layers.
        projector_dropout (`float`, *optional*, defaults to 0.0):
            Dropout rate inside the vision adapter MLP. Higher value adds more regularization.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            Dropout rate on vision attention probabilities. Higher value adds more regularization.
        rope_parameters (`RopeParameters`, *optional*):
            RoPE Parameters
    colwiserowwisecolwise_gather_output)zmodel.layers.*.self_attn.q_projzmodel.layers.*.self_attn.k_projzmodel.layers.*.self_attn.v_projzmodel.layers.*.self_attn.o_projzvision_adapter.mlp.fc1zvision_adapter.mlp.fc2zpatch_embedding.linearllama4_vision_modelvision_configNhidden_size
hidden_actnum_hidden_layersnum_attention_headsnum_channelsintermediate_sizevision_output_dim
image_size
patch_sizenorm_epsvision_feature_select_strategyinitializer_rangepixel_shuffle_ratioprojector_input_dimprojector_output_dimmulti_modal_projector_biasprojector_dropoutattention_dropoutrope_parametersc                 .   || _         || _        || _        || _        || _        || _        || _        |	| _        |
| _        || _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        t'        | P  di | y )N )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    super__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    kwargs	__class__s                        a/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/llama4/configuration_llama4.pyr$   zLlama4VisionConfig.__init__[   s    . '$!2(!2$!2$ #6 !2#6 #6 $8!*D'!2!2.L+."6"    )i   gelu"      r   i   i   i     h㈵>default{Gz?g      ?   r1   F        r2   N)__name__
__module____qualname____doc__base_model_tp_plan
model_typebase_config_keyintstrfloatboolr   dictr$   __classcell__r'   s   @r(   r   r      s   3l ,5+4+4+4"+"+"9 'J%O #&!'(**,#$(,(,!$!#!%5>*.,/*.+/27*-*-MQ),#4Z,# $J,# :	,#
 !4Z,# Dj,# :,# :,# $J,# $J,# $,,# ),d
,# !4<,# #T\,# !4Z,#  "Dj!,#" %)4K#,#$ !4<%,#& !4<',#( ($sN/B*CCdJ),# ,#r)   r   c                        e Zd ZdZdZdgZdZdddddddddddddZddddd	d	dddd
d
Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
ee	f   z  dz  f fdZ xZS )Llama4TextConfiga)  
    This is the configuration class to store the configuration of a [`Llama4TextModel`]. It is used to instantiate a
    Llama4 text model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Llama4 109B.

    e.g. [meta-llama/Llama-4-Scout-17B-16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 202048):
            Vocabulary size of the Llama4 text model. Defines the maximum number of different tokens that can be represented
            by the `inputs_ids` passed when calling [`Llama4TextModel`].
        hidden_size (`int`, *optional*, defaults to 5120):
            Dimensionality of the embeddings and hidden states.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        intermediate_size_mlp (`int`, *optional*, defaults to 16384):
            Intermediate size of dense MLP layers. Larger value increases FFN capacity and compute.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 40):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If not
            specified, will default to `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 128):
            Per-head attention dimension. Larger value increases head width and compute.
        hidden_act (`str` or `Callable`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions.
        pad_token_id (`int`, *optional*, defaults to 128004):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the beginning of sentence token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the end of sentence token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        attention_dropout (`float`, *optional*, defaults to 0.0):
            Dropout rate on vision attention probabilities. Higher value adds more regularization.
        num_experts_per_tok (`int`, *optional*, defaults to 1):
            Top-k experts routed per token. Higher value uses more experts per token and more compute.
        num_local_experts (`int`, *optional*, defaults to 16):
            Number of experts in each MoE layer. Higher value increases capacity and routing choices.
        moe_layers (`list[int]`, *optional*):
            List of layer indices that use MoE. Overrides `interleave_moe_layer_step` when set.
        interleave_moe_layer_step (`int`, *optional*, defaults to 1):
            Spacing between MoE layers when `moe_layers` is `None`. Larger value means fewer MoE layers.
        use_qk_norm (`bool`, *optional*, defaults to `True`):
            Whether to L2-normalize queries/keys on RoPE layers. Can stabilize attention when enabled.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether to return router logits (and auxiliary loss) in outputs.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            Weight for the router auxiliary loss. Higher value makes routing loss contribute more to total loss.
        router_jitter_noise (`float`, *optional*, defaults to 0.0):
            Amount of noise added to router logits during training. Higher value increases exploration.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        no_rope_layers (`list[int]`, *optional*):
            List with at least the same length as the number of layers in the model.
            A `1` at an index position indicates that the corresponding layer will use RoPE,
            while a `0` indicates that it's a NoPE layer.
        no_rope_layer_interval (`int`, *optional*, defaults to 4):
            If `no_rope_layers` is `None`, it will be created using a NoPE layer every
            `no_rope_layer_interval` layers.
        attention_chunk_size (`int`, *optional*, defaults to 8192):
            Chunk size for the attention computation. Smaller value enforces more local attention and lowers memory.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        attn_temperature_tuning (`bool`, *optional*, defaults to `True`):
            Whether to dynamically scale the attention temperature for each query token based on sequence length.
            Recommended for long sequences (e.g., >32k tokens) to maintain stable output results.
        floor_scale (`int`, *optional*, defaults to 8192):
            Base scale (in tokens) for attention temperature tuning. Larger value delays scaling to longer positions.
        attn_scale (`float`, *optional*, defaults to 0.1):
            Strength of attention temperature tuning. Larger value increases scaling at long positions.

    Example:
    llama4_textpast_key_valuesg    Ar	   r
   packed_rowwise)layers.*.self_attn.q_projlayers.*.self_attn.k_projlayers.*.self_attn.v_projlayers.*.self_attn.o_projz-layers.*.feed_forward.shared_expert.gate_projz+layers.*.feed_forward.shared_expert.up_projz-layers.*.feed_forward.shared_expert.down_proj*layers.*.feed_forward.experts.gate_up_proj'layers.*.feed_forward.experts.down_projlayers.*.feed_forward.gate_projlayers.*.feed_forward.up_projlayers.*.feed_forward.down_projgrouped_gemm	ep_router)
rF   rG   rH   rI   rJ   rK   rL   rM   rN   zlayers.*.feed_forward.routerNr    c#                    || _         || _        || _        || _        | | _        |"| _        |!| _        || _        |
| _        || _	        || _
        || _        || _        || _        d| _        ||}|| _        |	| _        || _        || _        || _        || _        ||n| j                  | j                  z  | _        || _        || _        || _        || _        || _        || _        |g k(  rd }t9        | j                        D $cg c]  }$t;        |$dz   |z  dk7         }%}$|r|n|%| _        || _        ||ntA        t9        |dz
  ||            | _!        || _"        || _#        |#| j<                  D &cg c]  }&|&rdnd
 c}&| _#        tI        | jF                  | j                         || _%        tM        '|   di |# y c c}$w c c}&w )NF       chunked_attentionfull_attentionr"   )(tie_word_embeddingspad_token_idbos_token_ideos_token_idattn_temperature_tuning
attn_scalefloor_scale
vocab_sizemax_position_embeddingsr   r   intermediate_size_mlpr   r   attention_biasnum_key_value_headsr   r   rms_norm_eps	use_cacher   head_dimuse_qk_normnum_experts_per_toknum_local_expertsoutput_router_logitsrouter_aux_loss_coefrouter_jitter_noiseranger:   no_rope_layersinterleave_moe_layer_steplist
moe_layersattention_chunk_sizelayer_typesr   r    r#   r$   )(r%   r]   r   r   r_   r   r   ra   rd   r   r^   r   rb   rc   rW   rX   rY   rV   r   rf   rg   ro   rm   re   rh   ri   rj   r    rl   no_rope_layer_intervalrp   rq   rZ   r\   r[   r&   	layer_idxdefault_no_rope_layersno_roper'   s(                                          r(   r$   zLlama4TextConfig.__init__  s   L $7 ((('>$$&$'>$&!2%:"!2#6 #&"5#6 $!2("!2$,$8d>N>NRVRjRj>j&#6 !2$8!$8!#6  R!N QVVZVlVlPm"
CLCQ"88A=>"
 "
 1?nDZ)B& % -1%- 	 %9!&TXTgTg IPw#4DD D 	d..0F0FG."6"9"
, s   F>2G)"i@ i       i @  0   (         silui   r0   r.   TNrR      Fr2   rR   r,   NrR   TFgMbP?r2   NN   rv   NTrv   g?)r3   r4   r5   r6   r8   keys_to_ignore_at_inferencedefault_thetar7   base_model_ep_planr   r>   r;   r$   r?   r@   s   @r(   rB   rB      s   Yv J#4"5M%.%.%.%.9B7@9B6F3<+4)2+4 &/%.%.%.6D3A+4)2+4(3 # )!"#""MQ ! $Gh#8 ($sN/B*CCdJ9h# h#r)   rB   c                   R     e Zd ZdZdZddddZeedZdd	iZ		 	 	 	 	 	 d fd
	Z
 xZS )Llama4Configa  
    This is the configuration class to store the configuration of a [`Llama4Model`]. It is used to instantiate an
    Llama4 model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Llama4 109B.

    e.g. [meta-llama/Llama-4-Scout-17B-16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        vision_config (`Llama4VisionConfig`, *optional*):
            The Llama4 Vision config.
        text_config (`Llama4TextConfig`, *optional*):
            The Llama4 Text config.
        boi_token_index (`int`, *optional*, defaults to 200080):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 200081):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 200092):
            The image token index to encode the image prompt.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.

    ```python
    >>> from transformers import Llama4Model, Llama4Config

    >>> # Initializing a Llama4 7B style configuration
    >>> configuration = Llama4Config()

    >>> # Initializing a model from the Llama4 7B style configuration
    >>> model = Llama4Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```llama4image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configr   zmulti_modal_projector.linear_1colwise_repc                    |%t               | _        t        j                  d       n8t	        |t
              rt        di || _        nt	        |t               r|| _        || _        || _        || _        |%t               | _
        t        j                  d       n8t	        |t
              rt        di || _
        nt	        |t              r|| _
        || _        t        | 4  di | y )Nz9vision_config is None, using default llama4 vision configz5text_config is None, using default llama4 text configr"   )r   r   loggerinfo
isinstancer>   r   r   r   rB   r   rV   r#   r$   )	r%   r   r   r   r   r   rV   r&   r'   s	           r(   r$   zLlama4Config.__init__  s      !3!5DKKSTt,!3!Dm!DD'9:!.D..!2/1DKKOPT*/>+>D%56*D#6 "6"r)   )NNi i i F)r3   r4   r5   r6   r8   attribute_maprB   r   sub_configsr7   r$   r?   r@   s   @r(   r   r   o  sX    $L J-))M
 #3EWXK(-  !# #r)   r   )r   rB   r   N)configuration_utilsr   r   modeling_rope_utilsr   utilsr   
get_loggerr3   r   r   rB   r   __all__r"   r)   r(   <module>r      s]   " K 1  
		H	%n#) n#bb#' b#JP## P#f Er)   