
    qis                        d Z ddlZddlmc mZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3  e jh                  e5      Z6 G d de      Z7 G d de1      Z8 G d de      Z9 G d dejt                        Z; G d de'      Z< G d d e)      Z= G d! d"e3      Z> G d# d$e2      Z? G d% d&e*e      Z@ G d' d(e0      ZA G d) d*e/      ZB G d+ d,e+      ZC G d- d.e-      ZD G d/ d0e.      ZE G d1 d2e,      ZFg d3ZGy)4zPyTorch MiniMax model.    N)nn   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfiglayer_type_validation)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)MoeModelOutputWithPast)RopeParameters)Unpack)TransformersKwargslogging)merge_with_config_defaults)OutputRecordercapture_outputs   )Gemma2RotaryEmbedding)MixtralAttentionMixtralDecoderLayerMixtralForCausalLMMixtralForQuestionAnswering MixtralForSequenceClassificationMixtralForTokenClassificationMixtralModelMixtralPreTrainedModelMixtralRMSNormMixtralSparseMoeBlockMixtralTopKRouterc            B           e Zd ZdZdZdgZdZdddddddd	Zd
gdgfddgdgfdgdgfdZddiZ		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d2de
dz  de
dz  de
dz  de
dz  de
dz  de
dz  de
dz  dedz  de
dz  dedz  de
dz  dedz  de
dz  de
dz  d e
dz  d!edz  d"e
dz  d#edz  d$e
dz  de
dz  d%edz  d&edz  d'edz  d(eeeef   z  dz  d)ee   dz  d*e
dz  d+e
dz  d,e
dz  d-e
dz  d.e
dz  d/e
dz  d0e
dz  f@ fd1Z xZS )3MiniMaxConfiga  
    This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an
    MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the MiniMax.

    [MiniMaxAI/MiniMax-Text-01-hf](https://huggingface.co/MiniMaxAI/MiniMax-Text-01-hf)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the MiniMax model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MiniMaxModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 14336):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
            The maximum sequence length that this model might ever be used with. MiniMax's sliding window attention
            allows sequence of up to 4096*32 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `4096`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        num_experts_per_tok (`int`, *optional*, defaults to 2):
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter
        num_local_experts (`int`, *optional*, defaults to 8):
            Number of experts per Sparse MLP layer.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss. See [here]() for more details
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        router_jitter_noise (`float`, *optional*, defaults to 0.0):
            Amount of noise to add to the router.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        block_size (`int`, *optional*, defaults to 256):
            The length of each attention block, determining how queries, keys, and values
            are grouped and processed for intra- and inter-block attention.
        full_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after normal attention.
        full_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after normal attention.
        linear_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after lightning attention.
        linear_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after lightning attention.
        mlp_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after MLP.
        mlp_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after MLP.

    ```python
    >>> from transformers import MiniMaxModel, MiniMaxConfig

    >>> # Initializing a MiniMax style configuration
    >>> configuration = MiniMaxConfig()

    >>> # Initializing a model from the MiniMax style configuration
    >>> model = MiniMaxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```minimaxpast_key_valuesg    .Acolwiserowwisepacked_colwisemoe_tp_experts)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projz!layers.*.mlp.experts.gate_up_projzlayers.*.mlp.experts.down_projzlayers.*.mlp.experts	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormnum_expertsnum_local_expertsN
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dim
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_idbos_token_ideos_token_idtie_word_embeddingssliding_windowattention_dropoutnum_experts_per_tokoutput_router_logitsrouter_aux_loss_coefrouter_jitter_noiserope_parameterslayer_types
block_sizefull_attn_alpha_factorfull_attn_beta_factorlinear_attn_alpha_factorlinear_attn_beta_factormlp_alpha_factormlp_beta_factorc!                    || _         |	| _        || _        || _        || _        || _        || _        ||}|| _        || _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        | | _        | j.                  ;t?        | j                        D "cg c]  }"tA        |"dz   dz        rdnd c}"| _        tC        | j.                  | j                         || _"        tG        #|   di |! y c c}"w )N   r   full_attentionlinear_attention )%r5   r=   r6   r7   r8   r9   rE   r:   r<   r>   r?   r@   rF   r;   rG   r4   rH   rI   rJ   rD   rA   rB   rC   rL   rM   rN   rO   rP   rQ   rR   rS   rangeboolr
   rK   super__init__)$selfr5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   r4   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   kwargsi	__class__s$                                      ]/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/minimax/modular_minimax.pyr\   zMiniMaxConfig.__init__   s   H %'>$&!2!2#6 , &"5#6 $!2("!2 #6 !2$8!$8!#6 #6 (((&$&<#%:"(@%'>$ 0.#W\]a]s]sWt RSD!a%1$5 ;MM D 	d..0F0FG."6" s   E) i }  i   i 8      rb      Nsilui   g{Gz?h㈵>TNrU   r   FN        r   rc   FgMbP?rf   NN   rU   rU   rU   rU   rU   rU   )__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencedefault_thetabase_model_tp_planbase_model_pp_planattribute_mapintstrfloatrZ   r   dictlistr\   __classcell__r`   s   @ra   r%   r%   5   s   eN J#4"5M%.%.%.%.-=*3 0 &(9:#%568IJ!"_$56 	*M "'"&(-(**,*+#!'.7*.#'!%#'#$#$+0%)*-*+(),1-2,/MQ(,!$-.,-/0./'(&'CR#$JR# 4ZR# :	R#
 :R# !4ZR# !4ZR# *R# $JR# "%tR# !4<R# DjR# $;R# DjR# DjR#  Dj!R#" "D[#R#$ d
%R#& !4<'R#( !4Z)R#* :+R#, #Tk-R#. $dl/R#0 #T\1R#2 ($sN/B*CCdJ3R#4 #Y%5R#6 $J7R#8 !$d
9R#:  #Tz;R#< #&*=R#> "%t?R#@ *AR#B tCR# R#    r%   c                       e Zd Zy)MiniMaxRMSNormNrh   ri   rj   rX   ry   ra   r{   r{         ry   r{   c                   r     e Zd Z fdZd ZdefdZ fdZdefdZde	j                  fd	Zd
efdZ xZS )MiniMaxCachec                 0    t         |           g | _        y N)r[   r\   linear_cacher]   r`   s    ra   r\   zMiniMaxCache.__init__  s    02ry   c                     t        t        | j                        |dz         D ]  }| j                  j                  g         || j                  |<   y )NrU   )rY   lenr   append)r]   	layer_idxr   _s       ra   set_linear_cachezMiniMaxCache.set_linear_cache  sK    s4,,-y1}= 	)A$$R(	)'3)$ry   r   c                 >    |t        |       k  r| j                  |   S y r   )r   r   )r]   r   s     ra   get_linear_cachezMiniMaxCache.get_linear_cache  s"    s4y $$Y//ry   c                 Z    t        t        | 	         t        | j                              S r   )maxr[   __len__r   r   r   s    ra   r   zMiniMaxCache.__len__  s"    57?$c$*;*;&<==ry   repeatsc                     t        t        |             D ]`  }| j                  |   g k7  r.| j                  |   j                  |d      | j                  |<   C| j                  |   j                  |       b y )Nr   dim)rY   r   r   repeat_interleaver1   batch_repeat_interleave)r]   r   r   s      ra   r   z$MiniMaxCache.batch_repeat_interleave  ss    s4y) 	HI  +r1/3/@/@/K/]/]^ekl/]/m!!),I&>>wG		Hry   indicesc                     t        t        |             D ]T  }| j                  |   g k7  r"| j                  |   |df   | j                  |<   7| j                  |   j	                  |       V y )N.)rY   r   r   r1   batch_select_indices)r]   r   r   s      ra   r   z!MiniMaxCache.batch_select_indices%  sk    s4y) 	EI  +r1/3/@/@/KGUXL/Y!!),I&;;GD		Ery   
max_lengthc                     t        d      )Nz*MiniMaxCache doesnot support `crop` method)RuntimeError)r]   r   s     ra   cropzMiniMaxCache.crop,  s    GHHry   )rh   ri   rj   r\   r   rr   r   r   r   torchTensorr   r   rw   rx   s   @ra   r   r     sL    34# 
>Hs HEELL EIs Iry   r   c                   >    e Zd Zdedef fdZd Zd Z	 	 ddej                  de
ej                  ej                  f   d	ej                  dz  d
edz  dej                  dz  dee   de
ej                  ej                  dz  e
ej                     dz  f   fdZ xZS )MiniMaxLightningAttentionconfigr   c                    t         |           || _        t        |dd       xs |j                  |j
                  z  | _        |j
                  | _        |j                  | _        |j                  | _        t        |j                     | _        t        | j                  | j
                  z        | _        t        j                  |j                  | j
                  | j                  z  dz  d      | _        t        j                  | j
                  | j                  z  |j                  d      | _        t        j                  |j                  | j
                  | j                  z  d      | _        | j'                         }| j)                  |      \  }}}| j+                  d|       | j+                  d|       | j+                  d|       | j+                  d|       y )	Nr;   r   F)bias
slope_ratequery_decay	key_decaydiagonal_decay)r[   r\   r   getattrr6   r9   r;   r8   rM   r   r<   act_fnr{   r2   r   Linearqkv_projout_projoutput_gateget_slope_ratedecay_factorsregister_buffer)r]   r   r   r   r   r   r   r`   s          ra   r\   z"MiniMaxLightningAttention.__init__1  s   "
D9mV=O=OSYSmSm=m#)#=#= !'!9!9 ++V../"4==43K3K#KL			&"4"4d6N6NQUQ^Q^6^ab6bino		$":":T]]"JFL^L^ejk99V%7%79Q9QTXTaTa9ahmn((*
151C1CJ1O.Y\:6]K8[)4-~>ry   c                     ddd| j                   z  z  z  }t        j                  | j                         dz   }d| j                  | j                  dz
  dz   z  z
  dz   }||z  }||z  }|d d d d f   }|S )NrU   r   rc   re   )r9   r   aranger   r8   )r]   baseexponentfactorrates        ra   r   z(MiniMaxLightningAttention.get_slope_rateG  s    A!d66678<< 8 89A=T^^t'='='AD'HIIDPX~f}AtTM"ry   c                    t        j                  | j                        dz   }t        j                  | |d d d f   z        }t        j                  | | j                  |d d d f   z
  z        }|d d d f   |d d d f   z
  }|d d d d d d f   }||z  }t        j                  |dk\  | t        d            }t        j                  |      }|||fS )NrU   r   z-inf)r   r   rM   expwherert   )r]   r   block_size_ranger   r   r   s         ra   r   z'MiniMaxLightningAttention.decay_factorsR  s     <<81<ii.>q$w.G GHIIzkT__?OPQSWPW?X-XYZ	)!T'25EdAg5NN'dAq(89#n4^q%8>/5QW=Y>2I~55ry   Nr.   position_embeddingsr/   r'   cache_positionr^   returnc                    |j                   \  }}}	|| j                  z   dz
  | j                  z  }
| j                  | j                  |            }|j	                  ||| j
                  d| j                  z        }t        j                  || j                  d      \  }}}|j                  dd      }|j                  dd      }|j                  dd      }d }||j                  | j                        }|t        j                  || j
                  | j                  | j                        j                  |      }|Q|j                  t        j                        }|j                  |j!                  d      j!                  d       d      }g }t#        |
      D ]b  }|| j                  z  }t%        || j                  z   |      }||z
  }|d d d d ||f   }|d d d d ||f   }|d d d d ||f   }| j&                  d d d |f   }| j(                  d d | d f   }| j*                  d d d d d |d |f   }t        j,                  | j.                   |z        }t        j0                  ||j                  dd            }t        j0                  ||z  |      }t        j0                  ||z  |      }||z   }|j3                  |       t        j0                  ||z  j                  dd      |      } ||z  | z   }e nt        j,                  | j.                         }!g }t#        |      D ]  }|d d d d ||dz   f   }|d d d d ||dz   f   }|d d d d ||dz   f   }t        j0                  |j                  dd      |      }"|!|z  |"z   }t        j0                  ||      }|j3                  |        t        j4                  |d      }|j                  dd      }|j	                  ||| j
                  | j                  z        }| j7                  |      }t9        j:                  | j=                  |            |z  }| j?                  |      }||jA                  | j                  |       ||fS )	NrU   r   r   r   )dtyper   )!shaperM   r   r   reshaper9   r;   r   split	transposer   r   zerostorZ   masked_fill	unsqueezerY   minr   r   r   r   r   matmulr   catr2   Fsigmoidr   r   r   )#r]   r.   r   r/   r'   r   r^   
batch_sizeseq_lenr6   
num_blocks
qkv_statesquery_states
key_statesvalue_statesattn_weights_interattn_outputr_   	start_idxend_idxcurrent_block_sizecurrent_query_statescurrent_key_statescurrent_value_statescurrent_query_decaycurrent_key_decaycurrent_diagonal_decayblock_decayattn_weights_intraattn_output_intraattn_output_intercurrent_attn_outputnext_attn_weights_interratiocurrent_attn_weights_inters#                                      ra   forwardz!MiniMaxLightningAttention.forward`  s    ,9+>+>(
G[/!3G
[[}!=>
''
GT=U=UWX[_[h[hWhi
16Z\]1^.j,#--a3))!Q/
#--a3 "&!0!A!A$..!Q%!&Z9Q9QSWS`S`bfbobo!p!s!s"
 )!/!2!2!2!D+779Q9QRS9T9^9^_a9b8bdefK:& `/	i$//97C%,y%8"'3Aq)G:K4K'L$%/1i6G0G%H"'3Aq)G:K4K'L$&*&6&6q:M;M:M7M&N#$(NN17I6I6J3J$K!)-)<)<QCVDVCVXkYkXk=k)l&#ii(8;M(MN &+\\2FHZHdHdegikHl%m"$)LL1CF\1\^r$s! %*LL1EH[1[]o$p! '8:K&K#""#67 +0,,'*;;FFr2NPd+' &8+%EH_%_";`@ IIt./EK7^ 	8'3Aq!a!e)O'D$%/1a!a%i%@"'3Aq!a!e)O'D$-2\\:L:V:VWY[]:^`t-u*%*-?%?B\%\"&+ll3GI[&\#""#67	8 ii4 "++Aq1!))*gt?W?WZ^ZgZg?ghii,ii 0 0 ?@;NmmK0 &,,T^^=OP...ry   )NN)rh   ri   rj   r%   rr   r\   r   r   r   r   tupler   
LongTensorr   r   r   rw   rx   s   @ra   r   r   0  s    ?} ? ?,	6& )-26`/||`/ #5<<#=>`/ t+	`/
 `/ ((4/`/ -.`/ 
u||U\\D0%2E2LL	M`/ry   r   c                       e Zd Zy)MiniMaxRotaryEmbeddingNr|   rX   ry   ra   r   r     r}   ry   r   c                       e Zd Zy)MiniMaxAttentionNr|   rX   ry   ra   r   r     r}   ry   r   c                       e Zd Zy)MiniMaxTopKRouterNr|   rX   ry   ra   r   r     r}   ry   r   c                       e Zd Zy)MiniMaxSparseMoeBlockNr|   rX   ry   ra   r   r     r}   ry   r   c                   d    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  dej                  dz  d	e
dz  d
edz  dej                  dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )MiniMaxDecoderLayerr   r   c                    t         |   ||       || _        t        |d      r|j                  |   nd | _        |j                  | _        |j                  | _        | `t        |      | _        | j
                  dk(  r4t        ||      | _        |j                  | _        |j                  | _        y t!        ||      | _        |j"                  | _        |j$                  | _        y )NrL   rW   )r[   r\   r   hasattrrL   
layer_typerR   rS   mlpr   r   	self_attnrP   attn_alpha_factorrQ   attn_beta_factorr   rN   rO   )r]   r   r   r`   s      ra   r\   zMiniMaxDecoderLayer.__init__  s    +";B6=;Y&,,Y7_c & 7 7%55H(0??006vyIDN%+%D%DD"$*$B$BD!-fi@DN%+%B%BD"$*$@$@D!ry   Nr.   r   r/   position_idsr'   r@   r   r^   r   c                 *   | j                  |      }|}	 | j                  d|||||||d|\  }}
|	| j                  z  || j                  z  z   }| j	                  |      }|}	| j                  |      }|	| j                  z  || j                  z  z   }|S )N)r.   r   r/   r   r'   r@   r   rX   )input_layernormr   r   r   post_attention_layernormr   rR   rS   )r]   r.   r   r/   r   r'   r@   r   r^   residualr   s              ra   r   zMiniMaxDecoderLayer.forward  s     ,,]; )4>> 	
' 3)%+)	
 	
q !4#9#99MDLaLa<aa55mD / 4#8#88=4K_K_;__ry   )NNNNFN)rh   ri   rj   r%   rr   r\   r   r   r   r   r   rZ   r   r   FloatTensorr   rw   rx   s   @ra   r   r     s    A} A A* IM.204(,!&26|| #5<<#=>E t+	
 &&-  $; ((4/ -. 
u  %(9(95;L;L(L"MPT"TT	Ury   r   c                   D     e Zd ZdZ eedd      eeegdZ	 fdZ
 xZS )MiniMaxPreTrainedModelFzmlp.gater   )
layer_nameindex)router_logitsr.   
attentionsc                    t         |   |       t        |t              r|j	                         }|j                  |      \  }}}t        j                  |j                  |       t        j                  |j                  |       t        j                  |j                  |       t        j                  |j                  |       y y r   )r[   _init_weights
isinstancer   r   r   initcopy_r   r   r   r   )r]   moduler   r   r   r   r`   s         ra   r	  z$MiniMaxPreTrainedModel._init_weights  s    f%f78..0J5;5I5I*5U2KNJJv((*5JJv));7JJv''3JJv,,n= 9ry   )rh   ri   rj   _can_compile_fullgraphr   r   r   r   r   _can_record_outputsr	  rw   rx   s   @ra   r  r    s5    "'(9jXYZ,')BC> >ry   r  c                       e Zd Zee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  de
dz  dej                  dz  d	ee   d
eez  fd              Zy)MiniMaxModelNr,   r/   r   r'   r-   r@   r   r^   r   c                    |d u |d uz  rt        d      |r|t               }n*|r(t        |t              st        dt        |       d      || j	                  |      }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }| j                  j                  t        nt        }
 |
| j                  |||||      }|}| j                  ||      }| j                   D ]&  }|j"                  dk(  r|}n|} ||f||||||d	|}( | j%                  |      }t'        ||
      S )Nz:You must specify exactly one of input_ids or inputs_embedszSMiniMax uses cache of its own and is not compatible with `past_key_values` of type .r   rU   )device)r   r-   r/   r   r'   r   rV   )r/   r   r   r'   r@   r   )last_hidden_stater'   )
ValueErrorr   r
  typer0   get_seq_lengthr   r   r   r  r   r   rE   r   r   
rotary_embr1   r   r2   r   )r]   r,   r/   r   r'   r-   r@   r   r^   past_seen_tokensmask_functioncausal_maskr.   r   decoder_layerinput_attention_masks                   ra   r   zMiniMaxModel.forward  s    -t";<YZZ0*nOz/<Hefjkzf{e||}~    --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L.2kk.H.H.P*Vw#;;'))+%
 &"oom\J![[ 	M''+;;'2$ (6$)	3$7) /#-	 	M	$ 		-0%++
 	
ry   )NNNNNNN)rh   ri   rj   r   r   r   r   r   r   r  rZ   r   r   r   r   r   rX   ry   ra   r  r    s     .2.204/326!%26D
##d*D
 t+D
 &&-	D

 &,D
 ((4/D
 $;D
 ((4/D
 +,D
 
'	'D
   D
ry   r  c                        e Zd Z fdZ xZS )MiniMaxForCausalLMc                 "    t        |   di |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MiniMaxForCausalLM

        >>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```rX   )r[   r   )r]   super_kwargsr`   s     ra   r   zMiniMaxForCausalLM.forwardd  s    . w...ry   )rh   ri   rj   r   rw   rx   s   @ra   r   r   c  s    / /ry   r   c                       e Zd Zy) MiniMaxForSequenceClassificationNr|   rX   ry   ra   r$  r$  ~  r}   ry   r$  c                       e Zd Zy)MiniMaxForTokenClassificationNr|   rX   ry   ra   r&  r&    r}   ry   r&  c                       e Zd Zy)MiniMaxForQuestionAnsweringNr|   rX   ry   ra   r(  r(    r}   ry   r(  )r%   r  r  r   r$  r&  r(  )Hrk   r   torch.nn.functionalr   
functionalr    r   r  activationsr   cache_utilsr   r   configuration_utilsr	   r
   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   modeling_rope_utilsr   processing_utilsr   utilsr   r   utils.genericr   utils.output_capturingr   r   gemma2.modeling_gemma2r   mixtral.modeling_mixtralr   r   r   r   r   r   r   r    r!   r"   r#   
get_loggerrh   loggerr%   r{   r   Moduler   r   r   r   r   r   r  r  r   r$  r&  r(  __all__rX   ry   ra   <module>r>     sB        & ! . J R B 9 6 1 & 0 7 E :    
		H	%O#$ O#d	^ 	"I< "IJP/		 P/f	2 		' 		) 		1 	0-/I 0f>3 >&G
< G
T/+ /6	'G 		$A 		"= 	ry   