
    qi5                         d dl mZ ddlmZ ddlmZmZ ddlmZ ddl	m
Z
mZmZmZ ddlmZ  G d	 d
e      Z G d de      Z G d de
      Z G d de      Z G d de      Z G d de      Zg dZy)    N   )RopeParameters)auto_docstringcan_return_tuple   )LlamaConfig)LlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel)NemotronMLPc            ,       <    e Zd ZdZdZdddddddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedz  dedz  d	edz  d
edz  dedz  dedz  dedz  dedz  dedz  dedz  de	dz  dedz  dedz  dedz  de	dz  de	dz  dedz  de	dz  dedz  de
eee
f   z  dz  f( fdZ xZS )Jais2Configa
  
    This is the configuration class to store the configuration of a [`Jais2Model`]. It is used to instantiate a Jais2
    model according to the specified arguments, defining the model architecture.
    [inceptionai/Jais-2-8B-Chat](https://huggingface.co/inceptionai/Jais-2-8B-Chat).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 150272):
            Vocabulary size of the Jais2 model.
        hidden_size (`int`, *optional*, defaults to 3328):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 26624):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 26):
            Number of attention heads for each attention layer.
        num_key_value_heads (`int`, *optional*):
            Number of key_value heads for Grouped Query Attention.
        hidden_act (`str`, *optional*, defaults to `"relu2"`):
            The non-linear activation function in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether to return last key/values attentions.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 0):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 150024):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings.
        attention_bias (`bool`, *optional*, defaults to `True`):
            Whether to use a bias in the query, key, value and output projection layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `True`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers.
        head_dim (`int`, *optional*):
            The attention head dimension.
        rope_parameters (`dict`, *optional*):
            The RoPE parameters.
    jais2colwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projN
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actmax_position_embeddingsinitializer_rangelayer_norm_eps	use_cachepad_token_idbos_token_ideos_token_idtie_word_embeddingsattention_biasattention_dropoutmlp_biashead_dimrope_parametersc                     t        |   di d|d|d|d|d|d|d|d|d	|	d
|d|d|d|d|d|d|d|d|d|| |
| _        | `| `y )Nr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&    )super__init__r   rms_norm_epspretraining_tp)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   kwargs	__class__s                         Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/jais2/modular_jais2.pyr*   zJais2Config.__init__]   s    0 	 	
!	
#	
 0	
 0		

 !4	
 !4	
 "	
 %<	
 0	
  	
 &	
 &	
 &	
 !4	
 *	
  0!	
" #	
$ %	
& ,)	
, -    )i K i   i h         Nrelu2i    g{Gz?gh㈵>TNr   iJ FTg        TNN)__name__
__module____qualname____doc__
model_typebase_model_tp_planintstrfloatboolr   dictr*   __classcell__r/   s   @r0   r   r      s   1f J &/%.%.%. )"+ "("&(-(**,*.!(.2*.'+!%#'#$#)+0&**- $#MQ+0 $J0  4Z0  :	0 
 :0  !4Z0  !4Z0  $J0  "%t0  !4<0  0  $;0  Dj0  Dj0  Dj0   "D[!0 " t#0 $ !4<%0 & +'0 ( *)0 * ($sN/B*CCdJ+0  0 r1   r   c                       e Zd Zy)Jais2MLPNr5   r6   r7   r(   r1   r0   rC   rC          r1   rC   c                   (     e Zd Zdedef fdZ xZS )Jais2DecoderLayerconfig	layer_idxc                     t         |   ||       t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _        y N)eps)r)   r*   nn	LayerNormr   r   input_layernormpost_attention_layernorm)r-   rH   rI   r/   s      r0   r*   zJais2DecoderLayer.__init__   sP    +!||F,>,>FDYDYZ(*V5G5GVMbMb(c%r1   )r5   r6   r7   r   r;   r*   r@   rA   s   @r0   rG   rG      s    d{ ds d dr1   rG   c                       e Zd Zy)Jais2PreTrainedModelNrD   r(   r1   r0   rR   rR      rE   r1   rR   c                   $     e Zd Zdef fdZ xZS )
Jais2ModelrH   c                     t         |   |       t        j                  |j                  |j
                        | _        y rK   )r)   r*   rM   rN   r   r   norm)r-   rH   r/   s     r0   r*   zJais2Model.__init__   s.     LL!3!39N9NO	r1   )r5   r6   r7   r   r*   r@   rA   s   @r0   rT   rT      s    P{ P Pr1   rT   c                   2     e Zd Zee fd              Z xZS )Jais2ForCausalLMc                 "    t        |   di |S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Jais2ForCausalLM

        >>> model = Jais2ForCausalLM.from_pretrained("inceptionai/Jais-2-8B-Chat")
        >>> tokenizer = AutoTokenizer.from_pretrained("inceptionai/Jais-2-8B-Chat")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```r(   )r)   forward)r-   super_kwargsr/   s     r0   rZ   zJais2ForCausalLM.forward   s    ( w...r1   )r5   r6   r7   r   r   rZ   r@   rA   s   @r0   rX   rX      s    /  /r1   rX   )r   rT   rX   rR   )torch.nnrM   modeling_rope_utilsr   utilsr   r   llama.configuration_llamar   llama.modeling_llamar	   r
   r   r   nemotron.modeling_nemotronr   r   rC   rG   rR   rT   rX   __all__r(   r1   r0   <module>rc      s{      1 5 3  5o + o d	{ 	d) d	/ 	P P/' /0r1   