
    qi#                        d dl Z d dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)  G d de%      Z* G d de!      Z+d Z, G d de'      Z- G d de      Z. G d de      Z/e G d de              Z0e G d  d!e             Z1e G d" d#e             Z2g d$Z3y)%    N)Callable   )initialization)CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring   )CLIPMLP)Gemma2ForCausalLM)LlamaDecoderLayer
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Llama4TextL2Norm)Qwen3Attention   )NanoChatConfigc                       e Zd Zy)NanoChatRMSNormN__name__
__module____qualname__     _/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/nanochat/modular_nanochat.pyr   r   +       r$   r   c                       e Zd Zy)NanoChatRotaryEmbeddingNr   r#   r$   r%   r(   r(   /   r&   r$   r(   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  || fd      S )zJRotates half the hidden dims of the input with flipped signs for NanoChat..Nr   )dim)shapetorchcat)xx1x2s      r%   rotate_halfr2   3   sZ    	
3"!''"+"""	#B	
3q ""	#B99b2#YB''r$   c                       e Zd Zdedef fdZ	 	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  dz  f   fdZ xZS )NanoChatAttentionconfig	layer_idxc                     t         |   ||       | `| `t	        |j
                        | _        t	        |j
                        | _        y N)eps)super__init__sliding_window
layer_typer   rms_norm_epsq_normk_normselfr5   r6   	__class__s      r%   r;   zNanoChatAttention.__init__;   sA    +O%&*=*=>%&*=*=>r$   Nhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionkwargsreturnc                 \   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
| j                  |	      }	| j                  |
      }
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t               } || |	|
||f| j"                  sdn| j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )Nr*   r   r   )sincosrH           )dropoutscaling)r,   head_dimq_projview	transposek_projv_projr   r?   r@   updater6   r   get_interfacer5   _attn_implementationr   trainingattention_dropoutrP   reshape
contiguouso_proj)rB   rD   rE   rF   rG   rH   rI   input_shapehidden_shapequery_states
key_statesvalue_statesrM   rL   cache_kwargsattention_interfaceattn_outputattn_weightss                     r%   forwardzNanoChatAttention.forwardC   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j {{<0[[,
&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r$   )NNNN)r    r!   r"   r   intr;   r-   Tensortupler   
LongTensorr   r   rh   __classcell__rC   s   @r%   r4   r4   :   s    ?~ ?# ? IM.2(,26-)||-) #5<<#=>E-) t+	-)
 -) ((4/-) +,-) 
u||U\\D00	1-)r$   r4   c                        e Zd Z fdZ xZS )NanoChatMLPc                     t         |   |       t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        y )NF)bias)r:   r;   nnLinearhidden_sizeintermediate_sizefc1fc2rB   r5   rC   s     r%   r;   zNanoChatMLP.__init__t   sR     99V//1I1IPUV99V55v7I7IPUVr$   )r    r!   r"   r;   rm   rn   s   @r%   rp   rp   s   s    W Wr$   rp   c                   (     e Zd Zdedef fdZ xZS )NanoChatDecoderLayerr5   r6   c                     t         |           t        |j                        | _        t        |j                        | _        y r8   )r:   r;   r   r>   input_layernormpost_attention_layernormrA   s      r%   r;   zNanoChatDecoderLayer.__init__{   s4    .63F3FG(7F<O<O(P%r$   )r    r!   r"   r   ri   r;   rm   rn   s   @r%   r{   r{   z   s    Q~ Q# Q Qr$   r{   c                   0    e Zd Zdej                  ddfdZy)NanoChatPreTrainedModelmodulerJ   Nc           	      ,   t        j                  | |       t        |t              rnt	        j
                  |j                  j                  d| j                  j                  t        j                  d| j                  j                  z        z         y y )NrN   r   )meanstd)r   _init_weights
isinstancer4   initnormal_r^   weightr5   initializer_rangemathsqrtnum_hidden_layers)rB   r   s     r%   r   z%NanoChatPreTrainedModel._init_weights   sh    %%dF3f/0LL$$KK11DIIa$++B_B_>_4`` 1r$   )r    r!   r"   rs   Moduler   r#   r$   r%   r   r      s    BII $ r$   r   c                        e Zd Zdef fdZ	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	ej                  dz  d
e
dz  dee   defdZ xZS )NanoChatModelr5   c                 Z    t         |   |       t        |j                        | _        y r8   )r:   r;   r   r>   normry   s     r%   r;   zNanoChatModel.__init__   s"     #(;(;<	r$   N	input_idsrF   position_idsrG   inputs_embedsrH   	use_cacherI   rJ   c           
      d   |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|E||j	                         nd}	t        j                  |j                  d   |j                        |	z   }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  |      }| j                  d | j                  j                   D ]  } ||f|
||||d|} | j                  |      }t        ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embeds)r5   r   r   )device)r5   r   rF   rH   rG   r   )r   )rF   rE   r   rG   rH   )last_hidden_staterG   )
ValueErrorembed_tokensr   r5   get_seq_lengthr-   aranger,   r   	unsqueezer   
rotary_embr   layersr   r	   )rB   r   rF   r   rG   r   rH   r   rI   past_seen_tokenscausal_maskrD   rE   decoder_layers                 r%   rh   zNanoChatModel.forward   sh    -t";<YZZ *.*;*;I*FM0*$++>O!CRC^==?de]003M<P<PQTdd  )33A6L(;;'))+%
 &"oom,oW		-0![[)H4;;+H+HI 		M)*$7) /- M		 		-0&++
 	
r$   )NNNNNNN)r    r!   r"   r   r;   r-   rl   rj   r   FloatTensorboolr   r   r	   rh   rm   rn   s   @r%   r   r      s    =~ = .2.204(,2626!%9
##d*9
 t+9
 &&-	9

 9
 ((4/9
 ((4/9
 $;9
 +,9
 
!9
r$   r   c                   ,     e Zd ZddiZdef fdZ xZS )NanoChatForCausalLMlm_headcolwise_gather_outputrJ   c                 $    t        |   di | y)ak  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, AutoModelForCausalLM

        >>> model = AutoModelForCausalLM.from_pretrained("karpathy/nanochat-d32")

        >>> tokenizer = AutoTokenizer.from_pretrained("karpathy/nanochat-d32")

        >>> conversation = [
                {"role": "user", "content": "What is the capital of France?"},
            ]

        >>> inputs = tokenizer.apply_chat_template(
                conversation, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
            ).to(device)

        >>> with torch.no_grad():
        >>>     outputs = model.generate(**inputs, max_new_tokens=64, do_sample=False)

        >>> generated_tokens = outputs[0, inputs["input_ids"].shape[1] :]
        >>> output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
        ```Nr#   )r:   rh   )rB   super_kwargsrC   s     r%   rh   zNanoChatForCausalLM.forward   s    2 	','r$   )r    r!   r"   _tp_planr
   rh   rm   rn   s   @r%   r   r      s     23H()? ( (r$   r   )r   r   r   )4r   collections.abcr   r-   torch.nnrs    r   r   cache_utilsr   r   masking_utilsr   modeling_outputsr	   r
   modeling_utilsr   r   processing_utilsr   utilsr   r   clip.modeling_clipr   gemma2.modeling_gemma2r   llama.modeling_llamar   r   r   r   r   r   llama4.modeling_llama4r   qwen3.modeling_qwen3r   configuration_nanochatr   r   r(   r2   r4   rp   r{   r   r   r   __all__r#   r$   r%   <module>r      s     $   & . / O F & 7 ( 6  6 1 2	& 		2 	(6) 6)rW' WQ, Q 2   ?
J ?
 ?
D (+ ( (>r$   