
    qi                        d dl Z ddlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZmZmZmZmZmZ d	d
lmZ  ej,                  e      Z G d de      Z G d de      Z G d de      Z G d de	      Z G d de      Z G d de
      Z G d de      Z G d de      Z  G d de      Z! G d de      Z"g dZ#y)     N   )CausalLMOutputWithPast)Unpack)logging   )DeepseekV3DecoderLayerDeepseekV3MLPDeepseekV3MoEDeepseekV3PreTrainedModelDeepseekV3TopkRouter)Qwen3AttentionQwen3ForCausalLM
Qwen3ModelQwen3RMSNormQwen3RotaryEmbeddingTransformersKwargs   )Dots1Configc                       e Zd Zy)Dots1RMSNormN__name__
__module____qualname__     Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/dots1/modular_dots1.pyr   r   (       r   r   c                       e Zd Zy)Dots1RotaryEmbeddingNr   r   r   r   r    r    ,   r   r   r    c                       e Zd Zy)Dots1AttentionNr   r   r   r   r"   r"   0   r   r   r"   c                       e Zd Zy)Dots1MLPNr   r   r   r   r$   r$   4   r   r   r$   c                       e Zd Zy)Dots1TopkRouterNr   r   r   r   r&   r&   8   r   r   r&   c                       e Zd Zd Zy)Dots1MoEc                    |j                         }|| j                  j                  z   }|j                  d| j                  | j
                  | j                  z        j                  dd      d   j                  d      }t        j                  || j                  dd      d   }t        j                  |      }|j                  d|d       |j                  d      j                  d| j                  | j
                  | j                  z        j                  d| j
                        }|j                  |j!                          d      }t        j                  || j"                  dd      d   }|j%                  d|      }	| j&                  r|	j                  dd	
      dz   }
|	|
z  }	|	| j(                  z  }	||	fS )Nr   )dimr   F)kr+   sortedr   g        T)r+   keepdimg#B;)sigmoidgatee_score_correction_biasviewn_groupn_routed_expertstopksumtorch
topk_group
zeros_likescatter_	unsqueezeexpandreshapemasked_fillbooltop_kgathernorm_topk_probrouted_scaling_factor)selfrouter_logitsrouter_logits_for_choicegroup_scores	group_idx
group_mask
score_maskscores_for_choicetopk_indicestopk_weightsdenominators              r   route_tokens_to_expertsz Dots1MoE.route_tokens_to_experts=   s   %--/#04993T3T#T $))"dllD<Q<QUYUaUa<abT!T_Q SRS[ 	
 JJ|tBuUVWX	%%l3
Ay!,  $VBd&;&;t||&KLWR../ 	
 5@@*//BSASUXYzz"3tzzrRWXYZ[$++A|<&**r4*@5HKK'L#d&@&@@\))r   N)r   r   r   rO   r   r   r   r(   r(   <   s    *r   r(   c                   (     e Zd Zdedef fdZ xZS )Dots1DecoderLayerconfig	layer_idxc                 N    t         |   ||       |j                  |   | _        y )N)super__init__layer_typesattention_type)rD   rR   rS   	__class__s      r   rV   zDots1DecoderLayer.__init__X   s%    +$00;r   )r   r   r   r   intrV   __classcell__rY   s   @r   rQ   rQ   W   s    <{ <s < <r   rQ   c                       e Zd ZdZy)Dots1PreTrainedModelN)r   r   r   "_keys_to_ignore_on_load_unexpectedr   r   r   r^   r^   ]   s    )-&r   r^   c                       e Zd Zy)
Dots1ModelNr   r   r   r   ra   ra   a   r   r   ra   c                   .     e Zd Zdee   def fdZ xZS )Dots1ForCausalLMsuper_kwargsreturnc                 "    t        |   di |S )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Dots1ForCausalLM

        >>> model = Dots1ForCausalLM.from_pretrained("rednote-hilab/dots1.llm1.inst")
        >>> tokenizer = AutoTokenizer.from_pretrained("rednote-hilab/dots1.llm1.inst")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```r   )rU   forward)rD   rd   rY   s     r   rg   zDots1ForCausalLM.forwardf   s    4 w...r   )r   r   r   r   r   r   rg   r[   r\   s   @r   rc   rc   e   s%    /12/ 
 / /r   rc   )r^   ra   rc   )$r7   modeling_outputsr   processing_utilsr   utilsr    deepseek_v3.modeling_deepseek_v3r   r	   r
   r   r   qwen3.modeling_qwen3r   r   r   r   r   r   configuration_dots1r   
get_loggerr   loggerr   r    r"   r$   r&   r(   rQ   r^   ra   rc   __all__r   r   r   <module>rq      s     6 &    - 
		H	%	< 		/ 		^ 		} 		* 	*} *6<. <.4 .	 	/' /<r   