
    qiM                        d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZ ddlmZ  ej@                  e!      Z"dZ# G d de      Z$ G d de      Z% G d de      Z& G d de      Z' G d de      Z( G d de      Z) G d de      Z* G d  d!e      Z+g d"Z,y)#zPyTorch Qwen3 model.    )CallableN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging   )GemmaMLP)LlamaAttention)Qwen2ForCausalLMQwen2ForQuestionAnsweringQwen2ForSequenceClassificationQwen2ForTokenClassificationQwen2RMSNormQwen2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )Qwen3ConfigzQwen/Qwen3-8Bc                       e Zd Zy)Qwen3RMSNormN__name__
__module____qualname__     Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/qwen3/modular_qwen3.pyr   r   0       r    r   c                       e Zd Zy)Qwen3MLPNr   r   r    r!   r$   r$   4   r"   r    r$   c                       e Zd Zy)Qwen3RotaryEmbeddingNr   r   r    r!   r&   r&   8   r"   r    r&   c                       e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  dz  f   fdZ xZS )Qwen3Attentionconfig	layer_idxc                 R   t        |d      r|j                  |   nd | _        t        |   ||       t        | j                  |j                        | _        t        | j                  |j                        | _	        | j                  dk(  r|j                  | _
        y d | _
        y )Nlayer_types)epssliding_attention)hasattrr,   
layer_typesuper__init__r   head_dimrms_norm_epsq_normk_normsliding_window)selfr)   r*   	__class__s      r!   r2   zQwen3Attention.__init__=   s    ;B6=;Y&,,Y7_c+"4==f6I6IJ"4==f6I6IJ7;J]7]f33cgr    Nhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionkwargsreturnc                 j   |j                   d d }g |d| j                  }| j                  | j                  |      j	                  |            j                  dd      }	| j                  | j                  |      j	                  |            j                  dd      }
| j                  |      j	                  |      j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t               } || |	|
||f| j"                  sdn| j$                  | j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )Nr   r   )sincosr>   g        )dropoutscalingr7   )shaper3   r5   q_projview	transposer6   k_projv_projr   updater*   r   get_interfacer)   _attn_implementationr   trainingattention_dropoutrF   r7   reshape
contiguouso_proj)r8   r:   r;   r<   r=   r>   r?   input_shapehidden_shapequery_states
key_statesvalue_statesrD   rC   cache_kwargsattention_interfaceattn_outputattn_weightss                     r!   forwardzQwen3Attention.forwardD   s    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL..
%
 
%
!\ *k));;;;FFHkk+.L((r    )NN)r   r   r   r   intr2   torchTensortupler   
LongTensorr	   r   r^   __classcell__r9   s   @r!   r(   r(   <   s    h{ hs h )-26*)||*) #5<<#=>*) t+	*)
 *) ((4/*) -.*) 
u||U\\D00	1*)r    r(   c                   .     e Zd Zdee   def fdZ xZS )Qwen3ForCausalLMsuper_kwargsr@   c                 "    t        |   di |S )a^  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM

        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```r   )r1   r^   )r8   rh   r9   s     r!   r^   zQwen3ForCausalLM.forwardr   s    4 w...r    )r   r   r   r	   r
   r   r^   rd   re   s   @r!   rg   rg   q   s%    /12/ 
 / /r    rg   c                       e Zd Zy)Qwen3ForSequenceClassificationNr   r   r    r!   rk   rk      r"   r    rk   c                       e Zd Zy)Qwen3ForTokenClassificationNr   r   r    r!   rm   rm      r"   r    rm   c                       e Zd Zy)Qwen3ForQuestionAnsweringNr   r   r    r!   ro   ro      r"   r    ro   )rg   ro   Qwen3PreTrainedModel
Qwen3Modelrk   rm   )-__doc__collections.abcr   r`   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr	   utilsr
   r   gemma.modeling_gemmar   llama.modeling_llamar   qwen2.modeling_qwen2r   r   r   r   r   r   r   r   configuration_qwen3r   
get_loggerr   logger_CHECKPOINT_FOR_DOCr   r$   r&   r(   rg   rk   rm   ro   __all__r   r    r!   <module>r      s     $    B 6 5 & 0 +	 	 	 - 
		H	%% 	< 		x 		/ 	2)^ 2)j/' /<	%C 		"= 		 9 	r    