
    qiJ7                        d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.  ej^                  e0      Z1 G d de$      Z2 G d de      Z3 G d de      Z4 G d de'      Z5 G d de!      Z6 G d d e)      Z7 G d! d"e%      Z8 G d# d$e*      Z9 G d% d&e(      Z: G d' d(e&      Z;g d)Z<y)*    )CallableN)nn   )initialization)Cache)create_causal_mask)BaseModelOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)capture_outputs   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)Gemma2RotaryEmbedding)
GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedMoEGraniteMoeSharedPreTrainedModelapply_rotary_pos_embeager_attention_forward   )GraniteMoeHybridConfigc                       e Zd Zdedef fdZ	 	 	 ddej                  dej                  dz  dedz  dej                  dz  d	e
ej                  ej                  f   dz  d
ee   de
ej                  ej                  f   fdZ xZS )GraniteMoeHybridAttentionconfig	layer_idxc                 &    t         |   ||       y Nsuper__init__selfr&   r'   	__class__s      o/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr,   z"GraniteMoeHybridAttention.__init__3   s    +    Nhidden_statesattention_maskpast_key_valuescache_positionposition_embeddingskwargsreturnc                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }||\  }}t        |	|
||      \  }	}
|%d|i}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t              } || |	|
||f| j                  sdn| j                   | j"                  d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )Nr"   r   r5   g        )dropoutscaling)shapehead_dimq_projview	transposek_projv_projr    updater'   r   get_interfacer&   _attn_implementationr!   trainingattention_dropoutr<   reshape
contiguouso_proj)r.   r2   r3   r4   r5   r6   r7   input_shapehidden_shapequery_states
key_statesvalue_statescossincache_kwargsattention_interfaceattn_outputattn_weightss                     r0   forwardz!GraniteMoeHybridAttention.forward6   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST**HC';L*VY[^'_$L*&,n=L'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r1   )NNN)__name__
__module____qualname__r#   intr,   torchTensorr   
LongTensortupler   r   rW   __classcell__r/   s   @r0   r%   r%   2   s    ,5 ,# , )-26HL))||)) t+)) 	))
 ((4/)) #5<<#=>E)) +,)) 
u||U\\)	*))r1   r%   c                   (     e Zd Zdedef fdZ xZS )GraniteMoeHybridMambaLayerr&   r'   c                 8    t         |   t        |      |       y r)   )r+   r,   r   r-   s      r0   r,   z#GraniteMoeHybridMambaLayer.__init__c   s    V,i8r1   )rX   rY   rZ   r#   r[   r,   r`   ra   s   @r0   rc   rc   b   s    95 9# 9 9r1   rc   c                         e Zd Zd fd	Z xZS )GraniteMoeHybridRMSNormGatedc                 &    t         |   ||       y r)   r*   )r.   hidden_sizeepsr/   s      r0   r,   z%GraniteMoeHybridRMSNormGated.__init__h   s    c*r1   )gư>)rX   rY   rZ   r,   r`   ra   s   @r0   rf   rf   g   s    + +r1   rf   c                   $     e Zd Zdef fdZ xZS )GraniteMoeHybridMLPr&   c                 $    t         |   |       y r)   r*   r.   r&   r/   s     r0   r,   zGraniteMoeHybridMLP.__init__m   s     r1   )rX   rY   rZ   r#   r,   r`   ra   s   @r0   rk   rk   l   s    !5 ! !r1   rk   c                       e Zd Zy)GraniteMoeHybridRotaryEmbeddingNrX   rY   rZ    r1   r0   ro   ro   q       r1   ro   c                       e Zd Zy)GraniteMoeHybridMoENrp   rq   r1   r0   rt   rt   u   rr   r1   rt   c                   N    e Zd Zdedef fdZe	 	 	 	 	 ddej                  dej                  dz  de	dz  de
dz  d	ej                  dz  d
eej                  ej                  f   dz  dee   deej                  eej                  ej                  f   dz  f   fd       Z xZS )GraniteMoeHybridDecoderLayerr&   r'   c                 `   t         |   ||       t        |      | _        d | _        d | _        |j                  |   dk(  rt        ||      | _        nt        ||      | _        |j                  |   | _	        |j                  dkD  rt        |      nd | _        t        |dd      dkD  | _        y )Nmambar   num_local_experts)r+   r,   rk   
shared_mlp	self_attnrx   layers_block_typerc   r%   
layer_typery   rt   block_sparse_moegetattrhas_expertsr-   s      r0   r,   z%GraniteMoeHybridDecoderLayer.__init__z   s    +-f5
##I.'93FIFDJ6vyIDN 229= @F?W?WZ[?[ 3F ;ae #6+>BQFr1   Nr2   r3   r4   	use_cacher5   r6   r7   r8   c           
         |}| j                  |      }| j                   | j                  d||||d|}n | j                  d||||||d|\  }}	||| j                  z  z   }|}| j	                  |      }| j
                  r&| j                  |      }
|
| j                  |      z   }n| j                  |      }||| j                  z  z   }|S )N)r2   r5   cache_paramsr3   )r2   r3   r4   r   r5   r6   rq   )input_layernormrx   r{   residual_multiplierpost_attention_layernormr   r~   rz   )r.   r2   r3   r4   r   r5   r6   r7   residual_moe_hidden_statess              r0   rW   z$GraniteMoeHybridDecoderLayer.forward   s    !,,];::!&DJJ +-,-	
 M  .t~~  +- /#-$7   M1 !=43K3K#KK 55mD $ 5 5m D-0NNM OOM:M =43K3K#KKr1   )NNFNN)rX   rY   rZ   r#   r[   r,   r   r\   r]   r   boolr^   r_   r   r   FloatTensorrW   r`   ra   s   @r0   rv   rv   y   s    G5 G# G&  /3(,!&26HL+||+ t++ 	+
 $;+ ((4/+ #5<<#=>E+ 45+ 
u  %(9(95;L;L(L"MPT"TT	U+ +r1   rv   c                   \     e Zd ZU eed<   dgZdZ ej                          fd       Z	 xZ
S )GraniteMoeHybridPreTrainedModelr&   rv   Tc           
         t         |   |       t        |t              rt	        j
                  |j                         t	        j                  |j                  t        j                  t        j                  d|j                  dz                      t	        j
                  |j                         y t        |t              r t	        j
                  |j                         y y )Nr"   )r+   _init_weights
isinstancerc   initones_dt_biascopy_A_logr\   logarange	num_headsDrf   weight)r.   moduler/   s     r0   r   z-GraniteMoeHybridPreTrainedModel._init_weights   s    f%f89JJv~~&JJv||UYYu||Av?O?ORS?S/T%UVJJvxx  <=JJv}}% >r1   )rX   rY   rZ   r#   __annotations___no_split_modules_is_statefulr\   no_gradr   r`   ra   s   @r0   r   r      s1    ""78LU]]_& &r1   r   c                       e Zd Zdef fdZeee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ej                  dz  dee   deez  fd                     Zd Z xZS )GraniteMoeHybridModelr&   c           	      (   t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        |j                  | _        |j                  dk(  rt        |      | _        y d | _        y c c}w )Nrope)r+   r,   r   
ModuleListrangenum_hidden_layersrv   layersembedding_multiplierposition_embedding_typero   
rotary_embr-   s      r0   r,   zGraniteMoeHybridModel.__init__   sz     mmNSTZTlTlNmn)&)<n
 %+$?$?!EKEcEcgmEm9&Asw os   BN	input_idsr3   position_idsr4   inputs_embedsr   r5   r7   r8   c           
         |d u |d uz  rt        d      || j                  |      }|| j                  z  }|F||j                         nd}	t	        j
                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  ||||      }
| j                  ||      }|}d }| j                  | j                  ||      }| j                  D ]$  }|j                  dk(  r|n|
} ||f|||||d|}& | j                  |      }|r|j                   sd|_        t#        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r"   devicerx   )r3   r4   r   r5   r6   T)last_hidden_stater4   )
ValueErrorembed_tokensr   get_seq_lengthr\   r   r=   r   	unsqueezer   r&   _update_mamba_maskr   r   r}   normhas_previous_stater
   )r.   r   r3   r   r4   r   r   r5   r7   past_seen_tokenscausal_mask
mamba_maskr2   r6   decoder_layer
layer_masks                   r0   rW   zGraniteMoeHybridModel.forward   s    -t";<YZZ  --i8M%(A(AA!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(KK
 ,,^^L
 &"??&"&//-"N![[ 	M'4'?'?7'JP[J)) /#-$7 M		 		-0?#E#E15O.%++
 	
r1   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr"   )r\   all)r.   r3   r5   r   s       r0   r   z(GraniteMoeHybridModel._update_mamba_mask  s7     $
!q ^%?EIIn`aNaDbJr1   )NNNNNNN)rX   rY   rZ   r#   r,   r   r   r   r\   r^   r]   r   r   r   r   r   r_   r	   rW   r   r`   ra   s   @r0   r   r      s    x5 x  .2.204(,26!%26@
##d*@
 t+@
 &&-	@

 @
 ((4/@
 $;@
 ((4/@
 45@
 
(	(@
    @
D	r1   r   c                   P     e Zd ZddiZdef fdZ fdZ	 	 	 	 	 	 	 d fd	Z xZS )GraniteMoeHybridForCausalLMzlm_head.weightzmodel.embed_tokens.weightr&   c                 d    t         |   |       t        |      | _        | j	                          y r)   )r+   r,   r   model	post_initrm   s     r0   r,   z$GraniteMoeHybridForCausalLM.__init__)  s&     *62
r1   c                 "    t        |   di |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM

        >>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm-granite/granite-4.0-h-tiny")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-4.0-h-tiny")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```rq   )r+   rW   )r.   super_kwargsr/   s     r0   rW   z#GraniteMoeHybridForCausalLM.forward/  s    . w...r1   c	                     |<|r:t        | j                  |j                  d   | j                  | j                        }t        |   |f|||||||d|	}
|
S )Nr   r   )r4   r3   r   r5   r   r   is_first_iteration)r   r&   r=   dtyper   r+   prepare_inputs_for_generation)r.   r   r4   r3   r   r5   r   r   r   r7   model_inputsr/   s              r0   r   z9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationH  su     "y>Y__Q/DKKO w<

+)')%1

 

 r1   )NNNNNTF)	rX   rY   rZ   _tied_weights_keysr#   r,   rW   r   r`   ra   s   @r0   r   r   &  sB    *,GH5 /8   r1   r   )r   r   r   )=collections.abcr   r\   r    r   r   cache_utilsr   masking_utilsr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   bamba.configuration_bambar   bamba.modeling_bambar   r   r   gemma2.modeling_gemma2r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   r   r    r!   configuration_granitemoehybridr#   
get_loggerrX   loggerr%   rc   rf   rk   ro   rt   rv   r   r   r   __all__rq   r1   r0   <module>r      s    %   &   / O 5 & @ @ 7 5 3 b b :   C 
		H	%-) 9 -)`9 9
+#4 +
!- !
	&; 		- 	@#? @F&&E & W1 WtA"= AH fr1   