
    qi4                        d dl Z d dl mZ ddlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*  G d de      Z+ G d de      Z, G d de      Z- G d de       Z. G d dej^                        Z0 G d d e"      Z1 G d! d"e%      Z2e G d# d$e#e             Z3e G d% d&e'             Z4 G d' d(e&      Z5g d)Z6y)*    N)nn   )initialization)ACT2FN)CacheDynamicCache)create_causal_mask)MoeCausalLMOutputWithPastMoeModelOutputWithPast)PreTrainedModel)Unpack)TransformersKwargsauto_docstring)can_return_tuplemerge_with_config_defaults)capture_outputs   )GraniteRMSNormGraniteRotaryEmbedding)JetMoeParallelExpertsJetMoeTopKGating)LlamaAttentionLlamaPreTrainedModel)MixtralDecoderLayerMixtralForCausalLMMixtralModelload_balancing_loss_func   )GraniteMoeConfigc                       e Zd Zy)GraniteMoeRMSNormN__name__
__module____qualname__     c/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/granitemoe/modular_granitemoe.pyr!   r!   $       r'   r!   c                       e Zd Zy)GraniteMoeRotaryEmbeddingNr"   r&   r'   r(   r+   r+   (   r)   r'   r+   c                       e Zd Zy)GraniteMoeParallelExpertsNr"   r&   r'   r(   r-   r-   ,   r)   r'   r-   c                       e Zd Zy)GraniteMoeTopKGatingNr"   r&   r'   r(   r/   r/   0   r)   r'   r/   c                   .     e Zd ZdZdef fdZd Z xZS )GraniteMoeMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    t         |           |j                  | _        |j                  | _        t
        |j                     | _        t        |j                  | j                  | j                  dz        | _
        t        |j                  | j                  | j                        | _        t        | j                  |j                  |j                        | _        y )Nr   )
input_sizenum_expertstop_k)super__init__hidden_sizer4   intermediate_sizer   
hidden_act
activationr-   num_local_expertsinput_linearoutput_linearr/   num_experts_per_tokrouterselfr2   	__class__s     r(   r8   zGraniteMoeMoE.__init__=   s     ,,!33 !2!235f6N6NPTP_P_aeaqaqtuauv6v7O7OQUQaQacgcrcrs*00,,
r'   c                    |j                         \  }}}|j                  d|      }| j                  |      \  }}}}}||   }	| j                  |	|      }
|
j	                  dd      }| j                  |d         |d   z  }
| j                  |
|      }||d d d f   z  }t        j                  ||z  | j                  f|j                  |j                        }|j                  d||      }|j                  ||| j                        }|S )Nr   )dimr   r   )dtypedevice)sizereshaperA   r>   chunkr<   r?   torchzerosr4   rH   rI   	index_addview)rC   layer_inputbszlengthemb_size_batch_indexbatch_gatesexpert_sizeexpert_inputshidden_stateschunked_hidden_statesexpert_outputsrN   layer_outputs                  r(   forwardzGraniteMoeMoE.forwardL   s    + 0 0 2VX!))"h76:kk+6N3;[!#K0))-E - 3 3A2 3 >(=a(@ADYZ[D\\++M;G'+ag*>>S6\4??;>CWCW`n`u`uvq+~F#((fdooFr'   )r#   r$   r%   __doc__r   r8   r^   __classcell__rD   s   @r(   r1   r1   4   s    
/ 
r'   r1   c                   (     e Zd Zdedef fdZ xZS )GraniteMoeAttentionr2   	layer_idxc                 J    t         |   | ||       |j                  | _        y N)r7   r8   attention_multiplierscalingrC   r2   rd   rD   s      r(   r8   zGraniteMoeAttention.__init__`   s!    vy122r'   )r#   r$   r%   r   intr8   r`   ra   s   @r(   rc   rc   _   s    3/ 3C 3 3r'   rc   c                        e Zd Zdedef fdZ	 	 	 	 ddej                  dej                  dz  dedz  dej                  dz  d	e
ej                  ej                  f   dz  d
ej                  fdZ xZS )GraniteMoeDecoderLayerr2   rd   c                 H   t         |   ||       t        ||      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        | `t	        |      | _        |j                  | _        y )N)r2   rd   eps)r7   r8   rc   	self_attnr1   block_sparse_moer!   r9   rms_norm_epsinput_layernormpost_attention_layernormmlpresidual_multiplierri   s      r(   r8   zGraniteMoeDecoderLayer.__init__f   s    +,FiP -f 501C1CI\I\](9&:L:LRXReRe(f%H -f 5#)#=#= r'   NrZ   attention_maskpast_key_valuescache_positionposition_embeddingsreturnc           	          |}| j                  |      } | j                  d|||||d|\  }}||| j                  z  z   }|}| j                  |      }| j	                  |      }||| j                  z  z   }|S )N)rZ   rw   rx   ry   rz   r&   )rs   rp   rv   rt   rq   )	rC   rZ   rw   rx   ry   rz   kwargsresidualrU   s	            r(   r^   zGraniteMoeDecoderLayer.forwardp   s     !,,];)4>> 
')+) 3
 
q !=43K3K#KK 55mD--m< =43K3K#KKr'   )NNNN)r#   r$   r%   r   rj   r8   rM   Tensorr   
LongTensortupler^   r`   ra   s   @r(   rl   rl   e   s    >/ >C > /3(,26HL|| t+ 	
 ((4/ #5<<#=>E 
r'   rl   c                   f    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZ ej                         d        Zy)	GraniteMoePreTrainedModelr2   modelTrl   rx   Fc                     t        j                  | |       t        |t              r7t	        j
                  |j                  d| j                  j                         y y )Ng        )meanstd)	r   _init_weights
isinstancer-   initnormal_weightr2   initializer_range)rC   modules     r(   r   z'GraniteMoePreTrainedModel._init_weights   s@    %%dF3f78LLSdkk6S6ST 9r'   N)r#   r$   r%   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphrM   no_gradr   r&   r'   r(   r   r      sS    &*#12#4"5N"U]]_U Ur'   r   c                       e Zd Zdef fdZeee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ej                  dz  dee   defd                     Z xZS )GraniteMoeModelr2   c           	      &   t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        |j                  | _        y c c}w )Nrn   )r7   r8   r   
ModuleListrangenum_hidden_layersrl   layersr!   r9   rr   normembedding_multiplierri   s      r(   r8   zGraniteMoeModel.__init__   sq     mmHMfNfNfHgh9#FI6h
 &f&8&8f>Q>QR	$*$?$?! is   BN	input_idsrw   position_idsrx   inputs_embeds	use_cachery   r}   r{   c                 b   |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
|| j                  z  }|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f||
||||d|} | j                  |      }t!        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embeds)r2   r   r   )rI   )r2   r   rw   ry   rx   r   )rz   rw   r   rx   r   ry   )last_hidden_staterx   )
ValueErrorr   r2   embed_tokensget_seq_lengthrM   arangeshaperI   	unsqueezer	   r   
rotary_embr   r   r   r   )rC   r   rw   r   rx   r   r   ry   r}   past_seen_tokenscausal_maskrZ   rz   decoder_layers                 r(   r^   zGraniteMoeModel.forward   sp    -t";<YZZ0*$++>O  --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 &(A(AA% #oom\J![[)H4;;+H+HI 
	M)	$7*) /#-	 	M
	 		-0%++
 	
r'   )NNNNNNN)r#   r$   r%   r   r8   r   r   r   rM   r   r   r   FloatTensorboolr   r   r   r^   r`   ra   s   @r(   r   r      s    @/ @   .2.204(,26!%26;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
 $;;
 ((4/;
 +,;
 
 ;
    ;
r'   r   c                   D    e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  d	ej                  dz  d
edz  dej                  dz  deej                  z  deez  fd              Z xZS )GraniteMoeForCausalLMr2   c                 f    t         |   |       t        |      | _        |j                  | _        y rf   )r7   r8   r   r   logits_scalingrB   s     r(   r8   zGraniteMoeForCausalLM.__init__   s*     $V,
$33r'   Nr   rw   r   rx   r   labelsoutput_router_logitsry   logits_to_keepr{   c
           
         ||n| j                   j                  } | j                  d||||||d|
}|j                  }t	        |	t
              rt        |	 d      n|	}| j                  |dd|ddf         }|| j                   j                  z  }d}|* | j                  ||fd| j                   j                  i|
}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j!                  |j"                        z  z  }t%        ||||j&                  |j(                  |j*                  |j                        S )al  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeForCausalLM

        >>> model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r   rw   r   rx   r   ry   
vocab_size)lossaux_losslogitsrx   rZ   
attentionsrouter_logitsr&   )r2   r   r   r   r   rj   slicelm_headr   loss_functionr   r   r   r5   r@   router_aux_loss_coeftorI   r
   rx   rZ   r   )rC   r   rw   r   rx   r   r   r   ry   r   r}   outputsrZ   slice_indicesr   r   r   s                    r(   r^   zGraniteMoeForCausalLM.forward   s   L %9$D $++JjJj 	 $** 
)%+')
 
  118B>SV8W~ot4]kmA}a,?@A$++444%4%%  ;;11 	D /%%  ((	H !11HKK4LLL(#33!//))!//
 	
r'   )	NNNNNNNNr   )r#   r$   r%   r   r8   r   r   rM   r   r   r   r   r   rj   r   r
   r^   r`   ra   s   @r(   r   r      s   4/ 4
  .2.204(,26*.,026-.S
##d*S
 t+S
 &&-	S

 S
 ((4/S
   4'S
 #TkS
 ((4/S
 ell*S
 
*	*S
  S
r'   r   )r   r   r   )7rM   r    r   r   activationsr   cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   utils.genericr   r   utils.output_capturingr   granite.modeling_graniter   r   jetmoe.modeling_jetmoer   r   llama.modeling_llamar   r   mixtral.modeling_mixtralr   r   r   r   configuration_granitemoer   r!   r+   r-   r/   Moduler1   rc   rl   r   r   r   __all__r&   r'   r(   <module>r      s       & ! . / Q - & 7 I 5 M L G v v 6	 		 6 		 5 		+ 	(BII (V3. 3#0 #L U 4o U U" G
l G
 G
T[
. [
| Tr'   