
    qiX                        d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ	 ddl
mZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,  G d de&      Z- G d dej\                        Z/ G d dej\                        Z0 G d dej\                        Z1 G d  d!ej\                        Z2 G d" d#ej\                        Z3 G d$ d%ej\                        Z4 G d& d'e      Z5 G d( d)e      Z6e G d* d+e6             Z7 G d, d-e6e      Z8g d.Z9y)/z"Modular components for DBRX model.    )Callable)AnyN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )LlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)load_balancing_loss_func   )
DbrxConfigc                       e Zd Zy)DbrxRotaryEmbeddingN)__name__
__module____qualname__     W/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/dbrx/modular_dbrx.pyr    r    -   s    r%   r    c                        e Zd ZdZ	 ddedz  f fdZ	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	ej                  dz  d
e
ej                  ej                  f   fdZ xZS )DbrxAttentionzYModular DBRX attention component that can be reused across different model architectures.N	layer_idxc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        |j                  | _	        || _
        |j                  }|j                  | _        |j                  | _        |j                  | _        | j                  | j                   z  | _        | j                  dz  | _        |j&                  | _        d| _        t+        j,                  | j                  | j                  d| j                   z  | j                  z  z   d      | _        t+        j,                  | j                  | j                  d      | _        y )Ng      Tr   Fbias)super__init__configd_modelhidden_sizen_heads	num_headshead_dimmax_seq_lenmax_position_embeddingsr)   attn_config
attn_pdropattention_dropoutclip_qkv
kv_n_headsnum_key_value_headsnum_key_value_groupsscaling
rope_theta	is_causalr   LinearWqkvout_proj)selfr/   r)   kwargsr7   	__class__s        r&   r.   zDbrxAttention.__init__4   s*    	!>>((DNN:'-'9'9$"((!,!7!7#,,#.#9#9 $(NNd6N6N$N!}}d*%00IId..T5M5M1MPTP]P]1]]di
	 		$"2"2D4D4D5Qr%   hidden_statesattention_maskposition_embeddingspast_key_valuescache_positionreturnc                    |j                   d d }g |d| j                  }| j                  |      }	| j                  | j                   nd }
|	j	                  |
| j                        }	|	j                  | j                  | j                  | j                  z  | j                  | j                  z  gd      \  }}}|j                  |      j                  dd      }|j                  |      j                  dd      }|j                  |      j                  dd      }|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        j                  | j                  j                   t"              } || ||||f| j$                  sdn| j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )	N)minmaxr   dimr   )sincosrK           )dropoutr>   )shaper4   rB   r:   clampsplitr1   r<   view	transposer   updater)   r   get_interfacer/   _attn_implementationr   trainingr9   r>   reshape
contiguousrC   )rD   rG   rH   rI   rJ   rK   rE   input_shapehidden_shape
qkv_statesmin_valquery_states
key_statesvalue_statesrT   rS   cache_kwargsattention_interfaceattn_outputattn_weightss                       r&   forwardzDbrxAttention.forwardP   s    $))#2.88b8$--8YY}-
$(MM$=4==.4%%'t}}%E
1;1A1A  ((4==8((4==8
  2B 2
.j, $((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHmmK0L((r%   NNNNN)r!   r"   r#   __doc__intr.   torchTensor
LongTensorr	   tuplerm   __classcell__rF   s   @r&   r(   r(   1   s    c
 !%R :R> /37;(,266)||6) t+6) #--4	6)
 6) ((4/6) 
u||U\\)	*6)r%   r(   c            
            e Zd Z fdZdej
                  dej
                  dej
                  dej
                  dej
                  f
dZ xZS )DbrxExpertGLUc                    t         |           |j                  | _        |j                  | _        |j                  | _        t        j                  t        j                  | j                  | j                  z  | j                              | _	        t        j                  t        j                  | j                  | j                  z  | j                              | _
        t        j                  t        j                  | j                  | j                  z  | j                              | _        |j                  j                  dd      }t        |   | _        y )Nnamesilu)r-   r.   r1   ffn_hidden_sizemoe_num_expertsr   	Parameterrr   emptyw1v1w2
ffn_act_fngetr   activation_fn)rD   r/   act_fn_namerF   s      r&   r.   zDbrxExpertGLU.__init__   s    !--%55%55,,u{{4+?+?$BVBV+VX\XhXhij,,u{{4+?+?$BVBV+VX\XhXhij,,u{{4+?+?$BVBV+VX\XhXhij''++FF;#K0r%   x	expert_w1	expert_v1	expert_w2rL   c                     |j                  |      }|j                  |      }| j                  |      }||z  }|j                  |j                               }|S rn   )matmulr   t)	rD   r   r   r   r   	gate_projup_projintermediate_states	down_projs	            r&   rm   zDbrxExpertGLU.forward   sW     HHY'	((9%&&y1	''1'..y{{}=	r%   r!   r"   r#   r.   rr   rs   rm   rv   rw   s   @r&   ry   ry      sK    1*/,,CH<<\a\h\h	r%   ry   c                        e Zd Z fdZdej
                  dej
                  dej
                  dej
                  fdZ xZS )DbrxExpertsc                     t         |           t        |      | _        |j                  | _        |j
                  | _        |j                  | _        y rn   )r-   r.   ry   mlpr1   r}   r~   num_expertsrD   r/   rF   s     r&   r.   zDbrxExperts.__init__   sD     (!--%55!11r%   rG   top_k_indextop_k_weightsrL   c                    |j                   d   }|j                  d| j                        }t        j                  ||j
                  |j                        }t        j                         5  t        j                  j                  j                  || j                        }|j                  ddd      }t        j                  |j                  d      d      j                         }d d d        d| j                  | j                   f}D ]  }	|	d   }	t        j                         5  t        j"                  |	         \  }
}d d d        | j$                  j&                  j)                  |      |	   }| j$                  j*                  j)                  |      |	   }| j$                  j,                  j)                  |      |	   }| j%                  |   |||      }|j)                  d| j                        ||
d f   z  }|j/                  d||       
 |j)                  |d| j                        }|S # 1 sw Y   OxY w# 1 sw Y   xY w)	Nr   rN   )dtypedevice)num_classesr   r   )rN   rQ   )rW   r`   r}   rr   
zeros_liker   r   no_gradr   
functionalone_hotr   permutegreatersumnonzeror1   wherer   r   rZ   r   r   
index_add_)rD   rG   r   r   
batch_sizenext_statesexpert_mask
expert_hitsplit_expert_shape
expert_idxidx	token_idxr   r   r   statess                   r&   rm   zDbrxExperts.forward   s    #((+
%--b$2F2FG&&}M<O<OXeXlXlm]]_ 	S((--55ktO_O_5`K%--aA6K{8'DaHPPRJ	S
 !$"6"68H8HI$ 		9J#AJ F!&[-D!EYF!!"45jAB!!"45jAB!!"45jABXXmI6BCF[[T%9%9:]9VY[_K_=``F""1i8		9 "&&z2t7K7KL%	S 	SF Fs   ,A=H6)I6I I	r   rw   s   @r&   r   r      sC    2|| \\ ||	
 
r%   r   c                        e Zd Z fdZdej
                  deej
                  ej
                  ej                  f   fdZ xZ	S )
DbrxRouterc                     t         |           |j                  | _        |j                  | _        t        j                  | j                  |j                  d      | _        y NFr+   )	r-   r.   r}   r1   moe_jitter_epsr   rA   r~   layerr   s     r&   r.   zDbrxRouter.__init__   sJ    !11$33YYt//1G1GeT
r%   rG   rL   c                    | j                   rN| j                  B|t        j                  |      j	                  d| j                  z
  d| j                  z         z  }|j                  d|j                  d         }| j                  |      }|S )Ng      ?rN   )r_   r   rr   
empty_likeuniform_rZ   rW   r   )rD   rG   router_logitss      r&   rm   zDbrxRouter.forward   s    ==T00<U--m<EEd)))31D1D+D M &**2}/B/B2/FG

=1r%   )
r!   r"   r#   r.   rr   rs   ru   rt   rm   rv   rw   s   @r&   r   r      s;    UU\\ eELL%,,X]XhXh<h6i r%   r   c                   ~     e Zd ZdZ fdZd Zdej                  deej                  ej                  f   fdZ	 xZ
S )DbrxFFNz0Modular DBRX MLP/FFN component with MoE support.c                     t         |           t        |j                        | _        t        |j                        | _        |j                  j                  | _        |j                  j                  | _	        y rn   )
r-   r.   r   
ffn_configrouterr   expertsmoe_normalize_expert_weights	moe_top_ktop_k)rD   r/   rE   rF   s      r&   r.   zDbrxFFN.__init__   sY     !2!23"6#4#45,2,=,=,Z,Z)&&00
r%   c                 $   t         j                  j                  j                  |d|j                        }t        j
                  || j                  d      \  }}| j                  &|t        j                  || j                  dd      z  }||fS )Nr   )rR   r   rN   rQ   T)prR   keepdim)	rr   r   r   softmaxr   topkr   r   norm)rD   r   router_top_valuerouter_indicess       r&   route_tokens_to_expertszDbrxFFN.route_tokens_to_experts   s    ++33MqP]PcPc3d+0::mTZZUW+X(.,,8/%** D$E$E2W[3    //r%   rG   rL   c                 v    | j                  |      }| j                  |      \  }}| j                  |||      }|S rn   )r   r   r   )rD   rG   r   r   r   outputs         r&   rm   zDbrxFFN.forward   s<    M2%)%A%A-%P"{m[-Hr%   )r!   r"   r#   rp   r.   r   rr   rs   ru   rm   rv   rw   s   @r&   r   r      s9    :10U\\ eELL%,,<V6W r%   r   c                        e Zd Zddededz  f fdZ	 	 	 ddej                  dej                  dej                  dz  de	dz  d	ej                  dz  d
e
deej                  ej                  f   fdZ xZS )DbrxNormAttentionNormNr/   r)   c                    t         |           || _        |j                  | _        t	        j
                  |j                  d      | _        t        ||      | _	        t	        j
                  |j                  d      | _
        y )NFr+   r/   r)   )r-   r.   r)   resid_pdropr   	LayerNormr0   norm_1r(   attnnorm_2rD   r/   r)   rF   s      r&   r.   zDbrxNormAttentionNorm.__init__   sc    "!--ll6>>>!
	 ll6>>>r%   rG   rI   rH   rJ   rK   rE   rL   c           	      f   |}| j                  |      j                  |j                        } | j                  d|||||d|\  }}t        j
                  j                  || j                  | j                        }||z   }|}| j                  |      j                  |j                        }||fS N)rG   rH   rI   rJ   rK   )r   r_   r$   )
r   tor   r   r   r   rV   r   r_   r   )	rD   rG   rI   rH   rJ   rK   rE   residual_states_s	            r&   rm   zDbrxNormAttentionNorm.forward  s     (M255m6I6IJ$499 
') 3+)
 
q --mt?O?OZ^ZgZg-h%7'M255m6I6IJ--r%   rn   )NNN)r!   r"   r#   r   rq   r.   rr   rs   rt   r	   r   ru   rm   rv   rw   s   @r&   r   r      s    	?z 	?cDj 	? /3(,26.||. #--. t+	.
 . ((4/. . 
u||U\\)	*.r%   r   c                        e Zd Zdedef fdZ	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	ej                  dz  d
e
fdZ xZS )	DbrxBlockr/   r)   c                     t         |           |j                  | _        |j                  | _        || _        t        ||      | _        t        |      | _	        y )Nr   r/   )
r-   r.   r0   r1   r   r)   r   norm_attn_normr   ffnr   s      r&   r.   zDbrxBlock.__init__   sP    !>>!--"3
 &)r%   NrG   rH   rI   rJ   rK   rE   c           	           | j                   d|||||d|\  }}| j                  |      }t        j                  j	                  || j
                  | j                        }||z   }|S r   )r   r   r   r   rV   r   r_   )rD   rG   rH   rI   rJ   rK   rE   resid_statess           r&   rm   zDbrxBlock.forward+  s     ':d&9&9 '
') 3+)'
 '
#m /--mt?O?OZ^ZgZg-h$}4r%   ro   )r!   r"   r#   r   rq   r.   rr   rs   rt   r	   r   rm   rv   rw   s   @r&   r   r     s    	*z 	*c 	* /37;(,26|| t+ #--4	
  ((4/ r%   r   c                        e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZ ej$                         dej(                  f fd	       Z xZS )
DbrxPreTrainedModelr/   transformerTr   rJ   F)rG   
attentionsmodulec                 >   t         |   |       | j                  j                  }t	        |t
              rgt        j                  |j                  d|       t        j                  |j                  d|       t        j                  |j                  d|       y y )NrU   )meanstd)r-   _init_weightsr/   initializer_range
isinstancery   initnormal_r   r   r   )rD   r   r   rF   s      r&   r   z!DbrxPreTrainedModel._init_weightsS  sj    f%kk++fm,LL#6LL#6LL#6 -r%   )r!   r"   r#   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flex_attn_supports_attention_backend_supports_flash_attn_supports_sdpa_can_compile_fullgraphr   r(   _can_record_outputsrr   r   r   Moduler   rv   rw   s   @r&   r   r   C  sx    %&*#$#4"5"&N""#
 U]]_7BII 7 7r%   r   c                   V    e Zd ZdZdef fdZdej                  fdZdej                  fdZ	e
ee	 	 	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dedz  dej"                  dz  dedz  dej                  dz  dee   defd                     Z xZS )	DbrxModela  Transformer decoder consisting of *config.num_hidden_layers*. Each layer is a [`DbrxBlock`] layer.

    Args:
        config ([`DbrxConfig`]): Model configuration class with all parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    r/   c           	      ,   t         |   |       |j                  | _        |j                  | _        |j
                  | _        t        |      | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j"                  |j                  d      | _        d| _        | j)                          y c c}w r   )r-   r.   pad_token_idpadding_idx
vocab_size	emb_pdropr    
rotary_embr   	Embeddingr0   wte
ModuleListrangen_layersr   blocksr   norm_fgradient_checkpointing	post_initr   s      r&   r.   zDbrxModel.__init__g  s     !.. ++))-f5<< 1 16>>4CSCSTmmSXY_YhYhSi$jiYvy%A$jkll6>>>&+# 	 %ks   4DrL   c                     | j                   S rn   r  rD   s    r&   get_input_embeddingszDbrxModel.get_input_embeddingsu  s    xxr%   valuec                     || _         y rn   r  rD   r  s     r&   set_input_embeddingszDbrxModel.set_input_embeddingsx  s	    r%   N	input_idsrH   position_idsrJ   inputs_embeds	use_cacherK   rE   c                 D   |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f||
||||d|} | j                  |      }t        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   )r/   r  rH   rK   rJ   r  )rI   rH   r  rJ   r  rK   )last_hidden_staterJ   )
ValueErrorr
   r/   r  get_seq_lengthrr   arangerW   r   	unsqueezer   r	  r  num_hidden_layersr  r   )rD   r  rH   r  rJ   r  r  rK   rE   past_seen_tokenscausal_maskrG   rI   decoder_layers                 r&   rm   zDbrxModel.forward{  s^    -t";<YZZ0*$++>O  HHY/M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 & #oom\J![[)H4;;+H+HI 
	M)	$7*) /#-	 	M
	 M2%++
 	
r%   )NNNNNNN)r!   r"   r#   rp   r   r.   r   r
  r  r  r   r   r   rr   rt   rs   r	   FloatTensorboolr   r   r   rm   rv   rw   s   @r&   r  r  ]  s
   z bll ",,    .2.204(,26!%26;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
 $;;
 ((4/;
 +,;
 
 ;
    ;
r%   r  c                       e Zd ZddiZddiZddgdgfiZdef fdZd	ej                  fd
Z
dej                  fdZd	ej                  fdZdej                  fdZdefdZd	efdZee	 	 	 	 	 	 	 	 	 	 d dej*                  dz  dej,                  dz  dej*                  dz  dedz  dej0                  dz  dej*                  dz  dedz  dedz  dej*                  dz  deej,                  z  dee   d	efd              Z xZS )!DbrxForCausalLMzlm_head.weightztransformer.wte.weightlm_headcolwise_gather_outputrG   logitsr/   c                    t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  j                  | _        |j                  j                  | _        |j                  j                  | _        | j!                          y r   )r-   r.   r  r   r  r   rA   r1   r-  r   moe_loss_weightrouter_aux_loss_coefr~   r   r   num_experts_per_tokr  r   s     r&   r.   zDbrxForCausalLM.__init__  s     $V, ++yy!3!3V5F5FUS$*$5$5$E$E!!,,<<#)#4#4#>#> r%   rL   c                 6    | j                   j                         S rn   )r   r  r  s    r&   r  z$DbrxForCausalLM.get_input_embeddings  s    4466r%   r  c                 :    | j                   j                  |       y rn   )r   r  r  s     r&   r  z$DbrxForCausalLM.set_input_embeddings  s    --e4r%   c                     | j                   S rn   r-  r  s    r&   get_output_embeddingsz%DbrxForCausalLM.get_output_embeddings  s    ||r%   new_embeddingsc                     || _         y rn   r7  )rD   r9  s     r&   set_output_embeddingsz%DbrxForCausalLM.set_output_embeddings  s	    %r%   decoderc                     || _         y rn   r   )rD   r<  s     r&   set_decoderzDbrxForCausalLM.set_decoder  s
    "r%   c                     | j                   S rn   r>  r  s    r&   get_decoderzDbrxForCausalLM.get_decoder  s    r%   Nr  rH   r  rJ   r  labelsr  output_router_logitsrK   logits_to_keeprE   c                 l   ||n| j                   j                  } | j                  d||||||||	d|}|j                  }t	        |
t
              rt        |
 d      n|
}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                         z  z  }t#        ||||j$                  |j&                  |j(                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, DbrxForCausalLM

        >> model = DbrxForCausalLM.from_pretrained("transformers-community/dbrx-instruct")
        >> tokenizer = AutoTokenizer.from_pretrained("transformers-community/dbrx-instruct")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```
        N)r  rH   r  rJ   r  r  rC  rK   )lossaux_lossr/  rJ   rG   r   r   r$   )r/   rC  r   r   r   rq   slicer-  loss_functionr  r   r   r   r3  r2  r   r   r   rJ   rG   r   )rD   r  rH   r  rJ   r  rB  r  rC  rK   rD  rE   outputsrG   slice_indicesr/  rF  rG  s                     r&   rm   zDbrxForCausalLM.forward  sZ   P %9$D $++JjJj 	
 +;$*:*: 
+
)%+'!5)
+
 
+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r%   )
NNNNNNNNNr   ) r!   r"   r#   _tied_weights_keys_tp_plan_pp_planr   r.   r   r
  r  r  rA   r8  r;  r  r?  rA  r   r   rr   rt   rs   r	   r)  r*  rq   r   r   r   rm   rv   rw   s   @r&   r,  r,    s   *,DE23H_-z:;Hz 7bll 75",, 5ryy &BII &#9 # Y    .2.204(,26*.!%,026-.R
##d*R
 t+R
 &&-	R

 R
 ((4/R
   4'R
 $;R
 #TkR
 ((4/R
 ell*R
 +,R
 
#R
  R
r%   r,  )r,  r  r   ):rp   collections.abcr   typingr   rr   r    r   r   activationsr   cache_utilsr	   r
   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   llama.modeling_llamar   r   r   mixtral.modeling_mixtralr   configuration_dbrxr   r    r  r(   ry   r   r   r   r   r   r   r  r,  __all__r$   r%   r&   <module>ra     s   ) $    & ! . ) / R F & I I 7 5 
 @ *	. 	U)BII U)pBII 2$")) $N "bii 6'.BII '.T!* !H7/ 74 [
# [
 [
|u
)? u
p Br%   