
    qi                     &   d dl mZ d dlmZmZ d dlZd dlmZ ddlmZ	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,  G d dejZ                        Z.d Z/ ed      d@d       Z0dejb                  de2dejb                  fdZ3	 dAdejZ                  d ejb                  d!ejb                  d"ejb                  d#ejb                  dz  d$e4d%e4d&e!e#   fd'Z5 G d( d)ejZ                        Z6 G d* d+ejZ                        Z7 G d, d-ejZ                        Z8 G d. d/ejZ                        Z9 G d0 d1ejZ                        Z: G d2 d3ejZ                        Z; G d4 d5e      Z< G d6 d7e      Z=e$ G d8 d9e=             Z>	 	 	 dBd:ejb                  e?ejb                     z  dz  d;e2dz  d#ejb                  dz  dejb                  e2z  fd<Z@ G d= d>e=e      ZAg d?ZBy)C    )Callable)AnyOptionalN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_func_from_hub)create_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )
DbrxConfigc                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )DbrxRotaryEmbeddinginv_freqNconfigc                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultr!   F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr"   rope_parametersr$   compute_default_rope_parametersr   attention_scalingregister_bufferclone)selfr"   devicerope_init_fnr!   	__class__s        X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/dbrx/modeling_dbrx.pyr)   zDbrxRotaryEmbedding.__init__/   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuU    r3   ztorch.deviceseq_lenreturnztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimN      ?r      dtype)r3   r@   )	r-   getattrhidden_sizenum_attention_headstorcharangeint64tofloat)r"   r3   r8   basedimattention_factorr!   s          r6   r.   z3DbrxRotaryEmbedding.compute_default_rope_parameters?   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r7   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r   mpscpuF)device_typeenabledr>   rJ   r?   )r!   rH   expandshaperG   r3   
isinstancetypestrr   	transposerD   catcosr/   sinr@   )
r2   xposition_idsinv_freq_expandedposition_ids_expandedrP   freqsembrZ   r[   s
             r6   forwardzDbrxRotaryEmbedding.forward]   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$NNNN)__name__
__module____qualname__rD   Tensor__annotations__r   r)   staticmethodr   inttuplerH   r.   no_gradr   rb   __classcell__r5   s   @r6   r    r    ,   s    llVz V  $(+/"*T!*(* t* 
~u$	%	* *: U]]_<  <r7   r    c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrM   r>   rR   )rT   rD   rY   )r\   x1x2s      r6   rotate_halfrs   m   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezers   )qkrZ   r[   unsqueeze_dimq_embedk_embeds          r6   apply_rotary_pos_embr|   t   sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr7   hidden_statesn_repr9   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rT   rS   reshape)r}   r~   batchnum_key_value_headsslenr<   s         r6   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr7   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr>   r   rM   rJ   r@   ptrainingr   )r   num_key_value_groupsrD   matmulrX   r   
functionalsoftmaxfloat32rG   r@   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputs               r6   eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r7   c                        e Zd ZdZ	 ddedz  f fdZ	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	ej                  dz  d
e
ej                  ej                  f   fdZ xZS )DbrxAttentionzYModular DBRX attention component that can be reused across different model architectures.N	layer_idxc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        |j                  | _	        || _
        |j                  }|j                  | _        |j                  | _        |j                  | _        | j                  | j                   z  | _        | j                  dz  | _        |j&                  | _        d| _        t+        j,                  | j                  | j                  d| j                   z  | j                  z  z   d      | _        t+        j,                  | j                  | j                  d      | _        y )Ng      Tr>   Fbias)r(   r)   r"   d_modelrB   n_heads	num_headsr<   max_seq_lenr*   r   attn_config
attn_pdropattention_dropoutclip_qkv
kv_n_headsr   r   r   r;   	is_causalr   LinearWqkvout_proj)r2   r"   r   r   r   r5   s        r6   r)   zDbrxAttention.__init__   s*    	!>>((DNN:'-'9'9$"((!,!7!7#,,#.#9#9 $(NNd6N6N$N!}}d*%00IId..T5M5M1MPTP]P]1]]di
	 		$"2"2D4D4D5Qr7   r}   r   position_embeddingspast_key_valuescache_positionr9   c                    |j                   d d }g |d| j                  }| j                  |      }	| j                  | j                   nd }
|	j	                  |
| j                        }	|	j                  | j                  | j                  | j                  z  | j                  | j                  z  gd      \  }}}|j                  |      j                  dd      }|j                  |      j                  dd      }|j                  |      j                  dd      }|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        j                  | j                  j                   t"              } || ||||f| j$                  sdn| j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )	NrM   )minmaxr>   rR   r   )r[   rZ   r           )r   r   )rT   r<   r   r   clampsplitrB   r   viewrX   r|   updater   r   get_interfacer"   _attn_implementationr   r   r   r   r   r   r   )r2   r}   r   r   r   r   r   input_shapehidden_shape
qkv_statesmin_valquery_statesr   r   rZ   r[   cache_kwargsattention_interfacer   r   s                       r6   rb   zDbrxAttention.forward   s    $))#2.88b8$--8YY}-
$(MM$=4==.4%%'t}}%E
1;1A1A  ((4==8((4==8
  2B 2
.j, $((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHmmK0L((r7   rc   NNNN)re   rf   rg   __doc__rk   r)   rD   rh   
LongTensorr
   rl   rb   rn   ro   s   @r6   r   r      s    c
 !%R :R> /37;(,266)||6) t+6) #--4	6)
 6) ((4/6) 
u||U\\)	*6)r7   r   c            
            e Zd Z fdZdej
                  dej
                  dej
                  dej
                  dej
                  f
dZ xZS )DbrxExpertGLUc                    t         |           |j                  | _        |j                  | _        |j                  | _        t        j                  t        j                  | j                  | j                  z  | j                              | _	        t        j                  t        j                  | j                  | j                  z  | j                              | _
        t        j                  t        j                  | j                  | j                  z  | j                              | _        |j                  j                  dd      }t        |   | _        y )Nnamesilu)r(   r)   rB   ffn_hidden_sizemoe_num_expertsr   	ParameterrD   emptyw1v1w2
ffn_act_fngetr	   activation_fn)r2   r"   act_fn_namer5   s      r6   r)   zDbrxExpertGLU.__init__  s    !--%55%55,,u{{4+?+?$BVBV+VX\XhXhij,,u{{4+?+?$BVBV+VX\XhXhij,,u{{4+?+?$BVBV+VX\XhXhij''++FF;#K0r7   r\   	expert_w1	expert_v1	expert_w2r9   c                     |j                  |      }|j                  |      }| j                  |      }||z  }|j                  |j                               }|S rc   )r   r   t)	r2   r\   r   r   r   	gate_projup_projintermediate_states	down_projs	            r6   rb   zDbrxExpertGLU.forward  sW     HHY'	((9%&&y1	''1'..y{{}=	r7   re   rf   rg   r)   rD   rh   rb   rn   ro   s   @r6   r   r     sK    1*/,,CH<<\a\h\h	r7   r   c                        e Zd Z fdZdej
                  dej
                  dej
                  dej
                  fdZ xZS )DbrxExpertsc                     t         |           t        |      | _        |j                  | _        |j
                  | _        |j                  | _        y rc   )r(   r)   r   mlprB   r   r   num_expertsr2   r"   r5   s     r6   r)   zDbrxExperts.__init__%  sD     (!--%55!11r7   r}   top_k_indextop_k_weightsr9   c                    |j                   d   }|j                  d| j                        }t        j                  ||j
                  |j                        }t        j                         5  t        j                  j                  j                  || j                        }|j                  ddd      }t        j                  |j                  d      d      j                         }d d d        d| j                  | j                   f}D ]  }	|	d   }	t        j                         5  t        j"                  |	         \  }
}d d d        | j$                  j&                  j)                  |      |	   }| j$                  j*                  j)                  |      |	   }| j$                  j,                  j)                  |      |	   }| j%                  |   |||      }|j)                  d| j                        ||
d f   z  }|j/                  d||       
 |j)                  |d| j                        }|S # 1 sw Y   OxY w# 1 sw Y   xY w)	Nr   rM   )r@   r3   )num_classesr>   r   )rM   rR   )rT   r   r   rD   
zeros_liker@   r3   rm   r   r   one_hotr   permutegreatersumnonzerorB   wherer   r   r   r   r   
index_add_)r2   r}   r   r   
batch_sizenext_statesexpert_mask
expert_hitsplit_expert_shape
expert_idxidx	token_idxr   r   r   statess                   r6   rb   zDbrxExperts.forward,  s    #((+
%--b$2F2FG&&}M<O<OXeXlXlm]]_ 	S((--55ktO_O_5`K%--aA6K{8'DaHPPRJ	S
 !$"6"68H8HI$ 		9J#AJ F!&[-D!EYF!!"45jAB!!"45jAB!!"45jABXXmI6BCF[[T%9%9:]9VY[_K_=``F""1i8		9 "&&z2t7K7KL%	S 	SF Fs   ,A=H6)I6I I	r   ro   s   @r6   r   r   $  sC    2|| \\ ||	
 
r7   r   c                        e Zd Z fdZdej
                  deej
                  ej
                  ej                  f   fdZ xZ	S )
DbrxRouterc                     t         |           |j                  | _        |j                  | _        t        j                  | j                  |j                  d      | _        y NFr   )	r(   r)   r   rB   moe_jitter_epsr   r   r   layerr   s     r6   r)   zDbrxRouter.__init__L  sJ    !11$33YYt//1G1GeT
r7   r}   r9   c                    | j                   rN| j                  B|t        j                  |      j	                  d| j                  z
  d| j                  z         z  }|j                  d|j                  d         }| j                  |      }|S )Nr=   rM   )r   r  rD   
empty_likeuniform_r   rT   r  )r2   r}   router_logitss      r6   rb   zDbrxRouter.forwardR  s    ==T00<U--m<EEd)))31D1D+D M &**2}/B/B2/FG

=1r7   )
re   rf   rg   r)   rD   rh   rl   r   rb   rn   ro   s   @r6   r   r   K  s;    UU\\ eELL%,,X]XhXh<h6i r7   r   c                   ~     e Zd ZdZ fdZd Zdej                  deej                  ej                  f   fdZ	 xZ
S )DbrxFFNz0Modular DBRX MLP/FFN component with MoE support.c                     t         |           t        |j                        | _        t        |j                        | _        |j                  j                  | _        |j                  j                  | _	        y rc   )
r(   r)   r   
ffn_configrouterr   expertsmoe_normalize_expert_weights	moe_top_ktop_k)r2   r"   r   r5   s      r6   r)   zDbrxFFN.__init___  sY     !2!23"6#4#45,2,=,=,Z,Z)&&00
r7   c                 $   t         j                  j                  j                  |d|j                        }t        j
                  || j                  d      \  }}| j                  &|t        j                  || j                  dd      z  }||fS )Nr   r   rM   rR   T)r   rJ   keepdim)	rD   r   r   r   r@   topkr  r  norm)r2   r  router_top_valuerouter_indicess       r6   route_tokens_to_expertszDbrxFFN.route_tokens_to_expertsg  s    ++33MqP]PcPc3d+0::mTZZUW+X(.,,8/%** D$E$E2W[3    //r7   r}   r9   c                 v    | j                  |      }| j                  |      \  }}| j                  |||      }|S rc   )r  r  r  )r2   r}   r  r   r   outputs         r6   rb   zDbrxFFN.forwardp  s<    M2%)%A%A-%P"{m[-Hr7   )re   rf   rg   r   r)   r  rD   rh   rl   rb   rn   ro   s   @r6   r  r  \  s9    :10U\\ eELL%,,<V6W r7   r  c                        e Zd Zddededz  f fdZ	 	 	 ddej                  dej                  dej                  dz  de	dz  d	ej                  dz  d
e
deej                  ej                  f   fdZ xZS )DbrxNormAttentionNormNr"   r   c                    t         |           || _        |j                  | _        t	        j
                  |j                  d      | _        t        ||      | _	        t	        j
                  |j                  d      | _
        y )NFr   r"   r   )r(   r)   r   resid_pdropr   	LayerNormr   norm_1r   attnnorm_2r2   r"   r   r5   s      r6   r)   zDbrxNormAttentionNorm.__init__x  sc    "!--ll6>>>!
	 ll6>>>r7   r}   r   r   r   r   r   r9   c           	      f   |}| j                  |      j                  |j                        } | j                  d|||||d|\  }}t        j
                  j                  || j                  | j                        }||z   }|}| j                  |      j                  |j                        }||fS N)r}   r   r   r   r   r    )
r  rG   r@   r   r   r   r   r  r   r!  )	r2   r}   r   r   r   r   r   residual_states_s	            r6   rb   zDbrxNormAttentionNorm.forward  s     (M255m6I6IJ$499 
') 3+)
 
q --mt?O?OZ^ZgZg-h%7'M255m6I6IJ--r7   rc   rd   )re   rf   rg   r   rk   r)   rD   rh   r   r
   r   rl   rb   rn   ro   s   @r6   r  r  w  s    	?z 	?cDj 	? /3(,26.||. #--. t+	.
 . ((4/. . 
u||U\\)	*.r7   r  c                        e Zd Zdedef fdZ	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	ej                  dz  d
e
fdZ xZS )	DbrxBlockr"   r   c                     t         |           |j                  | _        |j                  | _        || _        t        ||      | _        t        |      | _	        y )Nr  r"   )
r(   r)   r   rB   r  r   r  norm_attn_normr  ffnr"  s      r6   r)   zDbrxBlock.__init__  sP    !>>!--"3
 &)r7   Nr}   r   r   r   r   r   c           	           | j                   d|||||d|\  }}| j                  |      }t        j                  j	                  || j
                  | j                        }||z   }|S r$  )r,  r-  r   r   r   r  r   )r2   r}   r   r   r   r   r   resid_statess           r6   rb   zDbrxBlock.forward  s     ':d&9&9 '
') 3+)'
 '
#m /--mt?O?OZ^ZgZg-h$}4r7   r   )re   rf   rg   r   rk   r)   rD   rh   r   r
   r   rb   rn   ro   s   @r6   r)  r)    s    	*z 	*c 	* /37;(,26|| t+ #--4	
  ((4/ r7   r)  c                        e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZ ej$                         dej(                  f fd	       Z xZS )
DbrxPreTrainedModelr"   transformerTr)  r   F)r}   
attentionsr   c                 >   t         |   |       | j                  j                  }t	        |t
              rgt        j                  |j                  d|       t        j                  |j                  d|       t        j                  |j                  d|       y y )Nr   )meanstd)r(   _init_weightsr"   initializer_rangerU   r   initnormal_r   r   r   )r2   r   r6  r5   s      r6   r7  z!DbrxPreTrainedModel._init_weights  sj    f%kk++fm,LL#6LL#6LL#6 -r7   )re   rf   rg   r   ri   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flex_attn_supports_attention_backend_supports_flash_attn_supports_sdpa_can_compile_fullgraphr)  r   _can_record_outputsrD   rm   r   Moduler7  rn   ro   s   @r6   r1  r1    sx    %&*#$#4"5"&N""#
 U]]_7BII 7 7r7   r1  c                   V    e Zd ZdZdef fdZdej                  fdZdej                  fdZ	e
ee	 	 	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dedz  dej"                  dz  dedz  dej                  dz  dee   defd                     Z xZS )	DbrxModela  Transformer decoder consisting of *config.num_hidden_layers*. Each layer is a [`DbrxBlock`] layer.

    Args:
        config ([`DbrxConfig`]): Model configuration class with all parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    r"   c           	      ,   t         |   |       |j                  | _        |j                  | _        |j
                  | _        t        |      | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j"                  |j                  d      | _        d| _        | j)                          y c c}w r   )r(   r)   pad_token_idpadding_idx
vocab_size	emb_pdropr    
rotary_embr   	Embeddingr   wte
ModuleListrangen_layersr)  blocksr  norm_fgradient_checkpointing	post_initr"  s      r6   r)   zDbrxModel.__init__  s     !.. ++))-f5<< 1 16>>4CSCSTmmSXY_YhYhSi$jiYvy%A$jkll6>>>&+# 	 %ks   4Dr9   c                     | j                   S rc   rO  r2   s    r6   get_input_embeddingszDbrxModel.get_input_embeddings  s    xxr7   r   c                     || _         y rc   rX  r2   r   s     r6   set_input_embeddingszDbrxModel.set_input_embeddings  s	    r7   N	input_idsr   r]   r   inputs_embeds	use_cacher   r   c                 D   |d u |d uz  rt        d      |r|t        | j                        }|| j                  |      }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f||
||||d|} | j                  |      }t        ||      S )	Nz:You must specify exactly one of input_ids or inputs_embedsr+  r   r   )r3   )r"   r_  r   r   r   r]   )r   r   r]   r   r`  r   )last_hidden_stater   )
ValueErrorr   r"   rO  get_seq_lengthrD   rE   rT   r3   rv   r   rM  rS  num_hidden_layersrT  r   )r2   r^  r   r]   r   r_  r`  r   r   past_seen_tokenscausal_maskr}   r   decoder_layers                 r6   rb   zDbrxModel.forward  s^    -t";<YZZ0*$++>O  HHY/M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 & #oom\J![[)H4;;+H+HI 
	M)	$7*) /#-	 	M
	 M2%++
 	
r7   )NNNNNNN)re   rf   rg   r   r   r)   r   rN  rZ  r]  r   r   r   rD   r   rh   r
   FloatTensorboolr   r   r   rb   rn   ro   s   @r6   rG  rG    s
   z bll ",,    .2.204(,26!%26;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
 $;;
 ((4/;
 +,;
 
 ;
    ;
r7   rG  gate_logitsr   c                    | t        | t              syt        | t              rC| d   j                  }t        j                  | D cg c]  }|j                  |       c}d      }t        j                  j                  j                  d      }t        j                  ||d      \  }}	t        j                  j                  j                  |	|      }
|>t        j                  |
j                         d      }t        j                  |d      }n|j                  \  }}|j                  d   ||z  z  }|dddddddf   j                  |||||f      j                  d||      j                        }t        j                   |
j                         |z  d      t        j                   |d      z  }|ddddddf   j                  ||||f      j                  d|      j                  |      }t        j                   ||z  d      t        j                   |d      z  }t        j                   ||j#                  d      z        }||z  S c c}w )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   rR   rM   )rU   rl   r3   rD   rY   rG   r   r   r   r  r   r5  rH   rT   rS   r   r   rv   )rk  r   r  r   compute_device
layer_gateconcatenated_gate_logitsrouting_weightsr'  selected_expertsr   tokens_per_expertrouter_prob_per_expertr   sequence_lengthre  expert_attention_mask router_per_expert_attention_maskoverall_losss                      r6   load_balancing_loss_funcrx  >  s9   : *[%"@+u%$Q..#(99^i-jPZjmmN.K-jpq#r hh))112JPR1SO**_eDA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
O4::1=*B^_ 4AtT12V&
OUKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&
O[QRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1G1Q1QRS1TTUL+%%[ .ks   Ic                       e Zd ZddiZddiZddgdgfiZdef fdZd	ej                  fd
Z
dej                  fdZd	ej                  fdZdej                  fdZdefdZd	efdZee	 	 	 	 	 	 	 	 	 	 d dej*                  dz  dej,                  dz  dej*                  dz  dedz  dej0                  dz  dej*                  dz  dedz  dedz  dej*                  dz  deej,                  z  dee   d	efd              Z xZS )!DbrxForCausalLMzlm_head.weightztransformer.wte.weightlm_headcolwise_gather_outputr}   logitsr"   c                    t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  j                  | _        |j                  j                  | _        |j                  j                  | _        | j!                          y r   )r(   r)   rG  r2  rK  r   r   rB   r{  r
  moe_loss_weightrouter_aux_loss_coefr   r   r  num_experts_per_tokrV  r   s     r6   r)   zDbrxForCausalLM.__init__  s     $V, ++yy!3!3V5F5FUS$*$5$5$E$E!!,,<<#)#4#4#>#> r7   r9   c                 6    | j                   j                         S rc   )r2  rZ  rY  s    r6   rZ  z$DbrxForCausalLM.get_input_embeddings  s    4466r7   r   c                 :    | j                   j                  |       y rc   )r2  r]  r\  s     r6   r]  z$DbrxForCausalLM.set_input_embeddings  s    --e4r7   c                     | j                   S rc   r{  rY  s    r6   get_output_embeddingsz%DbrxForCausalLM.get_output_embeddings  s    ||r7   new_embeddingsc                     || _         y rc   r  )r2   r  s     r6   set_output_embeddingsz%DbrxForCausalLM.set_output_embeddings  s	    %r7   decoderc                     || _         y rc   r2  )r2   r  s     r6   set_decoderzDbrxForCausalLM.set_decoder  s
    "r7   c                     | j                   S rc   r  rY  s    r6   get_decoderzDbrxForCausalLM.get_decoder  s    r7   Nr^  r   r]   r   r_  labelsr`  output_router_logitsr   logits_to_keepr   c                 l   ||n| j                   j                  } | j                  d||||||||	d|}|j                  }t	        |
t
              rt        |
 d      n|
}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                         z  z  }t#        ||||j$                  |j&                  |j(                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, DbrxForCausalLM

        >> model = DbrxForCausalLM.from_pretrained("transformers-community/dbrx-instruct")
        >> tokenizer = AutoTokenizer.from_pretrained("transformers-community/dbrx-instruct")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```
        N)r^  r   r]   r   r_  r`  r  r   )lossaux_lossr}  r   r}   r3  r  r%  )r"   r  r2  rb  rU   rk   slicer{  loss_functionrK  rx  r  r   r  r  rG   r3   r   r   r}   r3  )r2   r^  r   r]   r   r_  r  r`  r  r   r  r   outputsr}   slice_indicesr}  r  r  s                     r6   rb   zDbrxForCausalLM.forward  sZ   P %9$D $++JjJj 	
 +;$*:*: 
+
)%+'!5)
+
 
+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r7   )
NNNNNNNNNr   ) re   rf   rg   _tied_weights_keys_tp_plan_pp_planr   r)   r   rN  rZ  r]  r   r  r  rG  r  r  r   r   rD   r   rh   r
   ri  rj  rk   r   r   r   rb   rn   ro   s   @r6   rz  rz    s   *,DE23H_-z:;Hz 7bll 75",, 5ryy &BII &#9 # Y    .2.204(,26*.!%,026-.R
##d*R
 t+R
 &&-	R

 R
 ((4/R
   4'R
 $;R
 #TkR
 ((4/R
 ell*R
 +,R
 
#R
  R
r7   rz  )rz  rG  r1  )r   )r   )Nr>   N)Ccollections.abcr   typingr   r   rD   r    r   r9  activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   configuration_dbrxr   rE  r    rs   r|   rh   rk   r   rH   r   r   r   r   r   r  r  r)  r1  rG  rl   rx  rz  __all__r%  r7   r6   <module>r     s.  * %     & ! . ) 4 / 9 Q K F & I I G 5 *><")) ><B( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2U)BII U)pBII 2$")) $N "bii 6'.BII '.T!* !H7/ 74 [
# [
 [
@ #
*.	O&ell 33d:O&tO& LL4'	O&
 \\CO&du
)? u
p Br7   