
    qiH                        d dl mZ d dlZd dlmZ ddlmZmZ ddlm	Z	m
Z
 ddlmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  ejN                  e(      Z) G d de	      Z* G d de"      Z+ G d de       Z, G d de      Z- G d de      Z. G d de!      Z/ G d de&      Z0 G d de      Z1g d Z2y)!    )CallableN   )CacheDynamicCache)PreTrainedConfiglayer_type_validation)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPast)RopeParametersdynamic_rope_update)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)maybe_autocast   )CohereAttentionCohereDecoderLayerCohereForCausalLMCohereLayerNormCoherePreTrainedModelCohereRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Gemma2Modelc            ,       |    e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dedz  dedz  dedz  de	dz  dedz  dedz  dedz  de
dz  dedz  de	dz  dedz  dedz  dedz  dedz  dedz  dedz  deee
ef   z  dz  dedz  de	dz  d edz  d!ee
   dz  f* fd"Z xZS )$Cohere2Configac  
    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
    model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.


    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`CohereModel`]
        hidden_size (`int`, *optional*, defaults to 8192):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 22528):
            Dimension of the MLP representations.
        logit_scale (`float`, *optional*, defaults to 0.0625):
            The scaling factor for the output logits.
        num_hidden_layers (`int`, *optional*, defaults to 40):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 64):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 5):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 255001):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        sliding_window (`int`, *optional*, defaults to 4096):
            Size of the sliding window attention context.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.

    ```python
    >>> from transformers import Cohere2Model, Cohere2Config

    >>> # Initializing a Cohere Nextmodel configuration
    >>> configuration = Cohere2Config()

    >>> # Initializing a model from the Cohere2 configuration
    >>> model = Cohere2Model(configuration) # doctest: +SKIP

    >>> # Accessing the model configuration
    >>> configuration = model.config # doctest: +SKIP
    ```
    cohere2past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormN
vocab_sizehidden_sizeintermediate_sizelogit_scalenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actmax_position_embeddingsinitializer_rangelayer_norm_eps	use_cachepad_token_idbos_token_ideos_token_idtie_word_embeddingsrope_parametersattention_biasattention_dropoutsliding_windowlayer_typesc                    || _         |	| _        || _        || _        || _        || _        || _        ||}|| _        || _        |
| _	        || _
        || _        || _        || _        || _        || _        ||z  | _        || _        || _        || _        || _        |j+                  dd      | _        | j                  Wt/        | dd      | _        t1        | j
                        D cg c]!  }t3        |dz   | j,                  z        rdnd# c}| _        t5        | j                  | j
                         || _        t9        | t  di | y c c}w )Nsliding_window_pattern      sliding_attentionfull_attention )r*   r2   r+   r-   r,   r.   r/   r0   r1   r3   r4   r5   r;   r<   r=   r>   head_dimr6   r7   r8   r9   get_sliding_window_patterngetattrrangeboolr   r:   super__init__)selfr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   kwargsi	__class__s                           ]/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/cohere2/modular_cohere2.pyrM   zCohere2Config.__init__   sn   2 %'>$&&!2!2#6  &"5#6 $!2,",!2,& $'::(((#6  (.zz2JA'N$#+249QST+UD( t556  (,QUd6R6R,R'S#Yii D 	d..0F0FG."6" s   (&E)i      i X  g      ?(   @   NsilurS   g{Gz?gh㈵>Tr      i TNF        i   N)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planintfloatstrrK   r   dictlistrM   __classcell__rQ   s   @rR   r   r   0   s   KZ J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56 "("&(-$*(**,*.!'.2*.%) $#$#$#)+/MQ&+*-%)(,-D#$JD# 4ZD# :	D#
 T\D# :D# !4ZD# !4ZD# $JD# "%tD# !4<D# d
D# :D# DjD# DjD#  Dj!D#" "D[#D#$ ($sN/B*CCdJ%D#& t'D#( !4<)D#* d
+D#, #Y%-D# D#    r   c                   D    e Zd Z ej                         ed               Zy)Cohere2RotaryEmbeddingc                    | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  |dd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j                  |j                   
      	j                  |j                   
      fS # 1 sw Y   AxY w)Nr   rB   mpscpuF)device_typeenabledr   )dim)dtype)inv_freqrb   expandshape
isinstancedevicetyperc   r   	transposetorchrepeat_interleavecosattention_scalingsintorr   )
rN   xposition_idsinv_freq_expandedposition_ids_expandedro   freqsembr|   r~   s
             rR   forwardzCohere2RotaryEmbedding.forward   s@    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))%;C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   =BFF
N)rY   rZ   r[   rz   no_gradr   r   rE   rh   rR   rj   rj      s$    U]]_<  <rh   rj   c                       e Zd Zy)Cohere2LayerNormNrY   rZ   r[   rE   rh   rR   r   r          rh   r   c                   2   e Zd ZdZddededz  fdZ	 	 ddej                  de	ej                  ej                  f   dej                  dz  d	e
dz  d
ej                  dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZy)Cohere2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    t         j                  j                  |        || _        || _        t        |d|j                  |j                  z        | _        |j                  |j                  z  | _
        | j                  dz  | _        |j                  | _        d| _        t        |d      r|j                  |   nd }|dk(  r|j                   nd | _        t        j"                  |j                  |j                  | j                  z  |j$                        | _        t        j"                  |j                  |j                  | j                  z  |j$                        | _        t        j"                  |j                  |j                  | j                  z  |j$                        | _        t        j"                  |j                  | j                  z  |j                  |j$                        | _        y )NrF   g      Tr>   rC   )bias)nnModulerM   r   r   rI   r+   r/   rF   r0   num_key_value_groupsscalingr<   	is_causalhasattrr>   r=   Linearr;   q_projk_projv_projo_proj)rN   r   r   
layer_types       rR   rM   zCohere2Attention.__init__   s   
		4 "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!96=fm6TV''	2Z^
7AEX7Xf33^bii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
rh   r%   position_embeddingsr&   r    cache_positionrO   returnc                 F   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}| j                  t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t              } || |	|
||f| j                   sdn| j"                  | j$                  | j                  d|\  }} |j&                  g |d j)                         }| j+                  |      }||fS )Nrl   rB   r   )r~   r|   r   rX   )dropoutr   r=   )ru   rF   r   viewry   r   r   r=   r   updater   r   get_interfacer   _attn_implementationr   trainingr<   r   reshape
contiguousr   )rN   r%   r   r&   r    r   rO   input_shapehidden_shapequery_states
key_statesvalue_statesr|   r~   cache_kwargsattention_interfaceattn_outputattn_weightss                     rR   r   zCohere2Attention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S*';L*VY[^'_$L*&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL..
%
 
%
!\ *k));;;;FFHkk+.L((rh   N)NN)rY   rZ   r[   r\   r   ra   rM   rz   Tensortupler   
LongTensorr   r   r   rE   rh   rR   r   r      s    G
} 
t 
< )-26*)||*) #5<<#=>*) t+	*)
 *) ((4/*) +,*) 
u||U\\D0%2E2LL	M*)rh   r   c                   D    e Zd Zdedef fdZ	 	 	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )Cohere2DecoderLayerr   r   c                 N    t         |   ||       |j                  |   | _        y r   )rL   rM   r>   attention_type)rN   r   r   rQ   s      rR   rM   zCohere2DecoderLayer.__init__5  s%    +$00;rh   Nr%   r   r&   r    r5   r   rO   r   c           
          |}| j                  |      } | j                  d||||||d|\  }	}
| j                  |      }||	z   |z   }|S )N)r%   r   r&   r    r5   r   rE   )input_layernorm	self_attnmlp)rN   r%   r   r&   r    r5   r   rO   residualhidden_states_attention_hidden_states_mlps               rR   r   zCohere2DecoderLayer.forward9  sx     !,,];%3T^^ &
' 3)+)&
 &
" !HH]3 #::=NNrh   )NNNFN)rY   rZ   r[   r   ra   rM   rz   r   r   r   rK   r   r   r   FloatTensorr   rf   rg   s   @rR   r   r   4  s    <} < < IM.2(,!&26|| #5<<#=>E t+	
  $; ((4/ +, 
u  %(9(95;L;L(L"MPT"TT	Urh   r   c                       e Zd ZU eed<   y)Cohere2PreTrainedModelr   N)rY   rZ   r[   r   __annotations__rE   rh   rR   r   r   T  s    rh   r   c                        e Zd Zdef fdZ	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	e
dz  d
ej                  dz  dee   defdZ xZS )Cohere2Modelr   c                 p    t         |   |       t        |j                  |j                        | _        y )N)r+   eps)rL   rM   r   r+   r4   r)   )rN   r   rQ   s     rR   rM   zCohere2Model.__init__Y  s*     $&2D2D6K`K`a	rh   Nr#   r&   r   r    r$   r5   r   rO   r   c                 r   |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              s*| j                  |||||d}t        d
i |t        d
i |d}
|}| j                  ||      }| j                  D ]  } ||f|
|j                      |||||d|}! | j#                  |      }t%        ||	      S )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   rB   )rw   )r   r$   r&   r   r    r   )rD   rC   )r&   r   r    r5   r   r   )last_hidden_stater    rE   )
ValueErrorr'   r   r   get_seq_lengthrz   arangeru   rw   	unsqueezerv   rd   r	   r
   
rotary_embr(   r   r)   r   )rN   r#   r&   r   r    r$   r5   r   rO   past_seen_tokenscausal_mask_mappingmask_kwargsr%   r   decoder_layers                  rR   r   zCohere2Model.forward]  s    -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L?-F++!."0"0#2 ,K #5"C{"C%F%U%U#
 &"oom\J![[ 
	M)	2=3O3OP$7 /#-)	 	M
	 		-0&++
 	
rh   )NNNNNNN)rY   rZ   r[   r   rM   rz   r   r   r   r   rK   r   r   r   r   rf   rg   s   @rR   r   r   X  s    b} b .2.204(,26!%26=
##d*=
 t+=
 &&-	=

 =
 ((4/=
 $;=
 ((4/=
 +,=
 
!=
rh   r   c                       e Zd Zy)Cohere2ForCausalLMNr   rE   rh   rR   r   r     r   rh   r   )r   r   r   r   )3collections.abcr   rz   torch.nnr   cache_utilsr   r   configuration_utilsr   r   masking_utilsr	   r
   modeling_outputsr   modeling_rope_utilsr   r   modeling_utilsr   processing_utilsr   utilsr   r   utils.genericr   cohere.modeling_coherer   r   r   r   r   r   r   r   gemma2.modeling_gemma2r   
get_loggerrY   loggerr   rj   r   r   r   r   r   r   __all__rE   rh   rR   <module>r      s    %   . J R 7 6 & 0 +	 	 	 1 
		H	%c#$ c#L<2 <"	 	F) F)R, @2 B
; B
J	* 	 \rh   