
    Iui                     0   d Z ddlmZmZmZ ddlZddlmZm	Z	m
Z
  G d de
j                        Z G d de
j                        Z G d	 d
e
j                        Z G d de
j                        Z G d de
j                        Z G d de
j                        Z G d de
j$                        Z G d de
j(                        Z G d de
j,                        Z G d de
j0                        Z G d de
j,                        Z G d de
j0                        Zy)z0Declares specification of the Transformer model.    )OptionalTupleUnionN)attention_speccommon_spec
model_specc            /       L   e Zd Zddej                  j
                  dej                  j                  ddddddddddddddddfdedede	d	e	d
ej                  dedej                  de	de	de	de	de	de	de
e   de
e   de
e   de	de
ej                     dedede
e   de
e	   de	f.dZy)TransformerEncoderSpecTF   N'  
num_layers	num_headspre_normno_final_norm
activationnum_source_embeddingsembeddings_mergelayernorm_embeddingrelative_positionrelative_attention_biasffn_glurms_normmulti_query_attentionnum_heads_kvhead_dim
rotary_dimrotary_interleaverotary_scaling_typerotary_scaling_factorrotary_basesliding_windowqk_normpre_post_layer_normc                     |r||dk7  rt        d      d}|| _        t        j                  d      j	                  |      | _        || _        t        j                  d      j	                  |      | _        t        j                  d      j	                  |      | _        t        |      D cg c]  }t        j                          c}| _        d| _        |	s|
st               | _        |r|st        j                   |      | _        |rt        j                   |      | _        |)t        j                  d      j	                  |      | _        t        |      D cg c]  }t)        |	|
|||||||||||	       c}| _        yc c}w c c}w )
a'  Initializes a Transformer encoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          head_dim: Number of dimensions per attention head.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          sliding_window: Max sequence length to retain in KV Cache.
          qk_norm: Apply layer normalization to the query and key projections.
          pre_post_layer_norm: Add post layer norm for each pre norm layer.
        Nr   5Enabling multi_query_attention implies num_heads_kv=1int16int8Tr   int32)r   r   r   r   r   r   r   r   r   r   r    r"   r#   )
ValueErrorr   npdtypetyper   r   r   r   ranger   EmbeddingsSpec
embeddingsscale_embeddingsPositionEncoderSpecposition_encodingsLayerNormSpec
layer_normr   r!   TransformerEncoderLayerSpeclayer)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   _s                            T/opt/pipecat/venv/lib/python3.12/site-packages/ctranslate2/specs/transformer_spec.py__init__zTransformerEncoderSpec.__init__   sw   x !'LA,= K  L%:"'*//	: ((6*//
; " 0 5 56F G278M2N
-.K&&(
 !% )@&9&;D#M)77JDO'2'@'@('SD$%"$((7"3"8"8"HD$ :&!
   ("3(?!)!%"3$7&;'$7



s   -FF)__name__
__module____qualname__r   
ActivationRELUEmbeddingsMergeCONCATintboolr   r   RotaryScalingTypefloatr;        r:   r
   r
   
   s   
 #-8-C-C-H-H%&8C8S8S8Z8Z$)"'(-&+&*"&$("&JN'("(,"'$)1f
f
 f
 	f

 f
  **f
  #f
 &55f
 "f
  f
 "&f
 f
 f
  $f
 smf
  3-!f
" SM#f
$  %f
& &n&F&FG'f
(  %)f
* +f
, !-f
. $/f
0 "1f
rH   r
   c            J          e Zd Zdej                  j
                  ddddddddddddddddddddddddddddddddf"ded	ed
edej                  dedededededededededededededee   dedee	j                     dedededed ed!ed"ed#ed$ee   d%ee   d&ee   d'eej                     d(ee   d)ee   d*ed+ee   fHd,Zed-        Zy).TransformerDecoderSpecTFr   Nr   r   r   r   r   r   r   with_encoder_attentionr   project_in_outr   r   alignment_layeralignment_headsr   r   alibialibi_use_positive_positionsscale_alibir   r   r   r   r     original_max_position_embeddingsmax_position_embeddingsparallel_residualshared_layer_normr#   r   r   r   r!   
quant_typequant_group_size
quant_bitsr"    external_pre_post_encoder_layersc%           	      l   t               | _        |r|st        d      |rt        d      |r||dk7  rt        d      d}t        j                  d      j                  |      | _        || _        t        j                  d      j                  |      | _        t        j                  d      j                  |      | _	        t        j                  d      j                  |      | _
        t        j                         | _        d| _        t        j                   | _        || _        || _        || _        |)t        j                  d	      j                  |      | _        |	s|
s|s|t-               | _        |r|st        j0                  |
      | _        |rt        j0                  |
      | _        t        j6                         | _        t;        |      D %cg c]G  }%t=        d$i d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|#d|$I c}%| _        d| _         |xs ||k7  | j                  d <   |r2t        j6                         | _!        t        j6                         | _"        | r.| | j                  d!<   |"| j                  d"<   |!| j                  d#<   yyc c}%w )%a.  Initializes a Transformer decoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          with_encoder_attention: Enable the encoder attention sublayers.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          project_in_out: Add linear transformations after the embedding layer and before
            the final layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: Add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          sliding_window: Max sequence length to retain in KV Cache.
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
          external_pre_post_encoder_layers: if the encoder attention pre and processing
            is done outside the attention.
        z/The GPT-J block expects a pre-norm architecturez-The GPT-J block does not have cross attentionNr   r%   r&   r'   Tr)   r(   rL   r   r   r   r   r   r   r   r   r    rS   rT   rU   rV   r#   r   r   r!   r"   rZ   Fr   quantization_typequantization_bitsquantization_group_sizerG   )#dict_configr*   r+   r,   r-   r   r   r   rN   rO   r   r/   r0   r1   r   OPTIONALscale_outputsrP   rQ   rR   r!   r2   r3   r4   r5   r   
LinearSpec
projectionr.   TransformerDecoderLayerSpecr7   start_from_zero_embedding
project_inproject_out)&r8   r   r   r   r   r   rL   r   rM   r   r   rN   rO   r   r   rP   rQ   rR   r   r   r   r   r    rS   rT   rU   rV   r#   r   r   r   r!   rW   rX   rY   r"   rZ   r9   s&                                         r:   r;   zTransformerDecoderSpec.__init__u   s   p v !RSS% !PQQ 'LA,= K  L'*//	: ((6*//
;!xx055oF!xx055oF%446 $'00
,H)&%"$((7"3"8"8"HD!+"&9&;D#M)77JDO'2'@'@('SD$%0020 :&/
. - ( '="3 )@  	
 " & #4 %8 '< ( 2R )@ #4 #4 %8  *!" "#$  .%&  '( 2R)

2 */&0E 1
I% 	,- )446DO*557D0:DLL,-0:DLL,-6FDLL23 E
s   AJ1c                     | j                   S N)r`   r8   s    r:   configzTransformerDecoderSpec.config  s    ||rH   )r<   r=   r>   r   r?   r@   rC   rD   r   r   rE   rF   Quantizationr;   propertyrl   rG   rH   r:   rJ   rJ   t   s   
 -8-C-C-H-H$)'+#$"'(-! -2!$("&JN'("01'("'"'$)&+&*"&(,9=*.$(;@KdGdG dG 	dG
  **dG "dG !%dG dG dG  dG "&dG dG dG dG dG  !dG" '+#dG$ %dG& SM'dG(  )dG* &n&F&FG+dG,  %-dG. /dG0 +.1dG2 "%3dG4  5dG6  7dG8 "9dG:  $;dG< sm=dG> 3-?dG@ !AdGB [556CdGD #3-EdGF SMGdGH IdGJ +34.KdGL  rH   rJ   c                   j    e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	dee   dedeej                     dededefdZ	y)
r6   Nr   r   r   r   r    r#   c                    t        j                  d||||||||	|
|||      | _        t        ||      | _        |rt        j                  |      | _        t        j                  |      | _        t        j                  |      | _	        t        j                  |      | _
        t        | j                  d       t        | j                  d       y y )NT)self_attentionr   r   r   r   r   r!   r   r   r   r   r    r"   glur   r(   r5   )r   MultiHeadAttentionSpecrq   FeedForwardSpecffnr   r4   input_layer_normpost_attention_layer_normpre_feedforward_layer_normpost_feedforward_layer_normdelattr)r8   r   r   r   r   r   r   r!   r   r   r   r   r    r"   r#   s                  r:   r;   z$TransformerEncoderLayerSpec.__init__!  s    " -CC/$;%)!/ 3"7#
 #wB$/$=$=x$PD!-8-F-F!.D* /:.G.G!/D+ 0;/H/H!0D, D''6DHHl+ rH   )FFFFNNNNTNr   r   FF)
r<   r=   r>   r   rC   rD   r   rE   rF   r;   rG   rH   r:   r6   r6      s~       %$("&JN'("$)/, SM/,  /, &n&F&FG/,  %/, /, "/,rH   r6   c                   <    e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZy)re   Nc                 d   t        j                  d|||||||	|
||||||      | _        |r"t        j                  ||||||du       | _        t	        ||      | _        |rz|rt        j                         | _        n2t        j                         | _	        t        j                         | _
        t        | j                  d       t        | j
                  d       |rt        j                  |      | _	        t        j                  |      | _
        |r8|r6t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        t        | j                  d       t        | j
                  d       y y )NT)rq   r   r   r   r   r   r   r   r    rS   rT   r   r   r!   r"   F)r   r   r   r!   r"   has_normrr   r5   r(   )r   rt   rq   	attentionru   rv   r   r4   rV   rw   rx   r{   *external_post_encoder_attention_layer_norm)external_pre_encoder_attention_layer_normry   rz   )r8   rL   r   r   r   r   r   r   r   r   r    rS   rT   rU   rV   r#   r   r   r!   r"   rZ   s                        r:   r;   z$TransformerDecoderLayerSpec.__init__T  s   . -CC/$;!/ 3"7#-M$;%)
$ "+BB!)!-9UBDN #wB )4)B)B)D&(3(A(A(C%1<1J1J1L.D''6DHHl+$/$=$=x$PD!-8-F-F!.D* &*J--x@ ?  --x@ >
 /:.G.G!/D+ 0;/H/H!0D, D''6DHHl+1 rH   )TFFFFNTNr   r   r   r   FFFNNNFFr<   r=   r>   r;   rG   rH   r:   re   re   S  sF      $ % )* !!).+W,rH   re   c                       e Zd ZddZy)ru   c                     t        j                  |      | _        t        j                         | _        t        j                         | _        |rt        j                         | _        y y )Nr(   )r   r4   r5   rc   linear_0linear_1linear_0_noact)r8   rs   r   s      r:   r;   zFeedForwardSpec.__init__  sM    %33XF#..0#..0"-"8"8":D rH   N)FFr   rG   rH   r:   ru   ru     s    ;rH   ru   c                       e Zd Zd Zy)r2   c                 .    t         j                  | _        y rj   )r   ra   	encodingsrk   s    r:   r;   zPositionEncoderSpec.__init__  s    #,,rH   Nr   rG   rH   r:   r2   r2     s    -rH   r2   c                   0     e Zd ZdZddee   f fdZ xZS )TransformerConfigz%Configuration for Transformer models.layer_norm_epsilonc                 (    t        |   dd|i| y)zInitializes the configuration for Transformer models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   NrG   superr;   r8   r   kwargs	__class__s      r:   r;   zTransformerConfig.__init__       	I,>I&IrH   rj   r<   r=   r>   __doc__r   rF   r;   __classcell__r   s   @r:   r   r     s    /J8E? J JrH   r   c                    T    e Zd ZdZdedef fdZedddej                  j                  dddej                  j                  dddddfd	eeeeef   f   d
ededededej                  dedededej                  dededededefd       Zed        Zed        Zd Zd Zd Z xZS )TransformerSpeczDescribes a Transformer model.

    The specification is invariant to hidden dimensions but requires to
    explicitly set the number of layers and attention heads.
    encoderdecoderc                 
   t        |t              st        d      t        |t              st        d      t        |           || _        || _        | j                  j                  d| j                  j                         y)zInitializes a Transformer model specification.

        Args:
          encoder: The encoder specification.
          decoder: The decoder specification.
        1encoder argument must be a TransformerEncoderSpec1decoder argument must be a TransformerDecoderSpecr   N)
isinstancer
   	TypeErrorrJ   r   r;   r   r   r`   add_attributer   )r8   r   r   r   s      r:   r;   zTransformerSpec.__init__  sm     '#9:OPP'#9:OPP""#T\\%G%G	
rH   FTrK   r   r   r   with_relative_positionr   r   r   rN   rO   r   r   r   r   r   r   r   c                     t        |t        t        f      r|\  }}n||}}t        ||||||	|
||||||      }t	        |||||||||||||      } | ||      S )a  Creates a Transformer model specification.

        Args:
          num_layers: Number of encoder and decoder layers, or a 2-tuple if the
            number is different.
          num_heads: Number of attention heads.
          with_relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layer as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention.
        )r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   rN   rO   r   r   r   )r   listtupler
   rJ   )clsr   r   r   r   r   r   rN   rO   r   r   r   r   r   r   r   num_encoder_layersnum_decoder_layersr   r   s                       r:   from_configzTransformerSpec.from_config  s    V j4-05?2 25? 2('!"7- 34$;"7
  )'! 34$;++"7
  7G$$rH   c                      y)Nr   rG   rk   s    r:   namezTransformerSpec.name7  s     rH   c                      y)N   rG   rk   s    r:   revisionzTransformerSpec.revision;      rH   c                     t               S rj   )r   rk   s    r:   get_default_configz"TransformerSpec.get_default_config?  s     ""rH   c                     | j                   j                  D cg c]  }|j                  j                  d    c}S c c}w Nr   r   r0   weightshape)r8   specs     r:   get_source_vocabulary_sizez*TransformerSpec.get_source_vocabulary_sizeB  s/    151H1HI!!!$IIIs    <c                 \    | j                   j                  j                  j                  d   S r   r   r0   r   r   rk   s    r:   get_target_vocabulary_sizez*TransformerSpec.get_target_vocabulary_sizeE  #    ||&&--33A66rH   )r<   r=   r>   r   r
   rJ   r;   classmethodr   r?   r@   rA   rB   r   rC   r   rD   r   rn   r   r   r   r   r   r   r   s   @r:   r   r     sn   
-
8N
* 
 (-#-8-C-C-H-H! %&8C8S8S8Z8Z$)(-&+!O%#uS#X./O% O% !%	O%
 O% O%  **O% O% O%  #O% &55O% "O% "&O% O% O%   $!O% O%b ! !  #J7rH   r   c                   0     e Zd ZdZddee   f fdZ xZS )TransformerDecoderModelConfigz-Configuration for Transformer decoder models.r   c                 (    t        |   dd|i| y)zInitializes the configuration for Transformer decoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   NrG   r   r   s      r:   r;   z&TransformerDecoderModelConfig.__init__L  r   rH   rj   r   r   s   @r:   r   r   I      7J8E? J JrH   r   c            @           e Zd ZdZdef fdZedej                  j                  ddddddddddddddd	d	dddddddddddfd
e
de
dedej                  dedededededededededee
   dedeej                     dedede
de
deded ed!ed"ee
   d#ee
   d$ee
   d%eej                      d&ee
   d'ee
   d(ef>d)       Zed*        Zed+        Zd, Zd- Z xZS ).TransformerDecoderModelSpecz3Describes a Transformer decoder model (e.g. GPT-2).r   c                     t        |t              st        d      t        |           || _        | j
                  j                  j                         D ]!  \  }}| j                  j                  ||       # y)z|Initializes a Transformer decoder model specification.

        Args:
          decoder: The decoder specification.
        r   N)
r   rJ   r   r   r;   r   rl   itemsr`   r   )r8   r   keyvaluer   s       r:   r;   z$TransformerDecoderModelSpec.__init__Y  sh     '#9:OPP,,--335 	3JCLL&&sE2	3rH   TFNr   r   r   r   r   r   r   r   r   rM   r   r   r   rP   rQ   rR   r   r   r   r   r    rS   rT   rU   rV   r#   r   r   r   r!   rW   rX   rY   r"   c                      t        ||fi d|d|d|ddd|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|}  | |       S ) a!
  Creates a Transformer decoder model specification.

        Args:
          num_layers: Number of decoder layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          no_final_norm: Do not apply layer normalization after the last decoder block.
          project_in_out: Add a linear layer after the embedding layer and another one
            before the final output projection.
          with_relative_position: Enable relative position representations modules.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          head_dim: Number of head
          sliding_window: max sequence length to retain KV cache
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
        r   r   r   rL   Fr   rM   r   r   r   rP   rQ   rR   r   r   r   r   r    rS   rT   rU   rV   r#   r   r   r   r!   rW   rX   rY   r"   )rJ   )!r   r   r   r   r   r   r   rM   r   r   r   rP   rQ   rR   r   r   r   r   r    rS   rT   rU   rV   r#   r   r   r   r!   rW   rX   rY   r"   r   s!                                    r:   r   z'TransformerDecoderModelSpec.from_configg  sR   X )!
 !
 "	!

 !4!
 $)!
 (!
 *!
 5!
 !
 !
 !
 *F!
 $!
 "!
  0!!
" !4#!
$ #8%!
& $'!
( .N)!
* %<+!
, 0-!
. 0/!
0 !41!
2 #83!
4 &5!
6 7!
8 *9!
: ";!
< .=!
> "?!
@ A!
F 7|rH   c                      y)NrJ   rG   rk   s    r:   r   z TransformerDecoderModelSpec.name      'rH   c                      y)N   rG   rk   s    r:   r   z$TransformerDecoderModelSpec.revision  r   rH   c                     t               S rj   )r   rk   s    r:   r   z.TransformerDecoderModelSpec.get_default_config      ,..rH   c                 \    | j                   j                  j                  j                  d   S r   r   rk   s    r:   get_vocabulary_sizez/TransformerDecoderModelSpec.get_vocabulary_size  r   rH   )r<   r=   r>   r   rJ   r;   r   r   r?   r@   rC   rD   r   r   rE   rF   rm   r   rn   r   r   r   r   r   r   s   @r:   r   r   V  sK   =3 6 3 
 -8-C-C-H-H$)#$',-2!$("&JN'("01'("'"'$)&+&*"&(,9=*.$(Ann n 	n
  **n "n n n !%n n n n '+n n SMn   !n" &n&F&FG#n$  %%n& 'n( +.)n* "%+n,  -n.  /n0 "1n2  $3n4 sm5n6 3-7n8 !9n: [556;n< #3-=n> SM?n@ An n` ( (  /7rH   r   c                   0     e Zd ZdZddee   f fdZ xZS )TransformerEncoderModelConfigz-Configuration for Transformer encoder models.r   c                 (    t        |   dd|i| y)zInitializes the configuration for Transformer encoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   NrG   r   r   s      r:   r;   z&TransformerEncoderModelConfig.__init__  r   rH   rj   r   r   s   @r:   r   r     r   rH   r   c                        e Zd ZdZdej
                  j                  fdededej
                  f fdZ	e
d        Ze
d        Zd	 Zd
 Z xZS )TransformerEncoderModelSpecz2Describes a Transformer encoder model (e.g. BERT).Fr   pooling_layerpooling_activationc                 P   t        |t              st        d      t        |           || _        | j                  j                  d| j
                  j                         |rCt        j                         | _        t        j                  d      j                  |      | _        yy)zInitializes a Transformer encoder model specification.

        Args:
          encoder: The encoder specification.
          pooling_layer: Add the pooling layer.
          pooling_activation: The activation to apply after the pooling layer.
        r   r   r'   N)r   r
   r   r   r;   r   r`   r   r   r   rc   pooler_denser+   r,   r-   pooler_activation)r8   r   r   r   r   s       r:   r;   z$TransformerEncoderModelSpec.__init__  s     '#9:OPP""#T\\%G%G	
  + 6 6 8D%'XXf%5%:%:;M%ND" rH   c                      y)Nr
   rG   rk   s    r:   r   z TransformerEncoderModelSpec.name  r   rH   c                      y)Nr   rG   rk   s    r:   r   z$TransformerEncoderModelSpec.revision  r   rH   c                     t               S rj   )r   rk   s    r:   r   z.TransformerEncoderModelSpec.get_default_config  r   rH   c                 b    | j                   j                  d   j                  j                  d   S r   r   rk   s    r:   r   z/TransformerEncoderModelSpec.get_vocabulary_size  s(    ||&&q)0066q99rH   )r<   r=   r>   r   r   r?   Tanhr
   rD   r;   rn   r   r   r   r   r   r   s   @r:   r   r     sw    <
 $5@5K5K5P5P	O'O O (22	O4 ( (  /:rH   r   )r   typingr   r   r   numpyr+   ctranslate2.specsr   r   r   	LayerSpecr
   rJ   r6   re   ru   r2   SequenceToSequenceModelConfigr   SequenceToSequenceModelSpecr   LanguageModelConfigr   LanguageModelSpecr   r   r   rG   rH   r:   <module>r      s   6 ) )  E Eg
Z11 g
TiZ11 iX0,*"6"6 0,fX,*"6"6 X,v;j** ;-*.. -

J
@@ 
J}7j<< }7@
JJ$B$B 
JN7*">"> N7b
JJ$B$B 
J):*">"> ):rH   