
    qi\e                     V   d dl Z d dlmZ d dlZd dlmZmZmZmZm	Z	 d dl
mZ d dlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0  G d de0e      Z1 G d de.      Z2 G d de'      Z3 G d de&      Z4 G d dejj                        Z6 G d d e"      Z7 G d! d"e!      Z8 G d# d$e*      Z9 G d% d&e)      Z: G d' d(e,      Z; ed)*       G d+ d,e;             Z< G d- d.e+      Z=g d/Z>y)0    N)Callable)	Tokenizerdecodersnormalizerspre_tokenizers
processors)Unigram)nn   )create_bidirectional_mask)BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TokenizersBackend)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )LlamaAttentionLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)ParakeetCTCConfigParakeetEncoderConfig)ParakeetEncoderBlock ParakeetEncoderConvolutionModuleParakeetForCTCParakeetPreTrainedModel)ParakeetProcessor)T5Tokenizerc                   Z    e Zd Z	 	 	 	 	 	 	 	 d	dZ	 	 	 d
deee   z  dededz  dedef
dZy)LasrTokenizerNc	           	      R   || _         |q|D 
cg c]  }
dt        |
      v s|
 }}
t        |      dk  r!|t        |      D cg c]  }d| d
 c}z  }nC|dkD  r>|t        |      k7  r0t	        d| d| d      t        |      D cg c]  }d| d
 }}|}||| _        nbt        |      dft        |      dft        |      dfd	g| _        t        |dz
  d
d
      D ]#  }| j
                  j                  d| ddf       % t        t        | j
                  dd            | _	        |$t        j                  |      | j                  _        t        j                  t        j                         t        j                   ddd      g      | j                  _        t%        j                   ddd      | j                  _        t)        j*                  d|||||d|	 t-        j.                  ddgg dd| j0                  fg      | j                  _        y c c}
w c c}w c c}w )Nz
<extra_id_   >r   zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to LasrTokenizer. In this case the additional_special_tokens must include the extra_ids tokens        )   ▁g       r   F)unk_idbyte_fallbackr*   alwaysT)replacementprepend_schemesplit)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokens$A</s>)r7   r8   z$Br8   )singlepairspecial_tokens )
_extra_idsstrlenrange
ValueError_vocab_scoresappendr   r	   
_tokenizerr   Precompiled
normalizerr   SequenceWhitespaceSplit	Metaspacepre_tokenizerr   decoderr   __init__r   TemplateProcessingeos_token_idpost_processor)selfr2   r3   r4   _spm_precompiled_charsmapr5   r6   vocab
vocab_filekwargsxextra_tokensis                W/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/lasr/modular_lasr.pyrL   zLasrTokenizer.__init__,   s]    $ %0'@[!LTWXYTZDZA[L[< 1$)yIY-ZA
1#Q.?-ZZ)Q9L0A#A &yk1RSlRm n   8=Y7GH!j1-HLH(4% !&D Y%Y%Y%	"D 9q="b1 D""))Zs!+<c*BCD#""#
 %0)4)@)@AZ)[DOO&(6(?(?..0((U8[_`)
% #+"4"4W_gk"l"" 	
&?	
 	
 *4)F)F&>-**+*
&k \-Z Is   HHHH$	token_idsskip_special_tokensclean_up_tokenization_spacesgroup_tokensreturnc                     t        |t              r|g}|r%t        j                  |      D cg c]  }|d   	 }}|D cg c]  }|| j                  k7  s| }}t        j                  | f|||d|S c c}w c c}w )Nr   )rY   rZ   r[   )
isinstanceint	itertoolsgroupbypad_token_idr   _decode)rP   rY   rZ   r[   r\   rT   token_grouptokens           rX   rd   zLasrTokenizer._decodey   s     i%"I;D;L;LY;WXKQXIX )2PuUd>O>O5OUP	P ((
 3)E	

 
 	
 Y Qs   A4A9A9)r8   z<unk>z<pad>Nd   NNN)FNT)	__name__
__module____qualname__rL   r`   listboolr>   rd   r<       rX   r%   r%   +   so     "&"&K
` %*48!
c?
 "
 '+Tk	

 
 

rm   r%   c                       e Zd Zy)LasrProcessorNrh   ri   rj   r<   rm   rX   ro   ro      s    rm   ro   c                   \     e Zd ZdZddddddddd	d
ddddddddddddgddgddf fd	Z xZS )LasrEncoderConfiga  
    This is the configuration class to store the configuration of a [`LasrEncoder`]. It is used to instantiate a
    `LasrEncoder` model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
            hidden_size (`int`, *optional*, defaults to 512):
                Dimension of the layers and the hidden states.
            num_hidden_layers (`int`, *optional*, defaults to 17):
                Number of hidden layers in the Transformer encoder.
            num_attention_heads (`int`, *optional*, defaults to 8):
                Number of attention heads for each attention layer in the Transformer encoder.
            intermediate_size (`int`, *optional*, defaults to 2048):
                Dimension of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
            hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
                The non-linear activation function (function or string) in the encoder and pooler.
            attention_bias (`bool`, *optional*, defaults to `False`):
                Whether to use bias in the attention layers.
            convolution_bias (`bool`, *optional*, defaults to `False`):
                Whether to use bias in convolutions of the conformer's convolution module.
            conv_kernel_size (`int`, *optional*, defaults to 32):
                The kernel size of the convolution layers in the Conformer block.
            subsampling_conv_channels (`int`, *optional*, defaults to 256):
                The number of channels in the subsampling convolution layers.
            subsampling_conv_kernel_size (`int`, *optional*, defaults to 5):
                The kernel size of the subsampling convolution layers.
            subsampling_conv_stride (`int`, *optional*, defaults to 2):
                The stride of the subsampling convolution layers.
            num_mel_bins (`int`, *optional*, defaults to 128):
                Number of mel features.
            dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for all fully connected layers in the embeddings, encoder, and pooler.
            dropout_positions (`float`, *optional*, defaults to 0.0):
                The dropout ratio for the positions in the input sequence.
            layerdrop (`float`, *optional*, defaults to 0.1):
                The dropout ratio for the layers in the encoder.
            activation_dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for activations inside the fully connected layer.
            attention_dropout (`float`, *optional*, defaults to 0.1):
                The dropout ratio for the attention layers.
            max_position_embeddings (`int`, *optional*, defaults to 10000):
                The maximum sequence length that this model might ever be used with.
            initializer_range (`float`, *optional*, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
            layer_norm_eps (`float`, *optional*, defaults to 1e-06):
                The epsilon used by the layer normalization layers.
            feed_forward_residual_weights (`tuple[float, float]`, *optional*, defaults to `[1.5, 0.5]`):
                The residual weights for the feed forward layers.
            conv_residual_weights (`tuple[float, float]`, *optional*, defaults to `[2.0, 1.0]`):
                The residual weights for the convolution layers.
            batch_norm_momentum (`float`, *optional*, defaults to 0.01):
                The momentum for the batch normalization layers.
            rope_parameters (`RopeParameters`, *optional*):
                Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
                a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
                with longer `max_position_embeddings`.

    Example:
        ```python
        >>> from transformers import LasrEncoderModel, LasrEncoderConfig

        >>> # Initializing a `LasrEncoder` configuration
        >>> configuration = LasrEncoderConfig()

        >>> # Initializing a model from the configuration
        >>> model = LasrEncoderModel(configuration)

        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```

    This configuration class is based on the LasrEncoder architecture from Google Health AI. You can find more details
    and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
             i   siluF          r      g?r)   i'  g{Gz?gư>g      ?g      ?g       @g      ?g{Gz?Nc                     || _         || _        || _        || _        || _        t        |   di d|d|d|d|d|d|d|d|d	|	d
|d|
d|d|d|d|d|d|d|d|| | `| `y )Nhidden_sizenum_hidden_layersnum_attention_headsintermediate_size
hidden_actattention_biasconvolution_biasconv_kernel_sizesubsampling_conv_channelsnum_mel_binssubsampling_conv_kernel_sizesubsampling_conv_stridedropoutdropout_positions	layerdropactivation_dropoutattention_dropoutmax_position_embeddingsinitializer_ranger<   )	rope_parameterslayer_norm_epsfeed_forward_residual_weightsconv_residual_weightsbatch_norm_momentumsuperrL   subsampling_factorscale_input)rP   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rT   	__class__s                             rX   rL   zLasrEncoderConfig.__init__   s   8  /,-J*%:"#6  	
#	
/	
 !4	
 0		

 "	
 *	
 .	
 .	
 '@	
 &	
 *F	
 %<	
 	
 0	
  	
   2!	
" 0#	
$ %<%	
& 0)	
. #rm   )rh   ri   rj   __doc__rL   __classcell__r   s   @rX   rr   rr      sf    K^ "%%& ! %'*Cj"Cj 3: :rm   rr   c                   J     e Zd ZdZ	 	 	 	 	 ddeez  f fdZed        Z xZ	S )LasrCTCConfiga  
    This is the configuration class to store the configuration of a [`LasrForCTC`]. It is used to instantiate a
    Lasr CTC model according to the specified arguments, defining the model architecture.
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.
    Args:
            vocab_size (`int`, *optional*, defaults to 512):
                Vocabulary size of the model.
            ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
                Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
                instance of [`LasrForCTC`].
            ctc_zero_infinity (`bool`, *optional*, defaults to `True`):
                Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
                occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
                of [`LasrForCTC`].
            encoder_config (`Union[dict, LasrEncoderConfig]`, *optional*):
                The config object or dictionary of the encoder.
            pad_token_id (`int`, *optional*, defaults to 0):
                Padding token id. Also used as blank token id.
    Example:
        ```python
        >>> from transformers import LasrForCTC, LasrCTCConfig
        >>> # Initializing a Lasr configuration
        >>> configuration = LasrCTCConfig()
        >>> # Initializing a model from the configuration
        >>> model = LasrForCTC(configuration)
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
    This configuration class is based on the Lasr CTC architecture from Google Health AI. You can find more details
    and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
    encoder_configc           	      0    t        |   d|||||d| y )N)
vocab_sizectc_loss_reductionctc_zero_infinityr   rc   r<   )r   rL   )rP   r   r   r   r   rc   rT   r   s          rX   rL   zLasrCTCConfig.__init__C  s0     	 	
!1/)%	
 	
rm   c                 4    | j                   j                  dz  S )Nr   )r   r   )rP   s    rX   inputs_to_logits_ratioz$LasrCTCConfig.inputs_to_logits_ratioU  s    ""::A==rm   )rs   meanTNr   )
rh   ri   rj   r   dictrr   rL   propertyr   r   r   s   @rX   r   r   !  sC    F !37

 00
$ > >rm   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )LasrEncoderSubsamplingconfigc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j
                  |j                  |j                        | _
        t        j                  |j
                  |j                  |j                  |j                        | _        t        j                  |j                  |j
                        | _        t        j                         | _        y )N)kernel_sizestride)r   rL   r
   Linearr   r|   dense_0Conv1dr   r   conv_0r   conv_1dense_1ReLUact_fn)rP   r   r   s     rX   rL   zLasrEncoderSubsampling.__init__[  s    yy!4!4f6H6HIii;;11	
 ii,,;;11	
 yy!A!A6CUCUVggirm   input_featuresr]   c                 ,   | j                  | j                  |            }|j                  dd      }| j                  | j                  |            }| j                  | j	                  |            }|j                  dd      }| j                  |      S )Nr'   r   )r   r   	transposer   r   r   )rP   r   hidden_statess      rX   forwardzLasrEncoderSubsampling.forwardm  sz    DLL$@A%//15DKK$>?DKK$>?%//15||M**rm   )	rh   ri   rj   rr   rL   torchTensorr   r   r   s   @rX   r   r   Z  s+     0  $+ell +u|| +rm   r   c                       e Zd Zy)LasrEncoderRotaryEmbeddingNrp   r<   rm   rX   r   r   v  s    rm   r   c                        e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   dz  dej                  dz  de	e
   d	eej                  ej                  f   f
d
Z xZS )LasrEncoderAttentionr   	layer_idxc                 4    t         |   ||       d| _        y )NF)r   rL   	is_causalrP   r   r   r   s      rX   rL   zLasrEncoderAttention.__init__z  s    +rm   Nr   position_embeddingsattention_maskrT   r]   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	|\  }
}t        |||
|      \  }}t        j                  | j                  j                  t              } || |||	|f| j                  sdn| j                  | j                  d|\  }} |j                   g |d j#                         }| j%                  |      }||fS )Nr+   r'   r   r)   )r   scaling)shapehead_dimq_projviewr   k_projv_projr   r   get_interfacer   _attn_implementationr   trainingr   r   reshape
contiguouso_proj)rP   r   r   r   rT   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  rX   r   zLasrEncoderAttention.forward~  sk    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((rm   NN)rh   ri   rj   rr   r`   rL   r   r   tupler   r   r   r   r   s   @rX   r   r   y  s    0 S  IM.2	")||") #5<<#=>E") t+	")
 +,") 
u||U\\)	*")rm   r   c                   &     e Zd Zddef fdZ xZS )LasrEncoderConvolutionModuler   c                     t         |   ||       d| _        t        j                  |j
                  |j                        | _        y )Nsame)momentum)r   rL   paddingr
   BatchNorm1dr|   r   norm)rP   r   module_configr   s      rX   rL   z%LasrEncoderConvolutionModule.__init__  s7    /NN6#5#5@Z@Z[	rm   N)rh   ri   rj   rr   rL   r   r   s   @rX   r   r     s    \0 \ \rm   r   c                        e Zd Zdedef fdZ	 	 ddej                  dej                  dz  dej                  dz  dee	   d	ej                  f
d
Z
 xZS )LasrEncoderBlockr   r   c                 T   t         |   ||       |j                  | _        |j                  | _        t	        j
                  |j                  |j                  d      | _        t	        j
                  |j                  |j                  d      | _	        t	        j
                  |j                  |j                  d      | _
        t	        j
                  |j                  |j                  d      | _        t	        j
                  |j                  |j                  d      | _        y )NF)bias)r   rL   r   r   r
   	LayerNormr|   r   norm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outr   s      rX   rL   zLasrEncoderBlock.__init__  s    +-3-Q-Q*%+%A%A""$,,v/A/A6CXCX_d"e\\&*<*<f>S>SZ_`f&8&8&:O:OV[\"$,,v/A/A6CXCX_d"eV%7%79N9NUZ[rm   Nr   r   r   rT   r]   c                 0   |}| j                  | j                  |            }| j                  d   |z  | j                  d   |z  z   }| j                  |      } | j                  d|||d|\  }}||z   }| j                  | j                  |      |      }	| j                  d   |z  | j                  d   |	z  z   }|}| j                  | j                  |            }| j                  d   |z  | j                  d   |z  z   }| j                  |      }|S )Nr   r'   )r   r   r   )r   r<   )feed_forward1r   r   r   	self_attnconvr   r   feed_forward2r   r   )
rP   r   r   r   rT   residualnormalized_hidden_statesr   _conv_outputs
             rX   r   zLasrEncoderBlock.forward  sJ    !**4+B+B=+QR..q1H<t?a?abc?dgt?tt 	 $(#5#5m#D ' 
2) 3
 	
Q &3ii} =ni]2215EHbHbcdHehsHss **4+B+B=+QR..q1H<t?a?abc?dgt?tt 	 m4rm   r   )rh   ri   rj   rr   r`   rL   r   r   r   r   r   r   r   s   @rX   r   r     sw    
\0 
\S 
\ /337	!||! t+! #\\D0	!
 +,! 
!rm   r   c                   6    e Zd ZdZd Zdej                  fdZy)LasrPreTrainedModelFc                 .    t        j                  |       y r   )r   _init_weights)rP   modules     rX   r   z!LasrPreTrainedModel._init_weights  s    %%f-rm   input_lengthsc                     t        | j                  t              r| j                  j                  n| j                  }|j                  }|j
                  }d}t        |      D ]  }||z
  |z  dz   } |S )Nr   r'   )r_   r   r   r   r   r   r@   )rP   r  r   r   r   
num_layersr   s          rX   _get_subsampling_output_lengthz2LasrPreTrainedModel._get_subsampling_output_length  st    7A$++}7]33cgcncn$AA77
z" 	HA*[8VCaGM	H rm   N)rh   ri   rj   _supports_flex_attnr   r   r   r  r<   rm   rX   r   r     s    .	ELL 	rm   r   zh
    The LasrEncoder model, based on the Conformer architecture](https://arxiv.org/abs/2005.08100).
    )custom_introc                        e Zd ZU eed<   dZdef fdZeee	e
	 d
dej                  dej                  dz  dee   defd	                            Z xZS )LasrEncoderr   encoderc           	         t         |   |       d| _        |j                  | _        |j                  | _        |j
                  | _        t        |      | _        t        |      | _	        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                   |j"                  |j$                  d      | _        | j)                          y c c}w )NF)epsr   )r   rL   gradient_checkpointingr   r   r   r   
subsamplerr   
rotary_embr
   
ModuleListr@   r}   r   layersr   r|   r   out_norm	post_initr   s      rX   rL   zLasrEncoder.__init__  s     &+#~~!'!9!9))084V<mmBGH`H`BabYfi0b
 V%7%7V=R=RY^_	 cs   C2Nr   r   rT   r]   c                 b   | j                  |      }| j                  |t        j                  |j                  d   |j
                        j                  d            \  }}t        j                  j                  || j                  | j                        }t        j                  j                  || j                  | j                        }t        j                  j                  || j                  | j                        }| | j                  ||j                  d         }t        | j                  ||      }| j                  D ]G  }d}| j                  r&t        j                   g       }	|	| j"                  k  rd}|r: ||f|||fd	|}I | j%                  |      }t'        |
      S )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, LasrEncoder
        >>> from datasets import load_dataset, Audio

        >>> model_id = TODO
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> encoder = ParakeetEncoder.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"])
        >>> encoder_outputs = encoder(**inputs)

        >>> print(encoder_outputs.last_hidden_state.shape)
        ```
        r'   )devicer   )pr   )target_length)r   inputs_embedsr   FT)r   r   )last_hidden_state)r  r  r   aranger   r  	unsqueezer
   
functionalr   r   r   _get_output_attention_maskr   r   r  randr   r  r   )
rP   r   r   rT   r   r   r   encoder_layerto_dropdropout_probabilitys
             rX   r   zLasrEncoder.forward  s   > 7??5<<(;(;A(>}G[G[\ffghi
S --mt||VZVcVc-dmm##C4+A+ADMM#Zmm##C4+A+ADMM#Z%!<<^[h[n[nop[q<rN2;;')
 "[[ 	MG}}&+jjn#&7"G -!!#1),c
! 	!	  m4??rm   r   )rh   ri   rj   rr   __annotations__base_model_prefixrL   r   r   r   r   r   r   r   r   r   r   r   r   s   @rX   r	  r	    s     !0 "  /3?@?@ t+?@ +,	?@
 
?@     ?@rm   r	  c                        e Zd Z fdZ xZS )
LasrForCTCc                  8     t               j                  di | S )a  
        Example:

        ```python
        >>> from transformers import AutoProcessor, LasrForCTC
        >>> from datasets import load_dataset, Audio

        >>> model_id = TODO
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = LasrForCTC.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

        >>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
        >>> predicted_ids = model.generate(**inputs)
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        >>> print(transcription)
        ```
        r<   )r   generate)super_kwargsr   s    rX   r'  zLasrForCTC.generateO  s    ,  uw/,//rm   )rh   ri   rj   r'  r   r   s   @rX   r%  r%  N  s    0 0rm   r%  )r%  r	  r   ro   rr   r   r%   )?ra   collections.abcr   r   
tokenizersr   r   r   r   r   tokenizers.modelsr	   r
   masking_utilsr   modeling_outputsr   modeling_utilsr   r   processing_utilsr   tokenization_utils_tokenizersr   utilsr   r   r   utils.genericr   utils.output_capturingr   llama.modeling_llamar   r   r   r   parakeet.configuration_parakeetr   r   parakeet.modeling_parakeetr   r   r    r!   parakeet.processing_parakeetr"   t5.tokenization_t5r#   r%   ro   rr   r   Moduler   r   r   r   r   r   r	  r%  __all__r<   rm   rX   <module>r;     s!    $  S S %  6 / F & > I I 7 5 v v V  = ,d
K!2 d
N	% 	H- HV6>% 6>r+RYY +8 <!5 ;')> ')T\#C \.+ .b1 & 
X@% X@
X@v0 04rm   