
    qi                          d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
  ej                  e      Zd	d
iZ ed       G d de             ZdgZy)z Tokenization class for SpeechT5.    )Any   )SentencePieceBackend)logging)requires   )EnglishNumberNormalizer
vocab_filezspm_char.model)sentencepiece)backendsc            
           e Zd ZdZeZddgZdZ	 	 	 	 	 	 ddee	e
f   dz  ddf fdZdd	Zed
        Zej                  d        Zddee   fdZ	 ddee   dee   dz  dedee   f fdZ	 ddee   dee   dz  dee   fdZ xZS )SpeechT5Tokenizera	  
    Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The begin of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        normalize (`bool`, *optional*, defaults to `False`):
            Whether to convert numeric quantities in the text to their spelt-out english counterparts.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    	input_idsattention_maskFNsp_model_kwargsreturnc           
      \    || _         d | _        |||d<   t        	|   d||||||d| y )Nr   )r
   	bos_token	eos_token	unk_token	pad_token	normalize )r   _normalizersuper__init__)
selfr
   r   r   r   r   r   r   kwargs	__class__s
            d/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/speecht5/tokenization_speecht5.pyr   zSpeechT5Tokenizer.__init__M   sW     # &(7F$% 	 	
!	
 	
    c                 v    |j                  d| j                        }|rd|z   }|r| j                  |      }||fS )Nr    )popr   
normalizer)r   textis_split_into_wordsr   r   s        r    prepare_for_tokenizationz*SpeechT5Tokenizer.prepare_for_tokenizationj   s=    JJ{DNN;	:D??4(Df~r!   c                 P    | j                   t               | _         | j                   S N)r   r	   )r   s    r    r%   zSpeechT5Tokenizer.normalizerr   s%    #68Dr!   c                     || _         y r*   )r   )r   values     r    r%   zSpeechT5Tokenizer.normalizerx   s
     r!   c                 L    ||| j                   gz   S ||z   | j                   gz   S )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)r   token_ids_0token_ids_1s      r     build_inputs_with_special_tokensz2SpeechT5Tokenizer.build_inputs_with_special_tokens|   s5    $"3"3!444[(D,=,=+>>>r!   r/   r0   already_has_special_tokensc                     |rt         |   ||d      S dg}|dgt        |      z  |z   S dgt        |      z  dgt        |      z  z   |z   S )NT)r/   r0   r2   r   r   )r   get_special_tokens_masklen)r   r/   r0   r2   suffix_onesr   s        r    r4   z)SpeechT5Tokenizer.get_special_tokens_mask   su     &72'[]a 3   cC#k**k99c+&&A3[1A+AB[PPr!   c                 n    | j                   g}|t        ||z         dgz  S t        ||z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. SpeechT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        r   )r.   r5   )r   r/   r0   eoss       r    $create_token_type_ids_from_sequencesz6SpeechT5Tokenizer.create_token_type_ids_from_sequences   sK        !{S()QC//;,s23qc99r!   )z<s>z</s>z<unk>z<pad>FN)Fr*   )NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesis_fastdictstrr   r   r(   propertyr%   setterlistintr1   boolr4   r9   __classcell__)r   s   @r    r   r      s   (T *$&67G
 15
 c3h$.
 

:    
 ! !?QUVYQZ ? puQ9Q379t3CQhlQ	cQ GK:9:379t3C:	c:r!   r   N)r=   typingr    tokenization_utils_sentencepiecer   utilsr   utils.import_utilsr   number_normalizerr	   
get_loggerr:   loggerr>   r   __all__r   r!   r    <module>rR      si    '  D  * 6 
		H	%!#34  
%&E:, E: 'E:P 
r!   