
    qi                     n    d dl Z d dlZd dlmZmZmZmZmZ d dlm	Z	 ddl
mZ dddZ G d	 d
e      Zd
gZy)    N)	Tokenizerdecodersnormalizerspre_tokenizers
processors)Unigram   )TokenizersBackendzspiece.modelztokenizer.json)
vocab_filetokenizer_filec                        e Zd ZdZeZddgZeZ	 	 	 	 	 	 	 	 d fd	Z	d Z
d Z	 	 	 ddeee   z  d	ed
edz  dedef
 fdZ xZS )LasrTokenizera  
    Construct a LASR tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 100):
            Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are accessible as
            "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be retrieved by
            calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids method
        additional_special_tokens (`list[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        vocab (`str`, `dict` or `list`, *optional*):
            Custom vocabulary dict. If not provided, a minimal vocabulary is created using the special tokens.
    	input_idsattention_maskNc	           	      H   || _         |q|D 
cg c]  }
dt        |
      v s|
 }}
t        |      dk  r!|t        |      D cg c]  }d| d
 c}z  }nC|dkD  r>|t        |      k7  r0t	        d| d| d      t        |      D cg c]  }d| d
 }}|}||| _        nbt        |      dft        |      dft        |      dfd	g| _        t        |dz
  d
d
      D ]#  }| j
                  j                  d| ddf       % t        t        | j
                  dd            | _	        |$t        j                  |      | j                  _        t        j                  t        j                         t        j                   ddd      g      | j                  _        t%        j                   ddd      | j                  _        t)        | T  d|||||d|	 t-        j.                  ddgg dd| j0                  fg      | j                  _        y c c}
w c c}w c c}w )Nz
<extra_id_   >r   zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to LasrTokenizer. In this case the additional_special_tokens must include the extra_ids tokensg        )   ▁g       r	   F)unk_idbyte_fallbackr   alwaysT)replacementprepend_schemesplit)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokens$A</s>)r!   r"   z$Br"   )singlepairspecial_tokens )
_extra_idsstrlenrange
ValueError_vocab_scoresappendr   r   
_tokenizerr   Precompiled
normalizerr   SequenceWhitespaceSplit	Metaspacepre_tokenizerr   decodersuper__init__r   TemplateProcessingeos_token_idpost_processor)selfr   r   r   _spm_precompiled_charsmapr   r    vocabr   kwargsxextra_tokensi	__class__s                \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/lasr/tokenization_lasr.pyr7   zLasrTokenizer.__init__J   s]    $ %0'@[!LTWXYTZDZA[L[< 1$)yIY-ZA
1#Q.?-ZZ)Q9L0A#A &yk1RSlRm n   8=Y7GH!j1-HLH(4% !&D Y%Y%Y%	"D 9q="b1 D""))Zs!+<c*BCD#""#
 %0)4)@)@AZ)[DOO&(6(?(?..0((U8[_`)
% #+"4"4W_gk"l 	
&?	
 	
 *4)F)F&>-**+*
&k \-Z Is   HHH	Hc                 T    t        t        t        d | j                                    S )zQGet the list of sentinel tokens (extra_id tokens) from additional_special_tokens.c                 D    t        t        j                  d|             d uS )Nz<extra_id_\d+>)boolresearch)r?   s    rC   <lambda>z3LasrTokenizer.get_sentinel_tokens.<locals>.<lambda>   s    bii0A1&E!Fd!R     )listsetfilterr    )r;   s    rC   get_sentinel_tokensz!LasrTokenizer.get_sentinel_tokens   s&    RTXTrTrst
 	
rJ   c                 f    | j                         D cg c]  }| j                  |       c}S c c}w )z&Get the token IDs for sentinel tokens.)rN   convert_tokens_to_ids)r;   tokens     rC   get_sentinel_token_idsz$LasrTokenizer.get_sentinel_token_ids   s*    ?C?W?W?YZe**51ZZZs   .	token_idsskip_special_tokensclean_up_tokenization_spacesgroup_tokensreturnc                     t        |t              r|g}|r%t        j                  |      D cg c]  }|d   	 }}|D cg c]  }|| j                  k7  s| }}t        |   d|||d|S c c}w c c}w )Nr   )rS   rT   rU   r&   )
isinstanceint	itertoolsgroupbypad_token_idr6   _decode)	r;   rS   rT   rU   rV   r>   token_grouprQ   rB   s	           rC   r^   zLasrTokenizer._decode   s     i%"I;D;L;LY;WXKQXIX )2PuUd>O>O5OUP	Pw 
 3)E
 	
 	
 Y Qs   A. A3A3)r"   z<unk>z<pad>Nd   NNN)FNT)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr7   rN   rR   rZ   rK   rF   r(   r^   __classcell__)rB   s   @rC   r   r   !   s    "H *$&67E "&"&K
Z
[ %*48!
c?
 "
 '+Tk	

 
 

 
rJ   r   )r[   rG   
tokenizersr   r   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr
   re   r   __all__r&   rJ   rC   <module>rn      sA   *  	 S S % > $2EUV U
% U
p 
rJ   