
    qiM                         d dl mZmZmZmZmZ d dlmZ ddlm	Z	 ddl
mZ  ej                  e      ZdddZ G d	 d
e	      Zd
gZy)    )	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                        e Zd ZdZeZddgZeZ	 	 	 	 	 	 	 	 	 dde	e
e	ef   z  dz  de	ee	   z  dz  de	de	d	e	d
e	de	de	dz  de	dz  f fdZ xZS )HerbertTokenizera  
    Construct a BPE tokenizer for HerBERT (backed by HuggingFace's tokenizers library).

    Peculiarities:

    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
      a punctuation character will be treated separately.

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the methods. Users should refer to the
    superclass for more information regarding methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The padding token.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The mask token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token.
        vocab (`str`, `dict` or `list`, *optional*):
            Custom vocabulary dictionary.
        merges (`str` or `list[str]`, *optional*):
            Custom merges list.
    	input_idsattention_maskNvocabmerges	cls_token	unk_token	pad_token
mask_token	sep_tokenr   r   c
           
      D   ||nt        |      di| _        |xs g | _        t        t	        | j                  | j                  d t        |      d            | _        t        j                  dddd      | j
                  _        t        j                         | j
                  _        t        j                  d      | j
                  _        t        | @  d|||||d|
 t#        j$                  | j&                  d	f| j(                  df
      | j
                  _        y )Nr   z</w>)r   r   dropoutr   end_of_word_suffixFT)	lowercasestrip_accents
clean_texthandle_chinese_chars)suffix)r   r   r   r   r      )sepcls )str_vocab_mergesr   r   
_tokenizerr   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizerr   
BPEDecoderdecodersuper__init__r   BertProcessingr   r   post_processor)selfr   r   r   r   r   r   r   r   r   kwargs	__class__s              b/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/herbert/tokenization_herbert.pyr0   zHerbertTokenizer.__init__A   s     %0es9~q6I|#kk||i.#)
 &1%?%?5TX\&
" )7(G(G(I%"*"5"5V"D 	
!	
 	
 *4)B)B##*
&    )	NNz<s>z<unk>z<pad>z<mask>z</s>NN)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr%   dictintlistr0   __classcell__)r5   s   @r6   r   r      s    @ *$&67E .2)-  "!%"&+
T#s(^#d*+
 d3i$&+
 	+

 +
 +
 +
 +
 $J+
 4Z+
 +
r7   r   N)
tokenizersr   r   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr
   utilsr   
get_loggerr8   loggerr<   r   __all__r$   r7   r6   <module>rK      sP     T S ! >  
		H	%#/M P
( P
f 
r7   