
    qi                         d dl mZmZmZmZ d dlmZ ddlmZ ddl	m
Z
  e
j                  e      ZddiZ G d d	e      Zd	gZy
)    )	Tokenizerdecodersnormalizerspre_tokenizers)BPE   )TokenizersBackend)loggingtokenizer_fileztokenizer.jsonc                        e Zd ZdZeZdZddgZeZ		 	 	 	 	 	 	 dde
ee
ef   z  dz  de
ee
   z  dz  de
d	e
d
e
de
de
f fdZ xZS )GemmaTokenizeru  
    Construct a fast Gemma tokenizer (backed by HuggingFace's tokenizers library).

    This tokenizer uses a BPE model with byte fallback, no prefix space, and a normalizer that replaces
    spaces with "▁".

    Args:
        tokenizer_file (`str`, optional):
            A tokenizers JSON file containing the serialization of a tokenizer.
        unk_token (`str`, optional, defaults to "<unk>"):
            The unknown token.
        bos_token (`str`, optional, defaults to "<bos>"):
            The beginning of sequence token.
        eos_token (`str`, optional, defaults to "<eos>"):
            The end of sequence token.
        pad_token (`str`, optional, defaults to "<pad>"):
            The padding token.
        mask_token (`str`, optional, defaults to "<mask>"):
            The mask token.
        add_bos_token (`bool`, optional, defaults to True):
            Whether or not to add a `bos_token` at the start of sequences.
        add_eos_token (`bool`, optional, defaults to False):
            Whether or not to add an `eos_token` at the end of sequences.
        vocab (`str` or `dict[str, int]`, optional):
            Custom vocabulary dict. If not provided, a minimal vocabulary is created using the special tokens.
    left	input_idsattention_maskNvocabmerges	unk_token	bos_token	eos_token	pad_token
mask_tokenc                    |9t        |      dt        |      dt        |      dt        |      dt        |      di}|| _        |xs g | _        t        t	        | j                  | j                  dt        |      d d            | _        t        j                  dd	d
      | j
                  _        t        j                  t        j                  dd      t        j                         t        j                         g      | j
                  _        t        j                  dd      | j
                  _        t#        	| H  d|||||d| y )Nr         r      T)r   r   fuse_unkr   dropoutbyte_fallback merged_with_previousF)patternbehaviorinvertu   ▁)r   r   r   r   r    )str_vocab_mergesr   r   
_tokenizerr   Splitpre_tokenizerr   SequenceReplaceByteFallbackFusedecoderr   
normalizersuper__init__)
selfr   r   r   r   r   r   r   kwargs	__class__s
            ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/gemma/tokenization_gemma.pyr2   zGemmaTokenizer.__init__;   s*    =IIIIJE |#kk||i."	
 )7(<(<"8)
% #+"3"3eS)8+@+@+BHMMOT#
 &1%8%8e%D" 	
!	
 	
    )NNz<unk>z<bos>z<eos>z<pad>z<mask>)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namespadding_sidemodel_input_namesr   modelr%   dictintlistr2   __classcell__)r5   s   @r6   r   r      s    6 *L$&67E .2)-    "/
T#s(^#d*/
 d3i$&/
 	/

 /
 /
 /
 /
 /
r7   r   N)
tokenizersr   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr	   utilsr
   
get_loggerr8   loggerr<   r   __all__r$   r7   r6   <module>rL      sQ    H G ! >  
		H	%%'78 P
& P
f 
r7   