
    qi                         d Z ddlmZmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ  ej                  e      Zddd	d
ZddiZdZ G d de      ZdgZy)zTokenization classes for Qwen2.    )
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filezqwen/qwen-tokenizeri   zn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+c                   |     e Zd ZeZddgZeZ	 	 	 	 	 	 	 d
dee	ee
f   z  dz  deee   z  dz  dededef
 fd	Z xZS )Qwen2Tokenizer	input_idsattention_maskNvocabmerges	unk_token	eos_token	pad_tokenc                    ||nd| _         ||nddi| _        |xs g | _        t        t	        | j                  | j                  d d dddd            | _        t        j                         | j
                  _        t        j                         | j
                  _        t        j                  t        j                  t        t               dd      t        j                  | j                   d      g      | j
                  _        t%        
| L  d|||||d	| | j)                  | j*                  D 	cg c]  }	t-        |	d
       c}	       y c c}	w )NF<|endoftext|>r    )r   r   dropoutr   continuing_subword_prefixend_of_word_suffixfuse_unkbyte_fallbackisolated)behaviorinvert)add_prefix_space	use_regex)r   	bos_tokenr   r   r$   T)special )r$   _vocab_mergesr   r	   
_tokenizerr   	ByteLeveldecoderr   NFC
normalizerr   SequenceSplitr   PRETOKENIZE_REGEXpre_tokenizersuper__init__
add_tokensall_special_tokensr   )selfr   r   r   r&   r   r   r$   kwargstoken	__class__s             ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/qwen2/tokenization_qwen2.pyr5   zQwen2Tokenizer.__init__)   sM    5E4P 0V[      	 |#kk||*,#%#	
 #+"4"4"6%0__%6"(6(?(?$$+,' 
 ((%)%:%:#
)
% 	 	
-	
 	
 	dF]F]^UE48^_^s   6E)NNr   Nr   r   N)__name__
__module____qualname__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr	   modelstrdictintlistr5   __classcell__)r;   s   @r<   r   r   $   s    )$&67E .2)-(((9`T#s(^#d*9` d3i$&9` 	9` 9` 9` 9`    r   N)__doc__
tokenizersr   r   r   r   r   r   tokenizers.modelsr	   tokenization_utils_tokenizersr   utilsr   
get_loggerr=   loggerr@   MAX_MODEL_INPUT_SIZESr2   r   __all__r(   rI   r<   <module>rS      sp    & Z Z ! >  
		H	% &  /6  J >`& >`B 
rI   