
    qiE                     X    d dl mZmZmZmZ d dlmZ ddlmZ ddiZ	 G d de      Z
dgZy	)
    )	Tokenizerdecodersnormalizerspre_tokenizers)BPE   )TokenizersBackendtokenizer_fileztokenizer.jsonc                        e Zd ZdZeZdZddgZeZ		 	 	 	 	 	 	 dde
ee
ef   z  dz  de
ee
   z  dz  de
d	e
d
e
de
de
f fdZ xZS )Siglip2TokenizerzN
    Gemma tokenizer + SigLIP2 training default: lowercase normalization.
    left	input_idsattention_maskNvocabmerges	unk_token	bos_token	eos_token	pad_token
mask_tokenc                    |9t        |      dt        |      dt        |      dt        |      dt        |      di}|| _        |xs g | _        t        t	        | j                  | j                  dt        |      d d            | _        t        j                  dd	d
      | j
                  _        t        j                  t        j                  dd      t        j                         t        j                         g      | j
                  _        t        j                  dd      | j
                  _        t#        
| H  d|||||d| t'        | d      rJt)        | j*                  t,              r0| j*                  j/                  d| j0                  j2                         t5        | dd       }	|	F|	j                   9t        j                  t        j6                         |	j                   g      |	_        y y y )Nr         r      T)r   r   fuse_unkr   dropoutbyte_fallback merged_with_previousF)patternbehaviorinvertu   ▁)r   r   r   r   r   init_kwargstokenizer_class
_tokenizer )str_vocab_mergesr   r   r%   r   Splitpre_tokenizerr   SequenceReplaceByteFallbackFusedecoderr   
normalizersuper__init__hasattr
isinstancer#   dict
setdefault	__class____name__getattr	Lowercase)selfr   r   r   r   r   r   r   kwargsbackendr8   s             b/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/siglip2/tokenization_siglip2.pyr3   zSiglip2Tokenizer.__init__(   s    =IIIIJE |#kk||i."	
 )7(<(<"8)
% #+"3"3eS)8+@+@+BHMMOT#
 &1%8%8e%D" 	
!	
 	
 4'Jt7G7G,N''(94>>;R;RS$d37#5#5#A!,!5!5{7L7L7NPWPbPb6c!dG $B    )NNz<unk>z<bos>z<eos>z<pad>z<mask>)r9   
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namespadding_sidemodel_input_namesr   modelr'   r6   intlistr3   __classcell__)r8   s   @r?   r   r      s     *L$&67E .2)-    "7eT#s(^#d*7e d3i$&7e 	7e
 7e 7e 7e 7e 7er@   r   N)
tokenizersr   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr	   rD   r   __all__r&   r@   r?   <module>rP      s>   * H G ! > &'78 Ae( AeH 
r@   