
    qiR                         d Z ddlmZmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ  ej                  e      Zddd	d
Z G d de      ZdgZy)zTokenization classes for CLIP.    )Regex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                        e Zd ZdZeZddgZeZ	 	 	 	 	 	 dde	e
e	ef   z  dz  de	ee	   z  dz  de	de	d	e	d
e	f fdZd Z xZS )CLIPTokenizerav  
    Construct a CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
    Byte-Pair-Encoding.

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab (`str`, `dict` or `list`, *optional*):
            Vocabulary dict to use for the tokenizer.
        merges (`str` or `list`, *optional*):
            Merges list to use for the BPE tokenizer.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding, for example when batching sequences of different lengths.
    	input_idsattention_maskNvocabmerges	unk_token	bos_token	eos_token	pad_tokenc                    ||n"t        |      dt        |      dt        |      di}|xs g | _        t        t        || j                  d dddt        |                  | _        t        j                  t        j                         t        j                  t        d      d	      t        j                         g      | j                  _        t        j                  t        j                  t        d
      dd      t        j                  d      g      | j                  _        t!        j                         | j                  _        t%        	| L  d||||d| t)        j*                  t        |      | j,                  ft        |      | j.                  fdd      | j                  _        | j3                          y )Nr          z</w>F)r   r   dropoutcontinuing_subword_prefixend_of_word_suffixfuse_unkr   z\s+ z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedT)behaviorinvert)add_prefix_space)r   r   r   r   )sepclsr&   trim_offsets )str_mergesr   r	   
_tokenizerr   SequenceNFCReplacer   	Lowercase
normalizerr   Split	ByteLevelpre_tokenizerr   decodersuper__init__r   RobertaProcessingeos_token_idbos_token_idpost_processor%_wrap_decode_method_backend_tokenizer)
selfr   r   r   r   r   r   kwargs_vocab	__class__s
            \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/clip/tokenization_clip.pyr8   zCLIPTokenizer.__init__8   s       III 	 |#||*,#)i.

 &1%9%9__ 3 3E&M3 GI^I^I`a&
" )7(?(?$$z ' ((%@	)
% #+"4"4"6 	
		

 	
 *4)E)EY!2!23Y!2!23"	*
& 	224    c                     | j                   j                  | j                   j                  j                  fd}|| j                   _        y )Nc                  X     | i |}|j                  d      j                         }|S )Nr"   )replacestrip)argsr?   textr    orig_decode_methods      rB   new_decode_methodzNCLIPTokenizer._wrap_decode_method_backend_tokenizer.<locals>.new_decode_method   s1    %t6v6D<< 2C8>>@DKrC   )backend_tokenizerdecodemodelr    )r>   rK   r    rJ   s     @@rB   r=   z3CLIPTokenizer._wrap_decode_method_backend_tokenizer   sD    !33:: "3399LL	
 ):%rC   )NN<|endoftext|>z<|startoftext|>rO   rO   )__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr	   rN   r+   dictintlistr8   r=   __classcell__)rA   s   @rB   r   r      s    . *$&67E .2)-(*((E5T#s(^#d*E5 d3i$&E5 	E5
 E5 E5 E5N:rC   r   N)rS   
tokenizersr   r   r   r   r   r   tokenizers.modelsr	   tokenization_utils_tokenizersr   utilsr   
get_loggerrP   loggerrT   r   __all__r*   rC   rB   <module>rb      sU    % Z Z ! >  
		H	%#/`pq o:% o:d 
rC   