
    qi7                         d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	 ddl
ZddlmZ ddlmZ erddlmZ dd	lmZmZ dd
lmZ  ej,                  e      ZddiZdZ ed       G d de             ZdgZy)z$Tokenization class for SigLIP model.    N)copyfile)TYPE_CHECKINGAny   )
AddedToken)SentencePieceBackend)	TextInput)loggingrequires_backends)requires
vocab_filezspiece.modelu   ▁)sentencepiece)backendsc            
           e Zd ZdZeZddgZ	 	 	 	 	 	 	 d"deee	f   dz  ddf fdZ
ed        Zd	 Z	 d#d
ee   dee   dz  dedee   f fdZdee   dee   fdZ	 d$d
ee   dee   dz  dee   fdZ	 d$d
ee   dee   dz  dee   fdZd Zd ZdedefdZdddZd%dddee   f fdZed        Zd Zd Zd Zd Zd$ded edz  dee   fd!Z  xZ!S )&SiglipTokenizera  
    Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"</s>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`list[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        model_max_length (`int`, *optional*, defaults to 64):
            The maximum length (in number of tokens) for model inputs.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
    	input_idsattention_maskNsp_model_kwargsreturnc	                 R   t        | d       t        |t              rt        |dddd      n|}t        |t              rt        |dddd      n|}t        |t              rt        |dddd      n|}|i n|| _        || _        t        
|   d|||||| j                  ||d|	 y )NprotobufTF)rstriplstrip
normalizedspecial)r   	eos_token	unk_token	pad_tokenadditional_special_tokensr   model_max_lengthdo_lower_case )r   
isinstancestrr   r   r!   super__init__)selfr   r   r   r   r   r   r    r!   kwargs	__class__s             `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/siglip/tokenization_siglip.pyr&   zSiglipTokenizer.__init__X   s     	$
+ )S) yduVZ[ 	 )S) yduVZ[ 	 )S) yduVZ[ 	 &5%<r/* 
	
!&? 00-'
	
 
	
    c                 6    | j                   j                         S N)sp_modelget_piece_sizer'   s    r*   
vocab_sizezSiglipTokenizer.vocab_size   s    }}++--r+   c                     t        | j                        D ci c]  }| j                  |      | }}|j                  | j                         |S c c}w r-   )ranger1   convert_ids_to_tokensupdateadded_tokens_encoder)r'   ivocabs      r*   	get_vocabzSiglipTokenizer.get_vocab   sK    ;@;QRa++A.1RRT../ Ss   Atoken_ids_0token_ids_1already_has_special_tokensc                     |rt         |   ||d      S |dgt        |      z  dgz   S dgt        |      z  dgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r:   r;   r<   r      )r%   get_special_tokens_masklen)r'   r:   r;   r<   r)   s       r*   r?   z'SiglipTokenizer.get_special_tokens_mask   sy    $ &72'[]a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNr+   	token_idsc                     t        |      dkD  r7|d   | j                  k(  r%t        j                  d| j                   d       |S || j                  gz   S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r@   eos_token_idwarningswarnr   )r'   rA   s     r*   _add_eos_if_not_presentz'SiglipTokenizer._add_eos_if_not_present   s]    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222r+   c                 t    | j                   g}|t        ||z         dgz  S t        ||z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        r   )rD   r@   )r'   r:   r;   eoss       r*   $create_token_type_ids_from_sequencesz4SiglipTokenizer.create_token_type_ids_from_sequences   sP        !{S()QC//;${2S89QC??r+   c                 X    | j                  |      }||S | j                  |      }||z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )rG   )r'   r:   r;   s      r*    build_inputs_with_special_tokensz0SiglipTokenizer.build_inputs_with_special_tokens   s;    & 22;?66{CK,,r+   c                 D    | j                   j                         }d |d<   |S )Nr.   )__dict__copy)r'   states     r*   __getstate__zSiglipTokenizer.__getstate__   s#    ""$ jr+   c                     || _         t        | d      si | _        t        j                  di | j                  | _        | j
                  j                  | j                         y )Nr   r"   )rN   hasattrr   spmSentencePieceProcessorr.   Loadr   )r'   ds     r*   __setstate__zSiglipTokenizer.__setstate__   sO     t./#%D 22JT5I5IJ4??+r+   textc                 j    |j                  t        j                  ddt        j                              S )N )	translater$   	maketransstringpunctuation)r'   rY   s     r*   remove_punctuationz"SiglipTokenizer.remove_punctuation   s$    ~~cmmBF4F4FGHHr+   keep_punctuation_exact_stringc                     j                   r|j                         }|r*|j                   fd|j                  |      D              }n j	                  |      }t        j                  dd|      }|j                         }|S )a  Returns canonicalized `text` (puncuation removed).

        Args:
            text (`str`):
                String to be canonicalized.
            keep_punctuation_exact_string (`str`, *optional*):
                If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
                (but will still remove '{' and '}' that appear separately).
        c              3   @   K   | ]  }j                  |        y wr-   )r`   ).0partr'   s     r*   	<genexpr>z4SiglipTokenizer.canonicalize_text.<locals>.<genexpr>  s!      626''-6s   z\s+ )r!   lowerjoinsplitr`   resubstrip)r'   rY   rb   s   `  r*   canonicalize_textz!SiglipTokenizer.canonicalize_text   sw     ::<D(055 6:>**Eb:c6 D **40Dvvfc4(zz|r+   r	   c                     t        |   t        |j                  t        d      z   fi |}t	        |      dkD  r"|d   t        k(  r|d   | j
                  v r|dd }|S )z8
        Converts a string to a list of tokens.
        rh   r>   r   N)r%   tokenizeSPIECE_UNDERLINEreplacer@   all_special_tokens)r'   rY   add_special_tokensr(   tokensr)   s        r*   rq   zSiglipTokenizer.tokenize  se     !"2T\\BRTW5X"Xc\bcv;?vay,<<dNeNeAeABZFr+   c                 p    t        | j                  j                  t        | j                                    S r-   )r@   r.   encoder$   r   r0   s    r*   unk_token_lengthz SiglipTokenizer.unk_token_length  s%    4==''DNN(;<==r+   c                    | j                  |d      }| j                  j                  |t              }| j                  j                  | j                  |z   t              }t        |      | j                  k\  r|| j                  d S |S )u*  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE.

        For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.

        Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        Nra   )out_type)ro   r.   rx   r$   r   r@   ry   )r'   rY   r(   rv   s       r*   	_tokenizezSiglipTokenizer._tokenize  s     %%d$%O%%dS%9 %%dnnt&;c%J25f+AVAV2Vvd++-.b\bbr+   c                 8    | j                   j                  |      S )z0Converts a token (str) in an id using the vocab.)r.   piece_to_id)r'   tokens     r*   _convert_token_to_idz$SiglipTokenizer._convert_token_to_id2  s    }}((//r+   c                 <    | j                   j                  |      }|S )z=Converts an index (integer) in a token (str) using the vocab.)r.   	IdToPiece)r'   indexr   s      r*   _convert_id_to_tokenz$SiglipTokenizer._convert_id_to_token6  s    ''.r+   c                    g }d}d}|D ]P  }|| j                   v r-|s|dz  }|| j                  j                  |      |z   z  }d}g }>|j                  |       d}R || j                  j                  |      z  }|j	                         S )z:Converts a sequence of tokens (string) in a single string.r[   Frh   T)rt   r.   decodeappendrn   )r'   rv   current_sub_tokens
out_stringprev_is_specialr   s         r*   convert_tokens_to_stringz(SiglipTokenizer.convert_tokens_to_string;  s    
 
	(E///&#%Jdmm223EFNN
"&%'""))%0"'
	( 	dmm**+=>>
!!r+   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rBt         j                  j                  | j                        rt        | j                  |       |fS t         j                  j                  | j                        sCt        |d      5 }| j                  j                         }|j                  |       d d d        |fS |fS # 1 sw Y   |fS xY w)NzVocabulary path (z) should be a directory-r[   r   wb)ospathisdirloggererrorrj   VOCAB_FILES_NAMESabspathr   isfiler   openr.   serialized_model_protowrite)r'   r   r   out_vocab_fileficontent_spiece_models         r*   save_vocabularyzSiglipTokenizer.save_vocabularyN  s%   ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n5    0nd+ /r'+}}'K'K'M$-./     	/   s   +,E%%E0)</s>z<unk>r   NN@   T)NFr-   )F)"__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesdictr$   r   r&   propertyr1   r9   listintboolr?   rG   rJ   rL   rQ   rX   r`   ro   rq   ry   r|   r   r   r   tupler   __classcell__)r)   s   @r*   r   r   +   s   &P *$&67
 "&15+
 c3h$.+
 
+
Z . . puO9O379t3COhlO	cO8	3c 	3tCy 	3 GK@9@379t3C@	c@. GK-9-379t3C-	c-4
,Is Is I HL 0[ QUVYQZ  > >c(0
"&!c !C$J !Z_`cZd !r+   r   )r   r   rl   r^   rE   shutilr   typingr   r   r   rT   tokenization_utils_baser    tokenization_utils_sentencepiecer   r	   utilsr
   r   utils.import_utilsr   
get_loggerr   r   r   rr   r   __all__r"   r+   r*   <module>r      s    + 	 	    %  1 D 4 / * 
		H	%!>2    
%&q!* q! 'q!h	 
r+   