
    qi                     2   d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZmZmZmZ ddlmZmZmZmZ  ej,                  e      ZdZd	Zd
Z G d d      Z G d de      Zd Zd Zd Z d Z!d Z"de#e$   de$fdZ% ee	       G d de             Z&e&Z'y)z
Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
tokenization_utils_tokenizers.py
    N)OrderedDict)Anyoverload   )INIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingEncodedInputPreTokenizedInputPreTrainedTokenizerBase	TextInputTruncationStrategy)PaddingStrategy
TensorTypeadd_end_docstringsloggingzspecial_tokens_map.jsonzadded_tokens.jsonztokenizer_config.jsonc                   D    e Zd ZdZd Zd ZdefdZdedee   fdZ	d	 Z
y
)Triez
    Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
    Loose reference https://en.wikipedia.org/wiki/Trie
    c                 \    i | _         t               | _        d| _         | j                  |  y )N )dataset_tokens_termination_charupdate)selfargss     R/opt/pipecat/venv/lib/python3.12/site-packages/transformers/tokenization_python.py__init__zTrie.__init__3   s(    	u!#T    c                 @    t        | D ]  }| j                  |        y)z
        Updates the Trie with new tokens provided as arguments.

        Args:
            *args: Variable number of words to be added to the Trie.
        N)tupleadd)r   r   tokens      r   r   zTrie.update9   s"     D\ 	EHHUO	r    wordc                     |sy| j                   j                  |       | j                  }|D ]  }|j                  |i       ||<   ||   } d|| j                  <   y)u  
        Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
        The special key `""` in `self._termination_char` is used to represent termination.

        This function is idempotent, adding twice the same word will leave the trie unchanged

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.add("Hello 友達")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}

        >>> trie.add("Hello")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
        ```
        Nr   )r   r#   r   
setdefaultr   )r   r%   refchars       r   r#   zTrie.addC   sc    ( ii 	DtR0CId)C	 '(D""#r    textreturnc                 h   t               }dg}d}t        |      D ]8  \  }}|r||k  rt               }d}|j                         D ]  \  }	}
d|
v r|j                         D ]q  \  }}||	kD  r ng||	k  r|dz   }|dz   }n|}|}|t	        |      k  r||   nd}d|v r|}	|}|}||v sE||   }|dz  }d|v r|}	|}|}|t	        |      k(  rh||   }||v r-s |j                  |	       |j                         d} n"||
v r|
|   }
|
||	<   |j                  |	        |ri }n
|D ]  }	||	=  ||k\  s|| j                  v s'| j                  |   ||<   ; |j                         D ]8  \  }	}
d|
v st	        |      }|j                  |	       |j                  |        n | j                  ||      S )aY  
        Will look for the words added to the trie within `text`. Output is the original string split along the
        boundaries of the words found.

        This trie will match the longest possible word first !

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS] This is a extra_id_100"]

        >>> trie.add("[CLS]")
        >>> trie.add("extra_id_1")
        >>> trie.add("extra_id_100")
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS]", " This is a ", "extra_id_100"]
        ```
        r   Fr   r   NT)	r   	enumerater   itemslenappendr#   r   cut_text)r   r*   statesoffsetsskipcurrentcurrent_char	to_removeresetstarttrie_pointer	lookstartlooktrie_pointerlookahead_indexend	next_chars                   r   splitz
Trie.splitb   sL   B 
 #
 %.t_ \	:!G\$  I E (.||~ ?)#|% 8>||~  >3	#3$u,!&. /6kO")A+C
 /6O")C=LsSWy=XD$9^b	!11$-E"1C#2D'+;;/?	/J,+q0O!%55(1&5'6.#d); %(,_(=I (+;;+ >H NN5)NN3' E!\1 $0#=L %1F5M MM%(?)F & &Eu&
 $<499#<"&))L"9wy\	:~ $*<<> 		E<\! $iu%s# 		 }}T7++r    c                     |j                  t        |             g }d}|D ]9  }||kD  rt        j                  d       ||k(  r$|j                  |||        |}; |S )Nr   zbThere was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway.)r0   r/   loggererror)r   r*   r3   tokensr9   r>   s         r   r1   zTrie.cut_text   st     	s4y! 	Cs{ # MM$uS/*E	 r    N)__name__
__module____qualname____doc__r   r   strr#   listr@   r1    r    r   r   r   -   s=    
( (>W,# W,$s) W,rr    r   c                   J     e Zd Z fdZdefdZdedefdZdedefdZ	 xZ
S )	ExtensionsTriec                     t        |   |  y N)superr   )r   r   	__class__s     r   r   zExtensionsTrie.__init__  s    $r    prefixc                 v    | j                  |      }| j                  |      }|D cg c]  }||z   	 c}S c c}w )aC  
        Generates all extensions of a given prefix token in the Trie.

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.add("apple")
        >>> trie.add("app")
        >>> trie.add("application")
        >>> trie.extensions("app")
        ['app', 'apple', 'application']
        ```
        )	_get_node_collect_tokens)r   rR   prefix_noderetr$   s        r   
extensionszExtensionsTrie.extensions  s:     nnV,"";/,/05000s   6r$   r+   c                 D    | j                   }|D ]  }||vr |S ||   } |S )a  
        Retrieves the node corresponding to the given token in the Trie.

        Args:
            token (str): The token for which the corresponding node needs to be retrieved.

        Returns:
            dict: The node in the Trie corresponding to the given token.
        )r   )r   r$   noder)   s       r   rT   zExtensionsTrie._get_node+  s@     yy 	D4  :D		
 r    rZ   c                     | j                   |v r| j                   gng }|j                         D ]H  \  }}|| j                   k7  s| j                  |      }|j                  |D cg c]  }||z   	 c}       J |S c c}w )a  
        Generates all tokens in the Trie starting from a given node.

        Args:
            node (dict): The node in the Trie from which tokens need to be generated.

        Returns:
            list: List of tokens generated from the given node.
        )r   r.   rU   extend)r   rZ   rD   r$   subtrie_head	subtokenssubtokens          r   rU   zExtensionsTrie._collect_tokens=  s     .2-C-Ct-K$(()QS#'::< 	LE<... 00>		JHux/JK	L  Ks   $A;
)rE   rF   rG   r   rI   rX   dictrT   rJ   rU   __classcell__rQ   s   @r   rM   rM     s:     1 1&s t $D T r    rM   c                 d    | dk(  s| dk(  s
| dk(  s| dk(  ryt        j                  |       }|dk(  ryy)z0Checks whether `char` is a whitespace character. 	
TZsF)unicodedatacategoryr)   cats     r   _is_whitespacerm   O  s=     s{ddlddlddl


t
$C
d{r    c                 r    | dk(  s
| dk(  s| dk(  ryt        j                  |       }|j                  d      ryy)z-Checks whether `char` is a control character.re   rf   rg   FCT)ri   rj   
startswithrk   s     r   _is_controlrq   [  s<     t|tt|tt|


t
$C
~~cr    c                     t        |       }|dk\  r|dk  s|dk\  r|dk  s|dk\  r|dk  s
|dk\  r|dk  ry	t        j                  |       }|j                  d
      ry	y)z1Checks whether `char` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)ordri   rj   rp   )r)   cprl   s      r   _is_punctuationr~   g  sj    	TB
 	bR2X28bbBh2QS8Y[_bYbgimpgp


t
$C
~~cr    c                 d    | d   }t        t        |      t        |      z  t        |      z        S )zcChecks whether the last character in text is one of a punctuation, control or whitespace character.boolrq   r~   rm   )r*   	last_chars     r   _is_end_of_wordr   v  s0    RII&)CCnU^F__``r    c                 d    | d   }t        t        |      t        |      z  t        |      z        S )zdChecks whether the first character in text is one of a punctuation, control or whitespace character.r   r   )r*   
first_chars     r   _is_start_of_wordr   |  s0    aJJ'/**EEWaHbbccr    
token_list	new_tokenc                     t        j                  | |      }|t        |       k  r	| |   |k(  ry| j                  ||       y)zm
    Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
    N)bisectbisect_leftr/   insert)r   r   insertion_idxs      r   !_insert_one_token_to_ordered_listr     sA     &&z9=Ms:&:m+D	+Q-3r    c            %           e Zd ZdZ fdZedefd       Zedee	e
f   fd       Zedee
ef   fd       Zej                  dee
ee	z  f   dee
ef   fd       Zdee	e
f   fd	Zd
 Zd ZdKdee	   ee   z  dede
fdZdLdee	   dz  fdZdKdede
fdZdedee	   fdZd Zd Zd Zddej8                  ej<                  ddddddddddddfdeez  e z  deez  e z  dz  dededed e
dz  d!e
d"ed#e
dz  d$e	dz  d%e	e!z  dz  d&edz  d'edz  d(ed)ed*ed+ede"f$d,Z#	 dKde	d"ede$e	ee	e%f   f   fd-Z&	 dLd.ee
   d/ee
   dz  dee
   fd0Z'	 dMd.ed/edz  d1edee
   f fd2Z(e)dKd3e
d4ede	fd5       Z*e)dKd3ee
   d4edee	   fd6       Z*dKd3e
ee
   z  d4ede	ee	   z  fd7Z*d8e
de	fd9Z+d:ee	   de	fd;Z,	 	 dNd<e
ee
   z  d4ed=edz  de	fd>Z-	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dOd3ee
   d?ee
   dz  ded@ee	z  ez  dAee	z  ez  d e
dz  d!e
d#e
dz  d$e	dz  d%e	e!z  dz  d&edz  d'edz  d(ed)ed*ed+edBede"f$dCZ.	 	 	 	 dPd3ee
   d?ee
   dz  dDe
de	ez  d!e
de$ee
   ee
   ee
   f   fdEZ/	 dLd.ee
   d/ee
   dz  dee
   fdFZ0dLdGe	dHe	dz  de$e	dIf   fdJZ1 xZ2S )QPythonBackenda  
    Base class for all slow tokenizers.

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
    pretrained tokenizers as well as adding tokens to the vocabulary.

    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    c                 l   t               | _        d| _        t        | d      si | _        | j                  j                  |j                  di              | j                  j                         D ci c]  \  }}|j                  | c}}| _	        |j                  dd      | _
        |j                  dd      | _        |j                  dd       | _        d	|vrd
|d	<   t        | 8  di | | j                  | j                   D cg c]  }|| j                  vs| c}d       y c c}}w c c}w )Nr   _added_tokens_decoderadded_tokens_decodertoken_type_ids_pattern
bert_style%token_type_ids_include_special_tokensTspecial_tokens_patternbackendcustom)special_tokensrK   )r   tokens_trietotal_vocab_sizehasattrr   r   popr.   content_added_tokens_encoderr   r   r   rP   r   _add_tokensall_special_tokens)r   kwargsvkr$   rQ   s        r   r   zPythonBackend.__init__  s*     6 !" t45@BD& 	""))&**5KR*PQOSOiOiOoOoOq5rtq!aiil5r" '-jj1I<&X#5;ZZ@gim5n2 '-jj1I4&P# F" (F9 	"6"
 	 $ 7 7cu5HbHb;bUc 	 	
+ 6s, ds   3D+	D1D1r+   c                      yNFrK   r   s    r   is_fastzPythonBackend.is_fast  s    r    c                     t        | j                  j                         d       D ci c]  \  }}|j                  | c}}S c c}}w )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                     | d   S Nr   rK   items    r   <lambda>z4PythonBackend.added_tokens_encoder.<locals>.<lambda>  s    eijkel r    key)sortedr   r.   r   )r   r   r   s      r   added_tokens_encoderz"PythonBackend.added_tokens_encoder  s;     *00J0J0P0P0RXl)mnA		1nnns   Ac                 `    t        t        | j                  j                         d             S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `dict[str, int]`: The added tokens.
        c                     | d   S r   rK   r   s    r   r   z4PythonBackend.added_tokens_decoder.<locals>.<lambda>  s    PTUVPW r    r   )r`   r   r   r.   r   s    r   r   z"PythonBackend.added_tokens_decoder  s&     F455;;=CWXYYr    valuec           	         |j                         D ]  \  }}t        |t        t        f      rt        |t              s8t        d|j                  |j                  f dt        t        t        z  f       t        |t              rt        |      n|| j                  |<   || j                  t        |      <    | j                          y )Nz;The provided `added_tokens_decoder` has an element of type z, should be a dict of )
r.   
isinstancerI   r   int	TypeErrorrQ   r   r   _update_total_vocab_size)r   r   indexr$   s       r   r   z"PythonBackend.added_tokens_decoder  s     "KKM 	;LE5ec:%67z%QT?UQRWRaRachcrcrRrQs  tJ  KN  PZ  ]`  P`  K`  Ja  b  FPPUWZE[
50AafD&&u-5:D&&s5z2	; 	%%'r    c                     | j                   S )aX  
        Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
        the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
        something we should change.

        Returns:
            `dict[str, int]`: The added tokens.
        )r   r   s    r   get_added_vocabzPythonBackend.get_added_vocab  s     )))r    c                 X    | j                   dk(  r| j                          | j                   S )zD
        Size of the full vocabulary with the added tokens.
        r   )r   r   r   s    r   __len__zPythonBackend.__len__  s*    
   A%))+$$$r    c                 @    t        | j                               | _        y)a!  
        Update the size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because
        otherwise if there is a hole in the vocab, we will add tokenizers at a wrong index. This operation is slow and
        is only updated when adding tokens.
        N)r/   	get_vocabr   r   s    r   r   z&PythonBackend._update_total_vocab_size  s     !$DNN$4 5r    F
new_tokensr   c           	      *   d}||S | j                         j                         }t        |      }|D ]  }t        |t        t
        f      st        d| dt        |       d      t	        |      dk(  rDt        |t              r3|| j                  v rc|| j                  v xs |}t        |dd| |      }n |r|j                  d|j                  d	       || j                  v r|j                  s8|j                  r,t        | d
d      r|j                  j!                         |_        |j                  |vr||z   }|||j                  <   |dz  }n||j                     }|j                  r2t	        |      | j                  vr| j"                  j%                  |       || j                  |<   || j                  |j                  <   | j&                  st(        j+                  d| d        | j-                          | j/                          |S )a  
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the
        vocab which is why they have to be handled specifically.

        Args:
            new_tokens (`list[str]`or `list[tokenizers.AddedToken]`):
                Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary
                (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part
                of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the
                stripping and normalization of this token. This is NOT possible in `tokenizers`.
            special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the tokens should be added as special tokens.

        Returns:
            `int`: The number of tokens actually added to the vocabulary.

        Examples:

        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
        model = BertModel.from_pretrained("google-bert/bert-base-uncased")

        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
        print("We have added", num_added_toks, "tokens")
        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
        ```r   zToken z is not a string but a .r   F)rstriplstrip
normalizedspecialT)r   r   do_lower_caser   zAdding z to the vocabulary)r   copyr/   r   rI   r   r   typer   r   __setstate__r   r   r   getattrr   lower_extra_special_tokensr0   verboserB   info_update_trier   )	r   r   r   added_tokenscurrent_vocabnew_idxr$   
is_specialtoken_indexs	            r   r   zPythonBackend._add_tokens  s   < (--/m$ $	AEec:%67&/FtE{mST UVV5zR%%D666 "'$*A*A!A!S^J&eE*n^hE   ""t5CSCS#TU222==U%5%5'$Y^:_ % 3 3 5}}M1%4/:emm,!+EMM:}}U43J3J!J**11%86;D&&{38CD&&u}}5||geW,>?@I$	AL 	%%'r    Nunique_no_split_tokensc                 P   | j                   j                         D ]J  }|j                  | j                  j                  vs&| j                  j                  |j                         L |xs g D ]6  }|| j                  j                  vs| j                  j                  |       8 y rO   )r   valuesr   r   r   r#   )r   r   r$   s      r   r   zPythonBackend._update_trieR  s    //668 	4E}}D$4$4$<$<<  $$U]]3	4 ,1r 	,ED,,444  $$U+	,r    pairc                 X    g }g }t        | j                  ||r
|            S d            S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        N)r/    build_inputs_with_special_tokens)r   r   token_ids_0token_ids_1s       r   num_special_tokens_to_addz'PythonBackend.num_special_tokens_to_addZ  s5    & 488UYkdee_cdeer    r*   c                    |j                  d| j                        } | j                  |fi |\  }}|r| j                  |      S | j                  j                  |      }| j                  j                         }t        |      D ]  \  }}||v s| j                  j                  | j                  |         }|dkD  r||dz
     nd}	|t        |      dz
  k  r||dz      nd}
t        |t              so|j                  r|
r|
j                         ||dz   <   |j                  r|	r|	j                         ||dz
  <   |j                   s|	r|	d   dk7  r||dz
  xx   |z  cc<   d||<   |
s|
d   dk7  s|||dz      z   ||dz   <   d||<    g }t#        | j$                        }|D ]?  }|s||v s||v r|j'                  |        |j)                  | j                  |             A |S )a,  
        Converts a string into a sequence of tokens, using the tokenizer.

        Args:
            text: The sequence to be encoded.
            **kwargs: Passed along to the model-specific `prepare_for_tokenization` preprocessing method.

        Returns:
            The list of tokens.
        split_special_tokensr   r   Nr   rd   r   )r   r   prepare_for_tokenization	_tokenizer   r@   r   keysr-   r   getr/   r   r   r   r   single_wordr   r   r0   r\   )r   r*   r   r   rD   no_split_tokenir$   tok_extendedleftrightresultall_special_tokens_sets                r   tokenizezPythonBackend.tokenizeq  s     &zz*@$B[B[\4t44TDVDf>>$'' !!''-3388: "&) 	+HAu&#99==d>X>XY^>_`()Ava!e}4)*S[1_)<q1u$lJ7#**u(-q1u#**t(,q1u#//DHO"1q5MU2M(*F1I"uQx3,1F1q5M,AF1q5M(*F1I#	+( !$T%<%<!= 	5E&%3I*Ie$dnnU34	5 r    c                     t         )a  
        Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

        Do NOT take care of added tokens.
        NotImplementedError)r   r*   r   s      r   r   zPythonBackend._tokenize  s
     "!r    c                 ^    || j                   v r| j                   |   S | j                  |      S rO   )r   _convert_token_to_idr   r$   s     r   #_convert_token_to_id_with_added_vocz1PythonBackend._convert_token_to_id_with_added_voc  s2    D---,,U33((//r    c                     t         rO   r   r   s     r   r   z"PythonBackend._convert_token_to_id      !!r    Tr   	text_pairadd_special_tokenspadding_strategytruncation_strategy
max_lengthstrideis_split_into_wordspad_to_multiple_ofpadding_sidereturn_tensorsreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_lengthr   c                     t        |t        t        f      xrT | xr  xsJ |xr xr t        |d   t        t        f      xs' |xr#  xr t        |d   t        t        t        f      }|r|8t        |t        t        f      rt	        |      t	        |      k7  rt        d      ||nd gt	        |      z  }i }t        ||      D ]c  \  }}t        |t        t        f      r|rt        |d   t              s|t	        |      dk(  xr^ t        |d   t              xs t        |d   t        t        f      xr. t        |d   t              xs t        |d   t        t        f      }|r|\  }}n.t	        |      dk(  r|d   }nst        dt	        |       d        j                  d i d|d|d	|d
t        j                  d|d|d|ddd dd dd d|ddd|d|d|d|}|j                         D ]&  \  }}|j                  |g       j                  |       ( f |r&|r$|j                  dd        |j                  dd         j                  ||j                   ||	|
|      }t#        ||      S  fd} ||      }| ||      nd } j%                  ||||j                   |j                   |||	|
|d||||||      S )!Nr   zJIf `text` is a batch, `text_pair` must also be a batch of the same length.   r   z"Expected a pair of sequences, got z sequences.r*   r   r   r   r   r   r   r  r  r  r  r  r  Fr  r  r	  r   overflowing_tokensnum_truncated_tokens)paddingr   r  r  r  )tensor_typec                    t        | t              r"j                   j                  | fi       S t        | t        t
        f      rv| rtt        | d   t              r| S t        | d   t              rLr9j                  | D cg c]  } j                  |fi D ]  }|  c}}      S j                  |       S t        dt        |              c c}}w )Nr   z?Input must be a string, list of strings, or list of ints, got: )	r   rI   convert_tokens_to_idsr   rJ   r"   r   
ValueErrorr   )r*   r%   tokr  r   r   s      r   get_input_idsz1PythonBackend._encode_plus.<locals>.get_input_ids  s    $$11-$--2O2OPP$u.4d1gs+Kd1gs+*#99-1[T]T]]4=ZSY=Z[cS[S[    55d;;^_cdh_i^jkll \s   !C
T)pair_idsr   r  
truncationr   r   r  r  r  prepend_batch_axisr  r  r  r  r	  r   rK   )r   rJ   r"   rI   r/   r  zipr   _encode_plusr   
DO_NOT_PADr.   r'   r0   r   padr   r	   prepare_for_model)r   r*   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r   r   
is_batchedpairsbatch_outputscurrent_textcurrent_pairis_paircurrent_outputr   r   r  	first_ids
second_idss   `       `         `            r   r  zPythonBackend._encode_plus  s   ,  tUm4 
X111 ^S,SDGdE]1S^\00\ZQ#tUZI[5\ 	 $!)dE];s9~QTUYQZ?Z$%qrr!*!6ITFSY<NEM.1$.> -D*l
 |dE];$&|A<$, L)Q. m'Q=kLYZO^bdi]jAkm'Q=kLYZO^bdi]jAk 
 5A2l\*a/'3A0(+McR^N_M``k)lmm!2!2!2 "%"*" (:" &5%?%?	"
 )<"  *" "" )<" (," "&" $(" +@" +0" /H" 0J"  #0!"" $%"( #1"6"6"8 DJC!,,S"5<<UCDY-Db ";!!"6=!!"8$? HH(..%#5)&; % M !NKK	m  "$'	1:1F]9-D
%%1$***00!1%)#"7"7&?'A'# & 
 	
r    c                 
    ||fS )a  
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
            kwargs (`dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `tuple[str, dict[str, Any]]`: The prepared text and the unused kwargs.
        rK   )r   r*   r  r   s       r   r   z&PythonBackend.prepare_for_tokenizationD  s    , f~r    r   r   c                    | j                   dk(  r| j                  2| j                  &t        d| j                   d| j                   d      || j                  g|z   | j                  gz   S | j                  g|z   | j                  gz   |z   | j                  gz   S | j                   dk(  rX| j                  t        d| j                   d      ||| j                  gz   S || j                  gz   |z   | j                  gz   S | j                   dk(  rX| j
                  t        d	| j
                   d      || j
                  g|z   S | j
                  g|z   | j
                  gz   |z   S | j                   d
k(  r| j
                  2| j                  &t        d| j
                   d| j                   d      || j
                  g|z   | j                  gz   S | j
                  g|z   | j                  gz   |z   | j                  gz   S | j                   dk(  r| j                  2| j                  &t        d| j                   d| j                   d      || j                  g|z   | j                  gz   S | j                  g|z   | j                  | j                  gz   |z   | j                  gz   S | j                   dk(  r/t        | dg       }t        | dg       }|||z   |z   S ||z   |z   |z   S ||S ||z   S )a	  
        Build model inputs from a sequence or a pair of sequences by adding special tokens.

        This method dynamically builds inputs based on the tokenizer's `special_tokens_pattern`:
        - `"none"`: No special tokens
        - `"cls_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] seq1 [SEP]
        - `"eos"`: seq0 [EOS] or seq0 [EOS] seq1 [EOS]
        - `"bos"`: [BOS] seq0 or [BOS] seq0 [BOS] seq1
        - `"bos_eos"`: [BOS] seq0 [EOS] or [BOS] seq0 [EOS] seq1 [EOS]
        - `"cls_double_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] [SEP] seq1 [SEP]
        - `"prefix_suffix"`: `<prefix_tokens> seq0 [seq1] <suffix_tokens>` (custom prefix/suffix stored on the tokenizer)

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of input IDs with the appropriate special tokens.
        cls_sepzzCannot add special tokens following 'cls_sep' pattern because one or several special tokens are not defined (cls_token_id=z; sep_token_id=zZ)Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`eoszaCannot add special tokens following 'eos' pattern because eos token is not defined (eos_token_id=z[).Set the required special tokens in tokenizer or update `tokenizer.special_tokens_pattern`boszaCannot add special tokens following 'bos' pattern because bos token is not defined (bos_token_id=bos_eoszzCannot add special tokens following 'bos_eos' pattern because one or several special tokens are not defined (bos_token_id=z; eos_token_id=cls_double_sepzCannot add special tokens following 'cls_double_sep' pattern because one or several special tokens are not defined (cls_token_id=prefix_suffixprefix_tokenssuffix_tokens)r   cls_token_idsep_token_idr  eos_token_idbos_token_idr   )r   r   r   r.  r/  s        r   r   z.PythonBackend.build_inputs_with_special_tokens\  s   0 &&)3  (T->->-F 5595F5F4GW[WhWhVi jpp 
 "))*[8D<M<M;NNN%%&48I8I7JJ[X\`\m\m[nnn((E1  ( %%)%6%6$7 8pp 
 ""d&7&7%888$"3"3!44{BdFWFWEXXX((E1  ( %%)%6%6$7 8pp 
 "))*[88%%&48I8I7JJ[XX((I5  (T->->-F 5595F5F4GW[WhWhVi jpp  "))*[8D<M<M;NNN%%&48I8I7JJ[X\`\m\m[nnn((,<<  (T->->-F 5595F5F4GW[WhWhVi jpp 
 "))*[8D<M<M;NNN""#$$d&7&789  $$%	& ((O;#D/2>M#D/2>M"${2]BB ;.<}LL """,,r    already_has_special_tokensc                    |r|t        d      t        | 	  ||d      S | j                  dk(  rD|dgdgt	        |      z  z   dgz   S dgdgt	        |      z  z   dgz   dgt	        |      z  z   dgz   S | j                  dk(  r<|dgt	        |      z  dgz   S dgt	        |      z  dgz   dgt	        |      z  z   dgz   S | j                  dk(  r<|dgdgt	        |      z  z   S dgdgt	        |      z  z   dgz   dgt	        |      z  z   S | j                  d	k(  rD|dgdgt	        |      z  z   dgz   S dgdgt	        |      z  z   dgz   dgt	        |      z  z   dgz   S | j                  d
k(  rE|dgdgt	        |      z  z   dgz   S dgdgt	        |      z  z   ddgz   dgt	        |      z  z   dgz   S | j                  dk(  rat	        t        | dg             }t	        t        | dg             }dg|z  dgt	        |      z  z   }||dgt	        |      z  z  }|dg|z  z  }|S dg|rt	        |      ndt	        |      z   z  S )a"  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        This method dynamically builds the special tokens mask based on the tokenizer's `special_tokens_pattern`:
        - `"none"`: No special tokens (default, returns all 0s)
        - `"cls_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] seq1 [SEP]
        - `"eos"`: seq0 [EOS] or seq0 [EOS] seq1 [EOS]
        - `"bos"`: [BOS] seq0 or [BOS] seq0 [BOS] seq1
        - `"bos_eos"`: [BOS] seq0 [EOS] or [BOS] seq0 [EOS] seq1 [EOS]
        - `"cls_double_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] [SEP] seq1 [SEP]
        - `"prefix_suffix"`: `<prefix_tokens> seq0 [seq1] <suffix_tokens>`

        Args:
            token_ids_0 (`list[int]`):
                List of ids of the first sequence.
            token_ids_1 (`list[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        zYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.T)r   r   r4  r(  r   r   r)  r*  r+  r,  r-  r.  r/  )r  rP   get_special_tokens_maskr   r/   r   )r   r   r   r4  
prefix_len
suffix_lenmaskrQ   s          r   r6  z%PythonBackend.get_special_tokens_mask  s   6 && R 
 72'[]a 3   &&)3"sqcC$445;;31#K 001QC7A3[AQ;QRVWUXXX((E1"c+..1#55C#k**qc1aS3{;K5KLPQsRR((E1"sqcC$445531#K 001QC7A3[AQ;QRR((I5"sqcC$445;;31#K 001QC7A3[AQ;QRVWUXXX((,<<"sqcC$445;;31#K 001QF:qcCDT>TUYZX[[[((O;WT?B?@JWT?B?@J3#sS-='=>D&c+...QC*$$DK 3{3{+SEUUVVr    idsskip_special_tokensc                      y rO   rK   r   r:  r;  s      r   convert_ids_to_tokensz#PythonBackend.convert_ids_to_tokens  s    Y\r    c                      y rO   rK   r=  s      r   r>  z#PythonBackend.convert_ids_to_tokens  s    ehr    c                 n   t        |t              r8|| j                  v r| j                  |   j                  S | j	                  |      S g }|D ]e  }t        |      }|r|| j
                  v r|j                  || j                  v r| j                  |   j                  n| j	                  |             g |S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `list[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `list[str]`: The decoded token(s).
        )r   r   r   r   _convert_id_to_tokenall_special_idsr0   )r   r:  r;  rD   r   s        r   r>  z#PythonBackend.convert_ids_to_tokens  s     c3 $444 **3/77 ..s3  	EJE"u0D0D'DMMD666 **5199..u5		 r    r   c                     t         rO   r   )r   r   s     r   rA  z"PythonBackend._convert_id_to_token<  r   r    rD   c                 $    dj                  |      S )Nrd   )join)r   rD   s     r   convert_tokens_to_stringz&PythonBackend.convert_tokens_to_string?  s    xxr    	token_idsclean_up_tokenization_spacesc                     | j                  ||      }t        |t              r|g}| j                  |      }||n| j                  }|r| j                  |      }|S )zDecode token ids to string.)r;  )r>  r   rI   rF  rH  clean_up_tokenization)r   rG  r;  rH  r   filtered_tokensr*   s          r   _decodezPythonBackend._decodeB  sq     44YTg4hos+./O,,_=
 ,7 )22 	%
 (--d3Dr    r  r  r  r  c           	          | j                   d|||||d|\  }}}}|r |t        j                  k(  r|t        d      |d| j                  v }|d| j                  v }|du}|r| j                  |      nd}t        |      t        |xs g       z   |z   }g }|t        j                  k7  r$|r"||kD  r| j                  ||||z
  ||      \  }}}|r%| j                  ||      }| j                  ||      }n||r|ng z   }dgt        |      z  }d	|i}|r||d<   |r&|r| j                  ||      ndgt        |      z  |d
<   |r|
s|r||d<   |r||z
  nd|d<   | j                  |d	   ||       |t        j                  k7  s|r!| j                  |||j                   ||	|      }|rt        |d	         |d<   t#        ||
|      S )a  
        Prepares a sequence of input ids so it can be used by the model. Adds special tokens, truncates, and pads.

        Args:
            ids: Tokenized input ids of the first sequence.
            pair_ids: Tokenized input ids of the second sequence (optional).
        )r  r  r   r  r   NzNot possible to return overflowing tokens for pair of sequences with the `longest_first`. Please select another truncation strategy than `longest_first`, for instance `only_second` or `only_first`.token_type_idsattention_mask)r   r   )r  num_tokens_to_remover   r   	input_idsspecial_tokens_maskr  r  )r   r  r  r  r  length)r  r  rK   )"_get_padding_truncation_strategiesr   LONGEST_FIRSTr  model_input_namesr   r/   DO_NOT_TRUNCATEtruncate_sequencesr   $create_token_type_ids_from_sequencesr6  &_eventual_warn_about_too_long_sequencer   r  r  r   r	   )r   r:  r  r   r  r  r   r   r  r  r  r  r  r  r  r	  r   r  r   r   r   _r   num_special	total_lenr  sequencerN  encoded_inputss                                r   r  zPythonBackend.prepare_for_model[  sq   : @gt?f?f @
!!1@
 @
<-z1 &#'9'G'GG$>  !($48N8N$N! ($48N8N$N! t#CUd44$4?[\Hs8>r22[@	"4"D"DDXadnXn040G0G!%.%;$7 1H 1-C- <<S(KH!FFsHUN(h;HS3x=0N &x0 /=N+,%?Q,,S(;XYWZ]`ai]jWj 01 %^@R3EN/0OYY5K_`N12 	33N;4OQ[]de 999=R!XX%(..#5)&; & N '*>++F'GN8$^\noor    rP  c                    |dk  r||g fS t        |t              st        |      }g }|t        j                  k(  s|t        j                  k(  rC|At	        t        |      ||z         }| j                  dk(  r|d| }||d }n=|| d }|d|  }n/|t        j                  k(  rt        j                  dt        j                  j                   d       t        |      |rt        |      nd}	}t	        t        |	|z
        |      }
||
z
  }||	kD  r|
|dz  z   }||dz  z
  }n|dz  }|
|z   |dz  z
  }| j                  dk(  r|dkD  r|d|  n|}|r|dkD  r|d|  n|}nb||d }|r||d nd}nS|t        j                  k(  r@|r>t	        t        |      ||z         }| j                  dk(  r|| d }|d|  }n
|d| }||d }|||fS )z8Truncates sequences according to the specified strategy.r   Nr   zmBe aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'zg' truncation strategy. So the returned list will always be empty even if some tokens have been removed.r  r   )r   r   
ONLY_FIRSTrU  minr/   truncation_siderB   warningr   absONLY_SECOND)r   r:  r  rP  r   r   r  
window_lenlen_idslen_pairfirst_removesecond_removeids_to_movepair_ids_to_moves                 r   rX  z PythonBackend.truncate_sequences  sZ     1$"$$-/AB"45H"I "4"?"??#5#C#CCHXSXv0D'DEJ##v-%(*%5"./0%(*%6"0001 !$6$D$DDNN22D2R2R2X2X1Y Z,, !$C8#h-XGs8g#568LML0<?M!*]a-??#0=A3E#E +q0#/-#?=TUCU#V ##w.,7!Oc-K<(;CHX[\H\8$6&6%67bj+,':B8$4$56 !$6$B$BBxS]F5I,IJJ##w.%-zkl%;"#$:&:%:;%-kz%:"#$8$9:H000r    c                    | j                   r^|| j                  |      }t        |      }d}nW| j                  ||      }| j                  |      }t        |      }t        |      |z
  }nt        |      }|t        |      nd}| j                  dk(  rQt        t	        | dg             t        |      z   }||t        |      z  }|t        t	        | dg             z  }dg|z  S | j
                  dk(  r|dg|z  dg|z  z   S dg||z   z  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task.

        This method dynamically builds the token type IDs based on the tokenizer's configuration attributes:
        - `token_type_ids_pattern`: Pattern to use ("all_zeros" or "bert_style")
        - `token_type_ids_include_special_tokens`: Whether to account for special tokens in length calculation

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: Token type IDs according to the configured pattern.

        Examples:
            ```python
            # All zeros pattern (default, used by RoBERTa, BART, etc.)
            tokenizer.token_type_ids_pattern = "all_zeros"
            # Returns: [0, 0, 0, ...] for both sequences

            # BERT-style pattern (first sequence gets 0s, second gets 1s)
            tokenizer.token_type_ids_pattern = "bert_style"
            # Returns: [0, 0, 0, ..., 1, 1, 1, ...] for sequence pairs
            ```
        r   r-  r.  r/  r   r   )r   r   r/   r   r   r   )	r   r   r   r^  seq0_lenseq1_lenfull_sequenceseq0_with_specialr]  s	            r   rY  z2PythonBackend.create_token_type_ids_from_sequences  s2   > 55"@@Mx= $ E EkS^ _ %)$I$I+$V!01}-8 ;'H+6+Bs;'H &&/9GD/2>?#kBRRI&S--	WT?B?@@I3?"&&,6;;R3>QC(N22 3(X-..r    save_directoryfilename_prefix.c           	         ddl }ddl}t        | dd      xs t        | dd      }|y|j                  j	                  |      st
        j                  d| d       yt        | di       }|r| d	nd
}|j                  j                  |||j                  dd      z         }t        |dd      5 }	|	j                  |j                  |ddd      dz          ddd       t        | dd      }
|
|fS |j                  j                  |||j                  dd      z         }t        |dd      5 }t        | dd      r|j                  d       d}t        |
j                         d       D ]M  \  }}||k7  rt
        j                  d| d       |}|j                  dj                  |      dz          |dz  }O 	 ddd       ||fS # 1 sw Y   xY w# 1 sw Y   ||fS xY w) a  
        Default implementation for common vocabulary saving patterns.
        Saves self.encoder/self.vocab as JSON, optionally with self.bpe_ranks as merges.
        Returns empty tuple if no vocabulary exists.

        Override this method if your tokenizer needs custom saving logic (e.g., SentencePiece models,
        multiple vocabulary files, or special file formats).

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
            filename_prefix (`str`, *optional*):
                An optional prefix to add to the named of the saved files.

        Returns:
            `tuple[str, ...]`: Paths to the files saved, or empty tuple if no files saved.
        r   NencodervocabrK   zVocabulary path (z) should be a directoryvocab_files_names-r   
vocab_filez
vocab.jsonwzutf-8)encodingr  TF)indent	sort_keysensure_asciirf   	bpe_ranksmerges_filez
merges.txtadd_bpe_version_headerz#version: 0.2
c                     | d   S )Nr   rK   )kvs    r   r   z/PythonBackend.save_vocabulary.<locals>.<lambda>|  s    TVWXTY r    r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rd   r   )jsonosr   pathisdirrB   rC   rE  r   openwritedumpsr   r.   rd  )r   rs  rt  r  r  
vocab_attrrx  rR   rz  fr  
merge_filewriterr   
bpe_tokensr   s                   r   save_vocabularyzPythonBackend.save_vocabularyL  s   $ 	T9d3SwtWd7S
ww}}^,LL,^,<<STU#D*=rB*9O$A&r WW\\.&;L;P;PQ]_k;l2lm
*cG4 	aGGDJJz!tRWJX[__`	a D+t4	= WW\\.&;L;P;PQ^`l;m2mn
*cG4 	t5u=./E+1)//2CIY+Z '
KK'NN/
| <M M (ESXXj1D89
	 J''1	a 	a	 J''s   0(G	/BG	GG!)FrO   r   )FN)NTFFNr   NNNNNFFFTF)Nr   longest_firstr   )3rE   rF   rG   rH   r   propertyr   r   r`   rI   r   r   r   r   setterr   r   r   rJ   r   r   r   r   r   r   r   r   r   r  r   rW  r   r
   r   r	   r  r"   r   r   r   r6  r   r>  rA  rF  rL  r  rX  rY  r  ra   rb   s   @r   r   r     sW   
&
P    od38n o o Zd3
?&; Z Z   
($sJ4D/D*E 
($sT^J_ 
( !
(	*c3h 	*%6Ld3i$z2B&B LTX Leh L\,49t3C ,fd fs f.5Y 5T#Y 5n"0
" JN#',;,F,F2D2T2T!%$))-#'26-1-1*/+0#%I
++l:I
 00<?$FI
 !	I

 *I
 0I
 $JI
 I
 "I
  $JI
 DjI
 j(4/I
  $d{I
  $d{I
 $(I
  %)!I
" #I
$ %I
( 
)I
X 6;.2	sDcN"	#2 GKg-9g-379t3Cg-	cg-T fkNWNW.2TkNW^bNW	cNW` \\4\TW\ \hchhZ^_bZch htCy t `cfjknfo`o B"# "# " tCy  S   %*48	c? " '+Tk	 
8 &*#'056;!%)-#'26-1-1*/+0##(%lp#Ylp s)d"lp !	lp
 o-lp 3J!33lp $Jlp lp  $Jlp Djlp j(4/lp  $d{lp  $d{lp $(lp %)lp  !lp" #lp$ !%lp( 
)lpb &*$%8GA1#YA1 s)d"A1 "	A1
 !#55A1 A1 
tCy$s)T#Y.	/A1H GK>/9>/379t3C>/	c>/@:(c :(C$J :(Z_`ceh`hZi :(r    r   )(rH   r   ri   collectionsr   typingr   r   tokenization_utils_baser   r   r	   r
   r   r   r   r   utilsr   r   r   r   
get_loggerrE   rB   SPECIAL_TOKENS_MAP_FILEADDED_TOKENS_FILETOKENIZER_CONFIG_FILEr   rM   rm   rq   r~   r   r   rJ   rI   r   r   PreTrainedTokenizerrK   r    r   <module>r     s   
   #  	 	 	 L K 
		H	% 4 ' / d dN8T 8v		ad
4$s) 
4 
4 ,-v(+ v( .v(t $ r    