
    qi,                        U d Z ddlmZmZ ddlmZ  ej                  e      ZdZ	dZ
dZdZdZd	Zd
Zedededede
dediZeeef   ed<   ej+                         D  ci c]  \  } }|| 
 c}} Zeeef   ed<    G d de      ZdgZyc c}} w )z Tokenization classes for CANINE.   )
AddedTokenPreTrainedTokenizer)loggingi       i   i  i  i  i  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]SPECIAL_CODEPOINTSSPECIAL_CODEPOINTS_BY_NAMEc                        e Zd ZdZg dZ ee       ee       ee       ee       ee       ee	      ddf fd	Z
edefd       Zd Zd	edee   fd
ZdedefdZdedefdZd Z xZS )CanineTokenizera  
    Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
    converts each character into its Unicode code point.

    [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

    Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

    Args:
        model_max_length (`int`, *optional*, defaults to 2048):
                The maximum sentence length the model accepts.
    )	input_idsattention_masktoken_type_idsFi   c	                    t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}i | _        t        j                         D ]  \  }
}|
| j                  |<    | j                  j                         D 
ci c]  \  }}
|
|
 c}
}| _        t        | _        t        | j                        | _
        t        | 0  d||||||||dddd|	 y c c}
}w )NF)lstriprstripT	all_zeroscls_sep)	bos_token	eos_token	sep_token	cls_token	pad_token
mask_tokenadd_prefix_spacemodel_max_lengthtoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_pattern )
isinstancestrr   _special_codepointsr   items_special_codepoint_stringsUNICODE_VOCAB_SIZE_unicode_vocab_sizelen_num_special_tokenssuper__init__)selfr   r   r   r   r   r   r   r   kwargs	codepointname	__class__s               `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/canine/tokenization_canine.pyr)   zCanineTokenizer.__init__G   sy    JTT]_bIcJyuEir	IST]_bIcJyuEir	IST]_bIcJyuEir	IST]_bIcJyuEir	IST]_bIcJyuEir	 KUU_adJeZ
4Fku
 46 1779 	7OIt-6D$$T*	7
 483K3K3Q3Q3S;
 /iItO;
' $6 #&t'?'?#@  	
!--#.26#,	
 	
;
s   E(returnc                     | j                   S N)r%   )r*   s    r/   
vocab_sizezCanineTokenizer.vocab_sizex   s    '''    c                     t        | j                        D ci c]  }t        |      | }}|j                  | j                         |S c c}w r2   )ranger3   chrupdateadded_tokens_encoder)r*   ivocabs      r/   	get_vocabzCanineTokenizer.get_vocab|   sB    $)$//$:;qQ;;T../ <s   A
textc                     t        |      S )z5Tokenize a string (i.e. perform character splitting).)list)r*   r=   s     r/   	_tokenizezCanineTokenizer._tokenize   s    Dzr4   tokenc                 R    	 t        |      S # t        $ r t        d| d      w xY w)zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: '')ord	TypeError
ValueError)r*   rA   s     r/   _convert_token_to_idz$CanineTokenizer._convert_token_to_id   s5    	:u: 	:/wa899	:s   
 &indexc                 r    	 |t         v r	t         |   S t        |      S # t        $ r t        d|       w xY w)z
        Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
        human-readable format.
        zinvalid id: )r   r7   rE   rF   )r*   rH   s     r/   _convert_id_to_tokenz$CanineTokenizer._convert_id_to_token   sF    
	5**)%00u: 	5|E7344	5s    
 6c                 $    dj                  |      S )N )join)r*   tokenss     r/   convert_tokens_to_stringz(CanineTokenizer.convert_tokens_to_string   s    wwvr4   )__name__
__module____qualname____doc__model_input_namesr7   CLSSEPPADMASKr)   propertyintr3   r<   r    r?   r@   rG   rJ   rO   __classcell__)r.   s   @r/   r
   r
   7   s     J c(c(c(c(c(t9/
b (C ( (
c d3i :# :# :
5# 
5# 
5r4   r
   N)rS   tokenization_pythonr   r   utilsr   
get_loggerrP   loggerr$   rW   rU   rV   BOSrX   RESERVEDr   dictrZ   r    __annotations__r"   r   r
   __all__)r,   r-   s   00r/   <module>re      s    ' B  
		H	%    (l& DcN   VhUmUmUo-p/)TdIo-p DcN pb) bJ 
Q .qs   B