
    qiA                         d Z ddlmZmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ  ej                  e      ZddiZ G d	 d
e      Zd
gZy)zTokenization classes for XGLM.    )Regex	Tokenizerdecodersnormalizerspre_tokenizers
processors)Unigram   )TokenizersBackend)loggingtokenizer_fileztokenizer.jsonc                        e Zd ZdZeZddgZeZ	 	 	 	 	 	 	 	 dde	e
ee	ef      z  dz  de	de	de	d	e	d
e	de	def fdZ xZS )XGLMTokenizeraW  
    Construct a XGLM tokenizer (backed by HuggingFace's tokenizers library). Based on BPE.

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        tokenizer_file (`str`, *optional*):
            Path to a tokenizers JSON file containing the serialization of a tokenizer.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        vocab (`str`, `dict` or `list`, *optional*):
            Custom vocabulary dictionary. If not provided, a minimal vocabulary is created.
        merges (`list[tuple[str, str]]`, *optional*):
            Custom merge rules for BPE. If not provided, merges are generated from the vocabulary.
        add_prefix_space (`bool`, *optional*, defaults to `True`):
            Whether to add a prefix space before encoding.
    	input_idsattention_maskNvocab	bos_token	eos_token	sep_token	cls_token	unk_token	pad_tokenadd_prefix_spacec	                    d| _         t        | j                         D 
cg c]  }
d|
 d
 }}
|	j                  dg       xs g |	d<   |	dxx   |D cg c]  }||	d   vs| c}z  cc<   || _        ||| _        n7t        |      dft        |      dft        |      dft        |      dfg| _        t        t        | j                  dd            | _        t        j                  t        j                  t        d	      d
      t        j                         t        j                  t        d      d
      g      | j                  _        |rdnd}t        j                   d|      | j                  _        t%        j                   d|      | j                  _        t)        | T  d|||||||d|	 t-        j.                  | j0                   d| j0                   | j0                   d| j0                   d
| j0                   d| j0                   | j2                  | j4                  f| j0                  | j6                  fg      | j                  _        y c c}
w c c}w )N   z<madeupword>additional_special_tokensg        r
   F)r   unk_idbyte_fallbackz[\n\r\t] z {2,}alwaysneveru   ▁)replacementprepend_scheme)r   r   r   r   r   r   r   z $A z $B )singlepairspecial_tokens )num_madeup_wordsrangegetr   _vocabstrr   r	   
_tokenizerr   SequenceReplacer   NFKC
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__r   TemplateProcessingr   r   bos_token_ideos_token_idpost_processor)selfr   r   r   r   r   r   r   r   kwargsimadeup_wordswordr$   	__class__s                 \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/xglm/tokenization_xglm.pyr7   zXGLMTokenizer.__init__>   sF    !"49$:O:O4PQq+aS*QQ.4jj9TVX.Y._]_*+*+)0
T@[9\-\D0
 	
+ !1DK Y%Y%Y%Y%	DK $G$++aW\$]^%0%9%9##E+$6<  "##E(OS9&
" &67(6(@(@Ucq(r%"*"4"4We"f 		
-		
 		
 *4)F)Fnn%T$..)9:NN#4'7q8HT^^L\]!2!23!2!23*
&S R0
s   II
I
)N<s></s>rD   rC   z<unk>z<pad>T)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr	   modelr-   listtuplefloatboolr7   __classcell__)rA   s   @rB   r   r      s    : *$&67E 7;  !%=
T%U
+,,t3=
 =
 	=

 =
 =
 =
 =
 =
 =
    r   N)rH   
tokenizersr   r   r   r   r   r   tokenizers.modelsr	   tokenization_utils_tokenizersr   utilsr   
get_loggerrE   loggerrI   r   __all__r(   rR   rB   <module>rZ      sS    % Z Z % >  
		H	%%'78 _
% _
D 
rR   