
    qin                        U d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$  e       rddl%m&Z& ndZ& e       rddl'm(Z( ndZ( ejR                  e*      Z+i Z,e-e.e/e   f   e0d<   i Z1e-e.e/e   f   e0d<    ee.e.dz  f   g d e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd  e       rdndfd! e       rd"ndfd# e       rd$ndfd%d& e       rdndfd' e       rd(ndfd)d*d+ e       rd,ndfd- e       rd.ndfd/d0 e       rd1ndfd2d3 e       rdndfd4 e       rd5ndfd6d7 e       rdndfd8d9 e       rd:ndfd;d< e       rdndfd=d> e       rdndfd? e       rdndfd@dA e       rdBndfdC e       rd5ndfdD e       rdndfdE e       rdndfdF e       rdndfdG e       rdndfdH e       rdIndfdJdKdLdMdN e       rd5ndfdO e       rdPndfdQ e       rdRndfdSdT e       rdndfdU e       rdVndfdW e       rdndfdX e       rd5ndfdY e       rdndfdZd[ e       rd\ndfd] e       rd^ndfd_d` e       rdndfda e       rd5ndfdb e       rdcndfdd e       rdendfdfdg e       rdhndfdi e       rdjndfdk e       rdjndfdl e       rdjndfdm e       rdjndfdn e       rdjndfdo e       rdjndfdp e       rdndfdq e       rdrndfds e       rdrndfdt e       rdrndfdu e       rdrndfdv e       rdrndfdw e       rdrndfdx e       rdrndfdy e       rdrndfdz e       rdrndfd{ e       rd|ndfd} e       rd5ndfd~ e       rd5ndfd e       rd5ndfd e       rd\ndfdd e       rd5ndfddddd e       rdndfd e       rdndfd e       rdndfddd e       rdndfd e       rdndfd e       rd5ndfd e       rd5ndfd e       rdndfd e       rd5ndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rd"ndfd e       rd"ndfdd e       rdndfd e       rdndfd e       rd\ndfd e       rd\ndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfdd e       rdndfd e       rdndfdd e	       rdn
 e       rdrndfd e	       rdn
 e       rdrndfd e	       rdn
 e       rdrndfd e	       rdn
 e       rdrndfd e	       rdn
 e       rdrndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rd\ndfdǑd e       rdndfd e       rdndfd e       rdndfd e       rdndfdΑd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rd\ndfd e       rd\ndfd e       rdrndfd e       rdrndfd e       rd\ndfd e       rdndfd e       rdndfd e       rdndfd e       rd5ndfd e       rdndfd e       rdndfd e       rdndfd e       rd.ndfd e       rd.ndfdd e       rd5ndfdd e       rdndfd e	       rdn
 e       rdrndfd e       rdndfdd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfdd e       rdndfd e       rdjndfd  e       rdndfd e       rdndfd e       rdndfdddd e       rd	ndfd
 e       rd\ndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfd e       rdjndfd e       rdndfd e       rdndfd e       rdndfd e       rdndfdd e       rdndfd e       rd\ndfd e       rd5ndfd e       rdndfd e       rdndfd e       rdjndfd d! e       rdndfd" e       rdndfd# e       rd$ndfd% e       rdndfd&d'd( e       rdndfd) e       rdndfd*d+ e	       rdn
 e       rdrndfd, e	       rdn
 e       rdrndfd-d.d/d0d1 e       rd2ndfd3 e       rdndfd4 e       rd5ndfd6d7 e       rdndfd8 e       rdndfd9 e       rd:ndfd; e       rd\ndfd< e       rdndfd= e       rdndf      Z2h d>Z3e4e.   e0d?<   e3D ]  Z5e5e2vs e       rdrnde2e5<     ee e2      Z6 e jn                         D  ci c]  \  } }|| 
 c}} Z8d@ Z9dA Z:dBe.dCe/e   dz  fdDZ;	 	 	 	 	 	 	 dQdEe.ejx                  e.   z  dFe.ejx                  e.   z  dz  dGe=dHe-e.e.f   dz  dIe=e.z  dz  dJe.dz  dKe=dLe.dCe-e.ef   fdMZ> G dN dO      Z?dPdOgZ@yc c}} w (R  zAuto Tokenizer class.    N)OrderedDict)Any)is_mistral_common_available   )PreTrainedConfig)get_class_from_dynamic_moduleresolve_trust_remote_code)load_gguf_checkpoint)TOKENIZER_CONFIG_FILE)extract_commit_hashis_g2p_en_availableis_sentencepiece_availableis_tokenizers_availablelogging)cached_file   )EncoderDecoderConfig   )_LazyAutoMapping)CONFIG_MAPPING_NAMES
AutoConfigconfig_class_to_model_typemodel_type_to_module_name!replace_list_option_in_docstrings)TokenizersBackend)SentencePieceBackendREGISTERED_TOKENIZER_CLASSESREGISTERED_FAST_ALIASESaimv2CLIPTokenizeralbertAlbertTokenizeralignBertTokenizeraudioflamingo3Qwen2Tokenizer
aya_visionCohereTokenizerbarkbartRobertaTokenizerbarthezBarthezTokenizer)bartphoBartphoTokenizerbertzbert-generationBertGenerationTokenizer)zbert-japaneseBertJapaneseTokenizer)bertweetBertweetTokenizerbig_birdBigBirdTokenizerbigbird_pegasusPegasusTokenizer)biogptBioGptTokenizer
blenderbotBlenderbotTokenizer)zblenderbot-smallBlenderbotSmallTokenizerblipzblip-2GPT2Tokenizer)bridgetowerr+   bros)byt5ByT5Tokenizer	camembertCamembertTokenizer)canineCanineTokenizerchinese_clip)clapr+   clipclipseg)clvpClvpTokenizer
code_llamaCodeLlamaTokenizercodegencoherecohere2colqwen2convbertcpmCpmTokenizer)cpmantCpmAntTokenizer)ctrlCTRLTokenizer)zdata2vec-audioWav2Vec2CTCTokenizer)zdata2vec-textr+   dbrxdebertaDebertaTokenizerz
deberta-v2DebertaV2Tokenizer)diaDiaTokenizer
distilbertdprDPRQuestionEncoderTokenizerelectraemu3ernie)esmEsmTokenizerfalcon_mambaGPTNeoXTokenizerfastspeech2_conformerFastSpeech2ConformerTokenizer)flaubertFlaubertTokenizerflava	flex_olmo	florence2BartTokenizerfnetFNetTokenizer)fsmtFSMTTokenizerfunnelFunnelTokenizergemmaGemmaTokenizergemma2gemma3gemma3_textgemma3ngemma3n_textgitglmr   glm4glm4_moeglm4_moe_liteglm4v	glm4v_moe	glm_imageglmasrgot_ocr2zgpt-sw3GPTSw3Tokenizergpt2gpt_bigcodegpt_neogpt_neox)gpt_neox_japaneseGPTNeoXJapaneseTokenizergptj)graniter?   )
granitemoer?   )granitemoehybridr?   )granitemoesharedr?   zgrounding-dinogroupvitherbertHerbertTokenizer)hubertr[   )ibertr+   ideficsLlamaTokenizeridefics2instructblipinstructblipvideointernvljais2zkosmos-2XLMRobertaTokenizerlasr_ctcParakeetTokenizerlasr_encoderlayoutlm
layoutlmv2LayoutLMv2Tokenizer
layoutlmv3LayoutLMv3Tokenizer	layoutxlmLayoutXLMTokenizerledLEDTokenizerlighton_ocrQwen2TokenizerFastlilt
longformer)lukeLukeTokenizerlxmertLxmertTokenizerm2m_100M2M100Tokenizermambamamba2marianMarianTokenizermarkuplmMarkupLMTokenizermbartMBartTokenizermbart50MBart50Tokenizer)megar+   zmegatron-bert
metaclip_2)zmgp-strMgpstrTokenizer	ministralMistralCommonBackend
ministral3mistralmistral3mixtralmlukeMLukeTokenizerzmm-grounding-dino
mobilebertMobileBertTokenizermpnetMPNetTokenizermpt)mrar+   mt5T5Tokenizermusicgenmusicgen_melodymvpMvpTokenizer)myt5MyT5TokenizernezhanllbNllbTokenizerznllb-moenougatNougatTokenizernystromformerolmoolmo2olmo3olmo_hybridolmoezomdet-turbo	oneformerz
openai-gptOpenAIGPTTokenizeroptovis2owlv2owlvitpegasus	pegasus_x)	perceiverPerceiverTokenizerphi)phobertPhobertTokenizer
pix2structpixtralplbartPLBartTokenizer)
prophetnetProphetNetTokenizerqdqbertqwen2qwen2_5_omni
qwen2_5_vlqwen2_audio	qwen2_moeqwen2_vlqwen3qwen3_5Qwen3_5Tokenizerqwen3_5_moe	qwen3_moe
qwen3_nextqwen3_omni_moeqwen3_vlqwen3_vl_moe)ragRagTokenizerrealmrecurrent_gemmareformerReformerTokenizerrembertRemBertTokenizer	retribert)robertar+   )zroberta-prelayernormr+   )roc_bertRoCBertTokenizerroformerRoFormerTokenizerrwkvsam3
sam3_videoseamless_m4tSeamlessM4TTokenizerseamless_m4t_v2shieldgemma2siglipSiglipTokenizersiglip2Siglip2Tokenizerspeech_to_textSpeech2TextTokenizerspeecht5SpeechT5Tokenizer)splinterSplinterTokenizersqueezebertstablelm
starcoder2switch_transformerst5t5gemma)tapasTapasTokenizertrocrtvpudopUdopTokenizerumt5)	unispeechr[   )zunispeech-satr[   viltvisual_bert)vitsVitsTokenizervoxtralvoxtral_realtime)wav2vec2r[   )zwav2vec2-bertr[   )zwav2vec2-conformerr[   )wav2vec2_phonemeWav2Vec2PhonemeCTCTokenizerwhisperWhisperTokenizerxclipxglmXGLMTokenizer)xlmXLMTokenizerzxlm-robertazxlm-roberta-xlxlnetXLNetTokenizerxlstmxmodyoso>   fuyuphi3jambajanusllavaarcticopencuastep3p5vipllava	internlm2
llava_nextdeepseek_vldeepseek_vl_v2hyperclovax_vlmdeepseek_vl_hybrid)MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASSc                 t    t        | dd      5 }t        j                  |      cddd       S # 1 sw Y   yxY w)z*Loads a vocabulary file into a dictionary.rutf-8encodingN)openjsonload)
vocab_filereaders     \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/auto/tokenization_auto.py
load_vocabrh  q  s1    	j#	0 !Fyy ! ! !s   .7c                     g }t        | dd      5 }|D ]O  }|j                         }|s|j                  d      r(|j                  t	        |j                                      Q 	 ddd       |S # 1 sw Y   |S xY w)z Loads a merges file into a list.r^  r_  r`  #N)rb  strip
startswithappendtuplesplit)merges_filemergesrf  lines       rg  load_mergesrs  w  sr    F	k3	1 3V 	3D::<DDOOC0eDJJL12	33
 M3
 Ms   A1A1*A11A;
class_namereturnc                 @   | dv rt         S | t        v r	t        |    S | t        v r	t        |    S | dk(  rt         S t        j	                         D ]]  \  }}|| k(  st        |      }|dv r| dk(  rt        j                  dd      }nt        j                  d| d      }	 t        ||       c S  t        j                  j                         D ]  }t        |d	d       | k(  s|c S  t        j                  d      }t        ||       rt        ||       S y # t        $ r Y w xY w)
N>   BloomTokenizerBloomTokenizerFastr   )r   r   r   r   r   r   r<  r   z.tokenization_mistral_commontransformers.ztransformers.models__name__)r   r   r   TOKENIZER_MAPPING_NAMESitemsr   	importlibimport_modulegetattrAttributeErrorTOKENIZER_MAPPING_extra_contentvalueshasattr)rt  module_nametokenizer_classmodule	tokenizermain_modules         rg  tokenizer_class_from_namer    sC   ==  ,,&z2211+J77((   )@(E(E(G $_j(3K@Krr"88"001OQ_`"001[M1BDYZvz22 '55<<> 	9j$/:= )).9K{J'{J// " s   D	DDpretrained_model_name_or_path	cache_dirforce_downloadproxiestokenrevisionlocal_files_only	subfolderc                 "   |j                  d      }	t        | t        |||||||ddd|	      }
|
t        j	                  d       i S t        |
|	      }	t        |
d      5 }t        j                  |      }ddd       |	d<   |S # 1 sw Y   xY w)aY  
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
        pretrained_model_name_or_path (`str` or `os.PathLike`):
            This can be either:

            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
              huggingface.co.
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.

        cache_dir (`str` or `os.PathLike`, *optional*):
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
        force_download (`bool`, *optional*, defaults to `False`):
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
        proxies (`dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        token (`str` or *bool*, *optional*):
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
            when running `hf auth login` (stored in `~/.huggingface`).
        revision (`str`, *optional*, defaults to `"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
            identifier allowed by git.
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
        subfolder (`str`, *optional*, defaults to `""`):
            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
            specify the folder name here.

    <Tip>

    Passing `token=True` is required when you want to use a private model.

    </Tip>

    Returns:
        `dict`: The configuration of the tokenizer.

    Examples:

    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")

    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```_commit_hashF)r  r  r  r  r  r  r   _raise_exceptions_for_gated_repo%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errorsr  Nz\Could not locate the tokenizer configuration file, will try to use the model config instead.r_  r`  )	getr   r   loggerinfor   rb  rc  rd  )r  r  r  r  r  r  r  r  kwargscommit_hashresolved_config_filerf  results                rg  get_tokenizer_configr    s    J **^,K&%%))..305  #rs	%&:KHK	"W	5 #6"#(F>M# #s    BBc                   \    e Zd ZdZd Ze ee      dee	z  fd              Z
e	 dd       Zy)AutoTokenizera  
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
    created with the [`AutoTokenizer.from_pretrained`] class method.

    This class cannot be instantiated directly using `__init__()` (throws an error).
    c                     t        d      )Nz}AutoTokenizer is designed to be instantiated using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.)OSError)selfs    rg  __init__zAutoTokenizer.__init__  s    _
 	
    ru  c           	         |j                  dd      }d|d<   |j                  dd      }|j                  dd      }|j                  dd      }|j                  d      }|vt        j                  |d      }	|	,t        d	| d
dj	                  d t        D               d      t        |	      }
|
t        d|	 d       |
j                  |g|i |S |r3t        ||fi |}t        |d      d   }t        j                  d(i |}n|	 t        j                  |fd|i|}|j                  }t        |fi |}|j                  dd      }d}d|v r4t        |d   t         t"        f      r|d   }n|d   j                  dd      }||||dk7  rt        j                  |      wt        j                  |      j%                  dd      |j%                  dd      k7  r?t&        	 t'        j                  |g|i |S  t        |      j                  |g|i |S d|v r|d   |d<   |r|j%                  dd      }|du}t-        |      t.        v xs% |duxr t        |      duxs t        |dz         du}|r|t0        v rd}d}|r:|d   |d   }n|d   }d|v r|j3                  d      d   }nd}t5        |||||      }|rI|rGt7        |fi |}
|j                  dd      }|
j9                           |
j                  |g|d|i|S |c|}t        |      }
|
|j;                  d      st        |dz         }
|
|
j<                  dk(  rt&        }
|
t&        }
 |
j                  |g|i |S t?        |dd      rC|j@                  }d|vr|j%                  dd      }t        |      }
 |
j                  |g|i |S t        |tB              rzt-        |jD                        t-        |jF                        urDt(        jI                  d|jF                  jJ                   d |jD                  jJ                   d!       |jF                  }tM        t-        |      j<                        xs t?        |d"d      }|;t.        j                  t-        |      t&              }
|
 |
j                  |g|i |S |j                  dd      }|o|d#k7  r	d|v r|dd$ }t        |      }
|
|j;                  d      st        |dz         }
|
|
j<                  dk(  rt&        }
|
t&        }
 |
j                  |g|i |S t        d%|jJ                   d&dj	                  d' t.        D               d      # t        $ r t        j                  |fi |}Y pw xY w# t        $ r#}t(        j+                  d|        Y d}~d}~ww xY w))a  
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.

        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
        falling back to using pattern matching on `pretrained_model_name_or_path`:

        List options

        Params:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                Can be either:

                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
                      applicable to all derived classes)
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PreTrainedConfig`], *optional*)
                The configuration object used to determine the tokenizer class to instantiate.
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
            proxies (`dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
            subfolder (`str`, *optional*):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
            tokenizer_type (`str`, *optional*):
                Tokenizer type to be loaded.
            backend (`str`, *optional*, defaults to `"tokenizers"`):
                Backend to use for tokenization. Valid options are:
                - `"tokenizers"`: Use the HuggingFace tokenizers library backend (default)
                - `"sentencepiece"`: Use the SentencePiece backend
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__()` for more details.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer

        >>> # Download vocabulary from huggingface.co and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")

        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
        >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")

        >>> # Download vocabulary from huggingface.co and define model-specific arguments
        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)

        >>> # Explicitly use the tokenizers backend
        >>> tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer", backend="tokenizers")

        >>> # Explicitly use the sentencepiece backend
        >>> tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer", backend="sentencepiece")
        ```configNT
_from_autouse_fasttokenizer_typetrust_remote_code	gguf_filezPassed `tokenizer_type` z3 does not exist. `tokenizer_type` should be one of z, c              3       K   | ]  }|  y wN .0cs     rg  	<genexpr>z0AutoTokenizer.from_pretrained.<locals>.<genexpr>~  s      Dq Ds   rz  zTokenizer class z is not currently imported.F)return_tensorsr  auto_mapr   Fastz!Failed to use TokenizersBackend: r  r   r   z--code_revisionPythonBackendPreTrainedTokenizerFastz The encoder model config class: z3 is different from the decoder model config class: z. It is not recommended to use the `AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder specific tokenizer classes.
model_typer   z!Unrecognized configuration class z8 to build an AutoTokenizer.
Model type should be one of c              3   4   K   | ]  }|j                     y wr  )r{  r  s     rg  r  z0AutoTokenizer.from_pretrained.<locals>.<genexpr>  s     4[AQZZ4[s   r  )'popr  r|  
ValueErrorjoinr  from_pretrainedr   r
   r   	for_model	Exceptionr   r  r  
isinstancern  listreplacer   r  debugtyper  r\  ro  r	   r   register_for_auto_classendswithr{  r  r  r   decoderencoderwarning	__class__r   )clsr  inputsr  r  _r  r  r  tokenizer_class_namer  	gguf_pathconfig_dictconfig_model_typetokenizer_configtokenizer_config_classtokenizer_auto_mapehas_remote_codehas_local_code	class_refupstream_repotokenizer_class_candidate_classr  s                            rg  r  zAutoTokenizer.from_pretrained  s   d Hd+#| JJz4($4d;"JJ':DAJJ{+	 %#:#>#>~t#T #+ .~.>>qyy D,C DDEQH 
 88LMO& #34H3IId!eff2?223PdSYd]cdd#$A9WPVWI.yOPXYK))8K8F^c#331EVZ` #-- 00MXQWX!1!5!56G!N "))*:6F%5j%A"%5j%A%E%EoW[%\"
 &&2!-!R''++,=>J'++,=>FFvrR%--fb9: !,J,<<=Zn]cngmnn U,-CDTT-06:@  --%5n%EF>"!%;%C%CFB%O",D8f):: 
"$. )*@AM Z,-Cf-LMUYY	 	 04]]#O!%!!$0.q1	.q1	y  ) 5a 8 $ 9!#@.Racp! 0;IGdohnoO

?D1A3352?22-06J[_e  $/(>%78QRO&/H/Q/QRX/Y";<UX^<^"_*/G/G?/Z"3&"32?223PdSYd]cddV.5++F(637?O2?223PdSYd]cdd f23FNN#4+??6v~~7O7O6P Q%%+^^%=%=$> ?22 ^^F/V0E0EFm'RXZfhlJm
!/33DLBSTO*6667ThW]haghh "2!5!56G!N!-%)<<KaAa)?)D&78NOO&/E/N/Nv/V";<RU[<["\*/G/G?/Z"3&"32?223PdSYd]cdd/0@0@/A B++/994[IZ4[+[*\\]_
 	
I  c)99:Wb[abcB ! JLL#DQC!HIIJs*   U5 *V 5VV	W$WWNc                     |||}n||}nt        d      |||fD ]  }||t        |j                  <    |||t        |j                  <   t        j                  | ||       y)a  
        Register a new tokenizer in this mapping.

        Args:
            config_class ([`PreTrainedConfig`]):
                The configuration corresponding to the model to register.
            tokenizer_class: The tokenizer class to register (V5 - preferred parameter).
            slow_tokenizer_class: (Deprecated) The slow tokenizer to register.
            fast_tokenizer_class: (Deprecated) The fast tokenizer to register.
        Nz$You need to pass a `tokenizer_class`)exist_ok)r  r   r{  r   r  register)config_classr  slow_tokenizer_classfast_tokenizer_classr  	candidates         rg  r  zAutoTokenizer.register  s     "#/"6%1"6 !GHH.0DoV 	MI$CL,Y-?-?@	M  +0D0PEY#$8$A$AB""<8"Tr  )NNNF)r{  
__module____qualname____doc__r  classmethodr   r|  r   r   r  staticmethodr  r  r  rg  r  r    sZ    
 &'>?z
	1	1z
 @ z
x kpU Ur  r  r  )NFNNNFr  )Ar  r~  rc  oscollectionsr   typingr   transformers.utils.import_utilsr   configuration_utilsr   dynamic_module_utilsr   r	   modeling_gguf_pytorch_utilsr
   tokenization_utils_baser   utilsr   r   r   r   r   	utils.hubr   encoder_decoderr   auto_factoryr   configuration_autor   r   r   r   r   tokenization_utils_tokenizersr    tokenization_utils_sentencepiecer   
get_loggerr{  r  r   dictstrr  __annotations__r   r|  r\  setr  r  r}  CONFIG_TO_TYPErh  rs  r  PathLikeboolr  r  __all__)kvs   00rg  <module>r     s%      	 #  G 3 \ ? <  % 2 *  BH			H	% 68 d3S	>2 702 c49n- 26+c3:o6Q	%<%>/DIQ	(?(A$tLQ 
%<%>/DIQ 
/F/H+dS	Q
 
,C,E(4PQ 
$;$=4HQ 
'>'@#dKQ 
*A*C&NQ 	(Q 
$;$=4HQ 
9S9U5[_`Q 	3Q 	*Q 
+B+D'$OQ 
2I2K.QUVQ  	&!Q" 
0G0I,tT#Q$ 	9%Q& 
$;$=4H'Q( 
&=&??TJ)Q* 	,+Q, 
$;$=4H-Q. 	"/Q0 
.E.G*TR1Q2 	&3Q4 
,C,E4P5Q6 	%7Q8 
$;$=4H9Q: 
'>'@OdK;Q< 	"=Q> 
/F/H+dS?Q@ 
'>'@OdKAQB 
(?(A$tLCQD 
)@)B%MEQF 
)@)B%MGQH 
(?(A_tLIQJ 
"9";FKQL 	&MQN 	"OQP 	3QQR 	.SQT 
$;$=4HUQV 
*A*C&NWQX 
/F/H+dSYQZ 	 [Q\ 
*A*CN]Q^ 
1H1J-PTU_Q` 
'>'@OdKaQb 
$;$=4HcQd 
%<%>/DIeQf 	 gQh 
/F/H+dSiQj 
!EXEZ"A`dekQl 	*mQn 
%<%>/DIoQp 
)@)BoMqQr 
)@)BoMsQt 
$;$=4HuQv 	"wQx 
(?(A$tLyQz 
&=&?"TJ{Q| 
'>'@#dK}Q~ 
'>'@#dKQ@ 
,C,E(4PAQB 
(?(A$tLCQD 
-D-F)DQEQF 
#:#<$GGQH 
'>'@#dKIQJ 
(?(A$tLKQL 
,C,E(4PMQN 
1H1J-PTUOQP 
)@)B%MQQR 
-D-F)DQSQT 
-D-F)DQUQV 
*A*C&NWQX 
,C,E(4PYQZ 
)C)E%4P[Q\ 
$;$=4H]Q^ 
+B+D$O_Q` 
'>'@OdKaQb 
+B+D'$OcQd 	:eQf 
$;$=4HgQh 	%iQj 	(kQl 	.mQn 	.oQp 
.E.G?TRqQr 
(?(A_tLsQt 
*A*C&NuQv 	+wQx 	&yQz 
(?(A$tL{Q| 
)@)B%M}Q~ 
,C,E4PQ@ 
1H1JoPTUAQB 
)@)B%MCQD 
%<%>/DIEQF 
.E.G*TRGQH 
,C,E(4PIQJ 
0G0I,tTKQL 
(?(A_tLMQN 
0G0I,tTOQP 
0G0I,tTQQR 
.E.G*TRSQT 
"9";FUQV 
0G0I,tTWQX 
'>'@#dKYQZ 
-D-F)DQ[Q\ 	"]Q^ 
(?(A$tL_Q` 
)C)E%4PaQb 
(?(A$tLcQd 
)@)B%MeQf 
(B(D$$OgQh 
,C,E(4PiQj 
&=&?"TJkQl 
*A*C&NmQn 	%oQp 
-D-F/DQqQr 
0G0I,tTsQt 	'uQx *, #)@)B%		
wQD *, #)@)B%		
CQP *, #)@)B%		
OQ\ *, #)@)B%		
[Qh *, #)@)B%		
gQr 
&@&B"MsQt 
1H1JoPTUuQv 
0G0I,tTwQx 
&=&?"TJyQz 
&=&?"TJ{Q| 	$}Q~ 
!8!:EQ@ 
&=&?]TJAQB 
-D-FMDQCQD 
"9";FEQF 	"GQH 
%<%>/DIIQJ 
$;$=4HKQL 
(?(A_tLMQN 
(?(A$tLOQP 
/F/H+dSQQR 
'>'@#dKSQT 
(?(A$tLUQV 
)@)B%MWQX 
/F/H+dSYQZ 
(?(A$tL[Q\ 
+B+D$O]Q^ 
)@)BoM_Q` 
/F/H+dSaQb 
#:#<$GcQd 
&=&?"TJeQf 
%<%>/DIgQh 
&=&??TJiQj 
*A*C&NkQl 
,C,E(4PmQn 	,oQp 
#:#<$GqQr 	(sQt 
(?(A}tLuQx *, #)@)B%		
wQB 
(?(A$tLCQD 	.EQF 
'>'@OdKGQH 
&=&?"TJIQJ 
-D-F)DQKQL 
+B+D'$OMQN 
,C,E(4POQP 
*A*C&NQQR 
)@)B%MSQT 
&=&?"TJUQV 
*A*C&NWQX 
.E.G*TRYQZ 
*A*C&N[Q\ 
+B+D'$O]Q^ 
/F/H+dS_Q` 
)@)B%MaQb 
-D-F)DQcQd 	 eQf 
%<%>/DIgQh 
0G0I,tTiQj 
,C,E(4PkQl 
*A*C&NmQn 
)@)BoMoQp 	(qQr 	5sQt 	)uQv 
,C,E(4PwQx 
'>'@#dKyQz 
$;$=4H{Q| 
*A*CN}Q~ 
3J3L/RVWQ@ 
6M6O2UYZAQB 
-D-F)DQCQD 
(B(D$$OEQF 
*A*C&NGQH 
5O5Q1W[\IQJ 
,F,H(dSKQL 	*MQN 
+B+D$OOQP 
+B+D'$OQQR 
*A*CNSQT 
1H1JPTUUQV 
 7 9}tDWQX 
(?(A$tLYQZ 	$[Q\ 
+B+D'$O]Q^ 
#:#<$G_Q` 
$;$=4HaQb 
"9";FcQd 	.eQf 	2gQh 
$;$=4HiQj 
+B+D$OkQl 	"mQp *, #)@)B%		
oQ| *, #)@)B%		
{QF 	-GQH 	2IQJ 	7KQL 	<MQN 
*A*C&NOQP 
%<%>/DIQQR 
$;$=4HSQT 	 UQV 
1H1J-PTUWQX 
4K4M0SWXYQZ 
&=&?"TJ[Q\ 
(?(A$tL]Q^ 
*A*C&N_Q` 
&=&?"TJaQS n7 )3s8 $ < iJ00E\E^.Adh
+i %%9;RS #=#7#=#=#?@41a!Q$@!)# )$s)d2B )\ 04 %)#"]#&S)9#9]R[[%%,] ] #s(^d"	]
 #:] Dj] ] ] 
#s(^]@jU jUZ	 
0[ As   o8