
    qi`9                         d dl mZmZmZmZmZmZ d dlmZ ddl	m
Z
mZ ddlmZ ddlmZ  ej                   e      Zddd	Zg d
Z G d de      ZdgZy)    )Regex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPE   )
AddedTokenBatchEncoding)TokenizersBackend)loggingzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ace_Arabace_Latnacm_Arabacq_Arabaeb_Arabafr_Latnajp_Arabaka_Latnamh_Ethiapc_Arabarb_Arabars_Arabary_Arabarz_Arabasm_Bengast_Latnawa_Devaayr_Latnazb_Arabazj_Latnbak_Cyrlbam_Latnban_Latnbel_Cyrlbem_Latnben_Bengbho_Devabjn_Arabbjn_Latnbod_Tibtbos_Latnbug_Latnbul_Cyrlcat_Latnceb_Latnces_Latncjk_Latnckb_Arabcrh_Latncym_Latndan_Latndeu_Latndik_Latndyu_Latndzo_Tibtell_Grekeng_Latnepo_Latnest_Latneus_Latnewe_Latnfao_Latnpes_Arabfij_Latnfin_Latnfon_Latnfra_Latnfur_Latnfuv_Latngla_Latngle_Latnglg_Latngrn_Latnguj_Gujrhat_Latnhau_Latnheb_Hebrhin_Devahne_Devahrv_Latnhun_Latnhye_Armnibo_Latnilo_Latnind_Latnisl_Latnita_Latnjav_Latnjpn_Jpankab_Latnkac_Latnkam_Latnkan_Kndakas_Arabkas_Devakat_Georknc_Arabknc_Latnkaz_Cyrlkbp_Latnkea_Latnkhm_Khmrkik_Latnkin_Latnkir_Cyrlkmb_Latnkon_Latnkor_Hangkmr_Latnlao_Laoolvs_Latnlij_Latnlim_Latnlin_Latnlit_Latnlmo_Latnltg_Latnltz_Latnlua_Latnlug_Latnluo_Latnlus_Latnmag_Devamai_Devamal_Mlymmar_Devamin_Latnmkd_Cyrlplt_Latnmlt_Latnmni_Bengkhk_Cyrlmos_Latnmri_Latnzsm_Latnmya_Mymrnld_Latnnno_Latnnob_Latnnpi_Devanso_Latnnus_Latnnya_Latnoci_Latngaz_Latnory_Oryapag_Latnpan_Gurupap_Latnpol_Latnpor_Latnprs_Arabpbt_Arabquy_Latnron_Latnrun_Latnrus_Cyrlsag_Latnsan_Devasat_Bengscn_Latnshn_Mymrsin_Sinhslk_Latnslv_Latnsmo_Latnsna_Latnsnd_Arabsom_Latnsot_Latnspa_Latnals_Latnsrd_Latnsrp_Cyrlssw_Latnsun_Latnswe_Latnswh_Latnszl_Latntam_Tamltat_Cyrltel_Telutgk_Cyrltgl_Latntha_Thaitir_Ethitaq_Latntaq_Tfngtpi_Latntsn_Latntso_Latntuk_Latntum_Latntur_Latntwi_Latntzm_Tfnguig_Arabukr_Cyrlumb_Latnurd_Arabuzn_Latnvec_Latnvie_Latnwar_Latnwol_Latnxho_Latnydd_Hebryor_Latnyue_Hantzho_Hanszho_Hantzul_Latnc                       e Zd ZU dZeZddgZeZg Z	e
e   ed<   g Ze
e   ed<   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeeeef   z  dz  dee
e   z  dz  d	edz  f fd
Zedefd       Zej&                  deddfd       Zdededz  dedz  fdZ	 	 	 	 	 	 	 	 d de
e   dede
e   dz  dededz  dedz  dededz  dedefdZd Zd Zd!dZdeddfdZ xZS )"NllbTokenizera	  
    Construct an NLLB tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import NllbTokenizer

    >>> tokenizer = NllbTokenizer.from_pretrained(
    ...     "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="fra_Latn"
    ... )
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
    ```

    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values.
        src_lang (`str`, *optional*):
            The language to use as source language for translation.
        tgt_lang (`str`, *optional*):
            The language to use as target language for translation.
        legacy_behaviour (`bool`, *optional*, defaults to `False`):
            Whether to use legacy behaviour (suffix pattern) or new behaviour (prefix pattern).
    	input_idsattention_maskprefix_tokenssuffix_tokensNvocabmerges_spm_precompiled_charsmapc                     ||}n|t         }t        |	t              rt        |	ddd      n|	}	|| _        |.t        |      dt        |      dt        |      dt        |      di}|| _        |xs g | _        t        t        | j
                  | j                  d t        |      dd            | _	        |Vt        j                  t        j                  |      t        j                  t        d	      d
      g      | j                  _        t!        j"                  ddd      | j                  _        t'        j"                  ddd      | j                  _        t+        | X  d|||||||
||	||d| d| _        ddddd| _        | j0                  j3                         D ci c]  \  }}||
 c}}| _        |
|
nd| _        | j9                  | j6                        | _        || _        | j?                  | j6                         y c c}}w )NT)
normalizedlstripspecialr         r
   F)r   r   dropout	unk_tokenfuse_unkbyte_fallbackz {2,} u   ▁always)replacementprepend_schemesplit)	bos_token	eos_token	sep_token	cls_tokenr   	pad_tokensrc_langtgt_lang
mask_tokenextra_special_tokenslegacy_behaviour)<s><pad></s><unk>r?    ) FAIRSEQ_LANGUAGE_CODES
isinstancestrr   r   _vocab_mergesr   r	   
_tokenizerr   SequencePrecompiledReplacer   
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__fairseq_offsetfairseq_tokens_to_idsitemsfairseq_ids_to_tokens	_src_langconvert_tokens_to_idscur_lang_coder   set_src_lang_special_tokens)selfr   r   r   r   r   r   r   r   r   r   r   r   additional_special_tokensr   r   kwargskv	__class__s                      \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/nllb/tokenization_nllb.pyr  zNllbTokenizer.__init__Y   s   *  +(<%&.(>% *c* zd4N 	
 !1=IIII	E |#kk||i.#	
 %0)4)=)=++,EF''h=*DOO& )7(@(@Ucksw(x%"*"4"4W_gk"l 	
!!:-	
 	
   	&
" 8<7Q7Q7W7W7Y%Ztq!ad%Z"%-%9z!77G ((8 &[s   &H
returnc                     | j                   S N)r  r  s    r  r   zNllbTokenizer.src_lang   s    ~~    new_src_langc                 H    || _         | j                  | j                          y r"  )r  r  )r  r%  s     r  r   zNllbTokenizer.src_lang   s    %((8r$  return_tensorsr   r   c                 v    ||t        d      || _         | |fd|d|}| j                  |      }||d<   |S )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr'  forced_bos_token_id)
ValueErrorr   r  )r  
raw_inputsr'  r   r   extra_kwargsinputstgt_lang_ids           r  _build_translation_inputsz'NllbTokenizer._build_translation_inputs   sY     x/`aa jiT.i\hi00:(3$%r$  	src_texts	tgt_texts
max_lengthmax_target_lengthpadding
truncationc
           	          || _         || _        || j                  } | |fd||||	d|
}||S ||}| j                           | |fd||||	d|
}|d   |d<   | j	                          |S )NT)r)  r'  r3  r5  r6  )r)  r'  r5  r3  r6  r   labels)r   r   model_max_length_switch_to_target_mode_switch_to_input_mode)r  r1  r   r2  r   r3  r4  r5  r'  r6  r  model_inputsr8  s                r  prepare_seq2seq_batchz#NllbTokenizer.prepare_seq2seq_batch   s     ! ..J
#)!!
 
  $ * 	##%
#)(!
 
 "(!4X 	""$r$  c                 8    | j                  | j                        S r"  )r  r   r#  s    r  r;  z#NllbTokenizer._switch_to_input_mode  s    //>>r$  c                 r    | j                   | j                  | _         | j                  | j                         S r"  )r   r  set_tgt_lang_special_tokensr#  s    r  r:  z$NllbTokenizer._switch_to_target_mode  s,    ==  NNDM//>>r$  c                 v   | j                  |      | _        |}| j                  rg | _        | j                  | j                  g| _        t        j                  d| j                  |gdd| j                  |g| j                  | j                  f|| j                  fg      | j                  _
        y| j                  g| _        | j                  g| _        t        j                  |d| j                  g|dd| j                  g| j                  | j                  f|| j                  fg      | j                  _
        y)zReset the special tokens to the source lang setting.
        - In legacy mode: No prefix and suffix=[eos, src_lang_code].
        - In default mode: Prefix=[src_lang_code], suffix = [eos]
        $A$Bsinglepairspecial_tokensNr  r  r   r   eos_token_idr   r   TemplateProcessingr   r  post_processor)r  r   lang_code_tokens      r  r  z)NllbTokenizer.set_src_lang_special_tokens  s   
 "77A"  !#D"&"3"3T5G5G!HD-7-J-Jdnno>D$../B!%1B1B CoW[WiWiEjk.DOO* #'"4"4!5D"&"3"3!4D-7-J-J't~~>%tT4>>B!%1B1B CoW[WiWiEjk.DOO*r$  langc                 v   | j                  |      | _        |}| j                  rg | _        | j                  | j                  g| _        t        j                  d| j                  |gdd| j                  |g| j                  | j                  f|| j                  fg      | j                  _
        y| j                  g| _        | j                  g| _        t        j                  |d| j                  g|dd| j                  g| j                  | j                  f|| j                  fg      | j                  _
        y)zReset the special tokens to the target lang setting.
        - In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
        - In default mode: Prefix=[tgt_lang_code], suffix = [eos]
        rB  rC  rD  NrH  )r  rM  rL  s      r  r@  z)NllbTokenizer.set_tgt_lang_special_tokens$  s   
 "77=  !#D"&"3"3T5G5G!HD-7-J-Jdnno>D$../B!%1B1B CoW[WiWiEjk.DOO* #'"4"4!5D"&"3"3!4D-7-J-J't~~>%tT4>>B!%1B1B CoW[WiWiEjk.DOO*r$  )NNr   r   r   r   r   r   z<mask>NNNNNF)r?   NrI   NNlongestNT)r   N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr	   modelr   listint__annotations__r   r  dictr  propertyr   setterr0  boolr   r=  r;  r:  r  r@  __classcell__)r  s   @r  r   r   !   s   .` *$&67E!M49!!M49! .2)-04"&!!]9T#s(^#d*]9 d3i$&]9 $':]9~ #   __9S 9T 9 9
*-
9<t
ORUYz
 #&*"!%(, %)494 4 9t#	4
 4 $J4 :4 4 d
4 4 
4l??
2  r$  r   N)
tokenizersr   r   r   r   r   r   tokenizers.modelsr	   tokenization_pythonr   r   tokenization_utils_tokenizersr   utilsr   
get_loggerrP  loggerrT  r  r   __all__r  r$  r  <module>rh     s_     [ Z ! < >  
		H	% $=P`a  R& Z% Zz 
r$  