
    qi!                         d dl mZmZmZmZ d dlmZ ddlmZ ddl	m
Z
 ddlmZ  ej                  e      Zddd	Zg d
Z G d de
      ZdgZy)    )	Tokenizerdecoderspre_tokenizers
processors)Unigram   )
AddedToken)TokenizersBackend)loggingzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                       e Zd ZU dZeZddgZeZg Z	e
e   ed<   g Ze
e   ed<   	 	 	 	 	 	 	 	 	 	 	 ddeez  e
z  dz  f fdZed	efd
       Zej&                  ded	dfd       Zdededz  dedz  fdZd Zd ZddZded	dfdZ xZS )MBartTokenizeruC  
    Construct an MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import MBartTokenizer

    >>> tokenizer = MBartTokenizer.from_pretrained(
    ...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
    ... )
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokensNvocabc                 l   t        |t              rt        |dd      n|}t        j	                         }|$|j                  |D cg c]	  }||vs| c}       |rt        |      dft        |      dft        |      dft        |      dfg}|dgz  }t        D ]  }|j                  |df        |j                  t        |      df       || _        t        t        | j                  dd            | _
        d | j                  _        t        j                  t        j                         t        j                  dd	d
      g      | j                  _        t#        j                  dd	d
      | j                  _        t'        | P  d||||||||	|
|d
| t        D ci c]  }|| j+                  |       c}| _        d| _        ddddd| _        | j0                  j3                  | j,                         | j+                  t        |            | j0                  d<   | j0                  j5                         D ci c]  \  }}||
 c}}| _        |	|	nd| _        | j+                  | j8                        | _        |
| _        | j?                  | j8                         y c c}w c c}w c c}}w )NTF)lstriprstripg        )   ▁g       r   )unk_idbyte_fallbackr1   always)replacementprepend_schemesplit)
	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokens   r      )<s><pad></s><unk><mask>r    ) 
isinstancestrr	   FAIRSEQ_LANGUAGE_CODEScopyextendappend_vocabr   r   
_tokenizer
normalizerr   SequenceWhitespaceSplit	Metaspacepre_tokenizerr   decodersuper__init__convert_tokens_to_idslang_code_to_idfairseq_offsetfairseq_tokens_to_idsupdateitemsfairseq_ids_to_tokens	_src_langcur_lang_coder@   set_src_lang_special_tokens)selfr-   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   kwargs_additional_special_tokenst	lang_codekv	__class__s                     ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/mbart/tokenization_mbart.pyrY   zMBartTokenizer.__init__@   s    KUU_adJeZ
4Fku
%;%@%@%B"$0&--5]qB\9\] =Y%Y%Y%Y%	E m_$E3 /	i-./LL#j/3/0#GDKKQV$WX%)"(6(?(?..0((U8[_`)
% #+"4"4W_gk"l 	
!&@	
 	
 Oe 
AJIt11)<< 
   	&
" 	""))$*>*>?/3/I/I#j//Z""8,7;7Q7Q7W7W7Y%Ztq!ad%Z"%-%9w!77G ((8} ^V 
 &[s   	J&J&%J+J0returnc                     | j                   S N)ra   rd   s    rl   r?   zMBartTokenizer.src_lang   s    ~~    new_src_langc                 H    || _         | j                  | j                          y ro   )ra   rc   )rd   rr   s     rl   r?   zMBartTokenizer.src_lang   s    %((8rq   return_tensorsr?   r@   c                 v    ||t        d      || _         | |fd|d|}| j                  |      }||d<   |S )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensrt   forced_bos_token_id)
ValueErrorr?   rZ   )rd   
raw_inputsrt   r?   r@   extra_kwargsinputstgt_lang_ids           rl   _build_translation_inputsz(MBartTokenizer._build_translation_inputs   sY     x/`aa jiT.i\hi00:(3$%rq   c                 8    | j                  | j                        S ro   )rc   r?   rp   s    rl   _switch_to_input_modez$MBartTokenizer._switch_to_input_mode   s    //>>rq   c                 r    | j                   | j                  | _         | j                  | j                         S ro   )r@   ra   set_tgt_lang_special_tokensrp   s    rl   _switch_to_target_modez%MBartTokenizer._switch_to_target_mode   s,    ==  NNDM//>>rq   c                    | j                  |      | _        g | _        | j                  | j                  g| _        | j                  | j                        }| j                  | j                        }t        j                  |dgz   |z   |ddgz   |z   t        t        ||z   | j                  | j                  z                     | j                  _        y)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].$A$Bsinglepairspecial_tokensNrZ   rb   r+   eos_token_idr,   convert_ids_to_tokensr   TemplateProcessinglistziprQ   post_processor)rd   r?   prefix_tokens_strsuffix_tokens_strs       rl   rc   z*MBartTokenizer.set_src_lang_special_tokens   s    !77A"//1C1CD 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$58I$I4K]K]`d`r`rKr st*
&rq   langc                    | j                  |      | _        g | _        | j                  | j                  g| _        | j                  | j                        }| j                  | j                        }t        j                  |dgz   |z   |ddgz   |z   t        t        ||z   | j                  | j                  z                     | j                  _        y)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].r   r   r   Nr   )rd   r   r   r   s       rl   r   z*MBartTokenizer.set_tgt_lang_special_tokens   s    !77="//1C1CD 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$58I$I4K]K]`d`r`rKr st*
&rq   )NrD   rF   rF   rD   rG   rE   rH   NNN)rm   N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr+   r   int__annotations__r,   rK   dictrY   propertyr?   setterr}   r   r   rc   r   __classcell__)rk   s   @rl   r(   r(   !   s   . *$&67E!M49!!M49! +/"&R9TzD 4'R9h #   __9S 9T 9 9
*-
9<t
ORUYz
??


 
 
rq   r(   N)
tokenizersr   r   r   r   tokenizers.modelsr   tokenization_pythonr	   tokenization_utils_tokenizersr
   utilsr   
get_loggerr   loggerr   rL   r(   __all__rI   rq   rl   <module>r      s_     G F % - >  
		H	% $=P`a  { l
& l
^ 
rq   