
    qiR=                         d dl mZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ  e	j                  e      ZdZd	d
dZg dg ddZddddddddZ ed       G d de             ZdgZy)    )Any   )BatchEncoding)
AddedToken)SentencePieceBackend)logging)requiresu   ▁zsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)__java__
__python__	__en_XX__)r   r   r   __javascript____php____ruby____go__)basemultir   r   r   r   r   r   r   )javapythonen_XX
javascriptphprubygo)sentencepiece)backendsc                       e Zd ZU dZeZddgZg Zee	   e
d<   g Zee	   e
d<   	 	 	 	 	 	 	 	 	 	 	 	 	 ddeeef   dz  f fdZed	        Zd
 Zedefd       Zej(                  deddfd       Zdededz  dedz  fdZd Zd Z	 	 	 d dee   dedee   dz  dedef
 fdZd Zd Zd!dZdeddfdZdedefdZd" fd	Z xZ S )#PLBartTokenizera  
    Construct an PLBART tokenizer.

    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        src_lang (`str`, *optional*):
            A string representing the source language.
        tgt_lang (`str`, *optional*):
            A string representing the target language.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The start of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The cls token, which is a special token used as the first token for all tasks.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token(`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masking tasks. This
            is only used in the `"base"` tokenizer type. For `"multi"` tokenizer, masking is never done for the
            downstream tasks.
        language_codes (`str`, *optional*, defaults to `"base"`):
            What language codes to use. Should be one of `"base"` or `"multi"`.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:
            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.
            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import PLBartTokenizer

    >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
    >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
    >>> expected_translation_english = "Returns the maximum value of a b c."
    >>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokensNsp_model_kwargsc                 r   t        |t              rt        |dd      n|}|i n|| _        | j	                  |
      }
| j	                  |      }|	| _        t        | j
                     }|| _        i | _        i | _	        ddddd| _
        | j                  j                         D ci c]  \  }}||
 c}}| _        d| _        t        |      }|$|j                  |D cg c]	  }||vs| c}       t!        | D  d i d	|d
|d|d|d|d|d|d|d|
d|d|d| j                  d|d|	dddd| t%        | j&                        | _        t+        |      D ci c]"  \  }}|| j(                  |z   | j                  z   $ c}}| _        | j                  j                         D ci c]  \  }}||
 c}}| _	        ddddd| _
        | j
                  dk(  rEt%        | j&                        t%        | j                        z   | j                  z   | j                  d<   | j                  j-                  | j                         | j                  j                         D ci c]  \  }}||
 c}}| _        h d}|j-                  t        | j
                            d}|D ]?  }| j.                  j1                  |d       }|"| j2                  j1                  |d        d}A |r | j5                          | j7                          d}| j.                  j                         D ]3  \  }}|| j2                  v rt        |dddd      | j2                  |<   d}5 |r | j5                          | j7                          | j
                  dk(  r>|
| _        | j8                  | j                  | j8                     n| j8                  | _        n)|
|
nd| _        | j                  | j8                     | _        || _        | j?                  | j8                         y c c}}w c c}w c c}}w c c}}w c c}}w )!NTF)lstriprstripr         r   )<s><pad></s><unk>r
   	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokensr$   clean_up_tokenization_spaceslanguage_codesspecial_tokens_patternprefix_suffixtoken_type_ids_pattern	all_zerosr   <mask>>   r*   r,   r+   r-   r>   )special
normalizedr&   r'   r    ) 
isinstancestrr   r$   !_convert_lang_code_special_formatr9   FAIRSEQ_LANGUAGE_CODESr
   lang_code_to_idid_to_lang_codefairseq_tokens_to_idsitemsfairseq_ids_to_tokensfairseq_offsetlistextendsuper__init__lensp_modelsp_model_size	enumerateupdate_added_tokens_encoderpop_added_tokens_decoder_update_trie_update_total_vocab_size	_src_langcur_lang_code_idr6   set_src_lang_special_tokens)selfr
   r.   r/   r1   r2   r0   r3   r4   r9   r5   r6   r$   r7   r8   kwargsfairseq_language_codeskv_additional_special_tokensticodereserved_tokensremovedtokenidxsynced	__class__s                               `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/plbart/tokenization_plbart.pyrO   zPLBartTokenizer.__init__t   sl   & KUU_adJeZ
4Fku
%4%<r/99(C99(C,!78K8K!L %!!-.APQ%R"7;7Q7Q7W7W7Y%Ztq!ad%Z"%)*@%A"$0&--5]qB\9\] 	 	
!	
	
  	
  		

  	
  	
  	
 "	
 	
 	
 'A	
 !00	
 *F	
 *	
 $3	
  $/#	
* !/NWXnNo 
CJ1dD$$$q(4+>+>>> 
 261E1E1K1K1MNA1N-.APQ%R"&(36t}}3EDL`L`Ha3adhdwdw3wD&&x0""))$*>*>?7;7Q7Q7W7W7Y%Ztq!ad%Z"E5d6I6IJK$ 	E,,00=C**..sD9		
 ))+44::< 	JE3d000.8teE/D&&s+ F	 ))+&(%DN8<8R$$T^^4X\XfXf ! *2)=X;DN$($8$8$HD! ((8[ &[ ^2 
  O &[s$   'P 	P"*P"&'P'3P-+P3c                     t        t        | di             }t        | dd      }t        | d      rt        | j                        nd}t        | dd      dk(  r||z   |z   dz   S ||z   |z   S )NrF   rK   r(   rQ   r   r9   r   )rP   getattrhasattrrQ   )r]   lang_code_countrK   
base_vocabs       rl   
vocab_sizezPLBartTokenizer.vocab_size   su    gd,=rBC '7;+24+DS'!
4)62f</.@1DDO+n<<    c                    | j                   j                         }t        | j                  j	                               D ]G  }| j                  j                  |      }|dk(  r| j                  n|| j                  z   }||vsC|||<   I |j                  | j                  j                         D ci c]  \  }}||vs|| c}}       |S c c}}w )z,Override to use fairseq vocabulary structurer   )rH   copyrangerQ   get_piece_size	IdToPieceunk_token_idrK   rT   rU   rI   )r]   vocabrd   sp_tokenvocab_idrh   ri   s          rl   	get_vocabzPLBartTokenizer.get_vocab   s    **//1t}}3356 	+A}}..q1H,-Ft((T=P=P9PHu$"*h	+ 	43M3M3S3S3UlZUCY^fkYkeSjlm ms   0C
=C
returnc                     | j                   S N)rZ   r]   s    rl   r5   zPLBartTokenizer.src_lang   s    ~~rs   new_src_langc                 j    | j                  |      }|| _        | j                  | j                         y r   )rD   rZ   r\   )r]   r   s     rl   r5   zPLBartTokenizer.src_lang  s+    ==lK%((8rs   return_tensorsr5   r6   c                     ||t        d      | j                  |      | _        | j                  |      | _         | |fd|d|}| j	                  | j                        }||d<   |S )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr   forced_bos_token_id)
ValueErrorrD   r5   r6   convert_tokens_to_ids)r]   
raw_inputsr   r5   r6   extra_kwargsinputstgt_lang_ids           rl   _build_translation_inputsz)PLBartTokenizer._build_translation_inputs	  sz     x/`aa>>xH>>xHjiT.i\hi00?(3$%rs   c                     || j                   v r| j                   |   S | j                  j                  |      }|r|| j                  z   S | j                  S )z0Converts a token (str) in an id using the vocab.)rH   rQ   	PieceToIdrK   ry   )r]   rh   spm_ids      rl   _convert_token_to_idz$PLBartTokenizer._convert_token_to_id  sU    D...--e44((/ 06v+++L4;L;LLrs   c                     || j                   v r| j                   |   S | j                  j                  || j                  z
        S )z=Converts an index (integer) in a token (str) using the vocab.)rJ   rQ   rx   rK   )r]   indexs     rl   _convert_id_to_tokenz$PLBartTokenizer._convert_id_to_token  sA    D...--e44}}&&ut/B/B'BCCrs   	src_texts	tgt_textsc                 ~    | j                  |      | _        | j                  |      | _        t        |   ||fi |S r   )rD   r5   r6   rN   prepare_seq2seq_batch)r]   r   r5   r   r6   r^   rk   s         rl   r   z%PLBartTokenizer.prepare_seq2seq_batch%  s@     >>xH>>xHw,Y	LVLLrs   c                 8    | j                  | j                        S r   )r\   r5   r   s    rl   _switch_to_input_modez%PLBartTokenizer._switch_to_input_mode1      //>>rs   c                 8    | j                  | j                        S r   )set_tgt_lang_special_tokensr6   r   s    rl   _switch_to_target_modez&PLBartTokenizer._switch_to_target_mode4  r   rs   c                     | j                  |      }|| j                  |   nd| _        g | _        | j                  | j                  | j                  g| _        y| j                  g| _        y)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].NrD   rF   cur_lang_coder"   eos_token_idr#   )r]   r5   s     rl   r\   z+PLBartTokenizer.set_src_lang_special_tokens7  sk    99(C?G?ST11(;Y])"&"3"3T5G5G!HD"&"3"3!4Drs   langc                     | j                  |      }|| j                  |   nd| _        g | _        | j                  | j                  | j                  g| _        y| j                  g| _        y)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   r]   r   s     rl   r   z+PLBartTokenizer.set_tgt_lang_special_tokensA  sk    55d;;?;KT11$7QU)"&"3"3T5G5G!HD"&"3"3!4Drs   c                 2    t         j                  ||      }|S )z;Convert Language Codes to format tokenizer uses if required)FAIRSEQ_LANGUAGE_CODES_MAPgetr   s     rl   rD   z1PLBartTokenizer._convert_lang_code_special_formatL  s    )--dD9rs   c                 >    t        |   d||| j                  d|S )zOOverride to use self.clean_up_tokenization_spaces as default for batched input.)	token_idsskip_special_tokensr8   rA   )rN   decoder8   )r]   r   r   r8   r^   rk   s        rl   r   zPLBartTokenizer.decodeQ  s2    w~ 
 3)-)J)J
 	
 	
rs   )r*   r,   r,   r*   r-   r+   r>   r   NNNNT)r   Nr   )r~   N)FN)!__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr"   rL   int__annotations__r#   dictrC   r   rO   propertyrr   r}   r5   setterr   r   r   r   r   r   r   r\   r   rD   r   __classcell__)rk   s   @rl   r   r   /   s   ;z *$&67!M49!!M49!
 15"&%)s9 c3h$.s9j = = #   __9S 9T 9 9
*-9<tORUYzMD  &* 
M9
M 
M 9t#	
M
 
M 

M??5	5 	5 	5c c 

 
rs   r   N)typingr   tokenization_pythonr   tokenization_utils_baser    tokenization_utils_sentencepiecer   utilsr   utils.import_utilsr	   
get_loggerr   loggerSPIECE_UNDERLINEr   rE   r   r   __all__rA   rs   rl   <module>r      s     0 1 D  * 
		H	% #<P`a  4g  "
  
%&h
* h
 'h
V	 
rs   