
    qiF                     0   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	Z	ddl
mZ ddlmZ ddlmZ  ej                   e      Zd	d
ddddZdZ ed       G d de             Zdedeeef   de	j0                  fdZdeddfdZdedeez  fdZdgZy)    N)Path)copyfile)Any   )PreTrainedTokenizer)logging)requiresz
source.spmz
target.spmz
vocab.jsonztarget_vocab.jsonztokenizer_config.json)
source_spm
target_spmvocabtarget_vocab_filetokenizer_config_fileu   ▁)sentencepiece)backendsc            
           e Zd ZdZeZddgZ	 	 	 	 	 	 	 	 	 d+deee	f   dz  ddf fdZ
d Zd	edefd
Zd ZdefdZdedee   fdZdedefdZ fdZ fdZ	 	 d,dededz  def fdZdee   defdZd-dee   fdZd Zd Zedefd       Zd-dededz  dee   fdZdefdZd  Z d! Z!defd"Z"d#eddfd$Z#d% Z$d& Z%	 d.d'ed(edz  d)edee   fd*Z& xZ'S )/MarianTokenizeraB  
    Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        source_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the source language.
        target_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the target language.
        source_lang (`str`, *optional*):
            A string representing the source language.
        target_lang (`str`, *optional*):
            A string representing the target language.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        model_max_length (`int`, *optional*, defaults to 512):
            The maximum sentence length the model accepts.
        additional_special_tokens (`list[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import MarianForCausalLM, MarianTokenizer

    >>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
    >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
    >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)

    >>> outputs = model(**inputs)  # should work
    ```	input_idsattention_maskNsp_model_kwargsreturnc                    |i n|| _         t        |      j                         s
J d|        || _        t	        |      | _        t        |      | j
                  vrt        d      |rKt	        |      | _        | j                  j                         D ci c]  \  }}||
 c}}| _
        g | _        nv| j
                  j                         D ci c]  \  }}||
 c}}| _
        | j
                  D cg c](  }|j                  d      s|j                  d      s'|* c}| _        || _        || _        ||g| _        t#        || j                         | _        t#        || j                         | _        | j$                  | _        | j
                  | _        | j-                          d| _        t1        | d  d|||||	|
| j                   ||d	| y c c}}w c c}}w c c}w )Nzcannot find spm source z <unk> token must be in the vocab>><<F)	source_langtarget_lang	unk_token	eos_token	pad_tokenmodel_max_lengthr   r   separate_vocabs )r   r   existsr    	load_jsonencoderstrKeyErrortarget_encoderitemsdecodersupported_language_codes
startswithendswithr   r   	spm_filesload_spm
spm_source
spm_targetcurrent_spmcurrent_encoder_setup_normalizer_decode_use_source_tokenizersuper__init__)selfr
   r   r   r   r   r   r   r   r   r   r   r    kwargskv	__class__s                   `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/marian/tokenization_marian.pyr6   zMarianTokenizer.__init__k   s     &5%<r/J&&(P,CJ<*PP(. 'y>-=>>"+,=">D-1-@-@-F-F-HITQAqDIDL,.D)-1\\-?-?-ABTQAqDBDL>Bll2vall[_N`efeoeopteu12vD)&&$j1 #:t/C/CD":t/C/CD??#|| 	 ,1) 	
##- 00/+	
 	
- J C2vs   GG7G"G" G"c                     	 ddl m}  || j                        j                  | _        y # t
        t        f$ r  t        j                  d       d | _        Y y w xY w)Nr   )MosesPunctNormalizerz$Recommended: pip install sacremoses.c                     | S Nr!   )xs    r<   <lambda>z3MarianTokenizer._setup_normalizer.<locals>.<lambda>   s    Q     )	
sacremosesr>   r   	normalizepunc_normalizerImportErrorFileNotFoundErrorwarningswarn)r7   r>   s     r<   r3   z!MarianTokenizer._setup_normalizer   sM    	/7#78H8H#I#S#SD ./ 	/MM@A#.D 	/s   '* ,AArA   c                 ,    |r| j                  |      S dS )zHCover moses empty string edge case. They return empty list for '' input! )rF   )r7   rA   s     r<   rE   zMarianTokenizer.normalize   s    *+t##A&33rC   c                 n    || j                   v r| j                   |   S | j                   | j                     S r@   )r2   r   )r7   tokens     r<   _convert_token_to_idz$MarianTokenizer._convert_token_to_id   s8    D(((''.. ##DNN33rC   textc                     g }|j                  d      r5|j                  d      x}dk7  r|j                  |d|dz           ||dz   d }||fS )z6Remove language codes like >>fr<< before sentencepiecer   r   N   )r+   findappend)r7   rP   codeend_locs       r<   remove_language_codez$MarianTokenizer.remove_language_code   sX    ??4 4&@gR%GKK]w{+,!&DTzrC   c                 v    | j                  |      \  }}| j                  j                  |t              }||z   S )N)out_type)rX   r1   encoder%   )r7   rP   rV   piecess       r<   	_tokenizezMarianTokenizer._tokenize   s;    ..t4
d!!(((<f}rC   indexc                     || j                   v r| j                   |   S | j                  r| j                  n| j                  }|j	                  |      }|r|S | j
                  S )z?Converts an index (integer) in a token (str) using the decoder.)r)   r4   r/   r0   	IdToPiecer   )r7   r^   	spm_modelpieces       r<   _convert_id_to_tokenz$MarianTokenizer._convert_id_to_token   sU    DLL <<&&'+'H'HDOOdoo	##E*u14>>1rC   c                 $    t        |   |fi |S )ad  
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
            sequences (`Union[list[int], list[list[int]], np.ndarray, torch.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `list[str]`: The list of decoded sentences.
        )r5   batch_decode)r7   	sequencesr8   r;   s      r<   re   zMarianTokenizer.batch_decode   s    * w#I888rC   c                 $    t        |   |fi |S )a  
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, list[int], np.ndarray, torch.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        )r5   decode)r7   	token_idsr8   r;   s      r<   rh   zMarianTokenizer.decode   s    0 w~i2622rC   skip_special_tokensclean_up_tokenization_spacesc                 r    | j                    }|j                  d|      | _        t        |   d|||d|S )zCInternal decode method that handles use_source_tokenizer parameter.use_source_tokenizer)ri   rj   rk   r!   )r    popr4   r5   _decode)r7   ri   rj   rk   r8   default_use_sourcer;   s         r<   ro   zMarianTokenizer._decode  sR     "&!5!55,2JJ7MOa,b)w 
 3)E
 	
 	
rC   tokensc                 L   | j                   r| j                  n| j                  }g }d}|D ]>  }|| j                  v r||j	                  |      |z   dz   z  }g }.|j                  |       @ ||j	                  |      z  }|j                  t        d      }|j                         S )zQUses source spm if _decode_use_source_tokenizer is True, and target spm otherwiserL    )	r4   r/   r0   all_special_tokensdecode_piecesrU   replaceSPIECE_UNDERLINEstrip)r7   rq   sp_modelcurrent_sub_tokens
out_stringrN   s         r<   convert_tokens_to_stringz(MarianTokenizer.convert_tokens_to_string  s    &*&G&G4??T__
 	1E///h445GH5PSVVV
%'""))%0	1 	h,,-?@@
''(8#>
!!rC   c                 L    ||| j                   gz   S ||z   | j                   gz   S )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)r7   token_ids_0token_ids_1s      r<    build_inputs_with_special_tokensz0MarianTokenizer.build_inputs_with_special_tokens&  s5    $"3"3!444[(D,=,=+>>>rC   c                 H    | j                   | _        | j                  | _        y r@   )r/   r1   r$   r2   r7   s    r<   _switch_to_input_modez%MarianTokenizer._switch_to_input_mode-  s    ??#||rC   c                 b    | j                   | _        | j                  r| j                  | _        y y r@   )r0   r1   r    r'   r2   r   s    r<   _switch_to_target_modez&MarianTokenizer._switch_to_target_mode1  s*    ??#'#6#6D   rC   c                 ,    t        | j                        S r@   )lenr$   r   s    r<   
vocab_sizezMarianTokenizer.vocab_size6  s    4<<  rC   save_directoryfilename_prefixc                 z   t         j                  j                  |      st        j	                  d| d       y g }| j
                  rt         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t        | j                  |       t        | j                  |       |j                  |       |j                  |       nXt         j                  j                  ||r|dz   ndt        d   z         }t        | j                  |       |j                  |       t        t        d   t        d   g| j                  | j                  | j                  g      D ]  \  }}}	t         j                  j                  ||r|dz   nd|z         }
t         j                  j!                  |      t         j                  j!                  |
      k7  r=t         j                  j#                  |      rt%        ||
       |j                  |
       t         j                  j#                  |      rt'        |
d	      5 }|	j)                         }|j+                  |       d d d        |j                  |
        t-        |      S # 1 sw Y   (xY w)
NzVocabulary path (z) should be a directory-rL   r   r   r
   r   wb)ospathisdirloggererrorr    joinVOCAB_FILES_NAMES	save_jsonr$   r'   rU   zipr-   r/   r0   abspathisfiler   openserialized_model_protowritetuple)r7   r   r   saved_filesout_src_vocab_fileout_tgt_vocab_fileout_vocab_filespm_save_filenamespm_orig_pathra   spm_save_pathficontent_spiece_models                r<   save_vocabularyzMarianTokenizer.save_vocabulary:  sL   ww}}^,LL,^,<<STU!#*93&rEVW^E__" "$*93&rEVWjEkk" dll$67d))+=>1212WW\\/3!6rUfgnUo oN dllN3~.;>|,.?.MNNN__doo.<
 	27}i
 GGLL/3!6rUf fM ww}-1OOTVT[T[TbTbcpTq6""=1WW^^M2-. 3"+4+K+K+M(HH123 ""=1	2" [!!3 3s   ("J11J:	c                 "    | j                         S r@   )get_src_vocabr   s    r<   	get_vocabzMarianTokenizer.get_vocabg  s    !!##rC   c                 B    t        | j                  fi | j                  S r@   )dictr$   added_tokens_encoderr   s    r<   r   zMarianTokenizer.get_src_vocabj  s    DLL>D$=$=>>rC   c                 B    t        | j                  fi | j                  S r@   )r   r'   added_tokens_decoderr   s    r<   get_tgt_vocabzMarianTokenizer.get_tgt_vocabm  s    D''E4+D+DEErC   c                     | j                   j                         }|j                  t        j	                  g d             |S )N)r/   r0   r1   rF   r   )__dict__copyupdater   fromkeys)r7   states     r<   __getstate__zMarianTokenizer.__getstate__p  s4    ""$MMmn	
 rC   dc                      | _         t         d      si  _        t         d      sd _         fd j                  D        \   _         _         j
                   _         j                          y )Nr   r4   Fc              3   J   K   | ]  }t        |j                          y wr@   )r.   r   ).0fr7   s     r<   	<genexpr>z/MarianTokenizer.__setstate__.<locals>.<genexpr>  s     +fRSHQ8L8L,M+fs    #)	r   hasattrr   r4   r-   r/   r0   r1   r3   )r7   r   s   ` r<   __setstate__zMarianTokenizer.__setstate__w  sd     t./#%D t;<05D-+fW[WeWe+f(?? rC   c                      y)zJust EOS   r!   )r7   argsr8   s      r<   num_special_tokens_to_addz)MarianTokenizer.num_special_tokens_to_add  s    rC   c                     t        | j                        }|j                  | j                         |D cg c]
  }||v rdnd c}S c c}w )Nr   r   )setall_special_idsremoveunk_token_id)r7   seqr   rA   s       r<   _special_token_maskz#MarianTokenizer._special_token_mask  sF    d223t001:=>QQ/)q0>>>s   Ar   r   already_has_special_tokensc                     |r| j                  |      S || j                  |      dgz   S | j                  ||z         dgz   S )zCGet list where entries are [1] if a token is [eos] or [pad] else 0.r   )r   )r7   r   r   r   s       r<   get_special_tokens_maskz'MarianTokenizer.get_special_tokens_mask  sS     &++K88 ++K8A3>>++K+,EF!LLrC   )	NNNz<unk>z</s>z<pad>i   NF)FNr@   )NF)(__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   r%   r   r6   r3   rE   rO   rX   listr]   intrc   re   rh   boolro   r|   r   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   __classcell__)r;   s   @r<   r   r   ,   s   8t *$&67 15=
 c3h$.=
 
=
~/43 43 44 c d3i 
2# 2# 29.3: %*48	
 "
 '+Tk	
 

""tCy "S " ?QUVYQZ ?,7
 !C ! !+"c +"C$J +"Z_`cZd +"Z$4 $?Fd !d !t !? fk	M	M.2Tk	M^b	M	c	MrC   r   r   r   r   c                 R    t        j                  di |}|j                  |        |S )Nr!   )r   SentencePieceProcessorLoad)r   r   spms      r<   r.   r.     s%    

.
.
A
ACHHTNJrC   c                 v    t        |d      5 }t        j                  | |d       d d d        y # 1 sw Y   y xY w)NwrS   )indent)r   jsondump)datar   r   s      r<   r   r     s2    	dC %A		$!$% % %s   /8c                 p    t        | d      5 }t        j                  |      cd d d        S # 1 sw Y   y xY w)Nr)r   r   load)r   r   s     r<   r#   r#     s-    	dC Ayy|  s   ,5)r   r   rI   pathlibr   shutilr   typingr   r   tokenization_pythonr   utilsr   utils.import_utilsr	   
get_loggerr   r   r   rw   r   r%   r   r   r.   r   r   r#   __all__r!   rC   r<   <module>r      s     	      6  * 
		H	% ,4   
 
%&iM) iM 'iMX3 c3h M<`<` %# %$ %
C D4K 
 
rC   