
    qiZ<                         d Z ddlZddlZddlZddlmZ ddlmZmZ ddl	m
Z
  e
j                  e      ZddiZ G d	 d
      Z G d de      ZdgZy)z"Tokenization class for model MyT5.    N)defaultdict   )
AddedTokenPreTrainedTokenizer)logging
vocab_filezbyte_maps.jsonc                       e Zd ZdZdZdeeeef   z  fdZdeeeee   z  f   dedefdZ	deeef   d	eeeee   z  f   fd
Z
dee   d	dee   z  fdZddee   d	ee   fdZy)ByteRewriteraZ  
    Byte rewriter class for MyT5 tokenizer.
    This class is used to rewrite bytes using a hash tree. The hash tree is constructed from a set of rewriting rules.

    Args:
        rewriting_rules (`str` or `dict[str, str]`):
            A path to a json file containing the rewriting rules or a dictionary containing the rewriting rules.

    z[LEAF]rewriting_rulesc                    t        |t              r+t        |d      5 }t        j                  |      }d d d        n't        |t
              st        dt        |             | j                  |      | _	        |j                         D ci c]  \  }}||
 }}}| j                  |      | _        y # 1 sw Y   YxY wc c}}w )NrzDrewriting_rules should be either a path to json file or a dict, got )
isinstancestropenjsonloaddict	TypeErrortypeconstruct_hash_tree	hash_treeitemsreverse_hash_tree)selfr   fkvreverse_rewriting_ruless         \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/myt5/tokenization_myt5.py__init__zByteRewriter.__init__,   s    os+os+ /q"&))A,/ /OT2VW[\kWlVmn  11/B4C4I4I4K"LDAq1a4"L"L!%!9!9:Q!R/ / #Ms   B3B?3B<r   byte_in_sequencebyte_out_sequencec                     |j                  d      }|j                  d      }|}|D ]  }||vri ||<   ||   } ||| j                  <   y)zL
        Add a leaf with the output byte sequence to the hash tree.
         N)splitLEAF)r   r   r!   r"   byte_in_listbyte_out_listtree_pointerbs           r   add_leafzByteRewriter.add_leaf9   sb     (--c2)//4  	+A$"$Q'?L	+
 #0TYY    returnc                     t        t              }d t        d      D        D ]  }|g||   | j                  <    |j	                         D ]  \  }}| j                  |||        |S )zE
        Construct a hash tree for rewritten byte sequences.
        c              3   $   K   | ]  }|d  
 yw)02xN ).0xs     r   	<genexpr>z3ByteRewriter.construct_hash_tree.<locals>.<genexpr>M   s     1QsG*1s      )r   r   ranger&   r   r+   )r   r   r   r*   in_sequenceout_sequences         r   r   z ByteRewriter.construct_hash_treeH   ss      %	1eCj1 	*A'(cIaL#	* *9)>)>)@ 	@%KMM)[,?	@ r,   byte_sequenceNc                 \    | j                   }|D ]  }||v r||   } y || j                     S )zW
        Search the hash tree and return the rewritten byte sequence if found.
        N)r   r&   )r   r9   r)   r*   s       r   search_hash_treezByteRewriter.search_hash_treeU   sA     ~~ 	AL +A		 DII&&r,   in_bytesc                 Z   g }d}d}|t        |      k  r|s| j                  n| j                  }t        |t        |            D ]?  }||   }||v r||   }n||k(  r|g}	|} n$ n"| j                  |v s/|| j                     }	|}A |j                  	       |dz   }|t        |      k  r|S )a6  
        Rewrite a sequence of bytes using the hash tree.

        Args:
            in_bytes (`list[str]`): A list of bytes to be rewritten.
            reverse (`bool`): If True, decoding is performed with the reverse hash tree.
        Returns:
            `list[str]`: The rewritten byte sequence.
        r      )lenr   r   r6   r&   extend)
r   r<   reverse	out_bytesb_startb_endr)   jr*   cur_leafs
             r   rewrite_byteszByteRewriter.rewrite_bytesb   s     	H%184>>d>T>TL7CM2 QK$#/?L'\ !sHE99,+DII6HE X&aiG! H%$ r,   )F)__name__
__module____qualname____doc__r&   r   r   r    listr+   r   r;   rG   r1   r,   r   r
   r
      s     DSd38n(< S0$sD49,<'<"= 0QT 0il 04S> d3PTW[\_W`P`K`Fa 'd3i 'D49<L ' d3i  49  r,   r
   c            
           e Zd ZdZddgZeZ	 	 	 	 	 d	 d fdZed        Z	d Z
	 dd	ee   d
ee   dz  dedee   f fdZdee   dee   fdZ	 dd	ee   d
ee   dz  dee   fdZ	 dd	ee   d
ee   dz  dee   fdZdedee   fdZd Zd Zdee   dee   fdZdee   dee   fdZd Zddededz  dee   fdZ xZS ) MyT5Tokenizera  
    Construct a MyT5 tokenizer.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`): The file containing the byte rewriting rules.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 125):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
            like in ByT5 preprocessing see
            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
        additional_special_tokens (`list[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    	input_idsattention_maskNr-   c           	         |dkD  r|t        |      D cg c]  }d| d
 }}nK|dkD  rF|Dt        |      dkD  r6t        t        t        d |                  }	|	|k7  rt	        d| d| d      t        |t              rt        |dd	      n|}t        |t              rt        |dd	      n|}t        |t              rt        |dd	      n|}|||d
| _        t        | j                        | _	        d| _
        t        j                  t        |d            | _        t        | j                  d         | _        t        | j                  d         | _        t%        
| L  d|||d|d| y c c}w )Nr   z
<extra_id_>c                 .    t        dt        |       v       S )Nextra_id)boolr   )r3   s    r   <lambda>z(MyT5Tokenizer.__init__.<locals>.<lambda>   s    Ds1v9M4N r,   zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to MyT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r   r>      r5   r   decompose_map	merge_map)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr1   )r6   r?   setfilter
ValueErrorr   r   r   _added_tokens_decoderoffset_utf_vocab_sizer   r   r   	byte_mapsr
   decompose_rewritermerge_rewritersuperr    )r   r   r\   r]   r^   r_   r`   kwargsiextra_tokens	__class__s             r   r    zMyT5Tokenizer.__init__   s    q=6>DI)DT(Uq:aS):(U%(U]8DMfIgjkIks6*NPi#jklLy( &yk1RSlRm n( (  HRR[]`GaJydCgp	GQR[]`GaJydCgp	GQR[]`GaJydCgp	)2yY%O"$445# 4
C#89".t~~o/N"O*4>>++FG 	
&?	
 	
3 )Vs   E1c                     | j                   S N)rf   )r   s    r   
vocab_sizezMyT5Tokenizer.vocab_size   s    ###r,   c                     t        | j                  | j                  z         D ci c]  }| j                  |      | }}|j	                  | j
                         |S c c}w rp   )r6   rq   re   convert_ids_to_tokensupdateadded_tokens_encoder)r   rl   vocabs      r   	get_vocabzMyT5Tokenizer.get_vocab   sW    ;@SWS^S^A^;_`a++A.1``T../ as   Atoken_ids_0token_ids_1already_has_special_tokensc                     |rt         |   ||d      S |dgt        |      z  dgz   S dgt        |      z  dgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rx   ry   rz   r   r>   )rj   get_special_tokens_maskr?   )r   rx   ry   rz   rn   s       r   r|   z%MyT5Tokenizer.get_special_tokens_mask   sy    $ &72'[]a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNr,   	token_idsc                     t        |      dkD  r7|d   | j                  k(  r%t        j                  d| j                   d       |S || j                  gz   S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r?   eos_token_idwarningswarnr\   )r   r}   s     r   _add_eos_if_not_presentz%MyT5Tokenizer._add_eos_if_not_present   s]    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222r,   c                 t    | j                   g}|t        ||z         dgz  S t        ||z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. MyT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        r   )r   r?   )r   rx   ry   eoss       r   $create_token_type_ids_from_sequencesz2MyT5Tokenizer.create_token_type_ids_from_sequences  sP        !{S()QC//;${2S89QC??r,   c                 X    | j                  |      }||S | j                  |      }||z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r   )r   rx   ry   s      r    build_inputs_with_special_tokensz.MyT5Tokenizer.build_inputs_with_special_tokens  s;    & 22;?66{CK,,r,   textc                 r    |j                  d      D cg c]  }|d }}| j                  |      }|S c c}w )zTake as input a string and return a list of strings (tokens) for words/sub-words.
        Represents tokens in two character hex formatutf-8r0   )encodemorphological_encode)r   r   rk   rl   tokenss        r   	_tokenizezMyT5Tokenizer._tokenize4  s@     '+kk'&:;QsG*;;**62 <s   4c                 \    t        |      dk7  rd}|S t        |d      | j                  z   }|S )z0Converts a token (str) in an id using the vocab.rY   N   )r?   intre   )r   tokentoken_ids      r   _convert_token_to_idz"MyT5Tokenizer._convert_token_to_id<  s6     u:?H  5"~3Hr,   c                 (    || j                   z
  d}|S )z=Converts an index (integer) in a token (str) using the vocab.r0   )re   )r   indexr   s      r   _convert_id_to_tokenz"MyT5Tokenizer._convert_id_to_tokenF  s    4;;&s+r,   indicesc                 z    | j                   j                  |d      }| j                  j                  |d      }|S )NFrA   )rh   rG   ri   r   r   s     r   r   z"MyT5Tokenizer.morphological_encodeK  s=    ))777O%%33GU3Kr,   c                 z    | j                   j                  |d      }| j                  j                  |d      }|S )NTr   )ri   rG   rh   r   s     r   morphological_decodez"MyT5Tokenizer.morphological_decodeQ  s=    %%33GT3J))777Nr,   c                    d}g }|D ]`  }|| j                   v r|j                  | j                   |          0|| j                  v r|j                  |       P|j                  |       b | j                  |      }t	        | j                   j                               t	        | j                        z  }|D ].  }||v r|t        |d      z  }|t        j                  |      z  }0 |j                  dd      }|S )z:Converts a sequence of tokens (string) in a single string.r,   r   ignore)errors)	added_tokens_decoderappendru   r   ra   valuesbytesfromhexdecode)r   r   bstring
out_tokensr   _added_tokensstrings          r   convert_tokens_to_stringz&MyT5Tokenizer.convert_tokens_to_stringW  s    
 	)E111!!$";";E"BC$333!!%(!!%(	) ..z:
D55<<>?#dF_F_B`` 	0E%5005==//		0
 9r,   save_directoryfilename_prefixc                 n   t         j                  j                  |      r2t         j                  j                  ||r|dz   ndt        d   z         }n|r|dz   nd|z   }t        |dd      5 }|j                  t        j                  | j                  dd	             d d d        |fS # 1 sw Y   |fS xY w)
N- r   wr   )encodingrY   F)indentensure_ascii)
ospathisdirjoinVOCAB_FILES_NAMESr   writer   dumpsrg   )r   r   r   r   writers        r   save_vocabularyzMyT5Tokenizer.save_vocabularyn  s    77==(/3!6rUfgsUt tJ 4C/C/n\J*cG4 	SLLDNN15QR	S}	S}s   ,2B))B4)z</s>z<unk>z<pad>}   N)r-   N)NFrp   )rH   rI   rJ   rK   model_input_namesr   vocab_files_namesr    propertyrq   rw   rL   r   rU   r|   r   r   r   r   r   r   r   r   r   r   tupler   __classcell__)rn   s   @r   rN   rN      s   4 %&67)
 "&,
 
,
\ $ $ puO9O379t3COhlO	cO8	3c 	3tCy 	3 GK@9@379t3C@	c@0 GK-9-379t3C-	c-4c S	 
DI $s) DI $s) .	c 	C$J 	Z_`cZd 	r,   rN   )rK   r   r   r   collectionsr   tokenization_pythonr   r   utilsr   
get_loggerrH   loggerr   r
   rN   __all__r1   r,   r   <module>r      se    )  	  # B  
		H	% "#34 c cLr' rj 
r,   