
    qi                     d    d Z ddlmZmZ ddlmZ  ej                  e      Z G d de      Z	dgZ
y)zTokenization class for Dia.   )
AddedTokenPreTrainedTokenizer)loggingc            	            e Zd ZdZddgZ	 	 	 	 ddedz  dedz  dedz  def fd	Zed
        Z	d Z
dedee   fdZd Zd Zdee   defdZ xZS )DiaTokenizera  
    Construct a Dia tokenizer. Dia simply uses raw bytes utf-8 encoding except for special tokens `[S1]` and `[S2]`.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        unk_token (`str`, *optional*, defaults to `"<pad>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequences when encoding. Sequences longer than this will be truncated.
        offset (`int`, *optional*, defaults to 0):
            The offset of the tokenizer.
    	input_idsattention_mask	pad_tokenN	unk_token
max_lengthoffsetc                     t        |t              rt        |      n|}t        |t              rt        |      n|}d| _        |t        d      t        d      d| _        || _        t        |   d	||||dddd| y )
N   z[S1]z[S2])          	all_zerosTnone)r   r
   r   r   token_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_pattern )
isinstancestrr   _utf_vocab_size_added_tokens_decoderr   super__init__)selfr
   r   r   r   kwargs	__class__s         Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/dia/tokenization_dia.pyr   zDiaTokenizer.__init__,   s     .8	3-GJy)Y	-7	3-GJy)Y	#)2z&7IjY_N`%a" 		
!#.26#)		
 		
    c                     | j                   S N)r   )r   s    r"   
vocab_sizezDiaTokenizer.vocab_sizeF   s    ###r#   c                     t        | j                  | j                  z         D ci c]  }| j                  |      | }}|j	                  | j
                         |S c c}w r%   )ranger&   r   convert_ids_to_tokensupdateadded_tokens_encoder)r   ivocabs      r"   	get_vocabzDiaTokenizer.get_vocabJ   sW    ;@SWS^S^A^;_`a++A.1``T../ as   Atextreturnc                 ^    |j                  d      D cg c]  }t        |       }}|S c c}w )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsutf-8)encodechr)r   r/   r,   tokenss       r"   	_tokenizezDiaTokenizer._tokenizeO   s,    "&++g"67Q#a&77 8s   *c                 Z    t        |      dk7  rd}|S t        |      | j                  z   }|S )z0Converts a token (str) in an id using the vocab.r   N)lenordr   )r   tokentoken_ids      r"   _convert_token_to_idz!DiaTokenizer._convert_token_to_idT   s4     u:?H  5zDKK/Hr#   c                 6    t        || j                  z
        }|S )z=Converts an index (integer) in a token (str) using the vocab.)r4   r   )r   indexr:   s      r"   _convert_id_to_tokenz!DiaTokenizer._convert_id_to_token^   s    EDKK'(r#   r5   c                    d}|D ]p  }|| j                   v r*| j                   |   }t        |      j                  d      }n1|| j                  v r|j                  d      }n|j                  d      }||z  }r |j	                  dd      }|S )z:Converts a sequence of tokens (string) in a single string.r#   r2   ignore)errors)added_tokens_decoderr   r3   r+   decode)r   r5   bstringr:   added_token_obj
tok_stringstrings          r"   convert_tokens_to_stringz%DiaTokenizer.convert_tokens_to_stringc   s     	"E111"&";";E"B 188A
$333"\\'2
"\\'2
z!G	" 9r#   )<pad>rJ   i   r   )__name__
__module____qualname____doc__model_input_namesr   intr   propertyr&   r.   listr6   r<   r?   rI   __classcell__)r!   s   @r"   r   r      s    $ %&67 !( '!%
:
 :
 $J	

 
4 $ $
c d3i 

tCy S r#   r   N)rN   tokenization_pythonr   r   utilsr   
get_loggerrK   loggerr   __all__r   r#   r"   <module>rY      s>    " B  
		H	%Y& Yx 
r#   