
    qi                     v    d Z ddlZddlmZ ddlmZ  ej                  e      ZddiZ	d Z
 G d	 d
e      Zd
gZy)zTokenization classes for ESM.    N   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                     t        | d      5 }|j                         j                         }|D cg c]  }|j                          c}cd d d        S c c}w # 1 sw Y   y xY w)Nr)openread
splitlinesstrip)r   flinesls       Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/esm/tokenization_esm.pyload_vocab_filer      sS    	j#	 *!##%#()a	)* *)* *s   #AAAAA!c            
            e Zd ZdZeZddgZ	 	 	 	 	 d fd	Zdede	fdZ
de	defd	Zd
 Zd Zde	defdZdede	fdZ	 ddee   dee   dz  dee   fdZ	 ddededz  dedee   fdZd Zedefd       Z xZS )EsmTokenizerz&
    Constructs an ESM tokenizer.
    	input_idsattention_maskc           	      V   t        |      | _        t        t        | j                              | _        t        | j                        D 	ci c]  \  }}	|	|
 c}	}| _        t        
|   d|||||d| | j                  | _        | j                  | j                         y c c}	}w )N)	unk_token	cls_token	pad_token
mask_token	eos_token )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__unique_no_split_tokens_update_trie)selfr   r   r   r   r   r   kwargsindtok	__class__s             r   r$   zEsmTokenizer.__init__)   s     **5 4??!;<6?6PQ(#sS#XQ 	
!	
 	
 '+oo#$556 Rs   B%indexreturnc                 N    | j                   j                  || j                        S Nr!   getr   r'   r,   s     r   _convert_id_to_tokenz!EsmTokenizer._convert_id_to_tokenE         $$UDNN;;    tokenc                     | j                   j                  || j                   j                  | j                              S r/   r"   r1   r   r'   r6   s     r   _convert_token_to_idz!EsmTokenizer._convert_token_to_idH   0      $$UD,=,=,A,A$..,QRRr5   c                 "    |j                         S r/   )split)r'   textr(   s      r   	_tokenizezEsmTokenizer._tokenizeK   s    zz|r5   c                 p    | j                   j                         }|j                  | j                         |S r/   )r"   copyupdateadded_tokens_encoder)r'   
base_vocabs     r   	get_vocabzEsmTokenizer.get_vocabN   s0    &&++-
$334r5   c                     | j                   j                  || j                   j                  | j                              S r/   r8   r9   s     r   token_to_idzEsmTokenizer.token_to_idS   r;   r5   c                 N    | j                   j                  || j                        S r/   r0   r2   s     r   id_to_tokenzEsmTokenizer.id_to_tokenV   r4   r5   Ntoken_ids_0token_ids_1c                     | j                   g}| j                  g}|| j                  ||z   S ||z   |z   S | j                  t        d      ||z   |z   |z   |z   S )Nz=Cannot tokenize multiple sequences when EOS token is not set!)cls_token_ideos_token_id
ValueError)r'   rJ   rK   clsseps        r    build_inputs_with_special_tokensz-EsmTokenizer.build_inputs_with_special_tokensY   s       !  !  ([(([(3..&\]][ 3&4s::r5   already_has_special_tokensc                     |r-|t        d      |D cg c]  }|| j                  v rdnd c}S dgdgt        |      z  z   dgz   }||dgt        |      z  dgz   z  }|S c c}w )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`list[int]`):
                List of ids of the first sequence.
            token_ids_1 (`list[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        zYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.   r   )rO   all_special_idslen)r'   rJ   rK   rS   r6   masks         r   get_special_tokens_maskz$EsmTokenizer.get_special_tokens_maskg   s    $ && R 
 LWW%$"6"66AA=WWsqcC,,-3"QC#k**aS00D	 Xs   A!c                     t         j                  j                  ||r|dz   nddz         }t        |d      5 }|j	                  dj                  | j
                               d d d        |fS # 1 sw Y   |fS xY w)N- r   w
)ospathjoinr
   writer   )r'   save_directoryfilename_prefixr   r   s        r   save_vocabularyzEsmTokenizer.save_vocabulary   sk    WW\\.O?S3Hacgr2rs
*c" 	0aGGDIIdoo./	0}	0}s   +A--A8c                 ,    t        | j                        S r/   )rW   r   )r'   s    r   
vocab_sizezEsmTokenizer.vocab_size   s    4??##r5   )z<unk>z<cls>z<pad>z<mask>z<eos>r/   )NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr$   intstrr3   r:   r?   rE   rG   rI   listrR   boolrY   re   propertyrg   __classcell__)r+   s   @r   r   r   !   s    *$&67
 78<# <# <S# S# S
S S S< < < GK;9;379t3C;	c; fk.2Tk^b	c> $C $ $r5   r   )rk   r_   tokenization_pythonr   utilsr   
get_loggerrh   loggerrl   r   r   __all__r   r5   r   <module>rz      sQ    $ 	 6  
		H	%!;/ *m$& m$` 
r5   