
    qi9                         d Z ddlZddlZddlmZmZ  e       rddlZddlmZ ddl	m
Z
  e
j                  e      ZddiZd	 Z G d
 d      Z G d de      ZdgZy)z Tokenization classes for CPMAnt.    N)is_rjieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                     t        j                         }t        | dd      5 }|j                         }ddd       t	              D ]  \  }}|j                  d      }|||<    |S # 1 sw Y   4xY w)z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r   vocabreadertokensindextokens         `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocabr   "   sw    ##%E	j#	0 $F!!#$!&) uT"e L$ $s   A''A0c                       e Zd ZddZd Zy)WordpieceTokenizerc                 .    || _         || _        || _        y N)r   	unk_tokenmax_input_chars_per_word)selfr   r    r!   s       r   __init__zWordpieceTokenizer.__init__.   s    
"(@%    c                    t        |      }t        |      | j                  kD  r| j                  gS d}g }|t        |      k  rt        |      }d }||k  r0dj	                  |||       }|| j
                  v r|}n|dz  }||k  r0|!|j                  | j                         |dz  }n|j                  |       |}|t        |      k  r|S )Nr       )listlenr!   r    joinr   append)r"   r   charsstart
sub_tokensend
cur_substrsubstrs           r   tokenizezWordpieceTokenizer.tokenize3   s    Uu:555NN##
c%j e*CJ#+uS!12TZZ'!'Jq #+ !!!$..1
!!*- c%j   r$   N)<unk>   )__name__
__module____qualname__r#   r2    r$   r   r   r   -   s    A
r$   r   c                        e Zd ZdZeZddgZdZ	 	 	 	 	 	 	 	 	 d fd	Ze	d        Z
e	d        Ze	d        Ze	d	efd
       Zd Zd Z fdZd Zdee   d	efdZd Zd Zddededz  d	ee   fdZ xZS )CpmAntTokenizera  
    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bod_token (`str`, *optional*, defaults to `"<d>"`):
            The beginning of document token.
        eod_token (`str`, *optional*, defaults to `"</d>"`):
            The end of document token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        line_token (`str`, *optional*, defaults to `"</n>"`):
            The line token.
        space_token (`str`, *optional*, defaults to `"</_>"`):
            The space token.
    	input_idsattention_maskFc                    t        | dg       || _        || _        t        |      | _        | j                  |	   | j                  d<   | j                  |   | j                  d<   | j                  |	= | j                  |= t        j                  t        | j                  j                         d             | _        | j                  j                         D ci c]  \  }}||
 c}}| _	        t        | j                  |      | _        t        | 4  d||||||||	|
ddd	d
| |	|fD ]=  }| j                  j                  |d       }|"| j                   j                  |d        ? | j#                          y c c}}w )Nrjieba r   c                     | d   S Nr'   r8   xs    r   <lambda>z*CpmAntTokenizer.__init__.<locals>.<lambda>       Z[\]Z^ r$   key)r   r    	all_zerosTbos)	bod_token	eod_token	bos_token	eos_token	pad_tokenr    
line_tokenspace_tokenpadding_sidetoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_patternr8   )r   rJ   rK   r   encoderr   r   sorteditemsdecoderr   wordpiece_tokenizersuperr#   added_tokens_encoderpop_added_tokens_decoder_update_total_vocab_size)r"   r   rJ   rK   rL   rM   rN   r    rO   rP   rQ   kwargskvspecial_tokentoken_id	__class__s                   r   r#   zCpmAntTokenizer.__init__j   sp    	$
+""!*- LL5S!\\*5TLL%LL$"..vdll6H6H6JP^/_`)-););)=>A1>#5DLLT]#^  	
!#%#.26#(	
 	
 *:6 	?M0044]DIH#**..x>	? 	%%'/ ?s   E=c                 4    | j                   | j                     S r   )rU   rJ   r"   s    r   bod_token_idzCpmAntTokenizer.bod_token_id       ||DNN++r$   c                 4    | j                   | j                     S r   )rU   rK   rf   s    r   eod_token_idzCpmAntTokenizer.eod_token_id   rh   r$   c                      | j                   d   S )Nr   rU   rf   s    r   
newline_idzCpmAntTokenizer.newline_id   s    ||D!!r$   returnc                 ,    t        | j                        S r   )r)   rU   rf   s    r   
vocab_sizezCpmAntTokenizer.vocab_size   s    4<<  r$   c                 B    t        | j                  fi | j                  S r   )dictrU   r[   rf   s    r   	get_vocabzCpmAntTokenizer.get_vocab   s    DLL>D$=$=>>r$   c                     g }t        j                  |d      D ],  }|j                  | j                  j	                  |             . |S )zTokenize a string.F)r>   cutextendrY   r2   )r"   textoutput_tokensrC   s       r   	_tokenizezCpmAntTokenizer._tokenize   sH    D%( 	GA  !9!9!B!B1!EF	Gr$   c                     |D cg c]
  }|dk\  s	| }}|D cg c]4  }|| j                   k7  s|| j                  k7  s#|| j                  k7  s3|6 }}t        |   |fi |S c c}w c c}w )zDecode ids into a string.r   )pad_token_ideos_token_idbos_token_idrZ   _decode)r"   	token_idsr_   irC   rd   s        r   r~   zCpmAntTokenizer._decode   s     )41Q!VQ4	4 
A):):$:qDDUDU?UZ[_c_p_pZpA
	 
 wy3F33	 5
s    
A&A&A+A+ A+A+c                     || j                   v S r   rl   r"   r   s     r   checkzCpmAntTokenizer.check   s    $$r$   r   c                 $    dj                  |      S )Nr&   )r*   )r"   r   s     r   convert_tokens_to_stringz(CpmAntTokenizer.convert_tokens_to_string   s    wwvr$   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)rU   getr    r   s     r   _convert_token_to_idz$CpmAntTokenizer._convert_token_to_id   s,    ||t||'7'7'GHHr$   c                 N    | j                   j                  || j                        S )z=Converts an index (integer) in a token (str) using the vocab.)rX   r   r    )r"   r   s     r   _convert_id_to_tokenz$CpmAntTokenizer._convert_id_to_token   s    ||t~~66r$   Nsave_directoryfilename_prefixc                     t         j                  j                  |      r2t         j                  j                  ||r|dz   ndt        d   z         }n|r|dz   nd|z   }d}d| j
                  v r)| j
                  d   | j
                  d<   | j
                  d= d| j
                  v r)| j
                  d   | j
                  d<   | j
                  d= t        j                  t        | j
                  j                         d	 
            | _        t        |dd      5 }| j
                  j                         D ]>  \  }}||k7  rt        j                  d| d       |}|j                  |dz          |dz  }@ 	 d d d        |fS # 1 sw Y   |fS xY w)N-r&   r   r   r?   </_>r   </n>c                     | d   S rA   r8   rB   s    r   rD   z1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>   rE   r$   rF   wr   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r'   )ospathisdirr*   VOCAB_FILES_NAMESrU   r   r   rV   rW   r   loggerwarningwrite)r"   r   r   r   r   writerr   token_indexs           r   save_vocabularyzCpmAntTokenizer.save_vocabulary   sx   77==(/3!6rUfgsUt tJ 4C/C/n\J$,,#'<<#4DLL S!4<<#'<<#5DLL T""..vdll6H6H6JP^/_`*cG4 		&*ll&8&8&: "{K'NN/
| <N N (EUT\*
		 }		 }s   AFF)	z<d>z</d>z<s>z</s>z<pad>r3   r   r   leftr   )r5   r6   r7   __doc__r   vocab_files_namesmodel_input_namesadd_prefix_spacer#   propertyrg   rj   rm   intrp   rs   ry   r~   r   r(   strr   r   r   tupler   __classcell__)rd   s   @r   r:   r:   M   s    0 *$&67
 0(d , , , , " " !C ! !?4%tCy S I7c C$J Z_`cZd r$   r:   )r   r   r   transformers.utilsr   r   r>   tokenization_pythonr   utilsr   
get_loggerr5   r   r   r   r   r:   __all__r8   r$   r   <module>r      sm    '  	 E  6  
		H	%!;/  @X) Xv 
r$   