
    qi	                         d Z ddlmZmZmZmZ ddlmZ ddlm	Z	 ddl
mZ  ej                  e      Zddd	d
Z G d de	      ZdgZy)z$Tokenization classes for OpenAI GPT.    )	Tokenizerdecodersnormalizerspre_tokenizers)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                        e Zd ZdZeZddgZeZ	 	 	 d
de	e
e	ef   z  dz  de	ee	   z  dz  de	f fdZed	        Z xZS )OpenAIGPTTokenizera  
    Construct a GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
    the following peculiarities:

    - lower case all inputs
    - uses BERT's BasicTokenizer for pre-BPE tokenization

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        merges_file (`str`, *optional*):
            Path to the merges file.
        tokenizer_file (`str`, *optional*):
            Path to a tokenizers JSON file containing the serialization of a tokenizer.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        vocab (`str` or `dict[str, int]`, *optional*):
            Custom vocabulary dictionary. If not provided, a blank vocabulary is initialized.
        merges (`str` or `list[str]`, *optional*):
            Custom merges list. If not provided, an empty list is used.
    	input_idsattention_maskNvocabmerges	unk_tokenc                    ||nt        |      di| _        |xs g | _        t        t	        | j                  | j                  d dddt        |                  | _        t        j                  d      | j
                  _        t        j                         | j
                  _        t        j                  d      | j
                  _        t        | @  d
d	|i| y )Nr    z</w>F)r   r   dropoutcontinuing_subword_prefixend_of_word_suffixfuse_unkr   T)	lowercase)suffixr    )str_vocab_mergesr   r   
_tokenizerr   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizerr   
BPEDecoderdecodersuper__init__)selfr   r   r   kwargs	__class__s        `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/openai/tokenization_openai.pyr)   zOpenAIGPTTokenizer.__init__;   s      %0es9~q6I|#kk||*,#)i.

 &1%?%?$%O"(6(G(G(I%"*"5"5V"D 	
	
	
    c                      y)NTr   )r*   s    r-   do_lower_casez OpenAIGPTTokenizer.do_lower_case]   s    r.   )NNz<unk>)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr   dictintlistr)   propertyr0   __classcell__)r,   s   @r-   r   r      s    4 *$&67E .2)- 	 
T#s(^#d* 
 d3i$& 
 	 
D  r.   r   N)r4   
tokenizersr   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr	   utilsr
   
get_loggerr1   loggerr5   r   __all__r   r.   r-   <module>rE      sU    + G G ! >  
		H	%#/`pq C* CL  
 r.   