
    qi                        d Z ddlZddlZddlmZ ddlmZ  ej                  e	      Z
dddZi d	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*i d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLi dMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsZdt Z G du dve      ZdvgZy)wz)Tokenization classes for Salesforce CTRL.    N   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_file	Pregnancyi Christianityi  Explaini Fitnessi  Savingi  Aski#j  Assiv Jokei~ 	Questionsi6  Thoughtsi  Retailiv  Feminismi Writingi.  Atheismi Netflixi  	Computingiך  Opinioniͨ  Alonei  Funnyi%  Gamingi  Humani  Indiai3  JokeriR- Dietin  LegaliS.  NormaniK  Tipi Weightiw  Moviesi  Runningi[  Sciencei*  Horrori  
Confessioni  Financei/  Politicsi?  Scaryi Supportin1  Technologiesi  Teenageip Eventi  Learnedi Notioni 	Wikipediaiϒ  Booksi	  Extracti) Confessionsi- 
Conspiracyi( Linksi  	NarcissusiK Relationshipi  Relationshipsi iǢ  i  ih  i )ReviewsNewsTranslationmultilingualc                 x    t               }| d   }| dd D ]  }|j                  ||f       |} t        |      }|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairs	prev_charchars       \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/ctrl/tokenization_ctrl.py	get_pairsrH   [   sO     EEQIQR 		9d#$	 JEL    c                   `     e Zd ZdZeZeZd
 fd	Ze	d        Z
d Zd Zd Zd Zd Zd	 Z xZS )CTRLTokenizera`  
    Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    c           
      j   t        |d      5 }t        j                  |      | _        d d d        | j                  j	                         D ci c]  \  }}||
 c}}| _        t        |d      5 }|j                         j                  d      dd }	d d d        	D 
cg c]  }
t        |
j                                }	}
t        t        |	t        t        |	                        | _        i | _        d| _        t!        | D  d
|dddd	| y # 1 sw Y   xY wc c}}w # 1 sw Y   xY wc c}
w )Nzutf-8)encoding
r@   T	all_zerosnone)	unk_tokentoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_pattern )openjsonloadencoderitemsdecoderreadsplittupledictziprangelen	bpe_rankscacheadd_bpe_version_headersuper__init__)selfr   r   rR   kwargsvocab_handlekvmerges_handlemergesmerge	__class__s              rG   rh   zCTRLTokenizer.__init__   s   *w/ 	3<99\2DL	3)-););)=>A1>+0 	<M"'')//5a;F	<4:;5%&;;c&%F*<=>
&*# 	
#.26#)		

 	
	3 	3>	< 	<;s#   DD2#D$! D0D$D-c                 ,    t        | j                        S N)rc   rZ   ri   s    rG   
vocab_sizezCTRLTokenizer.vocab_size   s    4<<  rI   c                 B    t        | j                  fi | j                  S rs   )r`   rZ   added_tokens_encoderrt   s    rG   	get_vocabzCTRLTokenizer.get_vocab   s    DLL>D$=$=>>rI   c                 $    | j                   v r j                   |   S t        |      }t        t        |d d       |d   dz   gz         }t        |      }|s|S 	 t	        | fd      }| j
                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }|d d	 }| j                   |<   |S # t        $ r |j                  ||d         Y nw xY w)
NrO   z</w>c                 N    j                   j                  | t        d            S )Ninf)rd   getfloat)pairri   s    rG   <lambda>z#CTRLTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1W rI   )keyr   r@      @@ )re   r_   listrH   minrd   rc   indexextend
ValueErrorappendjoin)
ri   tokenrC   rD   bigramfirstsecondnew_wordijs
   `         rG   bpezCTRLTokenizer.bpe   s   DJJ::e$$U|T$s)_R6(9'::;$L$WXFT^^+"ME6HAc$i-

5!,A
 OOD1I.A7e#CIM(9d1q5kV>SOOEFN3FAOODG,FA c$i-  XHD4yA~!$9 : zz$CRy 

5- " OODH-s   E/ /FFc                     g }t        j                  d|      }|D ]:  }|j                  t        | j	                  |      j                  d                   < |S )zTokenize a string.z\S+\n? )refindallr   r   r   r^   )ri   textsplit_tokenswordsr   s        rG   	_tokenizezCTRLTokenizer._tokenize   sT    

9d+ 	BETXXe_%:%:3%? @A	BrI   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)rZ   r|   rR   )ri   r   s     rG   _convert_token_to_idz"CTRLTokenizer._convert_token_to_id   s,    ||t||'7'7'GHHrI   c                 N    | j                   j                  || j                        S )z=Converts an index (integer) in a token (str) using the vocab.)r\   r|   rR   )ri   r   s     rG   _convert_id_to_tokenz"CTRLTokenizer._convert_id_to_token   s    ||t~~66rI   c                 d    dj                  |      j                  dd      j                         }|S )z:Converts a sequence of tokens (string) in a single string.r   r    )r   replacestrip)ri   tokens
out_strings      rG   convert_tokens_to_stringz&CTRLTokenizer.convert_tokens_to_string   s,    XXf%--eR8>>@
rI   )z<unk>)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesCONTROL_CODEScontrol_codesrh   propertyru   rx   r   r   r   r   r   __classcell__)rq   s   @rG   rK   rK   k   sN      *!M
$ ! !?*XI7rI   rK   )r   rX   regexr   tokenization_pythonr   utilsr   
get_loggerr   loggerr   r   rH   rK   __all__rV   rI   rG   <module>r      s   0   6  
		H	%  88D8 v8 u	8
 e8 
58 
58 F8 8 8 e8 8 u8 v8 u8  !8" u#8$ U%8& U'8( e)8* T+8, T-8. U/80 E182 U384 d586 
5788 e98: e;8< u=8> t?8@ eA8B %C8D uE8F G8H VI8J uK8L EM8N uO8P UQ8R uS8T fU8V W8X TY8Z u[8\ 6]8^ %_8` Ua8b c8d Ee8f Vg8h o8v n' nn 
rI   