
    qiC_              	       0   d Z ddlZddlZddlZddlZddlmZ ddlmZ  ej                  e
      ZdddZd	 Z G d
 de      Z	 dZdZededdddddf	Z ej$                  ddj'                  e      z  ej(                  ej*                  z  ej,                  z        Z ej$                  d      Z ej$                  eej(                  ej*                  z  ej,                  z        Z ej$                  d      Zd dZd!dZ G d d      Zd Zd Zd"dZ dgZ!y)#z!Tokenization classes for BERTweet    N   )PreTrainedTokenizer)logging	vocab.txt	bpe.codes)
vocab_filemerges_filec                 x    t               }| d   }| dd D ]  }|j                  ||f       |} t        |      }|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairs	prev_charchars       d/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/bertweet/tokenization_bertweet.py	get_pairsr   #   sO     EEQIQR 		9d#$	 JEL    c            	            e Zd ZdZeZ	 	 	 	 	 	 	 	 d fd	Zed        Zd Z	d Z
d Zd Zd Zd	 Zd
 Zd Zddededz  deedf   fdZd Z xZS )BertweetTokenizera	  
    Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        normalization (`bool`, *optional*, defaults to `False`):
            Whether or not to apply a normalization preprocess.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    c                    	 ddl m} || _        || _        || _        i | _        d| j                  t        |      <   d| j                  t        |	      <   d| j                  t        |      <   d| j                  t        |      <   | j                  |       | j                  j                         D ci c]  \  }}||
 c}}| _        t        |d      5 }|j                         j                  d	      d d
 }d d d        D cg c]  }t!        |j                         d d
         }}t#        t%        |t'        t)        |                        | _        i | _        || _        t1               | _        ddd| _        t7        | p  d|||||||	|
dddd| y # t        $ r  t        j                  d       d | _        Y w xY wc c}}w # 1 sw Y   xY wc c}w )Nr   )demojizezsemoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0r      r   utf-8encoding
'z...)u   ’u   …	all_zerosTcls_double_sep)normalization	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokentoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_pattern )emojir   	demojizerImportErrorloggerwarningr   r	   encoderstradd_from_fileitemsdecoderopenreadsplittupledictziprangelen	bpe_rankscacher"   TweetTokenizertweetPreprocessorspecial_punctssuper__init__)selfr   r	   r"   r#   r$   r%   r&   r'   r(   r)   kwargsr   kvmerges_handlemergesmerge	__class__s                     r   rF   zBertweetTokenizer.__init__h   s   		"&%DN %&'(S^$'(S^$'(S^$'(S^$:&)-););)=>A1>+0 	;M"'')//5cr:F	;9?@%cr*+@@c&%F*<=>
*!/!1&)%8 	
'!#.26#3	
 	
=  	"NN( "DN	"$ ?	; 	;@s)   F 3G#G#G%GGGc                 ,    t        | j                        S N)r?   r3   rG   s    r   
vocab_sizezBertweetTokenizer.vocab_size   s    4<<  r   c                 B    t        | j                  fi | j                  S rP   )r<   r3   added_tokens_encoderrQ   s    r   	get_vocabzBertweetTokenizer.get_vocab   s    DLL>D$=$=>>r   c                 $    | j                   v r j                   |   S t        |      }t        t        |d d       |d   dz   gz         }t        |      }|s|S 	 t	        | fd      }| j
                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }|d d	 }| j                   |<   |S # t        $ r |j                  ||d         Y nw xY w)
Nr   z</w>c                 N    j                   j                  | t        d            S )Ninf)r@   getfloat)pairrG   s    r   <lambda>z'BertweetTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1W r   keyr   r   r   @@ )rA   r;   listr   minr@   r?   indexextend
ValueErrorappendjoin)
rG   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpezBertweetTokenizer.bpe   s   DJJ::e$$U|T$s)_R6(9'::;$L$WXFT^^+"ME6HAc$i-

5!,A
 OOD1I.A7e#CIM(9d1q5kV>SOOEFN3FAOODG,FA c$i-  XHD4yA~!$9 : zz$CRy 

5- " OODH-s   E/ /FFc                     | j                   r| j                  |      }g }t        j                  d|      }|D ]:  }|j	                  t        | j                  |      j                  d                   < |S )zTokenize a string.z\S+\n? )r"   normalizeTweetrefindallrd   ra   ro   r:   )rG   textsplit_tokenswordsrh   s        r   	_tokenizezBertweetTokenizer._tokenize   sm    &&t,D

9d+ 	BETXXe_%:%:3%? @A	Br   c                 (   | j                   D ]!  }|j                  || j                   |         }# | j                  j                  |      }dj	                  |D cg c]  }| j                  |       c}      }|j                  dd      j                  dd      j                  dd      j                  dd      j                  d	d
      }|j                  dd      j                  dd      j                  dd      j                  dd      j                  dd      j                  dd      }|j                  dd      j                  dd      j                  dd      j                  dd      }dj	                  |j                               S c c}w )z'
        Normalize a raw Tweet
        rq   zcannot zcan not zn't z n't zn 't zca n'tzcan'tzai n'tzain'tz'm z 'm z're z 're z's z 's z'll z 'll z'd z 'd z've z 've z p . m .z  p.m.z p . m z p.m z a . m .z a.m.z a . m z a.m )rD   replacerC   tokenizerg   normalizeTokenr:   )rG   tweetpuncttokensrh   	normTweets         r   rr   z BertweetTokenizer.normalizeTweet   se    (( 	EEMM%)<)<U)CDE	E ''007HHfMUd11%8MN	 i4WVW%WWg&WXw'WXw' 	 eV,WVW%WUF#WVW%WUF#WVW% 	 j(3WY(WZ)WY(	 	 xx	)**1 Ns   Fc                 $   |j                         }|j                  d      ry|j                  d      s|j                  d      ryt        |      dk(  r<|| j                  v r| j                  |   S | j                  | j	                  |      S |S |S )z-
        Normalize tokens in a Tweet
        @z@USERhttpwwwHTTPURLr   )lower
startswithr?   rD   r/   )rG   rh   lowercased_tokens      r   r|   z BertweetTokenizer.normalizeToken  s     !;;=C ((04D4O4OPU4VZ1_+++**511~~)~~e,,Lr   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)r3   rY   r'   )rG   rh   s     r   _convert_token_to_idz&BertweetTokenizer._convert_token_to_id  s,    ||t||'7'7'GHHr   c                 N    | j                   j                  || j                        S )z=Converts an index (integer) in a token (str) using the vocab.)r7   rY   r'   )rG   rc   s     r   _convert_id_to_tokenz&BertweetTokenizer._convert_id_to_token  s    ||t~~66r   c                 d    dj                  |      j                  dd      j                         }|S )z:Converts a sequence of tokens (string) in a single string.rq   r_    )rg   rz   strip)rG   r   
out_strings      r   convert_tokens_to_stringz*BertweetTokenizer.convert_tokens_to_string#  s,    XXf%--eR8>>@
r   Nsave_directoryfilename_prefixreturn.c                    t         j                  j                  |      st        j	                  d| d       yt        | di       }|r| dnd}t         j                  j                  |||j                  dd      z         }t        |d	d
      5 }t        | j                  j                         d       D ]"  \  }}|dk\  s|j                  | d| d       $ 	 ddd       t         j                  j                  |||j                  dd      z         }	t        |	d	d
      5 }
|
j                  d t        | j                  j                         d       D               ddd       ||	fS # 1 sw Y   xY w# 1 sw Y   ||	fS xY w)zF
        Save the vocabulary and merges files to a directory.
        zVocabulary path (z) should be a directoryr-   vocab_files_names-r   r   r   wr   r   c                     | d   S Nr   r-   kvs    r   r\   z3BertweetTokenizer.save_vocabulary.<locals>.<lambda>=  s    rRSu r   r]      rq   r   Nr	   r   c              3   J   K   | ]  \  }}d j                  |      dz     yw)rq   r   N)rg   ).0
bpe_tokenstoken_indexs      r   	<genexpr>z4BertweetTokenizer.save_vocabulary.<locals>.<genexpr>E  s*      +J $t+s   !#c                     | d   S r   r-   r   s    r   r\   z3BertweetTokenizer.save_vocabulary.<locals>.<lambda>G  s    ]_`a]b r   )ospathisdirr1   errorgetattrrg   rY   r8   sortedr3   r6   write
writelinesr@   )rG   r   r   r   prefixr   frh   token_id
merge_filewriters              r   save_vocabularyz!BertweetTokenizer.save_vocabulary.  sy    ww}}^,LL,^,<<STU#D*=rB*9O$A&r WW\\.&;L;P;PQ]_j;k2kl
*cG4 	5#)$,,*<*<*>DT#U 5xq=GGugQxj345	5 WW\\.&;L;P;PQ^`k;l2lm
*cG4 	 /5dnn6J6J6LRb/c 	 J''	5 	5	 J''s   2E0E0'=E<0E9<Fc                    t        |t              r*	 t        |dd      5 }| j                  |       ddd       y|j                         }|D ]Z  }|j                         }|j                  d      }|dk(  rt        d	      |d| }t        | j                        | j                  |<   \ y# 1 sw Y   yxY w# t        $ r}|d}~wt
        $ r t        d| d      w xY w)
zi
        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
        rr   r   NzIncorrect encoding detected in z, please rebuild the datasetrq   r   z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancer4   r8   r5   FileNotFoundErrorUnicodeError	Exception	readlinesr   rfindre   r?   r3   )	rG   r   fdfnfelineslineTmplineidxr   s	            r   r5   zBertweetTokenizer.add_from_fileL  s     ac!S73 +r&&r*+  	3G==?D**S/Cby !XYY:D!$T\\!2DLL	3+ 	 % 
 c"A!D` abbcs3   B7 B+B7 +B40B7 4B7 7	C CC)F<s></s>r   r   z<unk>z<pad>z<mask>rP   )__name__
__module____qualname____doc__VOCAB_FILES_NAMESr   rF   propertyrR   rU   ro   rx   rr   r|   r   r   r   r4   r;   r   r5   __classcell__)rN   s   @r   r   r   3   s    0d * >
@ ! !?*X	 +D&I7(c (C$J (Z_`ceh`hZi (<3r   r   ac  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      <3                         # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
    (?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
a	  
    (?:
      (?:            # (international)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # exchange
      [ *\-.\)]*
      \d{4}          # base
    )z	<[^>\s]+>z[\-]+>|<[\-]+z(?:@[\w_]+)z(?:\#+[\w_]+[\w\'_\-]*[\w_]+)z#[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]a  
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    z(%s)|z([^a-zA-Z0-9])\1{3,}z&(#?(x?))([^&;\s]+);c                 R    |d}t        | t              r| j                  ||      S | S )Nr   )r   bytesdecode)ru   r   errorss      r   _str_to_unicoder     s-    ${{8V,,Kr   c                 R    fd}t         j                  |t        | |            S )u  
    Remove entities from text by converting them to their corresponding unicode character.

    Args:
        text:
            A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
        keep (list):
            List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
            `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
        remove_illegal (bool):
            If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
            kept "as is".
    Returns: A unicode string with the entities removed.

    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

    Examples:

    ```python
    >>> from nltk.tokenize.casual import _replace_html_entities

    >>> _replace_html_entities(b"Price: &pound;100")
    'Price: \xa3100'

    >>> print(_replace_html_entities(b"Price: &pound;100"))
    Price: £100
    ```c                     | j                  d      }| j                  d      rU	 | j                  d      rt        |d      }nt        |d      }d|cxk  rdk  rn nt        |f      j                  d      S n>|v r| j                  d	      S t
        j                  j                  j                  |      }|	 t        |      S rd
S | j                  d	      S # t        $ r d }Y 0w xY w# t        t        f$ r Y 7w xY w)Nr   r   r      
         cp1252r   r   )groupintr   r   re   htmlentitiesname2codepointrY   chrOverflowError)matchentity_bodynumberkeepremove_illegals      r   _convert_entityz/_replace_html_entities.<locals>._convert_entity;  s    kk!n;;q>;;q> b1F b1F
 6)T) &+228<< d"{{1~%5599+F6{" $r7Q7   . s$   AC :
C+ C('C(+C=<C=)ENT_REsubr   )ru   r   r   r   r   s    ``  r   _replace_html_entitiesr     s"    :8: ::otX'FGGr   c                       e Zd ZdZddZd Zy)rB   a  
    Examples:

    ```python
    >>> # Tokenizer for tweets.
    >>> from nltk.tokenize import TweetTokenizer

    >>> tknzr = TweetTokenizer()
    >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
    >>> tknzr.tokenize(s0)
    ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

    >>> # Examples using *strip_handles* and *reduce_len parameters*:
    >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    >>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
    >>> tknzr.tokenize(s1)
    [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
    ```c                 .    || _         || _        || _        y rP   preserve_case
reduce_lenstrip_handles)rG   r   r   r   s       r   rF   zTweetTokenizer.__init__r  s    *$*r   c                 X   t        |      }| j                  rt        |      }| j                  rt	        |      }t
        j                  d|      }t        j                  |      }| j                  s4|D cg c])  }t        j                  |      r|n|j                         + }}|S c c}w )z
        Args:
            text: str

        Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
        `preserve_case=False`
        \1\1\1)r   r   remove_handlesr   reduce_lengtheningHANG_REr   WORD_RErt   r   EMOTICON_REsearchr   )rG   ru   	safe_textrw   xs        r   r{   zTweetTokenizer.tokenizew  s     &d+!$'D??%d+DKK	40		*!!HMN1+,,Q/QQWWY>NEN Os   5.B'NTFF)r   r   r   r   rF   r{   r-   r   r   rB   rB   ^  s    &+
r   rB   c                 P    t        j                  d      }|j                  d|       S )za
    Replace repeated character sequences of length 3 or greater with sequences of length 3.
    z	(.)\1{2,}r   regexcompiler   ru   patterns     r   r   r     s#     mmL)G;;y$''r   c                 P    t        j                  d      }|j                  d|       S )z4
    Remove Twitter username handles from text.
    zv(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)rq   r   r   s     r   r   r     s+     mm 	BG ;;sD!!r   c                 <    t        |||      j                  |       S )z:
    Convenience function for wrapping the tokenizer.
    r   )rB   r{   )ru   r   r   r   s       r   casual_tokenizer     s$     *\ijss r   )Nstrict)r-   Tr   r   )"r   r   r   rs   r   tokenization_pythonr   utilsr   
get_loggerr   r1   r   r   r   	EMOTICONSURLSREGEXPSr   rg   VERBOSEIUNICODEr   r   r   r   r   r   rB   r   r   r   __all__r-   r   r   <module>r     sQ   (  	 	  6  
		H	%   n3+ n3x	L		$)\ 		  (.
A+` %--chhw&779PSXS`S`9`
a %--/
0 emmIu}}uww'>'NO 
.	/:H@0 0p("  
r   