
    qi,                     D   U d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlmZ ddlmZ 	 e
j                  j!                  d        eh d      Zee   ed<    eh d      Zee   ed<   eez
  Zee   ed<   e	eef   Zdedej<                  dededef
dZdede fdZ!dedee   dee   de de	ee   e f   f
dZ"e G d d             Z#dee#   defd Z$y# e$ rC 	  e
j$                  dd	
       n*# eef$ r Z ej,                  de d       Y dZ[ndZ[ww xY wY w xY w)!a  Text processing utilities for sentence boundary detection and tag parsing.

This module provides utilities for natural language text processing including
sentence boundary detection, email and number pattern handling, and XML-style
tag parsing for structured text content.

Dependencies:
    This module uses NLTK (Natural Language Toolkit) for robust sentence
    tokenization. NLTK is licensed under the Apache License 2.0.
    See: https://www.nltk.org/
    Source: https://www.nltk.org/api/nltk.tokenize.punkt.html
    N)	dataclass)	FrozenSetListOptionalSequenceTuple)logger)sent_tokenizeztokenizers/punkt_tab	punkt_tabT)quietz4Failed to download NLTK 'punkt_tab' tokenizer data: aE  . This data is required for sentence tokenization features. The download failed due to filesystem permissions. To resolve: pre-install the data in a location with appropriate read permissions, or set the NLTK_DATA environment variable to point to a writable directory. See https://www.nltk.org/data.html for more information.>      ՜   ՞   ։   ؏   ؛   ؟   ۔   ।   ॥   ໌   །   ༎   ၊   ။   ።   ፧   ፨   ។   ៕   。   ！   ．   ；   ？   ｡   …!.;?SENTENCE_ENDING_PUNCTUATION>   r&   r'   r(   r)   r*   "_LATIN_SENTENCE_ENDING_PUNCTUATION'UNAMBIGUOUS_SENTENCE_ENDING_PUNCTUATIONtextmatcholdnewreturnc                     |j                         }|j                         }| || j                  ||      }| d| |z   | |d z   } | S )a  Replace occurrences of a substring within a matched section of text.

    Args:
        text: The input text in which replacements will be made.
        match: A regex match object representing the section of text to modify.
        old: The substring to be replaced.
        new: The substring to replace `old` with.

    Returns:
        The modified text with the specified replacements made within the matched section.
    N)startendreplace)r.   r/   r0   r1   r4   r5   replacements          F/opt/pipecat/venv/lib/python3.12/site-packages/pipecat/utils/string.pyreplace_matchr9   j   sR     KKME
))+CuS/))#s3K<+%ST
2DK    c                    | j                         } | syt        |       }|sy|d   }t        |      dk(  r@|| k(  r;| r| d   t        v rt        |       S t	        |       D ]  \  }}|t
        v s|dz   c S  yt        |      dkD  rt        |      S y)a  Find the position of the end of a sentence in the provided text.

    This function uses NLTK's sentence tokenizer to detect sentence boundaries
    in the input text, combined with punctuation verification to ensure that
    single tokens without proper sentence endings aren't considered complete sentences.

    Args:
        text: The input text in which to find the end of the sentence.

    Returns:
        The position of the end of the sentence if found, otherwise 0.
    r      )rstripr
   lenr+   	enumerater-   )r.   	sentencesfirst_sentenceichs        r8   match_endofsentencerE   }   s     ;;=D d#Iq\N 9~~5DH ;;t9 t_ 	EAr<<1u	  9~>"" r:   tagscurrent_tagcurrent_tag_indexc                     |r|\  }}|| |d v rdt        |       fS ||fS |D ]g  \  }}| |d j                  |      }| |d j                  |      }|dk(  r|dk(  rd|fc S ||kD  r||ft        |       fc S ||k(  sZdt        |       fc S  d|fS )a  Parse text to identify start and end tag pairs.

    If a start tag was previously found (i.e., current_tag is valid), wait for
    the corresponding end tag. Otherwise, wait for a start tag.

    This function returns the index in the text where parsing should continue
    in the next call and the current or new tags.

    Args:
        text: The text to be parsed.
        tags: List of tuples containing start and end tags.
        current_tag: The currently active tags, if any.
        current_tag_index: The current index in the text.

    Returns:
        A tuple containing None or the current tag and the index of the text.
    Nr   )r?   count)	r.   rF   rG   rH   _end_tag	start_tagstart_tag_countend_tag_counts	            r8   parse_start_end_tagsrP      s    0  
7d,-..#d)$$.// # %	701288C./066w?aMQ$6+,,},(#d)44-#d)$$% #$$r:   c                   ,    e Zd ZU dZeed<   eed<   d Zy)TextPartForConcatenationa  Class representing a part of text for concatenation with concatenate_aggregated_text.

    Parameters:
        text: The text content.
        includes_inter_part_spaces: Whether any necessary inter-frame
            (leading/trailing) spaces are already included in the text.
    r.   includes_inter_part_spacesc                 T    | j                    d| j                   d| j                   dS )Nz(text: [z], includes_inter_part_spaces: ))namer.   rS   )selfs    r8   __str__z TextPartForConcatenation.__str__   s,    ))HTYYK/NtOnOnNoopqqr:   N)__name__
__module____qualname____doc__str__annotations__boolrX    r:   r8   rR   rR      s     I $$rr:   rR   
text_partsc                 r   dd| sS dt         ffd}| D ]  }|j                  ss	 ||       |j                  rr	 ||       2|j                  ssdz   ||       Nd   j                         s"|j                  d   j                         sdz   ||        j	                         S )a  Concatenate a list of text parts into a single string.

    This function joins the provided list of text parts into a single string,
    taking into account whether or not the parts already contain spacing.

    This function is useful for aggregating text segments received from LLMs or
    transcription services.

    Args:
        text_parts: A list of text parts to concatenate.

    Returns:
        A single concatenated string.
     Fpartc                 <    | j                   z  | j                  y )N)r.   rS   )rd   last_includes_inter_part_spacesresults    r8   append_partz0concatenate_aggregated_text.<locals>.append_part  s     	$))*.*I*I'r:    r=   r   )rR   r.   rS   isspacestrip)ra   rh   rd   rf   rg   s      @@r8   concatenate_aggregated_textrl      s     F&+#J2 J   yy **/N 009X cMF ":%%'		!0D0D0F#5D \\^FMr:   )%r\   redataclassesr   typingr   r   r   r   r   nltklogurur	   nltk.tokenizer
   datafindLookupErrordownloadOSErrorPermissionErroreerror	frozensetr+   r]   r^   r,   r-   StartEndTagsMatchr9   intrE   rP   rR   rl   r`   r:   r8   <module>r      s   
 ! = =   '
IINN)* /8,./ Ys^ .d 6??Z5[ "IcN [  "DD (3  S#X BHH 3 S S &3c 3c 3l)%
)%
<
 )% ,')% 	)%
 8L!3&')%X r r r ?D1I,J ?s ?c  


k._% 
B1# FG G	
 	


s;   C DC21D2D9DDDDD