
    qi                     R    d Z ddlmZmZ ddlmZmZ ddlmZm	Z	m
Z
  G d de
      Zy)zSimple text aggregator for basic sentence-boundary text processing.

This module provides a straightforward text aggregator that accumulates text
until it finds an end-of-sentence marker, making it suitable for basic TTS
text processing scenarios.
    )AsyncIteratorOptional)SENTENCE_ENDING_PUNCTUATIONmatch_endofsentence)AggregationAggregationTypeBaseTextAggregatorc                        e Zd ZdZ fdZedefd       Zdede	e   fdZ
dedee   fdZdee   fd	Zd
 Zd Z xZS )SimpleTextAggregatora7  Simple text aggregator that accumulates text until sentence boundaries.

    This aggregator provides basic functionality for accumulating text tokens
    and releasing them when an end-of-sentence marker is detected. It's the
    most straightforward implementation of text aggregation for TTS processing.
    c                 @    t        |   di | d| _        d| _        y)zInitialize the simple text aggregator.

        Creates an empty text buffer ready to begin accumulating text tokens.

        Args:
            **kwargs: Additional arguments passed to BaseTextAggregator (e.g. aggregation_type).
         FN )super__init___text_needs_lookahead)selfkwargs	__class__s     [/opt/pipecat/venv/lib/python3.12/site-packages/pipecat/utils/text/simple_text_aggregator.pyr   zSimpleTextAggregator.__init__   s$     	"6"
&+    returnc                 j    t        | j                  j                  d      t        j                        S )z{Get the currently aggregated text.

        Returns:
            The text that has been accumulated in the buffer.
         texttype)r   r   stripr   SENTENCEr   s    r   r   zSimpleTextAggregator.text(   s&     

 0 0 5O<T<TUUr   r   c                  K   | j                   t        j                  k(  r!|rt        |t        j                         y|D ]8  }| xj                  |z  c_        | j                  |       d{   }|s4| : y7 w)a  Aggregate text and yield completed aggregations.

        In SENTENCE mode, processes the input text character-by-character. When
        sentence-ending punctuation is detected, it waits for non-whitespace
        lookahead before calling NLTK.

        In TOKEN mode, yields the text immediately without buffering.

        Args:
            text: Text to aggregate.

        Yields:
            Aggregation objects (sentences in SENTENCE mode, tokens in TOKEN mode).
        r   N)_aggregation_typer   TOKENr   r   _check_sentence_with_lookahead)r   r   charresults       r   	aggregatezSimpleTextAggregator.aggregate1   sw      !!_%:%::!t/2G2GHH  	DJJ$J  >>tDDF	 Es   A-B/A?0B7	Br%   c                 n  K   | j                   r}|j                         rld| _         t        | j                        }|rM| j                  d| }| j                  |d | _        t	        |j                  d      t
        j                        S yy| j                  r| j                  d   t        v rd| _         yw)a  Check for sentence boundaries using lookahead logic.

        This method implements the core sentence detection logic with lookahead.
        When sentence-ending punctuation is detected, it waits for the next
        non-whitespace character before calling NLTK. This disambiguates cases
        like "$29." (not a sentence) vs "$29. Next" (sentence ends at period).
        Whitespace alone is not meaningful lookahead since it appears in both
        cases. Instead, the first non-whitespace character after the punctuation
        is used to confirm the sentence boundary.

        Subclasses can call this via super() to reuse the lookahead behavior
        while adding their own logic (e.g., tag handling, pattern matching).

        Args:
            char: The most recently added character (used for lookahead check).

        Returns:
            Aggregation if sentence found, None otherwise.
        FNr   r   T)r   r   r   r   r   r   r   r   )r   r%   
eos_markerr&   s       r   r$   z3SimpleTextAggregator._check_sentence_with_lookaheadN   s     *   zz|(-%0<
!ZZ4F!%JK!8DJ&FLL,=OD\D\]] ::$**R.,GG$(D!s   B3B5c                    K   | j                   t        j                  k(  ry| j                  rN| j                  }| j	                          d{    t        |j                  d      t        j                        S y7 /w)ai  Flush any remaining text in the buffer.

        Returns any text remaining in the buffer. This is called at the end
        of a stream to ensure all text is processed. In TOKEN mode, returns
        None since tokens are yielded immediately.

        Returns:
            Any remaining text as a sentence, or None if buffer is empty or in TOKEN mode.
        Nr   r   )r"   r   r#   r   resetr   r   r   )r   r&   s     r   flushzSimpleTextAggregator.flush{   sb      !!_%:%::::ZZF**,FLL$5O<T<TUU s   A
A>A<0A>c                 (   K   d| _         d| _        yw)zHandle interruptions by clearing the text buffer.

        Called when an interruption occurs in the processing pipeline,
        discarding any partially accumulated text.
        r   FNr   r   r    s    r   handle_interruptionz(SimpleTextAggregator.handle_interruption         
 %   c                 (   K   d| _         d| _        yw)zClear the internally aggregated text.

        Resets the aggregator to its initial empty state, discarding
        any accumulated text content.
        r   FNr/   r    s    r   r,   zSimpleTextAggregator.reset   r1   r2   )__name__
__module____qualname____doc__r   propertyr   r   strr   r'   r   r$   r-   r0   r,   __classcell__)r   s   @r   r   r      sw    
, Vk V VC M+,F :+ ++AV +ZXk2 (&&r   r   N)r7   typingr   r   pipecat.utils.stringr   r   'pipecat.utils.text.base_text_aggregatorr   r   r	   r   r   r   r   <module>r>      s'    + Q d dK&- K&r   