
    qiH                         d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ  G d d	e      Z G d
 de      Z G d de      Zy)a<  Pattern pair aggregator for processing structured content in streaming text.

This module provides an aggregator that identifies and processes content between
pattern pairs (like XML tags or custom delimiters) in streaming text, with
support for custom handlers and configurable actions for when a pattern is found.
    N)Enum)AsyncIterator	AwaitableCallableListOptionalTuple)logger)AggregationAggregationType)SimpleTextAggregatorc                       e Zd ZdZdZdZdZy)MatchActiona  Actions to take when a pattern pair is matched.

    Parameters:
        REMOVE: The text along with its delimiters will be removed from the streaming text.
            Sentence aggregation will continue on as if this text did not exist.
        KEEP: The delimiters will be removed, but the content between them will be kept.
            Sentence aggregation will continue on with the internal text included.
        AGGREGATE: The delimiters will be removed and the content between will be treated
            as a separate aggregation. Any text before the start of the pattern will be
            returned early, whether or not a complete sentence was found. Then the pattern
            will be returned. Then the aggregation will continue on sentence matching after
            the closing delimiter is found. The content between the delimiters is not
            aggregated by sentence. It is aggregated as one single block of text.
    removekeep	aggregateN)__name__
__module____qualname____doc__REMOVEKEEP	AGGREGATE     \/opt/pipecat/venv/lib/python3.12/site-packages/pipecat/utils/text/pattern_pair_aggregator.pyr   r      s     FDIr   r   c                   <     e Zd ZdZdededef fdZdefdZ xZS )PatternMatcha6  Represents a matched pattern pair with its content.

    A PatternMatch object is created when a complete pattern pair is found
    in the text. It contains information about which pattern was matched,
    the full matched text (including start and end patterns), and the
    content between the patterns.
    contenttype
full_matchc                 6    t         |   ||       || _        y)au  Initialize a pattern match.

        Args:
            type: The type of the matched pattern pair. It should be representative
                   of the content type (e.g., 'sentence', 'code', 'speaker', 'custom').
            full_match: The complete text including start and end patterns.
            content: The text content between the start and end patterns.
        textr    N)super__init__r!   )selfr   r    r!   	__class__s       r   r&   zPatternMatch.__init__6   s     	gD1$r   returnc                 V    d| j                    d| j                   d| j                   dS )zReturn a string representation of the pattern match.

        Returns:
            A descriptive string showing the pattern type and content.
        zPatternMatch(type=z, text=z, full_match=))r    r$   r!   )r'   s    r   __str__zPatternMatch.__str__B   s-     $DII;gdii[dooM^^_``r   )r   r   r   r   strr&   r,   __classcell__r(   s   @r   r   r   -   s0    
% 
%3 
%C 
%a ar   r   c                   $    e Zd ZdZ fdZedefd       Zej                  fde
de
de
dedd f
d	Z	 dd
e
de
de
defdZde
deeged   f   dd fdZ	 dde
dedeee   e
f   fdZde
deeeef      fdZde
dee   f fdZ fdZ fdZ xZS )PatternPairAggregatora  Aggregator that identifies and processes content between pattern pairs.

    This aggregator buffers text until it can identify complete pattern pairs
    (defined by start and end patterns), processes the content between these
    patterns using registered handlers. By default, its aggregation method
    returns text at sentence boundaries, and remove the content found between
    any matched patterns. However, matched patterns can also be configured to
    returned as a separate aggregation object containing the content between
    their start and end patterns or left in, so that only the delimiters are
    removed and a callback can be triggered.

    This aggregator is particularly useful for processing structured content in
    streaming text, such as XML tags, markdown formatting, or custom delimiters.

    The aggregator ensures that patterns spanning multiple text chunks are
    correctly identified.
    c                 N    t        |   di | i | _        i | _        d| _        y)aA  Initialize the pattern pair aggregator.

        Creates an empty aggregator with no patterns or handlers registered.
        Text buffering and pattern detection will begin when text is aggregated.

        Args:
            **kwargs: Additional arguments passed to SimpleTextAggregator (e.g. aggregation_type).
        r   Nr   )r%   r&   	_patterns	_handlers_last_processed_position)r'   kwargsr(   s     r   r&   zPatternPairAggregator.__init__^   s+     	"6"()%r   r)   c                     | j                  | j                        }| j                  j                         }|r#|d   j                  dt        j
                        nt        j
                  }t        ||      S )z{Get the currently aggregated text.

        Returns:
            The text that has been accumulated in the buffer.
           r    r#   )_match_start_of_pattern_textstripgetr   SENTENCEr   )r'   pattern_startstripped_textr    s       r   r$   zPatternPairAggregator.textl   si     44TZZ@

((*  !  )A)AB )) 	
 D99r   r    start_patternend_patternactionc                     |t         j                  t         j                  t         j                  fv rt	        d| d      ||||d| j
                  |<   | S )a  Add a pattern pair to detect in the text.

        Registers a new pattern pair with a unique identifier. The aggregator
        will look for text that starts with the start pattern and ends with
        the end pattern, and treat the content between them as a match.

        Args:
            type: Identifier for this pattern pair. Should be unique and ideally descriptive.
                (e.g., 'code', 'speaker', 'custom'). type can not be 'sentence' or 'word' as
                those are reserved for the default behavior.
            start_pattern: Pattern that marks the beginning of content.
            end_pattern: Pattern that marks the end of content.
            action: What to do when a complete pattern is matched.

                - MatchAction.REMOVE: Remove the matched pattern from the text.
                - MatchAction.KEEP: Keep the matched pattern in the text and treat it as normal text. This allows you to register handlers for the pattern without affecting the aggregation logic.
                - MatchAction.AGGREGATE: Return the matched pattern as a separate aggregation object.

        Returns:
            Self for method chaining.
        zThe aggregation type 'zK' is reserved for default behavior and can not be used for custom patterns.)startendr    rB   )r   r=   WORDTOKEN
ValueErrorr3   )r'   r    r@   rA   rB   s        r   add_patternz!PatternPairAggregator.add_pattern|   sd    8 O,,o.B.BODYDYZZ(.yz  #	 
t r   
pattern_idremove_matchc                    ddl }|j                         5  |j                  d       |j                  dt        d       ddd       |rt
        j                  nt
        j                  }| j                  ||||      S # 1 sw Y   @xY w)a&  Add a pattern pair to detect in the text.

        .. deprecated:: 0.0.95
            This function is deprecated and will be removed in a future version.
            Use `add_pattern` with a type and MatchAction instead.

            This method calls `add_pattern` setting type with the provided pattern_id and action
            to either MatchAction.REMOVE or MatchAction.KEEP based on `remove_match`.

        Args:
            pattern_id: Identifier for this pattern pair. Should be unique and ideally descriptive.
                        (e.g., 'code', 'speaker', 'custom'). pattern_id can not be 'sentence' or 'word'
                        as those arereserved for the default behavior.
            start_pattern: Pattern that marks the beginning of content.
            end_pattern: Pattern that marks the end of content.
            remove_match: If True, the matched pattern will be removed from the text. (Same as MatchAction.REMOVE)
                          If False, it will be kept and treated as normal text. (Same as MatchAction.KEEP)
        r   Noncezadd_pattern_pair with a pattern_id or remove_match is deprecated and will be removed in a future version. Use add_pattern with a type and MatchAction instead   )
stacklevel)r    r@   rA   rB   )	warningscatch_warningssimplefilterwarnDeprecationWarningr   r   r   rI   )r'   rJ   r@   rA   rK   rP   rB   s          r   add_pattern_pairz&PatternPairAggregator.add_pattern_pair   s    * 	$$& 	!!&)MMd"	  	 (4##9I9I'#	   
 	
	 	s   *A>>BhandlerNc                 $    || j                   |<   | S )a  Register a handler for when a pattern pair is matched.

        The handler will be called whenever a complete match for the
        specified type is found in the text.

        Args:
            type: The type of the pattern pair to trigger the handler.
            handler: Async function to call when pattern is matched.
                     The function should accept a PatternMatch object.

        Returns:
            Self for method chaining.
        )r4   )r'   r    rV   s      r   on_pattern_matchz&PatternPairAggregator.on_pattern_match   s       'tr   r$   last_processed_positionc           	      "  K   g }|}| j                   j                         D ]3  \  }}t        j                  |d         }t        j                  |d         }|d   }	| d| }
t        j                  |
|t        j
                        }t        |      }|D ]  }|j                  d      }|j                  d      }t        |j                         ||      }|j                         |k  }|s,|| j                  v r	  | j                  |   |       d{    |	t        j                   k(  r|r|j#                  |dd      }|j%                  |        6 ||fS 7 H# t        $ r%}t        j                  d	| d
|        Y d}~od}~ww xY ww)a  Process newly complete pattern pairs in the text.

        Searches for pattern pairs that have been completed since last_processed_position,
        calls the appropriate handlers, and optionally removes the matches.

        Args:
            text: The text to process for pattern matches.
            last_processed_position: The position in text that was already processed.
                Only patterns that end at or after this position will be processed.

        Returns:
            Tuple of (all_matches, processed_text) where:

            - all_matches is a list of all pattern matches found. Note: There really should only ever be 1.
            - processed_text is the text after processing patterns. If no patterns are found, it will be the same as input text.
        rD   rE   rB   z(.*?)r8   r   r   r    r!   NzError in pattern handler for z:  )r3   itemsreescapefinditerDOTALLlistgroupr   r;   rE   r4   	Exceptionr
   errorr   r   replaceappend)r'   r$   rY   all_matchesprocessed_textr    pattern_inforD   rE   rB   regex
match_itermatchesmatchr   r!   pattern_matchalready_processedes                      r   _process_complete_patternsz0PatternPairAggregator._process_complete_patterns   s    & "&.."6"6"8 (	6D,IIl734E))L/0C!(+F gU3%(E UNBIIFJ:&G  6++a."[[^
 !-#MMO$:!
 %*IIK3J$J! )TT^^-CR2dnnT2=AAA
 [///,)7)?)?
BPQ)R  &&}556(	6T N** B$ R'DTF"QC%PQQRsH   C8F;EEEF..FE	F'FFFFc                     | j                   j                         D ]M  \  }}|d   }|d   }|j                  |      }|j                  |      }||kD  s8|j                  |      }||gc S  y)a  Check if text contains incomplete pattern pairs.

        Determines whether the text contains any start patterns without
        matching end patterns, which would indicate incomplete content.

        Args:
            text: The text to check for incomplete patterns.

        Returns:
            A tuple of (start_index, pattern_info) if an incomplete pattern is found,
            or None if no patterns are found or all patterns are complete.
        rD   rE   N)r3   r]   countfind)	r'   r$   r    rj   rD   rE   start_count	end_countstart_indexs	            r   r9   z-PatternPairAggregator._match_start_of_pattern!  s~     #'.."6"6"8 	3D, )Eu%C **U+K

3I Y&"ii.#\22	3  r   c                  K   |D ]`  }| xj                   |z  c_         | j                  | j                   | j                         d{   \  }}t        | j                         | _        || _         t        |      dkD  rt        |      dkD  r1t	        j
                  d|D cg c]  }|j                   c} d       | j                  |d   j                     j                  dt        j                        }|t        j                  k(  rd| _         |d    | j                  | j                         }||d   dk(  s4|d   j                  dt        j                        t        j                  k7  rq| j                   d|d    }| j                   |d   d | _         | j                  t        j                  k(  rt        j                  nt        j                   }	t#        |j%                         |	|       | j                  t        j                  k7  st&        | Q  |       d{   }
|
s2t#        |
j*                  |
j                  |
j*                         c | j                  t        j                  k(  rd| j                   rW| j                  | j                         ;t#        | j                   t        j                  | j                          d| _         yyyy7 c c}w 7 ĭw)	a#  Aggregate text and process pattern pairs.

        Processes the input text character-by-character, handles pattern pairs,
        and uses the parent's lookahead logic for sentence detection when no
        patterns are active.

        In TOKEN mode, pattern detection still works but non-pattern text is
        yielded as TOKEN aggregations instead of waiting for sentence boundaries.

        Args:
            text: Text to aggregate.

        Yields:
            PatternMatch objects as patterns complete or sentences are detected.
        Nr   r8   zMultiple patterns matched: z*. Only the first pattern will be returned.rB   r\   r[   )r:   rr   r5   lenr
   warningr    r3   r<   r   r   r   r9   _aggregation_typer   rG   r=   r   r;   r%   _check_sentence_with_lookaheadr$   )r'   r$   charpatternsri   prB   r>   resultagg_typeaggregationr(   s              r   r   zPatternPairAggregator.aggregate@  s    "  ;	DJJ$J .2-L-L

D99. ($Hn -0

OD)'DJ8}q x=1$NN5x6P!qvv6P5QQ{| (8(89==hHZHZ[[222!#DJ"1+% !88DM( "!$)$Q'++Hk6H6HI[MbMbb $6mA&67!ZZa(8(:;
 --1F1FF $))(11 
 #6<<>U[\\%%)>)>>$)G$J4$PP& + 0 0(--#.#3#3 o;	~ !!_%:%::tzz++DJJ7?" JJ(..#zz 
  
 @ @J:u( 7QB Qs?   AK6K,	AK6!K/4E'K6K6/K40K68B5K6/K6c                 L   K   t         |           d{    d| _        y7 w)zHandle interruptions by clearing the buffer and pattern state.

        Called when an interruption occurs in the processing pipeline,
        to reset the state and discard any partially aggregated text.
        Nr   )r%   handle_interruptionr5   r'   r(   s    r   r   z)PatternPairAggregator.handle_interruption  s'      g)+++()% 	,   $"$c                 L   K   t         |           d{    d| _        y7 w)zClear the internally aggregated text.

        Resets the aggregator to its initial state, discarding any
        buffered text and clearing pattern tracking state.
        Nr   )r%   resetr5   r   s    r   r   zPatternPairAggregator.reset  s%      gmo()% 	r   )T)r   )r   r   r   r   r&   propertyr   r$   r   r   r-   rI   boolrU   r   r   r   rX   intr	   r   rr   r   dictr9   r   r   r   r   r.   r/   s   @r   r1   r1   K   sE   $* :k : :( *00&& & 	&
 & 
!&R [_&
&
.1&
@C&
SW&
P"*L>9T?+J"K	 ( 9:@+@+25@+	tL!3&	'@+DC HU39=M4N >W C W M,,G W r** *r   r1   )r   r^   enumr   typingr   r   r   r   r   r	   logurur
   'pipecat.utils.text.base_text_aggregatorr   r   )pipecat.utils.text.simple_text_aggregatorr   r   r   r1   r   r   r   <module>r      sJ    
  L L  P J$ *a; a<_*0 _*r   