Ë
    ±q±i¨H  ã                   ó¦   — d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ  G d„ d	e«      Z G d
„ de«      Z G d„ de«      Zy)a<  Pattern pair aggregator for processing structured content in streaming text.

This module provides an aggregator that identifies and processes content between
pattern pairs (like XML tags or custom delimiters) in streaming text, with
support for custom handlers and configurable actions for when a pattern is found.
é    N)ÚEnum)ÚAsyncIteratorÚ	AwaitableÚCallableÚListÚOptionalÚTuple)Úlogger)ÚAggregationÚAggregationType)ÚSimpleTextAggregatorc                   ó   — e Zd ZdZdZdZdZy)ÚMatchActiona¯  Actions to take when a pattern pair is matched.

    Parameters:
        REMOVE: The text along with its delimiters will be removed from the streaming text.
            Sentence aggregation will continue on as if this text did not exist.
        KEEP: The delimiters will be removed, but the content between them will be kept.
            Sentence aggregation will continue on with the internal text included.
        AGGREGATE: The delimiters will be removed and the content between will be treated
            as a separate aggregation. Any text before the start of the pattern will be
            returned early, whether or not a complete sentence was found. Then the pattern
            will be returned. Then the aggregation will continue on sentence matching after
            the closing delimiter is found. The content between the delimiters is not
            aggregated by sentence. It is aggregated as one single block of text.
    ÚremoveÚkeepÚ	aggregateN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚREMOVEÚKEEPÚ	AGGREGATE© ó    ú\/opt/pipecat/venv/lib/python3.12/site-packages/pipecat/utils/text/pattern_pair_aggregator.pyr   r      s   „ ñð €FØ€DØIr   r   c                   ó<   ‡ — e Zd ZdZdededefˆ fd„Zdefd„Zˆ xZS )ÚPatternMatcha6  Represents a matched pattern pair with its content.

    A PatternMatch object is created when a complete pattern pair is found
    in the text. It contains information about which pattern was matched,
    the full matched text (including start and end patterns), and the
    content between the patterns.
    ÚcontentÚtypeÚ
full_matchc                 ó6   •— t         ‰|   ||¬«       || _        y)au  Initialize a pattern match.

        Args:
            type: The type of the matched pattern pair. It should be representative
                   of the content type (e.g., 'sentence', 'code', 'speaker', 'custom').
            full_match: The complete text including start and end patterns.
            content: The text content between the start and end patterns.
        ©Útextr    N)ÚsuperÚ__init__r!   )Úselfr   r    r!   Ú	__class__s       €r   r&   zPatternMatch.__init__6   s   ø€ ô 	‰Ñ˜g¨DÐÔ1Ø$ˆr   Úreturnc                 óV   — d| j                   › d| j                  › d| j                  › dS )z–Return a string representation of the pattern match.

        Returns:
            A descriptive string showing the pattern type and content.
        zPatternMatch(type=z, text=z, full_match=ú))r    r$   r!   )r'   s    r   Ú__str__zPatternMatch.__str__B   s-   € ð $ D§I¡I ;¨g°d·i±i°[ÀÈdÏoÉoÐM^Ð^_Ð`Ð`r   )r   r   r   r   Ústrr&   r,   Ú__classcell__©r(   s   @r   r   r   -   s0   ø„ ñð
% ð 
%¨3ð 
%¸Cõ 
%ða˜÷ ar   r   c                   ó$  ‡ — e Zd ZdZˆ fd„Zedefd„«       Zej                  fde
de
de
dedd f
d	„Z	 dd
e
de
de
defd„Zde
deeged   f   dd fd„Z	 dde
dedeee   e
f   fd„Zde
deeeef      fd„Zde
dee   fˆ fd„Zˆ fd„Zˆ fd„Zˆ xZS )ÚPatternPairAggregatoraž  Aggregator that identifies and processes content between pattern pairs.

    This aggregator buffers text until it can identify complete pattern pairs
    (defined by start and end patterns), processes the content between these
    patterns using registered handlers. By default, its aggregation method
    returns text at sentence boundaries, and remove the content found between
    any matched patterns. However, matched patterns can also be configured to
    returned as a separate aggregation object containing the content between
    their start and end patterns or left in, so that only the delimiters are
    removed and a callback can be triggered.

    This aggregator is particularly useful for processing structured content in
    streaming text, such as XML tags, markdown formatting, or custom delimiters.

    The aggregator ensures that patterns spanning multiple text chunks are
    correctly identified.
    c                 óN   •— t        ‰|   di |¤Ž i | _        i | _        d| _        y)aA  Initialize the pattern pair aggregator.

        Creates an empty aggregator with no patterns or handlers registered.
        Text buffering and pattern detection will begin when text is aggregated.

        Args:
            **kwargs: Additional arguments passed to SimpleTextAggregator (e.g. aggregation_type).
        r   Nr   )r%   r&   Ú	_patternsÚ	_handlersÚ_last_processed_position)r'   Úkwargsr(   s     €r   r&   zPatternPairAggregator.__init__^   s+   ø€ ô 	‰ÑÑ"˜6Ò"ØˆŒØˆŒØ()ˆÕ%r   r)   c                 óð   — | j                  | j                  «      }| j                  j                  «       }|r#|d   j                  dt        j
                  «      nt        j
                  }t        ||¬«      S )z{Get the currently aggregated text.

        Returns:
            The text that has been accumulated in the buffer.
        é   r    r#   )Ú_match_start_of_patternÚ_textÚstripÚgetr   ÚSENTENCEr   )r'   Úpattern_startÚstripped_textr    s       r   r$   zPatternPairAggregator.textl   si   € ð ×4Ñ4°T·Z±ZÓ@ˆØŸ
™
×(Ñ(Ó*ˆñ ð ˜!Ñ× Ñ  ¬×)AÑ)AÔBä ×)Ñ)ð 	ô
  °DÔ9Ð9r   r    Ústart_patternÚend_patternÚactionc                 ó®   — |t         j                  t         j                  t         j                  fv rt	        d|› d«      ‚||||dœ| j
                  |<   | S )aš  Add a pattern pair to detect in the text.

        Registers a new pattern pair with a unique identifier. The aggregator
        will look for text that starts with the start pattern and ends with
        the end pattern, and treat the content between them as a match.

        Args:
            type: Identifier for this pattern pair. Should be unique and ideally descriptive.
                (e.g., 'code', 'speaker', 'custom'). type can not be 'sentence' or 'word' as
                those are reserved for the default behavior.
            start_pattern: Pattern that marks the beginning of content.
            end_pattern: Pattern that marks the end of content.
            action: What to do when a complete pattern is matched.

                - MatchAction.REMOVE: Remove the matched pattern from the text.
                - MatchAction.KEEP: Keep the matched pattern in the text and treat it as normal text. This allows you to register handlers for the pattern without affecting the aggregation logic.
                - MatchAction.AGGREGATE: Return the matched pattern as a separate aggregation object.

        Returns:
            Self for method chaining.
        zThe aggregation type 'zK' is reserved for default behavior and can not be used for custom patterns.)ÚstartÚendr    rB   )r   r=   ÚWORDÚTOKENÚ
ValueErrorr3   )r'   r    r@   rA   rB   s        r   Úadd_patternz!PatternPairAggregator.add_pattern|   sd   € ð8 ”O×,Ñ,¬o×.BÑ.BÄO×DYÑDYÐZÑZÜØ(¨¨Ð.yÐzóð ð #ØØØñ	 
ˆ‰tÑð ˆr   Ú
pattern_idÚremove_matchc                 ó  — ddl }|j                  «       5  |j                  d«       |j                  dt        d¬«       ddd«       |rt
        j                  nt
        j                  }| j                  ||||¬«      S # 1 sw Y   Œ@xY w)a&  Add a pattern pair to detect in the text.

        .. deprecated:: 0.0.95
            This function is deprecated and will be removed in a future version.
            Use `add_pattern` with a type and MatchAction instead.

            This method calls `add_pattern` setting type with the provided pattern_id and action
            to either MatchAction.REMOVE or MatchAction.KEEP based on `remove_match`.

        Args:
            pattern_id: Identifier for this pattern pair. Should be unique and ideally descriptive.
                        (e.g., 'code', 'speaker', 'custom'). pattern_id can not be 'sentence' or 'word'
                        as those arereserved for the default behavior.
            start_pattern: Pattern that marks the beginning of content.
            end_pattern: Pattern that marks the end of content.
            remove_match: If True, the matched pattern will be removed from the text. (Same as MatchAction.REMOVE)
                          If False, it will be kept and treated as normal text. (Same as MatchAction.KEEP)
        r   NÚoncezadd_pattern_pair with a pattern_id or remove_match is deprecated and will be removed in a future version. Use add_pattern with a type and MatchAction insteadé   )Ú
stacklevel)r    r@   rA   rB   )	ÚwarningsÚcatch_warningsÚsimplefilterÚwarnÚDeprecationWarningr   r   r   rI   )r'   rJ   r@   rA   rK   rP   rB   s          r   Úadd_pattern_pairz&PatternPairAggregator.add_pattern_pair¤   sŽ   € ó* 	à×$Ñ$Ó&ñ 	Ø×!Ñ! &Ô)ØM‰Mðdä"Øð	 ô ÷	ñ (4”×#Ò#¼×9IÑ9IˆØ×ÑØØ'Ø#Øð	  ó 
ð 	
÷	ð 	ús   •*A>Á>BÚhandlerNc                 ó$   — || j                   |<   | S )aÌ  Register a handler for when a pattern pair is matched.

        The handler will be called whenever a complete match for the
        specified type is found in the text.

        Args:
            type: The type of the pattern pair to trigger the handler.
            handler: Async function to call when pattern is matched.
                     The function should accept a PatternMatch object.

        Returns:
            Self for method chaining.
        )r4   )r'   r    rV   s      r   Úon_pattern_matchz&PatternPairAggregator.on_pattern_matchÌ   s   € ð   'ˆ‰tÑØˆr   r$   Úlast_processed_positionc           	   ƒ   ó"  K  — g }|}| j                   j                  «       D ]3  \  }}t        j                  |d   «      }t        j                  |d   «      }|d   }	|› d|› }
t        j                  |
|t        j
                  «      }t        |«      }|D ]¼  }|j                  d«      }|j                  d«      }t        |j                  «       ||¬«      }|j                  «       |k  }|s,|| j                  v r	  | j                  |   |«      ƒ d{  –—†  |	t        j                   k(  r|rŒ˜|j#                  |dd«      }Œ¬|j%                  |«       Œ¾ Œ6 ||fS 7 ŒH# t        $ r%}t        j                  d	|› d
|› «       Y d}~Œod}~ww xY w­w)a  Process newly complete pattern pairs in the text.

        Searches for pattern pairs that have been completed since last_processed_position,
        calls the appropriate handlers, and optionally removes the matches.

        Args:
            text: The text to process for pattern matches.
            last_processed_position: The position in text that was already processed.
                Only patterns that end at or after this position will be processed.

        Returns:
            Tuple of (all_matches, processed_text) where:

            - all_matches is a list of all pattern matches found. Note: There really should only ever be 1.
            - processed_text is the text after processing patterns. If no patterns are found, it will be the same as input text.
        rD   rE   rB   z(.*?)r8   r   ©r   r    r!   NzError in pattern handler for z: Ú )r3   ÚitemsÚreÚescapeÚfinditerÚDOTALLÚlistÚgroupr   r;   rE   r4   Ú	Exceptionr
   Úerrorr   r   ÚreplaceÚappend)r'   r$   rY   Úall_matchesÚprocessed_textr    Úpattern_inforD   rE   rB   ÚregexÚ
match_iterÚmatchesÚmatchr   r!   Úpattern_matchÚalready_processedÚes                      r   Ú_process_complete_patternsz0PatternPairAggregator._process_complete_patternsß   s–  è ø€ ð& ˆØˆà"&§.¡.×"6Ñ"6Ó"8ó (	6ÑˆD,ä—I‘I˜l¨7Ñ3Ó4ˆEÜ—)‘)˜L¨Ñ/Ó0ˆCØ! (Ñ+ˆFð g˜U 3 %Ð(ˆEô Ÿ™ U¨N¼B¿I¹IÓFˆJÜ˜:Ó&ˆGà ò 6ØŸ+™+ a›.Ø"Ÿ[™[¨›^
ô !-Ø#ŸM™M›O°$À:ô!ð
 %*§I¡I£KÐ3JÑ$JÐ!ñ )¨T°T·^±^Ñ-CðRØ2˜dŸn™n¨TÑ2°=ÓA×AÐAð
 œ[×/Ñ/Ò/â,Ø)7×)?Ñ)?À
ÈBÐPQÓ)R™ð  ×&Ñ& }Õ5ò56ð(	6ðT ˜NÐ*Ð*ð BùÜ$ò RÜŸ™Ð'DÀTÀFÈ"ÈQÈCÐ%P×QÑQûðRüsH   ‚C8FÃ;EÄEÄEÄFÄ..FÅEÅ	FÅ'FÆFÆFÆFc                 óØ   — | j                   j                  «       D ]M  \  }}|d   }|d   }|j                  |«      }|j                  |«      }||kD  sŒ8|j                  |«      }||gc S  y)aÉ  Check if text contains incomplete pattern pairs.

        Determines whether the text contains any start patterns without
        matching end patterns, which would indicate incomplete content.

        Args:
            text: The text to check for incomplete patterns.

        Returns:
            A tuple of (start_index, pattern_info) if an incomplete pattern is found,
            or None if no patterns are found or all patterns are complete.
        rD   rE   N)r3   r]   ÚcountÚfind)	r'   r$   r    rj   rD   rE   Ústart_countÚ	end_countÚstart_indexs	            r   r9   z-PatternPairAggregator._match_start_of_pattern!  s~   € ð #'§.¡.×"6Ñ"6Ó"8ò 	3ÑˆD,Ø  Ñ)ˆEØ˜uÑ%ˆCð Ÿ*™* UÓ+ˆKØŸ
™
 3›ˆIð ˜YÓ&Ø"Ÿi™i¨Ó.Ø# \Ð2Ò2ð	3ð  r   c                óð  •K  — |D ]`  }| xj                   |z  c_         | j                  | j                   | j                  «      ƒ d{  –—† \  }}t        | j                   «      | _        || _         t        |«      dkD  rt        |«      dkD  r1t	        j
                  d|D cg c]  }|j                  ‘Œ c}› d«       | j                  |d   j                     j                  dt        j                  «      }|t        j                  k(  rd| _         |d   ­–— Œ| j                  | j                   «      }|Å|d   dk(  s4|d   j                  dt        j                  «      t        j                  k7  rŒq| j                   d|d    }| j                   |d   d | _         | j                  t        j                  k(  rt        j                  nt        j                   }	t#        |j%                  «       |	|¬«      ­–— Œø| j                  t        j                  k7  sŒt&        ‰| Q  |«      ƒ d{  –—† }
|
sŒ2t#        |
j*                  |
j                  |
j*                  ¬«      ­–— Œc | j                  t        j                  k(  rd| j                   rW| j                  | j                   «      €;t#        | j                   t        j                  | j                   ¬«      ­–— d| _         yyyy7 Œ¤c c}w 7 ŒÄ­w)	a#  Aggregate text and process pattern pairs.

        Processes the input text character-by-character, handles pattern pairs,
        and uses the parent's lookahead logic for sentence detection when no
        patterns are active.

        In TOKEN mode, pattern detection still works but non-pattern text is
        yielded as TOKEN aggregations instead of waiting for sentence boundaries.

        Args:
            text: Text to aggregate.

        Yields:
            PatternMatch objects as patterns complete or sentences are detected.
        Nr   r8   zMultiple patterns matched: z*. Only the first pattern will be returned.rB   r\   r[   )r:   rr   r5   Úlenr
   Úwarningr    r3   r<   r   r   r   r9   Ú_aggregation_typer   rG   r=   r   r;   r%   Ú_check_sentence_with_lookaheadr$   )r'   r$   ÚcharÚpatternsri   ÚprB   r>   ÚresultÚagg_typeÚaggregationr(   s              €r   r   zPatternPairAggregator.aggregate@  sœ  øè ø€ ð" ó ;	ˆDØJŠJ˜$ÑJð .2×-LÑ-LØ—
‘
˜D×9Ñ9ó.÷ (Ñ$ˆHnô -0°·
±
«OˆDÔ)à'ˆDŒJä8‹}˜qÒ Üx“= 1Ò$Ü—N‘NØ5ÀxÖ6PÀ!°q·v³vÒ6PÐ5QÐQ{Ð|ôð Ÿ™¨°©×(8Ñ(8Ñ9×=Ñ=¸hÌ×HZÑHZÓ[Øœ[×2Ñ2Ò2Ø!#D”JØ" 1™+Ó%Ùð !×8Ñ8¸¿¹ÓDˆMØÐ(ð " !Ñ$¨Ò)Ø$ QÑ'×+Ñ+¨H´k×6HÑ6HÓIÌ[×MbÑMbÒbáð Ÿ™Ð$6 m°AÑ&6Ð7Ø!ŸZ™Z¨°aÑ(8Ð(:Ð;”
ð ×-Ñ-´×1FÑ1FÒFô $×)Ò)ä(×1Ñ1ð ô
 #¨6¯<©<«>ÀÐU[Ô\Ó\Ùà×%Ñ%¬×)>Ñ)>Ô>ä$)¡GÑ$JÈ4Ó$P×PÛä&Ø +× 0Ñ 0Ø(×-Ñ-Ø#.×#3Ñ#3ôõ ðo;	ð~ ×!Ñ!¤_×%:Ñ%:Ò:¸t¿zºzØ×+Ñ+¨D¯J©JÓ7Ð?Ü"Ø ŸJ™JÜ(×.Ñ.Ø#Ÿz™zôó ð
  •
ð @ð @JÐ:ðu(ûò 7QðB Qús?   ƒAK6ÁK,Á	AK6Â!K/Â4E'K6ÈK6È/K4È0K6È8B5K6Ë/K6c              ƒ   óL   •K  — t         ‰|   «       ƒ d{  –—†  d| _        y7 Œ­w)zÕHandle interruptions by clearing the buffer and pattern state.

        Called when an interruption occurs in the processing pipeline,
        to reset the state and discard any partially aggregated text.
        Nr   )r%   Úhandle_interruptionr5   ©r'   r(   s    €r   r…   z)PatternPairAggregator.handle_interruption™  s'   øè ø€ ô ‰gÑ)Ó+×+Ð+Ø()ˆÕ%ð 	,úó   ƒ$•"–$c              ƒ   óL   •K  — t         ‰|   «       ƒ d{  –—†  d| _        y7 Œ­w)z­Clear the internally aggregated text.

        Resets the aggregator to its initial state, discarding any
        buffered text and clearing pattern tracking state.
        Nr   )r%   Úresetr5   r†   s    €r   r‰   zPatternPairAggregator.reset£  s%   øè ø€ ô ‰g‰m‹o×ÐØ()ˆÕ%ð 	úr‡   )T)r   )r   r   r   r   r&   Úpropertyr   r$   r   r   r-   rI   ÚboolrU   r   r   r   rX   Úintr	   r   rr   r   Údictr9   r   r   r…   r‰   r.   r/   s   @r   r1   r1   K   sE  ø„ ñô$*ð ð:kò :ó ð:ð( *×0Ñ0ñ&àð&ð ð&ð ð	&ð
 ð&ð 
!ó&ðR [_ñ&
Øð&
Ø.1ð&
Ø@Cð&
ØSWó&
ðPØðØ"*¨L¨>¸9ÀT¹?Ð+JÑ"Kðà	 óð( 9:ñ@+Øð@+Ø25ð@+à	ˆtLÑ! 3Ð&Ñ	'ó@+ðD¨Cð °H¸UÀ3ÈÀ9Ñ=MÑ4Nó ð>W  Cð W ¨M¸,Ñ,Gõ W ôr*÷*ð *r   r1   )r   r^   Úenumr   Útypingr   r   r   r   r   r	   Úlogurur
   Ú'pipecat.utils.text.base_text_aggregatorr   r   Ú)pipecat.utils.text.simple_text_aggregatorr   r   r   r1   r   r   r   ú<module>r“      sJ   ðñó 
Ý ß L× Lå ç PÝ Jô$ô ô*a;ô aô<_*Ð0õ _*r   