#
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""Vision image frame aggregation for Pipecat.

This module provides frame aggregation functionality to combine text and image
frames into vision frames for multimodal processing.
"""

from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame
from pipecat.processors.aggregators.openai_llm_context import (
    OpenAILLMContext,
    OpenAILLMContextFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor


class VisionImageFrameAggregator(FrameProcessor):
    """Aggregates consecutive text and image frames into vision frames.

    .. deprecated:: 0.0.85
        VisionImageRawFrame has been removed in favor of context frames
        (LLMContextFrame or OpenAILLMContextFrame), so this aggregator is not
        needed anymore. See the 12* examples for the new recommended pattern.

    This aggregator waits for a consecutive TextFrame and an InputImageRawFrame.
    After the InputImageRawFrame arrives it will output a VisionImageRawFrame
    combining both the text and image data for multimodal processing.
    """

    def __init__(self):
        """Initialize the vision image frame aggregator.

        The aggregator starts with no cached text, waiting for the first
        TextFrame to arrive before it can create vision frames.
        """
        import warnings

        warnings.warn(
            "VisionImageFrameAggregator is deprecated. "
            "VisionImageRawFrame has been removed in favor of context frames "
            "(LLMContextFrame or OpenAILLMContextFrame), so this aggregator is "
            "not needed anymore. See the 12* examples for the new recommended "
            "pattern.",
            DeprecationWarning,
            stacklevel=2,
        )
        super().__init__()
        self._describe_text = None

    async def process_frame(self, frame: Frame, direction: FrameDirection):
        """Process incoming frames and aggregate text with images.

        Caches TextFrames and combines them with subsequent InputImageRawFrames
        to create VisionImageRawFrames. Other frames are passed through unchanged.

        Args:
            frame: The incoming frame to process.
            direction: The direction of frame flow in the pipeline.
        """
        await super().process_frame(frame, direction)

        if isinstance(frame, TextFrame):
            self._describe_text = frame.text
        elif isinstance(frame, InputImageRawFrame):
            if self._describe_text:
                context = OpenAILLMContext()
                context.add_image_frame_message(
                    text=self._describe_text,
                    image=frame.image,
                    size=frame.size,
                    format=frame.format,
                )
                frame = OpenAILLMContextFrame(context)
                await self.push_frame(frame)
                self._describe_text = None
        else:
            await self.push_frame(frame, direction)