
    qil`                     x   d dl Z d dlmZmZmZ d dlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlmZmZmZmZmZ dd	lmZ d
dlmZmZ  e       rd dlZddlmZ d
dlmZ  e       rd dl m!Z!  ejD                  e#      Z$ G d de jJ                        Z& G d d      Z' e ed             G d de             Z(y)    N)AnyUnionoverload   )
AudioInput)GenerationConfig)
ImageInput)ProcessingKwargsUnpack)add_end_docstringsis_torch_availableis_vision_availableloggingrequires_backends)
VideoInput   )Pipelinebuild_pipeline_init_args)%MODEL_FOR_MULTIMODAL_LM_MAPPING_NAMES)
KeyDataset)Imagec                       e Zd ZdZdZdZy)
ReturnTyper   r   r   N)__name__
__module____qualname__TENSORSNEW_TEXT	FULL_TEXT     S/opt/pipecat/venv/lib/python3.12/site-packages/transformers/pipelines/any_to_any.pyr   r   /   s    GHIr!   r   c                   "    e Zd ZdZdee   fdZy)Chata   This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
    to this format because the rest of the pipeline code tends to assume that lists of messages are
    actually a batch of samples rather than messages in the same conversation.messagesc                 F    |D ]  }d|v rd|v rt        d       || _        y )NrolecontentzQWhen passing chat dicts as input, each dict must have a 'role' and 'content' key.)
ValueErrorr%   )selfr%   messages      r"   __init__zChat.__init__:   s7     	vGg%)w*> !tuu	v !r!   N)r   r   r   __doc__listdictr,   r    r!   r"   r$   r$   5   s    R!d !r!   r$   T)has_processorc                   |    e Zd ZdZdZdZdZdZdZ e	d      Z
 fdZ	 	 	 	 	 	 	 	 	 	 	 ddee   fd	Ze	 	 	 	 dd
edz  deedf   dz  deeddf   dz  deedf   dz  dedeeeef      fd       Ze	 	 	 	 dd
ee   dz  dee   ed   z  dz  dee   ed   z  ed   z  dz  dee   ed   z  dz  dedeeeeef         fd       Z	 	 	 dd
eee   z  ee   z  deee   z  eee      z  ez  dz  deee   z  ez  dz  deee   z  ez  dz  deeeef      eeeeef         z  f
 fdZddZddZ	 	 	 ddZ xZS )AnyToAnyPipelinea
  
    Multimodal Generation pipeline using an `AutoModelForMultimodalLM`. This pipeline generates text given any
    combination of multimodal data and text.When the underlying model is a conversational model, it can also
    accept one or more chats, in which case the pipeline will operate in chat mode and will continue the
    chat(s) by adding its response(s). Each chat takes the form of a list of dicts, where each dict contains
    "role" and "content" keys.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline(task="any-to-any", model="google/gemma-3n-E4B-it")
    >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
    [{'generated_text': 'a photo of two birds'}]
    ```

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline("any-to-any", model="google/gemma-3n-E4B-it")
    >>> messages = [
    >>>     {
    >>>         "role": "user",
    >>>         "content": [
    >>>             {
    >>>                 "type": "image",
    >>>                 "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
    >>>             },
    >>>             {"type": "text", "text": "Describe this image."},
    >>>         ],
    >>>     },
    >>>     {
    >>>         "role": "assistant",
    >>>         "content": [
    >>>             {"type": "text", "text": "There is a dog and"},
    >>>         ],
    >>>     },
    >>> ]
    >>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
    [{'input_text': [{'role': 'user',
        'content': [{'type': 'image',
        'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
        {'type': 'text', 'text': 'Describe this image.'}]},
    {'role': 'assistant',
        'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
    'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This multimodal pipeline can currently be loaded from pipeline() using the following task identifier:
    "any-to-any".

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=any-to-any).
    TF   )max_new_tokensc                 &   t        |   |i | d| j                  j                  v sd| j                  j                  v rt	        | d       t	        | d       d| j                  j                  v rt	        | d       | j                  t               y )Nimagevideovisiontorchvisionaudiolibrosa)superr,   modelinput_modalitiesr   check_model_typer   )r*   argskwargs	__class__s      r"   r,   zAnyToAnyPipeline.__init__   sw    $)&)djj111W

@[@[5[dH-dM2djj111dI.CDr!   NrA   c                    i }i }i }|j                  |       |||d<   |	|	|d<   |xs i |d<   ||dk7  r||d   d<   |j                  d      rd|d   d<   |7t        |t              r|g}||d   d	<   | j                  j
                  |d   d
<   ||d|v rt        d      ||d   d<   |2|0|t        d      |rt        j                  nt        j                  }n7||t        j                  }n"| |dv rt        | d      rt        j                  }|dvr|t        d| d| d      |dvrt        d| d      |^|| j                  j                  vrFt        d| d| j                  j                  j                   d| j                  j                         |||d<   |	|	|d<   |||d<   |
|
|d<   ||d<   |||fS )Ntimeoutcontinue_final_messagegenerate_kwargstextgeneration_modeload_audio_from_videoTuse_audio_in_videostop_strings	tokenizerr4   zp'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct argument. Please use only one.z>`return_full_text` is mutually exclusive with `return_tensors`)NrG   _postprocess_paramsz`return_type` cannot be set to z when generation_mode=z2. Set `return_type=None` or generation_mode='text')NrG   r6   r:   z[`generation_mode` can be only one of the `text`, `audio`, `image` but got generation_mode[=]z`generation_mode=z` is not supported for z6. The model can only output the following modalities: return_typeclean_up_tokenization_spacesskip_special_tokens)updateget
isinstancestr	processorrL   r)   r   r   r   r   hasattrr=   output_modalitiesrB   r   )r*   r4   rF   rD   return_full_textreturn_tensorsrO   rP   stop_sequencerE   rQ   rH   rA   forward_kwargspreprocess_paramspostprocess_paramss                   r"   _sanitize_parametersz%AnyToAnyPipeline._sanitize_parameters   s     	  (+2i(!-:P67 -<,Ar()&?f+DCRN,-.?@::-.FJN,-.BC$--!.@MN,-n==A^^=U=UN,-k:%*/??/R F  CQN,-.>?'K,?) !abb2B*..
H[H[K'K,?$,,K  _%F7SWYnKo$..K .0[5L1+>TUdTe fC C  "BBmn}m~~  A  (_DJJD`D`-`#O#44KDJJL`L`LiLiKj kGGKzzGcGcFdf 
 "0;}-!-;Q78'3A]=>*8K450?,- .2DDDr!   rG   imageszImage.Imagevideosz
np.ndarrayztorch.Tensorr:   returnc                      y Nr    r*   rG   r`   ra   r:   rA   s         r"   __call__zAnyToAnyPipeline.__call__   s      #r!   c                      y rd   r    re   s         r"   rf   zAnyToAnyPipeline.__call__   s     &)r!   c                    ||t        d      t        |t        t        t        f      rt        |d   t        t        t
        f      rt        |d   t
              r d|d   v rt        |   t        |      fi |S t        |d   t        t        f      rHt        |d   d   t
              r2d|d   d   v r(|D cg c]  }t        |       }}t        |   |fi |S |Dt        |t              s4t        |t              rt        |d   t              s	 t        |   |fi |S t        | j                  dd      t        j                  d       t        |   ||||dfi |S c c}w )a  
        Generate a text given text and optionally multimodal data passed as inputs.

        Args:
            text (`str`, `list[str]`, `list[dict]`):
                The text to be used for generation. If a list of strings is passed, the length of the list should be
                the same as the number of images. Text can also follow the chat format: a list of dictionaries where
                each dictionary represents a message in a conversation. Each dictionary should have two keys: 'role'
                and 'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of
                dictionary containing the text of the message and the type of the message.
            images (`str`, `list[str]`, `ImageInput`):
                The pipeline handles three types of images:

                - A string containing a HTTP(s) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images. Finally, this pipeline also supports
                the chat format (see `text`) containing images and text in this argument.
            videos (`str`, `list[str]`, `VideoInput`):
                The pipeline handles three types of videos:

                - A string containing a HTTP(s) link pointing to a video
                - A string containing a local path to a video
                - A video loaded and decoded to array format

                The pipeline accepts either a single video or a batch of videos. Finally, this pipeline also supports
                the chat format (see `text`) containing videos and text in this argument.
            audio (`str`, `list[str]`, `AudioInput`):
                The pipeline handles three types of audios:

                - A string containing a HTTP(s) link pointing to an audio
                - A string containing a local path to an audio
                - An audio loaded in PIL directly

                The pipeline accepts either a single audios or a batch of audios. Finally, this pipeline also supports
                the chat format (see `text`) containing audios and text in this argument.
            return_tensors (`bool`, *optional*, defaults to `False`):
                Returns the tensors of predictions (as token indices) in the outputs. If set to
                `True`, the decoded text is not returned.
            return_text (`bool`, *optional*):
                Returns the decoded texts in the outputs.
            return_full_text (`bool`, *optional*, defaults to `True`):
                If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
                specified at the same time as `return_text`.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
                Whether or not to clean up the potential extra spaces in the text output.
            continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
                last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
                By default this is `True` when the final message in the input chat has the `assistant` role and
                `False` otherwise, but you can manually override that behaviour by setting this flag.

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following key (cannot
            return a combination of both `generated_text` and `generated_token_ids`):

            - **generated_text** (`str`, present when `return_text=True` and `generation_mode="text"`) -- The generated text.
            - **generated_audio** (`np.ndarray`, present when `generation_mode="audio"`) -- The generated audio.
            - **generated_image** (`PIL.Image.Image`, present when `generation_mode="image"`) -- The generated image.
            - **generated_token_ids** (`torch.Tensor`, present when `return_tensors=True` and `generation_mode="text"`) -- The token
                ids of the generated text.
            - **input_text** (`str`) -- The input text.
        Nz0You must at least provide either text or images.r   r'   chat_templatea
  The input data was not formatted as a chat with dicts containing 'role' and 'content' keys, even though this model supports chat. Consider using the chat format for better results. For more information, see https://huggingface.co/docs/transformers/en/chat_templating)rG   r`   r7   r:   )r)   rT   r.   tupler   r/   r<   rf   r$   rU   getattrrV   loggerwarning_once)	r*   rG   r`   ra   r:   rA   chatchatsrB   s	           r"   rf   zAnyToAnyPipeline.__call__   sp   N >dlOPPdT5*56:d1gPTV[]aOb;c$q'4(VtAw->w'T
=f==DGdE]3
471:t8TY_cghicjklcmYm045d55w'888Zc%:z$PT?UZdeijkelnqZr 7#D3F33 4>>?D9E_ w&[` alekll+ 6s   :E%c                    t        |t              r||j                  d   d   dk(  }d| i|}| j                  j                  j
                  j                  dk(  r(|j                         D ci c]  \  }}|dv s|| }}} | j                  j                  |j                  f|dddd	|j                  | j                  
      }||d<   |S t        |t        t        t        f      r|}	i }nt|j                         }|j                  d      }	|j!                  dd       At#        | j                  d      r+| j                  j$                  j'                  |d         |d<   t        |	t        t        f      r t)        |	      dkD  r|j+                  dd        | j                  dd|	i|ddi|j                  | j                  
      }|	|d<   |S c c}}w )Nr'   	assistantadd_generation_promptMistralCommonBackend)padding
truncation
max_lengthptT)rE   rZ   tokenizereturn_dict)dtyperG   r:   feature_extractorr   ru   rZ   r    )rT   r$   r%   rV   rL   rB   r   itemsapply_chat_templatetor{   r.   rj   rU   copypoprS   rW   r|   fetch_audiolen
setdefault)
r*   inputsrD   rE   processing_kwargschat_template_kwargskvmodel_inputsrG   s
             r"   
preprocesszAnyToAnyPipeline.preprocessc  s   fd# &-)/)<V)D)S& %<AW=W#m[l#m ~~''11::>TT%9%?%?%A(!QQJqEqAqD($ ( >4>>=='=#  ' btzzb"  $*L  ftUC01DF[[]F::f%D zz'4(4Qd9e"&.."B"B"N"NvV]"_w dT5M*s4y1}((D9 &t~~d4d6d$dRcdgg** h 
  $VI(s   /G<Gc                     |i n|}|j                  d      }|j                  d|j                  d            }d|vr| j                  |d<    | j                  j                  di ||}|||dS )NrG   	input_idsdecoder_input_idsgeneration_config)generated_sequenceprompt_textr   r    )r   rS   r   r=   generate)r*   r   rF   r   r   r   s         r"   _forwardzAnyToAnyPipeline._forward  s     / 7"_"&&v. $$[,2B2BCV2WX	 o5373I3IO/00TZZ00S<S?S&8clmmr!   c           	      
   |d   }t        |t        t        f      r|gn|}|d   }|d   }|t        j                  k(  r+t        t        |            D 	cg c]  }	||	   ||	   d c}	S ||nd}|d   xs d}
|
dk(  rTt        | j                  d	      r>| j                  j                  |j                  | j                  j                              } | j                  j                  |fd
|i|}|t        j                  t        j                  hv rg }d|d<    | j                  j                  |fd
|i|}t!        ||      D ]V  \  }}|j#                  |      }d|cxk  rdk  r$n n!|j%                  ||t        |      z   d         F|j%                  |       X |}|t        j                  k(  rg }t!        ||      D ]  \  }}t        |t              r||z   }nt        |t              r||j&                  d   d   dk(  }|rt)        |j&                  d   d   d   j+                               }|dxx   |z  cc<   t-        |j&                        d d |j&                  d   d   |j&                  d   d   d d |gz   dgz   }nt-        |j&                        d|dgz   }|j%                  |        |}t!        ||      D cg c]*  \  }}dt        |t              r|j&                  n|d|
 |i, }}}|S c c}	w c c}}w )Nr   r   r   )
input_textgenerated_token_idsTrH   rG   r6   decode_image_tokensrQ   r   r   rq   r'   rr   r(   )r'   r(   r   
generated_)rT   rU   r$   r   r   ranger   rW   r=   r   r   devicerV   post_process_multimodal_outputr   r   zipfindappendr%   r/   r}   r.   )r*   model_outputsrO   rE   rQ   postprocess_kwargsinput_textsr   r   irH   generated_outputsnew_generated_textsdecoded_inputstext_generateddecoded_inputindex_input_text
full_textsr   generated_textnew_textr   generated_outputrecordss                           r"   postprocesszAnyToAnyPipeline.postprocess  s    $M2'1+T{'K{mQ\*+?@!+.	*,,, s;/0  +1~FXYZF[\  6I5T1Z^,->?I6g%'$**>S*T!%!?!?@R@U@UVZV`V`VgVg@h!iIDNNII
4G
K]

 :..
0D0DEE #%4:01JT^^JJ/BFXN 255F1W ?-#1#6#6}#E (-A-'..~>NQTUbQc>c>e/fg'..~>? !4*...J/2;@Q/R 2+^k3/%0>%ANT2-5 2=1E1Eb1I&1QU`1`.-#'(<(<R(@(KB(O(U(U(W#X (N:()-k.B.B)CCR)H(3(<(<R(@(H+6+?+?+CI+NsPR+SW_V`+`L * *.k.B.B)C%0^LG * !!.1/20 !+ 14KAR0S

 -
, Z
D5Qj11Wa_-.0@
 
 Kz
s   K:/K?)NNNNNNNNNNN)NNNN)NNNrd   )r   r   r   r-   _load_processor_load_image_processor_load_feature_extractor_load_tokenizer_pipeline_calls_generater   _default_generation_configr,   r   r
   r_   r   rU   r   r   r.   r/   rf   r	   r   r   r   r   r   __classcell__)rB   s   @r"   r2   r2   A   s   <| O!#O#!1"E %)# RE )*REh   37BF15#Dj# c=()D0# c<784?	#
 S,&'$.# # 
d38n	# #  "&9=OS7;)3i$) S	D//$6) S	D..n1EEL	)
 Cy4--4) ) 
d4S>"	#) ) IM6:59dmDIoT
*dm d3i$tCy/1J>Edm d3i*,t3	dm
 T#Y+d2dm 
d38n	T$sCx.%9 :	:dmL.`
n # Rr!   r2   ))enumtypingr   r   r   numpynpaudio_utilsr   
generationr   image_utilsr	   processing_utilsr
   r   utilsr   r   r   r   r   video_utilsr   baser   r   torchmodels.auto.modeling_autor   pt_utilsr   PILr   
get_loggerr   rl   Enumr   r$   r2   r    r!   r"   <module>r      s     ' '  $ ) $ 7  % 4 Q$			H	% 	! 	! ,4@Aox o Bor!   