
    qi6              
       (   d Z ddlZddlmZ ddlmZmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZ  G d	 d
e	d      Zdee   dedeee      fdZdeeee         deee      dededej*                  f
dZdedededefdZe G d de
             ZdgZy)zProcessor class for Mllama.    N   )BatchFeature)
ImageInputmake_nested_list_of_images)ProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringc                       e Zd ZdddiiZy)MllamaProcessorKwargsimage_kwargsmax_image_tiles   N)__name__
__module____qualname__	_defaults     ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/mllama/processing_mllama.pyr   r      s    q
Ir   r   F)total	input_idsimage_token_idreturnc                    t        |       D cg c]  \  }}||k(  s| }}}t        |      dk(  rg S t        |      dk(  r|d   dggS t        |dd |dd       D cg c]	  \  }}||g }}}|j                  |d   t        |       g       |d   d   }|ddd   D ]  }	|	d   |	d   dz
  k(  r||	d<   |	d   } |S c c}}w c c}}w )a  
    Generate a cross-attention token mask for image tokens in the input sequence.

    This function identifies the positions of image tokens in the input sequence and creates
    a mask that defines which subsequent tokens each image token should attend to.

    Args:
        input_ids (list[int]): A list of token ids representing the input sequence.
        image_token_id (int): The id of the token used to represent images in the sequence.

    Returns:
        list[list[int]]: A list of [start, end] pairs, where each pair represents the range
        of tokens an image token should attend to.

    Notes:
        - If no image tokens are present, an empty list is returned.
        - For a single image token, it attends to all subsequent tokens until the end of the sequence.
        - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
        - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
    r      N)	enumeratelenzipappend)
r   r   itokenimage_token_locationsloc1loc2vision_maskslast_mask_endvision_masks
             r   get_cross_attention_token_maskr,   "   s   , 09/C_81euP^G^Q__
 !Q&	  !Q&&q)2.//367LSb7QShijikSl3mnZT4T4LnLn .r2C	NCD
 !$Q'M#DbD) 'q>[^a//*KN#A'
 / ` os   B>B>Ccross_attention_token_mask	num_tilesmax_num_tileslengthc           	      z   t        |       }t        d | D              }t        j                  ||||ft        j                        }t        t        | |            D ]\  \  }\  }}	t        t        ||	            D ]<  \  }
\  }}t        |      dk(  s|\  }}t        ||      }|dk(  r|}d|||||
d|f<   > ^ |S )a  
    Convert the cross attention mask indices to a cross attention mask 4D array.

    This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
    The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.

    Args:
        cross_attention_token_mask (list[list[list[int]]]): A nested list structure where:
            - The outer list represents the batch dimension.
            - The middle list represents different images within each batch item.
            - The inner list contains pairs of integers [start, end] representing token ranges for each image.
        num_tiles (list[list[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
        max_num_tiles (int): The maximum possible number of tiles.
        length (int): The total sequence length of the input.

    Returns:
        np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
            The array contains `1` where attention is allowed and `0` where it is not.

    Note:
        - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
    c              3   2   K   | ]  }t        |        y wNr!   ).0maskss     r   	<genexpr>z?convert_sparse_cross_attention_mask_to_dense.<locals>.<genexpr>p   s     LUL   )shapedtype   r   r   N)r!   maxnpzerosint64r    r"   min)r-   r.   r/   r0   
batch_sizemax_num_imagescross_attention_mask
sample_idxsample_maskssample_num_tilesmask_idx	locationsmask_num_tilesstartends                  r   ,convert_sparse_cross_attention_mask_to_denserL   R   s    : /0JL1KLLN886>=Ahh
 9B#F`bkBl8m [4
4\#35>s<Qa?b5c 	[1H1y.9~"&
s#v&"9 CYZ$ZsHo~o%UV	[[  r   prompt	bos_tokenimage_tokenc                     || v r| S d}| j                  |      r%| t        |      d } |dz  }| j                  |      r%||z   | |  S )a\  
    Builds a string from the input prompt by adding `bos_token` if not already present.

    Args:
        prompt (`str`):
            The input prompt string.
        bos_token (`str`):
            The beginning of sentence token to be added.
        image_token (`str`):
            The image token used to identify the start of an image sequence.

    Returns:
        str: The modified prompt string with the `bos_token` added if necessary.

    Examples:
        >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
        '<begin_of_text>Hello world'

        >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
        '<|image|><begin_of_text>Hello world'

        >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
        '<begin_of_text>Hello world'
    r   Nr   )
startswithr!   )rM   rN   rO   num_image_tokens_on_starts       r   build_string_from_inputrS      sn    4 F !


K
(K(*+!Q&! 

K
( 556yk&JJr   c            
            e Zd Zd
 fd	Ze	 	 ddedz  deez  ee   z  ee   z  dz  de	e
   defd       Z	 ddZed	        Z xZS )MllamaProcessorNc                 F   t        |d      s(d| _        |j                  | j                        | _        n"|j                  | _        |j                  | _        d| _        |j                  | j                        | _        |j                  | _        t        | !  |||       y )NrO   z	<|image|>z<|python_tag|>)chat_template)	hasattrrO   convert_tokens_to_idsr   python_tokenpython_token_idrN   super__init__)selfimage_processor	tokenizerrW   	__class__s       r   r]   zMllamaProcessor.__init__   s    y-0*D"+"A"A$BRBR"SD(44D"+":":D,(>>t?P?PQ",,)=Qr   imagestextkwargsr   c           
         ||t        d       | j                  t        fd| j                  j                  i|}|d   j                  dd      }i }|t        |t              r|g}n3t        |t        t        f      rt        d |D              st        d      |D cg c]  }|j                  | j                         }}|D 	cg c]#  }	t        |	| j                  | j                        % }}	 | j                  |fi |d   }
| j                  ||
dg	       |
d
   D cg c]  }|j                  | j                          }}|j#                  |
       dg}|>| j$                  j'                  |      }t)        |      }|D cg c]  }t+        |       }}|t-        d D              rt        d |D              st        d      t/        |      dkD  rS||k7  s|k7  rI|t        d      d}t/        |      t/        |      k(  r||k7  rd}n|k7  rd}t        d| d| d|       |8 | j$                  |fi |d   }|j                  d      }|j#                  |       |c|a
d
   D cg c]  }t1        || j                          }}t3        || j$                  j4                  t7        d |
d
   D                    }||d<   t9        ||      S c c}w c c}	w c c}w c c}w c c}w )a  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
        Nz'You must specify either text or images.tokenizer_init_kwargstext_kwargsreturn_tensorsc              3   <   K   | ]  }t        |t                y wr3   )
isinstancestr)r5   ts     r   r7   z+MllamaProcessor.__call__.<locals>.<genexpr>   s     =_UVjC>P=_s   zAInvalid input text. Please provide a string, or a list of stringsimage)
modalitiesr   r   c              3   &   K   | ]	  }|d k(    ywr   Nr   r5   	batch_imgs     r   r7   z+MllamaProcessor.__call__.<locals>.<genexpr>   s     Di9>D   c              3   &   K   | ]	  }|d k(    ywrp   r   rq   s     r   r7   z+MllamaProcessor.__call__.<locals>.<genexpr>   s      Q#,	QQrs   zaIf a batch of text is provided, there should be either no images or at least one image per samplez@No image were provided, but there are image tokens in the prompt zZMake sure to pass your images as a nested list, where each sub-list holds images per batchzhIf you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped.z)The number of image tokens in each text (zA) should be the same as the number of provided images per batch (z). images_kwargsr.   c              3   2   K   | ]  }t        |        y wr3   r4   )r5   r   s     r   r7   z+MllamaProcessor.__call__.<locals>.<genexpr>  s     Qi3y>Qr8   )r.   r/   r0   rC   )datatensor_type)
ValueError_merge_kwargsr   r`   init_kwargspoprj   rk   listtupleallcountrO   rS   rN   _check_special_mm_tokensr   updater_   fetch_imagesr   r!   anysumr,   rL   r   r<   r   )r^   rb   rc   rd   output_kwargsrh   rx   rl   n_images_in_text	text_itemencoding	token_idsn_images_in_idsn_images_in_imagessampleadd_messageimage_featuresr.   r-   rC   s                       r   __call__zMllamaProcessor.__call__   sd   $ <FNFGG***!
"&.."<"<
 

 '}599:JDQ$$v e}5#=_Z^=_:_ !deeCGHa(8(8 9HHjno]f+It~~tGWGWXoDo%t~~dKmM.JKH))$gY)OU]^iUjk	yt/B/BCkOkKK!S))66v>F/7F<B!C&#f+!C!CD3CDDS Q0@Q N !w  #$q("&66/M_:_>$%ghh"$K-.#6F2GGL^brLr 'C(,>> 'Q$CDTCU V@@R?SSVWbVce 
 1T11&[M/<Z[N&**;7IKK' $"2`hit`u*S\.y$:M:MN*& * $P*#"22BBQ8K;PQQ	$  ,@D'(>BBo  Io l "DB*s   ""K
(K!&"K&
K+5K0c                 B     | j                   j                  |f||d|S )a  
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[str]`: The decoded text.
        )skip_special_tokensclean_up_tokenization_spaces)r`   batch_decode)r^   generated_outputsr   r   rd   s        r   post_process_image_text_to_textz/MllamaProcessor.post_process_image_text_to_text  s5    ( +t~~**
 3)E
 	
 	
r   c                     | j                   j                  }| j                  j                  }|D cg c]
  }|dk7  s	| }}t        ||z   dgz         S c c}w )Nr.   rC   )r`   model_input_namesr_   r~   )r^   tokenizer_input_namesimage_processor_input_namesnames       r   r   z!MllamaProcessor.model_input_names,  se     $ @ @&*&:&:&L&L# 9T&kW[_jWjt&k#&k),GGKaJbbcc 'ls
   
AAr3   )NN)TF)r   r   r   r]   r   r   r   r
   r~   r	   r   r   r   r   propertyr   __classcell__)ra   s   @r   rU   rU      s    R  %)aeXCT!XC ++d9o=EV@WWZ^^XC ./	XC
 
XC XCv Y^
6 d dr   rU   )__doc__numpyr=   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r   r	   tokenization_utils_baser
   r   utilsr   r   r~   intr,   ndarrayrL   rk   rS   rU   __all__r   r   r   <module>r      s    "  4 A H H C #,E -d3i - -QUVZ[^V_Q` -`-  $T$s)_ 5- DI-  -  	- 
 ZZ- `"KC "KC "Kc "Kc "KJ Ldn Ld Ld^ 
r   