
    qis              	          d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ ddlmZmZ dd	lmZ eeeef      eeeeeef      z  eeeeef         z  eeeeeef         z  Zeedz  eedz  eedz  eedz     z     z     z     Z G d
 de	d      Z G d ded      Z G d de
d      Ze G d de             Zdeeeeef   dedeeef   fdZdededefdZ d Z!d Z"d Z#ddZ$dgZ%y) zProcessor class for KOSMOS-2.    N   )BatchFeature)
ImageInput)ImagesKwargsProcessingKwargsProcessorMixin
TextKwargsUnpack)
AddedToken)BatchEncoding	TextInput)auto_docstringc                   <    e Zd ZU dZedz  ed<   eed<   edz  ed<   y)Kosmos2ImagesKwargsa  
    bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
        The bounding bboxes associated to `texts`.
    num_image_tokens (`int`, *optional* defaults to 64):
        The number of (consecutive) places that are used to mark the placeholders to store image information.
        This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
    first_image_token_id (`int`, *optional*):
        The token id that will be used for the first place of the subsequence that is reserved to store image
        information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
    Nbboxesnum_image_tokensfirst_image_token_id)__name__
__module____qualname____doc__
NestedList__annotations__int     `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/kosmos2/processing_kosmos2.pyr   r   '   s$    	 *$r   r   F)totalc                       e Zd ZU dZeed<   y)Kosmos2TextKwargsz
    add_eos_token (`bool`, defaults to `False`):
    Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
    add_eos_tokenN)r   r   r   r   boolr   r   r   r   r    r    8   s    
 r   r    c            
       D    e Zd ZU eed<   eed<   dddddddddd	ddid	Zy
)Kosmos2ProcessorKwargstext_kwargsimages_kwargsTFr   )	add_special_tokenspaddingstridereturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsverboser!   r   @   )r%   r&   N)r   r   r   r    r   r   	_defaultsr   r   r   r$   r$   A   sC    ""&& #').*/&+%*"

 
Ir   r$   c                   >    e Zd Zd fd	Ze	 	 ddedz  deee   z  dee	   de
fd       Zd Zd	 Z	 	 	 dd
eee   z  dedz  dededz  deee   z  f
dZddZddZed        Zdedeee      eee      z  defdZdeeef   eeeeef   z  deeef   fdZ xZS )Kosmos2Processorc                    d|_         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d	| _        d
| _	        d| _
        d| _        | j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                  g| _        || _        t        | j                        D cg c]   }dt        |      j!                  d       d" }}g }| j                  |z   D ]   }|j#                  t%        |ddd             " |j'                  |       t(        	| U  ||       yc c}w )z
        num_patch_index_tokens (`int`, *optional*, defaults to 1024):
            The number of tokens that represent patch indices.
        Fz</doc>z<image>z</image>z</chunk>z</line>z<phrase>z	</phrase>z<object>z	</object></delimiter_of_multi_objects/>z<grounding><patch_index_   >T)lstriprstrip
normalizedN)r-   	eod_token	boi_token	eoi_token	eoc_token	eol_token	bop_token	eop_token	boo_token	eoo_token	dom_token	grd_token
tag_tokensnum_patch_index_tokensrangestrzfillappendr   
add_tokenssuper__init__)
selfimage_processor	tokenizerrG   kwargsxpatch_index_tokenstokens_to_addtoken	__class__s
            r   rN   zKosmos2Processor.__init__X   sO   
 +0	'!"##"#$#$9& NNNNNNNNNNNNNNNNNNNNNN
 '=#JOPTPkPkJlmQc!fll1o->a@mm__'99 	aE  E$uY^!_`	a]+)4 ns   4%E1NimagestextrR   returnc           
      <
   ||t        d       | j                  t        fd| j                  j                  i|}|d   j                  dd       }|d   j                  dd      }|d   j                  dd       }|d   j                  d	d
      }|d   d   }	|d   d   }
|d   j                  dd       }t               }|' | j                  |fi |d   }|j                  |       || j                  ||||      }|	rd|sbt        |t              r| j                  j                   | }n7t        |t              r'|D cg c]  }| j                  j                   |  }}|d   d   xr ||d   d<   ||
nd
|d   d<   ||nd |d   d<    | j                  dd|i|d   }|j                  |       |	|d   d<   |
|d   d<   ||d   d<   | ||| j                  j                  dz   }|	}t!        |      dz   }t        t#        |||z               }dgdg|z  z   dgz   }g }g }|d   }t        |t              r|g}|d   g|d<   |D ]p  }|d | |z   |||z   d  z   }|j%                  |       t'        j&                  |      }|rdg|z   }|dgt)        |      t)        |      z
  z  z  }|j%                  |       r t        |t              rt+        t-        j.                        D cg c]  \  }}|t)        |      f c}}d       }|d   \  }}|d   \  }}|d   d   xr ||d   d<   d |d   d<    | j                  dd||   gi|d   }t)        |j.                  d         }||k7  r5| j                  j0                  dk(  r|D cg c]+  }|| j                  j2                  g|t)        |      z
  z  z   - }}|D cg c]  }|dg|t)        |      z
  z  z    }}|d   D cg c]  }|dg|t)        |      z
  z  z    c}|d<   n| j                  j0                  dk(  r|D cg c]+  }| j                  j2                  g|t)        |      z
  z  |z   - }}|D cg c]  }dg|t)        |      z
  z  |z    }}|d   D cg c]  }dg|t)        |      z
  z  |z    c}|d<   t        |t              r||d   }|d   d   |d<   |d   }|j                  t5        ||d   |d|             |S c c}w c c}}w c c}w c c}w c c}w c c}w c c}w c c}w )Nz*You have to specify either images or text.tokenizer_init_kwargsr&   r   r   r/   r   r%   r!   Fr'   r(   return_tensors)r   rY      r   	input_idsattention_maskc                     | d   S Nr   )rS   s    r   <lambda>z+Kosmos2Processor.__call__.<locals>.<lambda>   s    defhdi r   )keyrc   rightleft)r_   r`   image_embeds_position_mask)datatensor_typer   )
ValueError_merge_kwargsr$   rQ   init_kwargspop
setdefaultr   rP   updatepreprocess_examples
isinstancerI   	bos_tokenlistunk_token_idr   rH   rK   copylensorted	enumerater_   padding_sidepad_token_idr   )rO   rX   rY   rR   output_kwargsr   r   r   r!   r'   r(   r]   encodingimage_encodingstext_encodingwith_bosstart_indeximage_token_idsbase_image_embeds_position_maskr_   rh   all_input_idstext_idsmaskidxrS   sorted_length_min_len_not_paddedmax_len_paddeds                                  r   __call__zKosmos2Processor.__call__   sY    >dlIJJ***"
"&.."<"<
 
 /33HdC(9==>PRTU,_=AABXZ^_%m488%P*=9:NO.y9&}5@@AQSWX>1T11&[M/<Z[NOON+++D&&Sc+dD!-dC("nn667v>Dd+FJKt~~778<KDKm,-ABT} -()=> BHgUZM-(3OU~^cgM-()9:*DNNUUm8TUMOOM*=Om$%9:29m$Y/9Gm$%56 2#+'+~~'B'BQ'F$ *H h-!+K #5)=?SVf?f#ghO/0cQC:J4J.JaS.P+ I)+&$[1M$$!..67G.H-I)*) 8#L[1OCh{]mOmOoFpp  *yy!@A3:Ds8}s4y899*11$78 $% &1:=;R;R1STvsAc3q6]TZi! )6a(8%%&r*Q!-01EFX= m,-AB BFm,-=> . `T#YK `=Q^C_ `!$]%<%<Q%?!@%7~~22g=lu$vghQ$..*E*E)F.[^_`[aJa)b%b$v	$vIc6DEA~A'> ??62 6 JRRbIc6DEA~A'> ??6!12 44>lu$vghdnn&A&A%BnWZ[\W]F]%^ab%b$v	$vIc6DEQC>CF#:;a?62 6 JRRbIc6DEQC>CF#:;a?6!12
 $$)?%aL	-56F-G-J)*-G-J* OO%.*23C*D6P
 !/	 K Lj U %w66 %w66s0   !S5/S:
0T T,T
,0T"TTc                 @   |yt        |t              st        d      |D ]{  }|t        |t              s|g}|D ]^  }t        |t              rBt	        |      dk(  rt        d |D              r4t	        |      dk(  rt        d |D              rUt        d       } y)a  
        Check `bboxes` for a single text example. It could be
            - `None`: no bounding box associated to a text.
            - A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair found
              in a text. This could be:
                  - `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
                  - A tuple of 2 integers: A single bounding box specified by patch indices.
                  - A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
                  - A list containing the above 2 tuple types: Multiple bounding boxes for a
                   `<phrase> ... </phrase>` pair.
        Nz@`bboxes` (for a single text example) should be `None` or a list.   c              3   <   K   | ]  }t        |t                y wN)rr   r   .0rS   s     r   	<genexpr>zAKosmos2Processor._check_bboxes_for_single_text.<locals>.<genexpr>/  s     .Saz!S/A.S   r6   c              3   <   K   | ]  }t        |t                y wr   )rr   floatr   s     r   r   zAKosmos2Processor._check_bboxes_for_single_text.<locals>.<genexpr>0  s     1X1*Q2F1Xr   a'  Each element in `bboxes` (for a single text example) should be either `None`, a tuple containing 2 integers or 4 float point numbers, or a list containing such tuples. Also make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in batches or both for a single example.)rr   rt   rk   tuplerw   all)rO   r   bboxelements       r   _check_bboxes_for_single_textz.Kosmos2Processor._check_bboxes_for_single_text  s     >FD)_``  	D|d+v 
!'51\Q&3.S7.S+SG)c1XPW1X.X$@ 
	r   c                 \    |j                         }|| d| }| j                  ||      }|S )N )strip_insert_patch_index_tokens)rO   rY   imager   img_info_tokenss        r   _preprocess_single_examplez+Kosmos2Processor._preprocess_single_example9  s=    zz|%&av.D ..tV<r   textsr   r   c                     | j                   g|z  }dj                  | j                   g|z   | j                  gz         }d}t        |t              rd}|g}|dgt        |      z  }nt        |t              s|g}t        |      t        |      k7  r$t        dt        |       dt        |       d      |s| j                  |       |g}nE|4t        |t              st        d      |D ]  }| j                  |        ndgt        |      z  }t        |      t        |      k7  r$t        d	t        |       dt        |       d      t        |||      D 	
cg c]  \  }	}
}| j                  |	|
||       }}
}	}|s|d
   }|S c c}}
}	w )a-  Add image and bounding box information to `texts` as image and patch index tokens.

        Args:
            texts (`Union[TextInput, list[TextInput]]`): The texts to be processed.
            images (`ImageInput`, *optional*): The images associated to `texts`.
            bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
            num_image_tokens (`int`, *optional*, defaults to 64):
                The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num`
                attribute in `Kosmos2Config`.

        Returns:
            `Union[TextInput, list[TextInput]]`: The processed texts with image and patch index tokens.
        r   TFNzGThe number of examples in `texts` and `images` should be the same. Got  v.s. 	 instead.zS`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.zGThe number of examples in `texts` and `bboxes` should be the same. Got r   )r<   joinr=   rr   rI   rw   rt   rk   r   zipr   )rO   r   rX   r   r   
img_tokensr   batchedrS   rY   r   r   results                r   rq   z$Kosmos2Processor.preprocess_examplesC  s   , nn%(88
((DNN#3j#@DNNCS#ST eS!GGE>Vc%j(FFD)XFu:V$YZ]^cZdYeeklopvlwkx  yB  C  ..v6XFfd+ !vww 622156 Vc%j(Fv;#e*$YZ]^cZdYeeklopvlwkx  yB  C  &)%?
 
!eT ++D%O
 

 AYF
s   F	c                 \    |j                  | j                        d   }|rt        |      S |S rb   )splitr=   +clean_text_and_extract_entities_with_bboxes)rO   rY   cleanup_and_extractcaptions       r   post_process_generationz(Kosmos2Processor.post_process_generation  s,    **T^^,R0>wGGr   c                 x     | j                   |fd|i|}|D cg c]  }| j                  |d       c}S c c}w )a  
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[str]`: The decoded text.
        skip_special_tokensF)r   )batch_decoder   )rO   generated_outputsr   rR   generated_textsrY   s         r   post_process_image_text_to_textz0Kosmos2Processor.post_process_image_text_to_text  sH      ,$++,=qSfqjpqZijRV,,Tu,Mjjjs   7c                 l    | j                   j                  }| j                  j                  }||z   dgz   S )Nrh   )rQ   model_input_namesrP   )rO   tokenizer_input_namesimage_processor_input_namess      r   r   z"Kosmos2Processor.model_input_names  s9     $ @ @&*&:&:&L&L#$'BBFbEcccr   c                    |t        |      dk(  r|S t        t        j                  d|            }t        |      t        |      k7  r$t	        dt        |       dt        |       d      d}g }t        ||      D ]  \  }}|j                         \  }}	|j                  |||	        |	}|2t        |t              r|g}g }
t        d |D              st	        d      |D ],  }| j                  |      \  }}|
j                  | d	|        . t        |
      dk(  rd
j                  |
      }|j                  d| d        |t        |      k  r|j                  ||d         dj                  |      }|S )Nr   z<phrase>.+?</phrase>)stringzuThe number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got r   r   c              3   $   K   | ]  }|d u 
 y wr   r   )r   boxs     r   r   z>Kosmos2Processor._insert_patch_index_tokens.<locals>.<genexpr>  s     73s$7s   zTThe multiple bounding boxes for a single phrase should not contain any `None` value.r   z  </delimiter_of_multi_objects/> z	<object> z
 </object> )rw   rt   refinditerrk   r   spanrK   rr   r   r   #_convert_bbox_to_patch_index_tokensr   )rO   rY   r   matched_phrasescurr_posbuffermatchedr   r   endpatch_index_stringsr   patch_index_1patch_index_2position_strs                  r   r   z+Kosmos2Processor._insert_patch_index_tokens  s   >S[A-Kr{{+B4PQ3v;. H  IL  M\  I]  H^  ^d  eh  io  ep  dq  qz  {   &9 	@MGT\\^FAsMM$x,-H|$&v"$7$77 j   O/3/W/WX[/\,}#**m_Am_+MNO &'1,=BBCVWLMMIl^:>?/	@2 c$iMM$xy/*wwvr   r   c                    t        |      dk(  r|\  }}n7t        t        j                  | j                              }t        ||      \  }}dt        |      j                  d       d}dt        |      j                  d       d}||fS )Nr   r5   r6   r7   )rw   r   mathsqrtrG   coordinate_to_patch_indexrI   rJ   )rO   r   idx_1idx_2num_patches_per_sidetoken_1token_2s          r   r   z4Kosmos2Processor._convert_bbox_to_patch_index_tokens  s     t9>LE5 $'tyy1L1L'M#N 4T;OPLE5!#e*"2"21"5!6a8!#e*"2"21"5!6a8r   )i   )NN)NNr/   )T)r   r   r   rN   r   r   r   rt   r
   r$   r   r   r   r   	BboxInputr   rI   rq   r   r   propertyr   r   r   r   r   __classcell__)rW   s   @r   r2   r2   V   sa   /5b  %),0JT!J $y/)J /0	J
 
J JX!F %) ')@4	?*@ T!@ 	@
 *@ 
tCy@Dk& d d
+s +Ds<LtTYZ_T`Oa<a +fi +Z #s(OeE5%,F&GG 	sCx r   r2   r   r   rZ   c                 .   | \  }}}}||kD  r||kD  st        d      t        j                  ||z        }t        j                  ||z        }t        j                  ||z  dz
        }t        j                  ||z  dz
        }	||z  |z   }
|	|z  |z   }|
|fS )a  Convert a bounding box to a pair of patch indices.

    Args:
        bbox (`tuple[float, float, float, float]`):
            The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left and
            lower-right corners of the box. It should have x2 > x1 and y2 > y1.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `tuple[int, int]`: A pair of patch indices representing the upper-left patch and lower-right patch.
    zTThe coordinates in `bbox` should be `(x1, y1, x2, y2)` with `x2 > x1` and `y2 > y1`.r^   )rk   r   floorceil)r   r   x1y1x2y2ul_xul_ylr_xlr_yul_idxlr_idxs               r   r   r     s     RRGRopp::b//0D::b//0D99R..23D99R..23D((4/F((4/F6>r   r   r   c                 "   d|z  }| |z  }| |z  }||z  }||z  }| |k(  r||z  }||z  }	||z  |z   }
||z  |z   }nQ||k(  s||k(  r||z  }||z  }	||z  |z   }
||z  |z   }n,||z  |dz  z   }||z  |dz  z   }	||z  |dz  z   }
||z  |dz  z   }||	|
|fS )a  
    Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
    bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).

    Args:
        ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
        lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
    g      ?r   r   )r   r   r   	cell_sizer   r   r   r   r   r   r   r   s               r   patch_index_to_coordinater     s    **I ((D))D((D))D III	)I	)	III	)I	)I	A-I	A-I	A-I	A-r2r>r   c           
      $   d}t        j                  ||       }g }|D ]o  }|j                  d      }|j                         \  }}}|s*d}|j                  d      d   |j                  d      d   f}|j	                  d      }	g }
|	D ]  }t        j
                  d|      }t        j
                  d|dd       }|s5|s8|rD|
j                  t        |j                  d            t        |j                  d            f       ~|
j                  t        |j                  d            t        |j                  d            f        |r|j                  |||
f       E|
D ]&  }d|d    d	|d    d
}|j                  |||gf       ( r |S )a  Extract entities contained in `text`. The bounding bboxes is given in the form of patch indices.

    This function is only intended to be used within `clean_text_and_extract_entities_with_bboxes` where further
    processing happens, including converting to normalized coordinates and whitespace character cleaning up.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> entities = extract_entities_with_patch_indices(text)
    >>> entities
    [(' a snowman', (31, 41), [(44, 863)]), (' a fire', (130, 137), [(5, 911)])]
    ```z(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+></delimiter_of_multi_objects/>)*<patch_index_\d+><patch_index_\d+>)</object>r   Nr   r4   z<patch_index_(\d+)>r^   r5   z><patch_index_r7   )	r   r   r   groupsr   searchrK   r   group)rY   patternmatchesentities_with_patch_indicesmatchr   
phrase_tagphrasematch_contentpatch_index_pairsentity_bboxespairrS   yr   entitys                   r   #extract_entities_with_patch_indicesr   /  s    kG kk'4(G #% Kzz!},1LLN)
FMFJJqM!$ejjmA&67D *//0PQ% 		MD		0$7A		0$qr(;AQ!((#aggaj/3qwwqz?)KL!((#aggaj/3qwwqz?)KL		M '..m/LM% K(a	QyJ+22FD4&3IJK7K@ '&r   c           	          | \  }\  }}t        t        j                  dd|d|             }t        t        j                  dd|d|             }|||ff}|S )zfAdjust the positions of the entities in `text` to be relative to the text with special fields removed.<.*?>r   N)rw   r   sub)r   rY   entity_namestartr   adjusted_startadjusted_endadjusted_entitys           r   adjust_entity_positionsr  i  s_     &K%T&5\:;Nrvvgr4:67L"^\$BCOr   c                    | j                         }t        |       t        | j                               z
  }g }|D ]  \  }\  }}}t        |      t        |j                               z
  }	t        |      t        |j                               z
  }
||z
  |	z   }||z
  |
z
  }|j                         }|j	                  |||f|f        ||fS )z9Remove the spaces around the text and the entities in it.)r   rw   r8   r9   rK   )rY   entitiesnew_textleading_spacesnew_entitiesr   r   r   r   entity_name_leading_spacesentity_name_trailing_spacess              r   _cleanup_spacesr  s  s    zz|HYT[[]!33NL-5 A)\eS6%(%5K<N<N<P8Q%Q"&)+&6[=O=O=Q9R&R#&)CCN"%@@!'')[5#,?@A \!!r   c           	         t        j                  dd|       }t        |       }g }|D ]M  }|dd |d   }}t        ||       }|D 	cg c]  }	t	        |	d   |	d   |       }
}	|j                  ||
fz          O t        ||      S c c}	w )a  Remove the tag tokens from `text`, extract entities in it with some cleaning up of white characters.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> clean_text, entities = clean_text_and_extract_entities_with_bboxes(text)
    >>> clean_text
    'An image of a snowman warming himself by a fire.'

    >>> entities
    [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
    ```r   r   r   r   r^   )r   r   r   r  r   rK   r  )rY   r   processed_textr   r  itemr   r   r  r   bboxes_in_coordss              r   r   r     s     VVGR.N"Ed"KH+ ?aDG1&$?jpqbf5d1gtAwH\]qq+;*==>? >844	 rs   B)    )&r   rv   r   r   image_processing_utilsr   image_utilsr   processing_utilsr   r   r   r	   r
   tokenization_pythonr   tokenization_utils_baser   r   utilsr   rt   r   r   r   r   r   r   r    r$   r2   r   r   r   r  r  r   __all__r   r   r   <module>r     s   $   	 2 % b b - ? # 	sCx
5ue+,-.
4c3h !" 4eUE)*+,- 
 %$,edlT%$,eVZlI[:[5\&\!]]^
%,e %"
% -U * I ~ I  I XE%u*D$E ]` ejknpskset >(c (3 (c (Z7't"*5: 
r   