
    qi4                         d dl Z d dlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZ  e       rd dlZ ej$                  e      Z G d d	e	d
      Z G d de
      ZdgZy)    N   )
AudioInputmake_list_of_audio)BatchFeature)ProcessingKwargsProcessorMixinUnpack)	TextInput)is_torch_availableloggingc                   ,    e Zd Zddidddddddd	d
Zy)AudioFlamingo3ProcessorKwargspaddingTi>  g      >@
max_length)sampling_ratechunk_lengthreturn_attention_maskr   ptleft)return_tensorspadding_side)text_kwargsaudio_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     n/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/audioflamingo3/processing_audioflamingo3.pyr   r   "   s5     t
 # %)#	
 #"
Ir    r   F)totalc                        e Zd ZdZ	 	 	 	 d fd	ZddZ	 	 ddeee   z  dedz  d	e	dz  d
e
e   def
dZedee   fd       Z	 ddeee   z  ez  deee   z  dz  d
e
e   defdZdddZdedefdZ xZS )AudioFlamingo3Processora:  
    Constructs an AudioFlamingo3 processor which wraps an AudioFlamingo3 feature extractor and an AudioFlamingo3
    tokenizer into a single processor.

    [`AudioFlamingo3Processor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
    [`Qwen2TokenizerFast`]. See the [`~AudioFlamingo3Processor.__call__`] for more information.

    Args:
            feature_extractor ([`WhisperFeatureExtractor`]):
                The feature extractor is a required input.
            tokenizer ([`Qwen2TokenizerFast`]):
                The tokenizer is a required input.
            chat_template (`Optional[str]`, *optional*):
                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
                template will be used.
            audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
                Special token used to represent audio inputs in the chat template.
            default_transcription_prompt (`str`, *optional*, defaults to `"Transcribe the input speech."`):
                Default prompt to use for transcription tasks when applying transcription requests.
            max_audio_len (`int`, *optional*, defaults to 600):
                Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
    Nc                     || _         |j                  |      | _        || _        || _        t
        |   |||       y )N)chat_template)audio_tokenconvert_tokens_to_idsaudio_token_iddefault_transcription_promptmax_audio_lensuper__init__)selffeature_extractor	tokenizerr&   r'   r*   r+   	__class__s          r!   r-   z AudioFlamingo3Processor.__init__L   sE     ''==kJ,H)**I]Sr    returnc                 2    |dz
  dz  dz   }|dz
  dz  dz   }|S )N      r   )r.   audio_lengthsconv_output_lengthsaudio_tokens_lengthss       r!   _get_audio_token_lengthz/AudioFlamingo3Processor._get_audio_token_length[   s2    ,q0Q6: 3a 7A=A##r    Ftextaudiooutput_labelskwargsc           
          | j                   t        fd| j                  j                  i|}|d   }|d   }|j	                  d      }|dk7  r"t        | j                  j                   d      t        |t              r|g}n3t        |t        t        f      rt        d |D              st        d      i }	|3t        |      }t        |      t        |      k7  r$t        d	t        |       d
t        |       d      t        |d   |d   z        }
t        | j                   |d   z        }g }g }|D ]  }t        |j"                  d         }t%        d||
z   dz
  |
z        }||kD  r<t&        j)                  d||d   z  dd| j                    d| j                    d       |}|j+                  |       t-        |||
z        }t/        |      D ]-  }||
z  }t-        |dz   |
z  |      }|j+                  |||        /   | j0                  |fi |}	|	j3                  d      }||	d<   t5        j6                  t5        j8                  |j;                  d      |      D cg c]  }|j;                          c}      }| j=                  |      }t?        |      D ]N  \  }}tA        jB                  tA        jD                  | jF                        | jF                  |z  ||         }|||<   P  | j                  |fi |}i ||	}|rF|d   jI                         }d||| jJ                  k(  <   d||| j                  jL                  k(  <   ||d<   tO        ||      S c c}w )a=  
        Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
        method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
        audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
        with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
        the text is tokenized as-is (LM-only behavior).

        Args:
            text (`str` or `list[str]`):
                Input sequence or batch of sequences.
            audio (`np.ndarray` or `list[np.ndarray]`):
                Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
                `audio` inputs.
            output_labels (bool, *optional*, default=False):
                Whether to return labels for training.

        Returns:
            [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
            audio features (`input_features`, `input_features_mask`).
        tokenizer_init_kwargsr   r   r   r   z% only supports `return_tensors='pt'`.c              3   <   K   | ]  }t        |t                y wN
isinstancestr).0ts     r!   	<genexpr>z3AudioFlamingo3Processor.__call__.<locals>.<genexpr>   s     9[QR*Q:L9[   zAInvalid input text. Please provide a string, or a list of stringszGot z
 text but z audios; they must match 1:1.r   r   r   r4   zAudio duration (z.1fzs) exceeds zs; truncating to first zs.attention_maskinput_features_mask	input_idsilabels)datatensor_type)(_merge_kwargsr   r0   init_kwargsget
ValueErrorr1   r   rC   rD   listtupleallr   lenintr+   shapemaxloggerwarningappendminranger/   poptorchstacksplitsumr9   	enumerateresubescaper'   cloner)   pad_token_idr   )r.   r:   r;   r<   r=   call_kwargsr   r   r   audio_inputswindow_sizemax_windowsper_sample_windowsflat_chunksaudio_el	n_samplesn_wintime_capistartendpadding_masksr6   r8   audio_lengthexpandedtext_inputsrN   rM   s                                 r!   __call__z AudioFlamingo3Processor.__call__`   s   : )d(()
"&.."<"<
 
 "-0">2$)9:T! 7 788]^__dC 6DTD%=1c9[VZ9[6[`aa&u-E4yCJ& 4D	{*SZLHe!fgg l?;l>>ZZ[Kd00L4PPQK,.,.K! <q 12	A	K 7! ;KL;&NN*9|O7T+TUX*YYdeiewewdx  yP  QU  Qc  Qc  Pd  df  g (E"))%0y%+*=>u <AOEq1u3X>C&&xc':;<<" 2411+NNL'++,<=L2>L./ "KK%++lFVFVWYFZ\n:o(pQ(pqM#'#?#?#N  $--A#B #<66"))D,<,<"=t?O?OR^?^`def`gh"Q#
 %dnnT9[9.+..+&,,.F48F6T0001<@F6T^^8889#DN>BB% )qs   $Mc                     | j                   j                  }| j                  j                  }t        t        j                  ||z   dgz               S )NrJ   )r0   model_input_namesr/   rT   dictfromkeys)r.   	tok_names	fea_namess      r!   r   z)AudioFlamingo3Processor.model_input_names   sD    NN44	**<<	DMM)i"7;P:Q"QRSSr    promptc           
         t        |t              r|g}nt        |t        t        f      r |rt	        d |D              rt        |      }nst        t        |            }t               rU|D cg c]J  }t        |t        j                        r,|j                         j                         j                         n|L }}t        |      }|dk(  rt        d      || j                  g|z  }nt        |t              r|g|z  }nt        |t        t        f      r}t        |      |k7  rt        dt        |       d| d      g }|D ]L  }||j                  | j                         !t        |t              r|j                  |       Ct!        d       nt!        d      t#        ||      D 	
cg c](  \  }	}
d	d
|	dt        |
t              rd|
dnd|
dgdg* }}	}
 | j$                  |fdddd|S c c}w c c}
}	w )a  
        Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

        Args:
            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
                the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
            prompt (`str` or `list[str]`, *optional*):
                Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
                each sample uses `"Transcribe the input speech."`.
            **kwargs:
                Additional keyword arguments forwarded to [`~AudioFlamingo3Processor.apply_chat_template`] (for example
                `text_kwargs`, `audio_kwargs`, ...).

        Returns:
            [`BatchFeature`]: Processor outputs ready to be passed to [`AudioFlamingo3ForConditionalGeneration.generate`].

        c              3   <   K   | ]  }t        |t                y wrA   rB   )rE   els     r!   rG   zFAudioFlamingo3Processor.apply_transcription_request.<locals>.<genexpr>   s     ?dXZ
2s@S?drH   r   z)`audio` must contain at least one sample.z	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.userr:   )typer:   r;   )r   path)r   r;   )rolecontentT)tokenizeadd_generation_promptreturn_dict)rC   rD   rT   rU   rV   r   r   ra   TensordetachcpunumpyrW   rS   r*   r]   	TypeErrorzipapply_chat_template)r.   r;   r   r=   audio_itemsr   
batch_sizepromptsitemprompt_text
audio_itemconversationss               r!   apply_transcription_requestz3AudioFlamingo3Processor.apply_transcription_request   s   2 eS!38'Ke}-%C?d^c?d<du+K1%89K!#kvwegJr5<<<Xryy{0668^``ww%
?HII>889JFG$h+Gu.6{j( F}OJ<Gkl  G O<NN4#D#DEc*NN4(#$MNNO Z[[ ,/w+D
 (Z #!'=%j#6 ")*=&-
C	 

 
 (t''
"&	

 
 	
S x4
s   -AG70-G<)strip_prefixc                     | j                   j                  |i |}|r|D cg c]  }| j                  |       }}|S c c}w )ap  
        Forward arguments to [`~PreTrainedTokenizer.batch_decode`] and optionally remove the assistant framing the model
        was trained to produce.

        AF3 transcription requests respond with sentences such as `"The spoken content of the audio is "..."."`.
        Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
        )r0   batch_decode"_strip_assistant_prefix_and_quotes)r.   r   argsr=   decodedr:   s         r!   r   z$AudioFlamingo3Processor.batch_decode  sK     .$..--t>v>QXYt>>tDYGY Zs   ?c                 @   |j                         }dD ]1  }|j                  |      s|t        |      d j                         } n |j                  d      r|dd j                         }t        |      dk\  r%|d   |d   k(  r|d   dv r|dd j                         }|S )	zi
        Remove the assistant prefix and surrounding quotes from a decoded transcription string.
        )z"The spoken content of the audio isz!The transcription of the audio isN.rK   r5   r   >   "'r4   )strip
startswithrW   endswith)r.   r:   strippedprefixs       r!   r   z:AudioFlamingo3Processor._strip_assistant_prefix_and_quotes)  s    
 ::<
 	F ""6*#CKM288:	 S!}**,Hx=A(1+""=(1+Q[B["~++-Hr    )Nz<sound>zTranscribe the input speech.iX  )r6   torch.Tensorr2   r   )NFrA   )r   r   r   __doc__r-   r9   r
   rT   r   boolr	   r   r   r}   propertyrD   r   r   r   r   __classcell__)r1   s   @r!   r$   r$   4   s   6 %CT$ $(%*	cC$y/)cC D cC d{	cC
 67cC 
cCJ T49 T T *.O
T#Y+O
 d3i$&O
 67	O

 
O
b 05 s s r    r$   )rf   r   npaudio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r   r	   tokenization_utils_baser
   utilsr   r   ra   
get_loggerr   r[   r   r$   __all__r   r    r!   <module>r      sf     
  9 4 H H 0 0  
		H	%$4E $Jn JZ %
%r    