
    qi(3                        d dl Z d dl mZ ddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlm Z   ejB                  e"      Z# G d de      Z$ G d de      Z% G d de      Z& ed       G d de             Z' G d de      Z( ed       G d d e             Z)g d!Z*y)"    N)nn   )ACT2FN)Cache)create_bidirectional_mask)BaseModelOutputWithPoolingCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )Qwen2AudioEncoderQwen2AudioPreTrainedModel)VoxtralForConditionalGenerationVoxtralMultiModalProjector)WhisperAttentionWhisperEncoderLayer   )AudioFlamingo3Configc                       e Zd Zy)AudioFlamingo3AttentionN__name__
__module____qualname__     k/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/audioflamingo3/modular_audioflamingo3.pyr   r   '       r!   r   c                       e Zd Zy)AudioFlamingo3EncoderLayerNr   r    r!   r"   r%   r%   +   r#   r!   r%   c                       e Zd Zy)AudioFlamingo3PreTrainedModelNr   r    r!   r"   r'   r'   /   r#   r!   r'   zT
    The audio model from AudioFlamingo3 without any head or projection on top.
    custom_introc            
       z    e Zd ZdZeedZee	 dde	j                  de	j                  dz  deez  fd              Zy)	AudioFlamingo3EncoderzY
    AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
    )hidden_states
attentionsNinput_featuresinput_features_maskreturnc                    |j                   d   dz
  dz  dz   }|j                  d      }|dz
  dz  dz   }t        j                  ||j                        |dddf   k  }t
        j                  j                  | j                  |            }t
        j                  j                  | j                  |            }|j                  ddd      }|| j                  j                  z   }t
        j                  j                  || j                  | j                        }t        | j                   ||      }| j"                  D ]A  }	| j                  xr" t        j$                  g       | j&                  k  }
|
r6 |	||      d   }C |j                  ddd      }| j)                  |      j                  ddd      }| j+                  |      }t-        |	      S )
ap  
        Args:
            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
                these features from waveform input.
            input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
        r   r   deviceNr   )ptraining)configinputs_embedsattention_mask)last_hidden_state)shapesumtorcharanger4   r   
functionalgeluconv1conv2permuteembed_positionsweightdropoutr6   r   r7   layersrand	layerdrop
avg_pooler
layer_normr   )selfr.   r/   kwargsseq_leninput_features_lengthsr8   r,   r9   layerdrops              r"   forwardzAudioFlamingo3Encoder.forwardB   s   ( "''+a/A59!4!8!8!<"81"<!BQ!F#ll7>;P;PQTjklnrkrTss **4::n+EF**4::m+DE%--aA6 &(<(<(C(CC--mt||VZVcVc-d2;;'.
 [[ 	HE==DUZZ^dnn%DD %m^ DQ G	H &--aA66>>q!QG6)+
 	
r!   N)r   r   r   __doc__r%   r   _can_record_outputsr   r   r=   Tensortupler   rR   r    r!   r"   r+   r+   3   sg    
 4-
   483
3
 #\\D03

 
+	+3
   3
r!   r+   c                   (     e Zd ZdZdef fdZ xZS )!AudioFlamingo3MultiModalProjectorz
    Audio adaptor (small MLP) that projects AudioFlamingo3Encoder features
    to the LLM embedding space so they can replace `<sound>` tokens.
    r7   c                    t         |           t        j                  |j                  j
                  |j                  j
                  |j                        | _        t        |j                     | _        t        j                  |j                  j
                  |j                  j
                  |j                        | _        y )N)bias)super__init__r   Linearaudio_confighidden_sizetext_configprojector_biaslinear_1r   projector_hidden_actactlinear_2rL   r7   	__class__s     r"   r]   z*AudioFlamingo3MultiModalProjector.__init__   s    		++V-?-?-K-KRXRgRg
 &556		**F,>,>,J,JQWQfQf
r!   )r   r   r   rT   r   r]   __classcell__rh   s   @r"   rY   rY   z   s    

3 
 
r!   rY   z
    The AudioFlamingo3 model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
    c                       e Zd ZdZdZdZ fdZe ed      de	j                  de	j                  dee   deez  fd	              Zee	 	 	 	 	 	 	 	 	 	 	 dd
e	j"                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j"                  dz  dedz  de	j                  dz  de	j"                  dz  dedz  de	j"                  dz  dee	j                  z  dee   defd              Z fdZ xZS )&AudioFlamingo3ForConditionalGenerationNc                 $    t         |   |       y rS   )r\   r]   rg   s     r"   r]   z/AudioFlamingo3ForConditionalGeneration.__init__   s     r!   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r(   r.   r/   rM   r0   c                 d    | j                   |f|dd|}| j                  |j                        }|j                  d      dz
  dz  dz   }t	        j
                  |j                  d   |j                        dddf   |dddf   k  }||j                  |j                           }||_	        |S )a
  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padded feature indices.
        T)r/   return_dictr2   r   r   r3   N)
audio_towermulti_modal_projectorr:   r<   r=   r>   r;   r4   topooler_output)rL   r.   r/   rM   audio_outputaudio_embedspost_lengths
valid_masks           r"   get_audio_featuresz9AudioFlamingo3ForConditionalGeneration.get_audio_features   s    , (t''
0CQU
Y_
 11,2P2PQ ,//3a7A=A\\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
#JMM,2E2E$FG%1"r!   	input_idsr9   position_idspast_key_valuesr8   labels	use_cachecache_positionlogits_to_keepc                    | | j                         |      }||| j                  ||d      j                  }|| j                  j                  k(  j                  d      }|j                  |j                  |j                        |j                  |j                              } | j                  d||||||	|
|d|}|S )a+  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor

        >>> model_id = "nvidia/audio-flamingo-3-hf"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")

        >>> conversations = [
        >>>     [
        >>>         {
        >>>             "role": "user",
        >>>             "content": [
        >>>                 {"type": "text", "text": "Transcribe the input speech."},
        >>>                 {
        >>>                     "type": "audio",
        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
        >>>                 },
        >>>             ],
        >>>         }
        >>>     ],
        >>>     [
        >>>         {
        >>>             "role": "user",
        >>>             "content": [
        >>>                 {
        >>>                     "type": "text",
        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
        >>>                 },
        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
        >>>             ],
        >>>         }
        >>>     ],
        >>> ]

        >>> inputs = processor.apply_chat_template(
        >>>     conversations,
        >>>     tokenize=True,
        >>>     add_generation_prompt=True,
        >>>     return_dict=True,
        >>> ).to(model.device)

        >>> outputs = model.generate(**inputs, max_new_tokens=500)

        >>> decoded_outputs = processor.batch_decode(
        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
        >>> )
        >>> print(decoded_outputs)
        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
        ```T)ro   r2   )r8   r9   rz   r{   r|   r}   r~   r   r    )
get_input_embeddingsrx   rs   r7   audio_token_id	unsqueezemasked_scatterrr   r4   language_model)rL   ry   r.   r/   r9   rz   r{   r8   r|   r}   r~   r   rM   ru   audio_token_maskoutputss                   r"   rR   z.AudioFlamingo3ForConditionalGeneration.forward   s    `  7D557	BM%)*?22>CVdh2iwwL !*T[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M +>$*=*= 
+
')%+))
+
 
+
 r!   c                     |j                  dd       }|j                  dd       }|j                  d      }t        |   |i |}||d   d   dk(  r|||d<   |||d<   |S )Nr.   r/   r~   r   )popgetr\   prepare_inputs_for_generation)rL   argsrM   r.   r/   r~   model_inputsrh   s          r"   r   zDAudioFlamingo3ForConditionalGeneration.prepare_inputs_for_generation$  s      $4d;$jj)>E$45w<dMfM%,7G*H*Kq*P)1?-.".6I23r!   )NNNNNNNNNNr   )r   r   r   _tp_plan_pp_plan_keep_in_fp32_modules_strictr]   r   r   r=   FloatTensorrV   r
   r   rW   r   rx   
LongTensorr   boolintr	   rR   r   ri   rj   s   @r"   rl   rl      s    HH#' !  w)) #\\ +,	
 
+	+ >  .23737.204(,26*.!%26-.e##d*e ))D0e #\\D0	e
 t+e &&-e e ((4/e   4'e $;e ((4/e ell*e +,e 
 e  eN r!   rl   )rl   r'   r+   )+r=   r   activationsr   cache_utilsr   masking_utilsr   modeling_outputsr   r	   processing_utilsr
   utilsr   r   r   r   utils.genericr   utils.output_capturingr    qwen2_audio.modeling_qwen2_audior   r   voxtral.modeling_voxtralr   r   whisper.modeling_whisperr   r   configuration_audioflamingo3r   
get_loggerr   loggerr   r%   r'   r+   rY   rl   __all__r    r!   r"   <module>r      s       !   6 R & R R 7 5 c L > 
		H	%	. 		!4 		$= 	 
?
- ?

?
D
(B 
" 
d-L d
dN or!   