
    qig                        d dl Z d dlmZ d dlZd dlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(  ejR                  e*      Z+	 	 d-dejX                  dejZ                  dejZ                  dejZ                  dejZ                  dz  de.dz  de.fdZ/ G d dejX                        Z0 G d d e      Z1e G d! d"e             Z2 ed#$       G d% d&e2             Z3 G d' d(ejX                        Z4 ed)$       G d* d+e2e             Z5g d,Z6y).    N)Callable)nn   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)create_bidirectional_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )	AutoModelAutoModelForCausalLM   )AudioFlamingo3ConfigAudioFlamingo3EncoderConfigmodulequerykeyvalueattention_maskscalingdropoutc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )N      r   r   )dimptrainingr   )
sizetorchmatmul	transposer   
functionalsoftmaxr$   r+   
contiguous)
r   r   r    r!   r"   r#   r$   kwargsattn_weightsattn_outputs
             l/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/audioflamingo3/modeling_audioflamingo3.pyeager_attention_forwardr7   /   s     **R.D(<<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$    c                   Z    e Zd ZdZ	 	 	 	 	 	 ddedededededed	edz  d
edz  f fdZ	 	 	 	 	 dde	j                  de	j                  dz  dedz  de	j                  dz  dede	j                  dz  dee   dee	j                  e	j                  dz  ee	j                     dz  f   fdZ xZS )AudioFlamingo3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsr$   
is_decoderbias	is_causal	layer_idxconfigc	                 z   t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        |/|r-t        j                  d| j                  j                   d       || _        t!        j"                  ||d      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r'   zInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr>   )super__init__r;   r<   r$   head_dimrA   
ValueErrorr#   r=   r?   loggerwarning_once	__class____name__r@   r   Lineark_projv_projq_projout_proj)
selfr;   r<   r$   r=   r>   r?   r@   rA   rJ   s
            r6   rE   z AudioFlamingo3Attention.__init__L   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"*4>>+B+B*C D, ,
 #ii	95Aii	94@ii	94@		)YTBr8   hidden_stateskey_value_statespast_key_valuesr"   output_attentionscache_positionr3   returnc                 6   |du}|j                   dd \  }	}
|	|
d| j                  f}| j                  |      | j                  z  } |j                  | }|j                  dd      j                         }|it        |t              rY|j                  j                  | j                        }|r&d|j                  | j                  <   |j                  }n|j                  }||n|}|rK|rIrG|j                  | j                     j                  }|j                  | j                     j                   }n| j#                  |      j	                  |	d| j$                  | j                        }| j'                  |      j	                  |	d| j$                  | j                        }|j                  dd      j                         }|j                  dd      j                         }|)|s|nd}|j)                  ||| j                  d|i      \  }}t+        j,                  | j.                  j0                  t2              } || ||||f| j4                  sdn| j6                  d|d	|\  }}|j9                  |	|
d      j                         }| j;                  |      }||fS )
z#Input shape: Batch x Time x ChannelNr&   r   r   TrV                 ?)r$   r#   rU   )shaperF   rO   r#   viewr/   r2   
isinstancer   
is_updatedgetr@   cross_attention_cacheself_attention_cachelayerskeysvaluesrM   r<   rN   updater   get_interfacerA   _attn_implementationr7   r+   r$   reshaperP   )rQ   rR   rS   rT   r"   rU   rV   r3   is_cross_attentionbsztgt_lenq_input_shapequery_statesr^   current_states
key_statesvalue_statesattention_interfacer5   r4   s                       r6   forwardzAudioFlamingo3Attention.forwardt   s     .T9 %**3B/Wgr4==9 {{=1DLL@(|((-8#--a3>>@ &:oGZ+[(3377GJ!=A**4>>:"1"G"G"1"F"F .>-I)}/j(//?DDJ*11$..AHHL^499#r4>>SWS`S`aJ;;~6;;CT^^UYUbUbcL#--a3>>@J'11!Q7BBDL*7It+:+A+Adnn?OQ_>`,(
L )@(M(MKK,,.E)
 %8
%
  $}}C$,,/
%
 
%
!\ "))#w;FFHmmK0L((r8   )rY   FTFNN)NNNFN)rK   
__module____qualname____doc__intfloatboolr   rE   r-   Tensorr   r   r   tuplerr   __classcell__rJ   s   @r6   r:   r:   I   s=   G   $.2&C&C &C 	&C
 &C &C &C :&C %t+&CV 15(,.2"'.2N)||N)  ,,-N) 	N)
 t+N)  N) t+N) -.N) 
u||U\\D0%2E2LL	MN)r8   r:   c            	       |     e Zd Zdef fdZ	 ddej                  dej                  dedej                  fdZ xZ	S )	AudioFlamingo3EncoderLayerrA   c                 h   t         |           |j                  | _        t	        | j                  |j
                  |j                  |      | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)r;   r<   r$   rA   )rD   rE   d_modelr;   r:   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr$   r   activation_functionactivation_fnactivation_dropoutrL   encoder_ffn_dimfc1fc2final_layer_normrQ   rA   rJ   s     r6   rE   z#AudioFlamingo3EncoderLayer.__init__   s    0nn44,,	
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r8   rR   r"   rU   rW   c                    |}| j                  |      }| j                  |||      \  }}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  | j                  |            }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|j                  t        j                  k(  rEt        j                  |j                        j                  dz
  }t        j                   || |      }||fS )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rR   r"   rU   r)   i  )minmax)r   r   r   r0   r$   r+   r   r   r   r   r   dtyper-   float16finfor   clamp)rQ   rR   r"   rU   residualr4   clamp_values          r6   rr   z"AudioFlamingo3EncoderLayer.forward   sP    !11-@&*nn')/ '5 '
#|
 --mt||VZVcVc-d =0 --m<**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0%--/++m&9&9:>>EK!KKK<[YMl**r8   )F)
rK   rs   rt   r   rE   r-   ry   rx   rr   r{   r|   s   @r6   r~   r~      sL    =3 =, #(	%+||%+ %+  	%+
 
%+r8   r~   c                   6    e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZy)AudioFlamingo3PreTrainedModelrA   model)audiotextTr:   rT   N)rK   rs   rt   r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa r8   r6   r   r      s4      (&*#23"3Nr8   r   zT
    The audio model from AudioFlamingo3 without any head or projection on top.
    custom_introc            
           e Zd ZU dZeed<   dZdZdgZe	e
dZdef fdZd Zd	ej                  fd
Zdej                  fdZee	 ddej*                  dej*                  dz  d	eez  fd              Zdej2                  fdZ xZS )AudioFlamingo3EncoderzY
    AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
    rA   input_featuresr   r~   )rR   
attentionsc                 b   t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _        |j                  rt        j                  |      nd| _        t        j                  | j                  |dd      | _        t        j                  ||ddd      | _        t        j                   | j                  |      | _        | j"                  j%                  d       t        j&                  t)        |j*                        D cg c]  }t-        |       c}      | _        t        j0                  |j
                        | _        t        j4                  dd      | _        d| _        | j;                          y c c}w )	NrZ   r   r   )kernel_sizepaddingr   )r   strider   F)r   )rD   rE   r$   encoder_layerdrop	layerdropr   num_mel_binsmax_source_positionsscale_embeddingmathsqrtembed_scaler   Conv1dconv1conv2	Embeddingembed_positionsrequires_grad_
ModuleListrangeencoder_layersr~   rb   r   
layer_norm	AvgPool1d
avg_poolergradient_checkpointing	post_init)rQ   rA   r;   _rJ   s       r6   rE   zAudioFlamingo3Encoder.__init__!  s7    ~~11NN	"//$*$?$?!393I3I499Y/sYYt00)TUV
YYy)1VWX
!||D,E,EyQ++E2mmQVW]WlWlQm$nA%?%G$no,,v~~6,,q3&+# %os   6F,c                 J    | j                         D ]	  }d|_         d| _        y )NF)
parametersrequires_grad_requires_grad)rQ   params     r6   _freeze_parametersz(AudioFlamingo3Encoder._freeze_parameters:  s(    __& 	(E"'E	(#r8   rW   c                     | j                   S Nr   rQ   s    r6   get_input_embeddingsz*AudioFlamingo3Encoder.get_input_embeddings?  s    zzr8   r!   c                     || _         y r   r   rQ   r!   s     r6   set_input_embeddingsz*AudioFlamingo3Encoder.set_input_embeddingsB  s	    
r8   Ninput_features_maskc                    |j                   d   dz
  dz  dz   }|j                  d      }|dz
  dz  dz   }t        j                  ||j                        |dddf   k  }t
        j                  j                  | j                  |            }t
        j                  j                  | j                  |            }|j                  ddd      }|| j                  j                  z   }t
        j                  j                  || j                  | j                        }t        | j                   ||      }| j"                  D ]A  }	| j                  xr" t        j$                  g       | j&                  k  }
|
r6 |	||      d   }C |j                  ddd      }| j)                  |      j                  ddd      }| j+                  |      }t-        |	      S )
ap  
        Args:
            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
                these features from waveform input.
            input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
        r&   r   r   deviceNr   r)   )rA   inputs_embedsr"   )last_hidden_state)r[   sumr-   aranger   r   r0   gelur   r   permuter   weightr$   r+   r
   rA   rb   randr   r   r   r   )rQ   r   r   r3   seq_leninput_features_lengthsr   rR   r"   layerdrops              r6   rr   zAudioFlamingo3Encoder.forwardE  s   ( "''+a/A59!4!8!8!<"81"<!BQ!F#ll7>;P;PQTjklnrkrTss **4::n+EF**4::m+DE%--aA6 &(<(<(C(CC--mt||VZVcVc-d2;;'.
 [[ 	HE==DUZZ^dnn%DD %m^ DQ G	H &--aA66>>q!QG6)+
 	
r8   input_lengthsc                 6    |dz
  dz  dz   }|dz
  dz  dz   }||fS )zs
        Computes the output length of the convolutional layers and the output length of the audio encoder
        r   r   r   )rQ   r   output_lengthss      r6    _get_feat_extract_output_lengthsz6AudioFlamingo3Encoder._get_feat_extract_output_lengths}  s7     '*q014'!+1A5n,,r8   r   )rK   rs   rt   ru   r   r   main_input_namer   r   r~   r:   _can_record_outputsrE   r   r   Moduler   r   r   r   r-   ry   rz   r   rr   
LongTensorr   r{   r|   s   @r6   r   r     s    
 ('&O56 4-
: 2$
bii "))    483
3
 #\\D03

 
+	+3
   3
l-e>N>N -r8   r   c                   .     e Zd ZdZdef fdZd Z xZS )!AudioFlamingo3MultiModalProjectorz
    Audio adaptor (small MLP) that projects AudioFlamingo3Encoder features
    to the LLM embedding space so they can replace `<sound>` tokens.
    rA   c                    t         |           t        j                  |j                  j
                  |j                  j
                  |j                        | _        t        |j                     | _        t        j                  |j                  j
                  |j                  j
                  |j                        | _        y )NrC   )rD   rE   r   rL   audio_confighidden_sizetext_configprojector_biaslinear_1r   projector_hidden_actactlinear_2r   s     r6   rE   z*AudioFlamingo3MultiModalProjector.__init__  s    		++V-?-?-K-KRXRgRg
 &556		**F,>,>,J,JQWQfQf
r8   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )rQ   audio_featuresrR   s      r6   rr   z)AudioFlamingo3MultiModalProjector.forward  s2    n5/m4r8   )rK   rs   rt   ru   r   rE   rr   r{   r|   s   @r6   r   r     s    

3 
r8   r   z
    The AudioFlamingo3 model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
    c                   *    e Zd ZdZdZdZ fdZd Zd Zd Z	d Z
d Zd Ze ed	
      dej                   dej"                  dee   deez  fd              Zee	 	 	 	 	 	 	 	 	 	 	 ddej.                  dz  dej                   dz  dej"                  dz  dej"                  dz  dej.                  dz  dedz  dej                   dz  dej.                  dz  dedz  dej.                  dz  deej"                  z  dee   defd              Z fdZ xZS )&AudioFlamingo3ForConditionalGenerationNc                 *   t         |   |       |j                  j                  | _        t	        j
                  |j                        | _        t        j
                  |j                        | _	        t        |      | _        | j                          y r   )rD   rE   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   r   s     r6   rE   z/AudioFlamingo3ForConditionalGeneration.__init__  sn      ,,77$001D1DE2>>v?Q?QR%Fv%N" 	r8   c                 6    | j                   j                         S r   )r   r   r   s    r6   r   z;AudioFlamingo3ForConditionalGeneration.get_input_embeddings  s    ""7799r8   c                 :    | j                   j                  |       y r   )r   r   r   s     r6   r   z;AudioFlamingo3ForConditionalGeneration.set_input_embeddings  s    007r8   c                 6    | j                   j                         S r   )r   get_output_embeddingsr   s    r6   r  z<AudioFlamingo3ForConditionalGeneration.get_output_embeddings  s    ""88::r8   c                 :    | j                   j                  |       y r   )r   set_output_embeddings)rQ   new_embeddingss     r6   r  z<AudioFlamingo3ForConditionalGeneration.set_output_embeddings  s    11.Ar8   c                 :    | j                   j                  |       y r   )r   set_decoder)rQ   decoders     r6   r	  z2AudioFlamingo3ForConditionalGeneration.set_decoder  s    ''0r8   c                 6    | j                   j                         S r   )r   get_decoderr   s    r6   r  z2AudioFlamingo3ForConditionalGeneration.get_decoder  s    ""..00r8   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   r   r   r3   rW   c                 d    | j                   |f|dd|}| j                  |j                        }|j                  d      dz
  dz  dz   }t	        j
                  |j                  d   |j                        dddf   |dddf   k  }||j                  |j                           }||_	        |S )a
  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padded feature indices.
        T)r   return_dictr&   r   r   r   N)
r   r   r   r   r-   r   r[   r   topooler_output)rQ   r   r   r3   audio_outputaudio_embedspost_lengths
valid_masks           r6   get_audio_featuresz9AudioFlamingo3ForConditionalGeneration.get_audio_features  s    , (t''
0CQU
Y_
 11,2P2PQ ,//3a7A=A\\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
#JMM,2E2E$FG%1"r8   	input_idsr"   position_idsrT   r   labels	use_cacherV   logits_to_keepc                    | | j                         |      }||| j                  ||d      j                  }|| j                  j                  k(  j                  d      }|j                  |j                  |j                        |j                  |j                              } | j                  d||||||	|
|d|}|S )a+  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor

        >>> model_id = "nvidia/audio-flamingo-3-hf"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")

        >>> conversations = [
        >>>     [
        >>>         {
        >>>             "role": "user",
        >>>             "content": [
        >>>                 {"type": "text", "text": "Transcribe the input speech."},
        >>>                 {
        >>>                     "type": "audio",
        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
        >>>                 },
        >>>             ],
        >>>         }
        >>>     ],
        >>>     [
        >>>         {
        >>>             "role": "user",
        >>>             "content": [
        >>>                 {
        >>>                     "type": "text",
        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
        >>>                 },
        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
        >>>             ],
        >>>         }
        >>>     ],
        >>> ]

        >>> inputs = processor.apply_chat_template(
        >>>     conversations,
        >>>     tokenize=True,
        >>>     add_generation_prompt=True,
        >>>     return_dict=True,
        >>> ).to(model.device)

        >>> outputs = model.generate(**inputs, max_new_tokens=500)

        >>> decoded_outputs = processor.batch_decode(
        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
        >>> )
        >>> print(decoded_outputs)
        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
        ```T)r  r&   )r   r"   r  rT   r  r  rV   r  r   )
r   r  r  rA   audio_token_id	unsqueezemasked_scatterr  r   r   )rQ   r  r   r   r"   r  rT   r   r  r  rV   r  r3   r  audio_token_maskoutputss                   r6   rr   z.AudioFlamingo3ForConditionalGeneration.forward  s    `  7D557	BM%)*?22>CVdh2iwwL !*T[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M +>$*=*= 
+
')%+))
+
 
+
 r8   c                     |j                  dd       }|j                  dd       }|j                  d      }t        |   |i |}||d   d   dk(  r|||d<   |||d<   |S )Nr   r   rV   r   )popr_   rD   prepare_inputs_for_generation)rQ   argsr3   r   r   rV   model_inputsrJ   s          r6   r#  zDAudioFlamingo3ForConditionalGeneration.prepare_inputs_for_generationO  s      $4d;$jj)>E$45w<dMfM%,7G*H*Kq*P)1?-.".6I23r8   )NNNNNNNNNNr   )rK   rs   rt   _keep_in_fp32_modules_strict_tp_plan_pp_planrE   r   r   r  r  r	  r  r   r   r-   FloatTensorry   r   r   rz   r   r  r   r   rx   rv   r   rr   r#  r{   r|   s   @r6   r   r     s    $( HH:8;B11  w)) #\\ +,	
 
+	+ >  .23737.204(,26*.!%26-.e##d*e ))D0e #\\D0	e
 t+e &&-e e ((4/e   4'e $;e ((4/e ell*e +,e 
 e  eN r8   r   )r   r   r   )NrY   )7r   collections.abcr   r-   r   activationsr   cache_utilsr   r   
generationr	   masking_utilsr
   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   autor   r   configuration_audioflamingo3r   r   
get_loggerrK   rH   r   ry   rw   r7   r:   r~   r   r   r   r   __all__r   r8   r6   <module>r;     sj  ,  $   ! 5 ) 6 B 9 R F & R R 7 5 2 [ 
		H	% !%II%<<% 
% <<	%
 LL4'% T\% %4y)bii y)x8+!; 8+v O   
r-9 r-
r-j		 . 
}-JO }
}@ or8   