
    qiI                     f   d dl mZ d dlZddlmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/  e       r
d dl0Z0d dl0m1Z1  ejd                  e3      Z4 G d de&      Z5 G d de%      Z6 G d de(      Z7d/dZ8 G d de*      Z9 G d  d!e1jt                        Z; G d" d#e      Z< G d$ d%e#      Z= G d& d'e=      Z> G d( d)e"      Z? ed*+       G d, d-e!             Z@g d.ZAy)0    )CallableN   )ACT2FN)
AudioInputmake_list_of_audio)Cache)BatchFeature)GradientCheckpointingLayer)BaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringis_torch_availablelogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )&AudioFlamingo3ForConditionalGeneration!AudioFlamingo3MultiModalProjectorAudioFlamingo3PreTrainedModel)AudioFlamingo3ProcessorAudioFlamingo3ProcessorKwargs)GlmRotaryEmbedding)LlamaAttentioneager_attention_forwardrotate_half   )GlmAsrConfigGlmAsrEncoderConfig)nnc                       e Zd Zy)GlmAsrProcessorKwargsN__name__
__module____qualname__     [/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/glmasr/modular_glmasr.pyr%   r%   1       r+   r%   c            	       z     e Zd ZdZ	 	 	 	 d
 fd	ZddZ	 ddeee   z  ez  deee   z  dz  de	e
   defd	Z xZS )GlmAsrProcessora  
    Constructs an GlmAsr processor which wraps an GlmAsr feature extractor and an GlmAsr
    tokenizer into a single processor.

    [`GlmAsrProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
    [`Qwen2TokenizerFast`]. See the [`~GlmAsrProcessor.__call__`] for more information.

    Args:
            feature_extractor ([`WhisperFeatureExtractor`]):
                The feature extractor is a required input.
            tokenizer ([`Qwen2TokenizerFast`]):
                The tokenizer is a required input.
            chat_template (`Optional[str]`, *optional*):
                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
                template will be used.
            audio_token (`Optional[str]`, *optional*, defaults to `"<|pad|>`"):
                Special token used to represent audio inputs in the chat template.
            default_transcription_prompt (`str`, *optional*, defaults to `"Please transcribe this audio into text"`):
                Default prompt to use for transcription tasks when applying transcription requests.
            max_audio_len (`int`, *optional*, defaults to 655):
                Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
                655 gives approximately 8192 tokens, corresponding to the maximum sequence length of the text model.
    Nc                 0    t         |   ||||||       y )N)chat_templateaudio_tokendefault_transcription_promptmax_audio_len)super__init__)selffeature_extractor	tokenizerr1   r2   r3   r4   	__class__s          r,   r6   zGlmAsrProcessor.__init__M   s)     	'#)E' 	 	
r+   returnc                 d    d}dD ]  \  }}}|d|z  z   |dz
  z
  dz
  |z  dz   } ||z
  |z  dz   }|S )N   )r    r   r    )r    r   r   r   r    r*   )r7   audio_lengthsmerge_factorpaddingkernel_sizestride
num_tokenss          r,   _get_audio_token_lengthz'GlmAsrProcessor._get_audio_token_length_   sc    ,B 	`(G[&*Q[8K!OLqPU[[^__M	` $l2|CaG
r+   audiopromptkwargsc           	         t        |t              r|g}nt        |t        t        f      r |rt	        d |D              rt        |      }nst        t        |            }t               rU|D cg c]J  }t        |t        j                        r,|j                         j                         j                         n|L }}t        |      }|dk(  rt        d      || j                  g|z  }nt        |t              r|g|z  }nt        |t        t        f      r}t        |      |k7  rt        dt        |       d| d      g }|D ]L  }||j                  | j                         !t        |t              r|j                  |       Ct!        d       nt!        d      t#        ||      D 	
cg c](  \  }	}
d	t        |
t              rd
|
dnd
|
dd|	dgdg* }}	}
 | j$                  |fdddd|S c c}w c c}
}	w )a	  
        Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

        Args:
            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
                the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
            prompt (`str` or `list[str]`, *optional*):
                Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
                each sample uses `"Transcribe the input speech."`.
            **kwargs:
                Additional keyword arguments forwarded to [`~GlmAsrProcessor.apply_chat_template`] (for example
                `text_kwargs`, `audio_kwargs`, ...).

        Returns:
            [`BatchFeature`]: Processor outputs ready to be passed to [`GlmAsrForConditionalGeneration.generate`].

        c              3   <   K   | ]  }t        |t                y wN)
isinstancestr).0els     r,   	<genexpr>z>GlmAsrProcessor.apply_transcription_request.<locals>.<genexpr>   s     ?dXZ
2s@S?ds   r   z)`audio` must contain at least one sample.z	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.userrF   )typepath)rR   rF   text)rR   rT   )rolecontentT)tokenizeadd_generation_promptreturn_dict)rL   rM   listtupleallr   r   torchTensordetachcpunumpylen
ValueErrorr3   append	TypeErrorzipapply_chat_template)r7   rF   rG   rH   audio_itemsrO   
batch_sizepromptsitemprompt_text
audio_itemconversationss               r,   apply_transcription_requestz+GlmAsrProcessor.apply_transcription_requestg   s   2 eS!38'Ke}-%C?d^c?d<du+K1%89K!#kvwegJr5<<<Xryy{0668^``ww%
?HII>889JFG$h+Gu.6{j( F}OJ<Gkl  G O<NN4#D#DEc*NN4(#$MNNO Z[[ ,/w+D
 (Z # &j#6 ")*=&-
C!'=	 

 
 (t''
"&	

 
 	
S x4
s   -AG70-G<)Nz<|pad|>z&Please transcribe this audio into texti  )r?   torch.Tensorr;   rp   rK   )r'   r(   r)   __doc__r6   rE   rM   rZ   r   r   r%   r	   ro   __classcell__r:   s   @r,   r/   r/   4   su    8 %M
$ *.O
T#Y+O
 d3i$&O
 ./	O

 
O
r+   r/   c                       e Zd Zy)GlmAsrRotaryEmbeddingNr&   r*   r+   r,   ru   ru      r-   r+   ru   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd |f   | d|d f   }}|dd |f   |d|d f   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t        j                  ||gd      }t        j                  ||
gd      }||fS )N.)dim)	unsqueezeshaper   r]   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r,   apply_rotary_pos_embr      s    
--
&C
--
&C2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr+   c                        e Zd Zdedef fdZ	 d
dej                  deej                  ej                  f   dz  de	e
   deej                  ej                  f   fd	Z xZS )GlmAsrAttentionconfig	layer_idxc                 $   t         |   ||       d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _
        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        y )NFT)bias)r5   r6   	is_causalr#   Linearhidden_sizenum_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_projr7   r   r   r:   s      r,   r6   zGlmAsrAttention.__init__   s    +ii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^eijii : :T]] JFL^L^eijr+   Nhidden_statesposition_embeddingsrH   r;   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }|\  }	}
t        |||	|
      \  }}t        j                  | j                  j                  t              } || |||fd | j                  sdn| j                  | j                  d|\  }} |j                   g |d j#                         }| j%                  |      }||fS )Nrw   r    r   g        )attention_maskdropoutscaling)rz   r   r   view	transposer   r   r   r   get_interfacer   _attn_implementationr   trainingattention_dropoutr   reshape
contiguousr   )r7   r   r   rH   input_shapehidden_shapequery_states
key_statesvalue_statesr~   r   attention_interfaceattn_outputattn_weightss                 r,   forwardzGlmAsrAttention.forward   sk    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j(?(M(MKK,,.E)
 %8		%

  #}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r+   rK   r'   r(   r)   r!   intr6   r]   r^   r[   r   r   r   rr   rs   s   @r,   r   r      s~    k| k k IM!)||!) #5<<#=>E!) +,	!)
 
u||U\\)	*!)r+   r   c                   >     e Zd Z fdZdej
                  fdZ xZS )	GlmAsrMLPc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        |j                     | _
        y rK   )r5   r6   r#   r   r   intermediate_sizefc1fc2r   
hidden_actact_fnr7   r   r:   s     r,   r6   zGlmAsrMLP.__init__   s\    99V//1I1IJ99V55v7I7IJV../r+   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rK   )r   r   r   )r7   r   s     r,   r   zGlmAsrMLP.forward  s2    /M2/r+   )r'   r(   r)   r6   r]   r^   r   rr   rs   s   @r,   r   r      s    0U\\ r+   r   c            	            e Zd Zdedef fdZ	 d
dej                  deej                  ej                  f   dz  de	e
   dej                  fd	Z xZS )GlmAsrEncoderLayerr   r   c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        j                  |j                        | _	        t        j                  |j                        | _
        y )N)r   r   )r5   r6   r   r   	self_attnr   mlpr#   	LayerNorminput_layernormpost_attention_layernormr   s      r,   r6   zGlmAsrEncoderLayer.__init__
  sd    !--()LV$!||F,>,>?(*V5G5G(H%r+   Nr   r   rH   r;   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r   r   r*   )r   r   r   r   )r7   r   r   rH   residual_s         r,   r   zGlmAsrEncoderLayer.forward  s     !,,];)4>> 
' 3
 
q
 !=0 !55mD/ =0r+   rK   r   rs   s   @r,   r   r   	  sp    I| I I IM|| #5<<#=>E +,	
 
r+   r   c                       e Zd Zy)GlmAsrPreTrainedModelNr&   r*   r+   r,   r   r   ,  r-   r+   r   c                   x     e Zd ZU eed<   dZdZdgZee	dZ
def fdZeeedee   fd                     Z xZS )	GlmAsrEncoderr   input_featuresrF   r   )r   
attentionsc           	         t         |   |       t        j                  |j                  |j
                  dd      | _        t        j                  |j
                  |j
                  ddd      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j
                        | _        t        |      | _        d| _        | j%                          y c c}w )Nr   r    )rB   rA   r   )rB   rC   rA   )r   F)r5   r6   r#   Conv1dnum_mel_binsr   conv1conv2
ModuleListrangenum_hidden_layersr   layersr   normru   
rotary_embgradient_checkpointing	post_initr   s      r,   r6   zGlmAsrEncoder.__init__:  s     YYv22F4F4FTU_`a
YYv1163E3EST]^hij
mmDI&JbJbDcdy	2d
 LL!3!34	/v>&+# es   DrH   c                    t         j                  j                  | j                  |            }t         j                  j                  | j	                  |            }|j                  dd      }|}| j                  |t        j                  |j                  d   |j                        d d d f         }| j                  D ]  } ||fd|i|} | j                  |      }t        |      S )Nr    r   device)r   r   )last_hidden_state)r#   
functionalgelur   r   r   r   r]   arangerz   r   r   r   r   )r7   r   rH   inputs_embedsr   r   encoder_layers          r,   r   zGlmAsrEncoder.forwardG  s     **4::n+EF**4::m+DE%//15%"oo]5H5H5KTaThTh(ijnpqjq(r . 
 "[[ 	lM)-kM`kdjkM	l 		-0)MJJr+   )r'   r(   r)   r"   __annotations__main_input_nameinput_modalities_no_split_modulesr   r   _can_record_outputsr6   r   r   r   r   r   r   rr   rs   s   @r,   r   r   0  sl    &O-.+%
2   K7I0J K    Kr+   r   c                   $     e Zd Zdef fdZ xZS )GlmAsrMultiModalProjectorr   c                 :   t         |           t        j                  |j                  j
                  |j                  j                  dz        | _        t        j                  |j                  j                  dz  |j                  j                        | _	        y )Nr   )
r5   r6   r#   r   audio_configr   text_configr   linear_1linear_2r   s     r,   r6   z"GlmAsrMultiModalProjector.__init__\  sm    		&"5"5"G"GI[I[IgIgjkIkl		&"4"4"@"@1"DfFXFXFdFder+   )r'   r(   r)   r!   r6   rr   rs   s   @r,   r   r   [  s    f| f fr+   r   z~
    The GlmAsr model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Llama language model.
    custom_introc                       e Zd Ze ed      dej                  dej                  dee	   de
ez  fd              Z	 	 	 	 	 	 	 	 	 	 	 dd	ej                  dz  dej                  dz  dej                  dz  d
ej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dedz  dej                  dz  deej                  z  dee	   def fdZ xZS )GlmAsrForConditionalGenerationzgCompute audio embeddings from log-mel input features using the audio encoder and multi-modal projector.r   r   input_features_maskrH   r;   c                 *    | j                   |fddi|}|j                  }|j                  |j                  d   d| j                  j
                  j                        }| j                  |      }|j                  d      }dD ]  \  }}	}
|d|z  z   |	dz
  z
  dz
  |
z  dz   } d}||z
  |z  dz   }t        j                  |j                  d   |j                  	      d d d f   |d d d f   k  }||j                  |j                           |_        |S )
NrY   Tr   rw   r>   r   r    r=   r   )audio_towerr   r   rz   r   r   r   multi_modal_projectorsumr]   r   r   topooler_output)r7   r   r   rH   audio_outputsaudio_hidden_statesaudio_embedsr?   rA   rB   rC   r@   post_lengths
valid_masks                 r,   get_audio_featuresz1GlmAsrForConditionalGeneration.get_audio_featuresh  s<    )((TTTVT+==199  #R)A)A)S)S
 112EF+//3,B 	`(G[&*Q[8K!OLqPU[[^__M	`%4EI\\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
&2:==ATAT3U&V#r+   N	input_idsr   r   past_key_valuesr   labels	use_cachecache_positionlogits_to_keepc                 6    t        |   d|||||||	|
|d	|S )a  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import GlmAsrForConditionalGeneration, AutoProcessor

        >>> model_id = "zai-org/GLM-ASR-Nano-2512"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = GlmAsrForConditionalGeneration.from_pretrained(model_id, dtype="auto", device_map="auto")
        >>> inputs = processor.apply_transcription_request("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")

        >>> inputs = inputs.to(model.device, dtype=model.dtype)

        >>> outputs = model.generate(**inputs, do_sample=False, max_new_tokens=500)

        >>> decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
        >>> print(decoded_outputs)
        ```)	r  r   r   r  r   r  r  r  r  r*   )r5   r   )r7   r  r   r   r   r   r  r   r  r  r  r  rH   r:   s                r,   r   z&GlmAsrForConditionalGeneration.forward  sA    V w 
)%+'))
 
 	
r+   )NNNNNNNNNNr   )r'   r(   r)   r   r   r]   FloatTensorr^   r   r   r[   r   r  
LongTensorr   boolr   r   r   rr   rs   s   @r,   r   r   b  s{    ~)) #\\ +,	
 
+	+ 4 .23737.204(,26*.!%26-.6
##d*6
 ))D06
 #\\D0	6

 t+6
 &&-6
 6
 ((4/6
   4'6
 $;6
 ((4/6
 ell*6
 +,6
 
 6
 6
r+   r   )r   r   r/   r   )Nr    )Bcollections.abcr   ra   npactivationsr   audio_utilsr   r   cache_utilsr   feature_extraction_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   &audioflamingo3.modeling_audioflamingo3r   r   r   (audioflamingo3.processing_audioflamingo3r   r   glm.modeling_glmr   llama.modeling_llamar   r   r   configuration_glmasrr!   r"   r]   r#   
get_loggerr'   loggerr%   r/   ru   r   r   Moduler   r   r   r   r   r   __all__r*   r+   r,   <module>r"     s#   %  ! 9   4 9 R 5 & T T I 5 
 n 1 W W C  
		H	% @9 ?B
- B
J 5. 4$*)n *)Z		  3  F @9 ?(K) (KVf A f 
S
%K S

S
l jr+   