
    qiY                     2   d dl mZ d dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(  e       r
d dl)Z)d dl)m*Z*  G d de*jV                        Z,d Z-de)j\                  de/de)j\                  fdZ0	 d8de*jV                  de)j\                  de)j\                  d e)j\                  d!e)j\                  dz  d"e1d#e1d$ee   fd%Z2d9d&Z3 ee3       G d' d(e*jV                               Z4 G d) d*e*jV                        Z5 G d+ d,e      Z6e G d- d.e             Z7 G d/ d0e7      Z8 G d1 d2e*jV                        Z9 ed34       G d5 d6e7e	             Z:g d7Z;y):    )Callable)Optional   )ACT2FN)Cache)GenerationMixin)use_kernelized_func)GradientCheckpointingLayer)BaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_torch_available)can_return_tuplemaybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModelAutoModelForCausalLM   )GlmAsrConfigGlmAsrEncoderConfigN)nnc                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )GlmAsrRotaryEmbeddinginv_freqNconfigc                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultr"   F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr#   rope_parametersr%   compute_default_rope_parametersr   attention_scalingregister_bufferclone)selfr#   devicerope_init_fnr"   	__class__s        \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/glmasr/modeling_glmasr.pyr*   zGlmAsrRotaryEmbedding.__init__0   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuU    r4   ztorch.deviceseq_lenreturnztorch.Tensorc                 n   | j                   d   }| j                   j                  dd      }t        | dd      xs | j                  | j                  z  }t        ||z        }d}d|t        j                  d|dt        j                        j                  |t        j                  	      |z  z  z  }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr   r   dtype)r4   r@   )r.   getgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r#   r4   r9   baser=   r>   dimattention_factorr"   s	            r7   r/   z5GlmAsrRotaryEmbedding.compute_default_rope_parameters@   s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(223 U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r8   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r   mpscpuF)device_typeenabledr   rL   r?   )r"   rJ   expandshaperI   r4   
isinstancetypestrr   	transposerF   catcosr0   sinr@   )
r3   xposition_idsinv_freq_expandedposition_ids_expandedrR   freqsembr\   r]   s
             r7   forwardzGlmAsrRotaryEmbedding.forward`   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$N)NNN)__name__
__module____qualname__rF   Tensor__annotations__r   r*   staticmethodr   rE   tuplerJ   r/   no_gradr   rd   __classcell__r6   s   @r7   r!   r!   -   s    llV| V  &*+/"*t#*(* t* 
~u$	%	* *> U]]_<  <r8   r!   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrO   r   rT   )rV   rF   r[   )r^   x1x2s      r7   rotate_halfrs   p   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r8   hidden_statesn_repr:   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rV   rU   reshape)rt   ru   batchnum_key_value_headsslenr>   s         r7   	repeat_kvr{   w   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr8   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr   r   rO   )rL   r@   )ptrainingr   )r{   num_key_value_groupsrF   matmulrZ   r   
functionalsoftmaxfloat32rI   r@   r   r   
contiguous)r|   r}   r~   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputs               r7   eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r8   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd |f   | d|d f   }}|dd |f   |d|d f   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t        j                  ||gd      }t        j                  ||
gd      }||fS )NrO   .rT   )	unsqueezerV   rs   rF   r[   )qkr\   r]   r_   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r7   apply_rotary_pos_embr      s    
--
&C
--
&C2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr8   c                        e Zd ZdZdedef fdZ	 ddej                  de	ej                  ej                  f   dz  de
e   d	e	ej                  ej                  f   fd
Z xZS )GlmAsrAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr#   	layer_idxc                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        y )Nr>   g      FT)bias)r)   r*   r#   r   rB   rC   rD   r>   ry   r   r   attention_dropout	is_causalr   Linearq_projk_projv_projo_projr3   r#   r   r6   s      r7   r*   zGlmAsrAttention.__init__   s,   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^eijii : :T]] JFL^L^eijr8   Nrt   position_embeddingsr   r:   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }|\  }	}
t        |||	|
      \  }}t        j                  | j                  j                  t              } || |||fd | j                  sdn| j                  | j                  d|\  }} |j                   g |d j#                         }| j%                  |      }||fS )NrO   r   r           )r   r   r   )rV   r>   r   viewrZ   r   r   r   r   get_interfacer#   _attn_implementationr   r   r   r   rw   r   r   )r3   rt   r   r   input_shapehidden_shapequery_statesr   r   r\   r]   attention_interfacer   r   s                 r7   rd   zGlmAsrAttention.forward   sk    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j(?(M(MKK,,.E)
 %8		%

  #}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r8   re   )rf   rg   rh   __doc__r   rE   r*   rF   ri   rl   r   r   rd   rn   ro   s   @r7   r   r      s    Gk| k k" IM!)||!) #5<<#=>E!) +,	!)
 
u||U\\)	*!)r8   r   c                   >     e Zd Z fdZdej
                  fdZ xZS )	GlmAsrMLPc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        |j                     | _
        y re   )r)   r*   r   r   rC   intermediate_sizefc1fc2r   
hidden_actact_fnr3   r#   r6   s     r7   r*   zGlmAsrMLP.__init__   s\    99V//1I1IJ99V55v7I7IJV../r8   rt   c                 l    | j                  |      }| j                  |      }| j                  |      }|S re   )r   r   r   )r3   rt   s     r7   rd   zGlmAsrMLP.forward   s2    /M2/r8   )rf   rg   rh   r*   rF   ri   rd   rn   ro   s   @r7   r   r      s    0U\\ r8   r   c            	            e Zd Zdedef fdZ	 d
dej                  deej                  ej                  f   dz  de	e
   dej                  fd	Z xZS )GlmAsrEncoderLayerr#   r   c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        j                  |j                        | _	        t        j                  |j                        | _
        y )N)r#   r   )r)   r*   rC   r   	self_attnr   mlpr   	LayerNorminput_layernormpost_attention_layernormr   s      r7   r*   zGlmAsrEncoderLayer.__init__   sd    !--()LV$!||F,>,>?(*V5G5G(H%r8   Nrt   r   r   r:   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)rt   r    )r   r   r   r   )r3   rt   r   r   residual_s         r7   rd   zGlmAsrEncoderLayer.forward   s     !,,];)4>> 
' 3
 
q
 !=0 !55mD/ =0r8   re   )rf   rg   rh   r   rE   r*   rF   ri   rl   r   r   rd   rn   ro   s   @r7   r   r      sp    I| I I IM|| #5<<#=>E +,	
 
r8   r   c                   6    e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZy)GlmAsrPreTrainedModelr#   model)audiotextTr   past_key_valuesN)rf   rg   rh   r   rj   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpar   r8   r7   r   r     s4    (&*#*+"3Nr8   r   c                   x     e Zd ZU eed<   dZdZdgZee	dZ
def fdZeeedee   fd                     Z xZS )	GlmAsrEncoderr#   input_featuresr   r   )rt   
attentionsc           	         t         |   |       t        j                  |j                  |j
                  dd      | _        t        j                  |j
                  |j
                  ddd      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j
                        | _        t        |      | _        d| _        | j%                          y c c}w )Nr   r   )kernel_sizepaddingr   )r   strider   )r#   F)r)   r*   r   Conv1dnum_mel_binsrC   conv1conv2
ModuleListrangenum_hidden_layersr   layersr   normr!   
rotary_embgradient_checkpointing	post_initr   s      r7   r*   zGlmAsrEncoder.__init__,  s     YYv22F4F4FTU_`a
YYv1163E3EST]^hij
mmDI&JbJbDcdy	2d
 LL!3!34	/v>&+# es   Dr   c                    t         j                  j                  | j                  |            }t         j                  j                  | j	                  |            }|j                  dd      }|}| j                  |t        j                  |j                  d   |j                        d d d f         }| j                  D ]  } ||fd|i|} | j                  |      }t        |      S )Nr   r   r4   )r_   r   )last_hidden_state)r   r   gelur   r   rZ   r   rF   rG   rV   r4   r   r   r   )r3   r   r   inputs_embedsrt   r   encoder_layers          r7   rd   zGlmAsrEncoder.forward9  s     **4::n+EF**4::m+DE%//15%"oo]5H5H5KTaThTh(ijnpqjq(r . 
 "[[ 	lM)-kM`kdjkM	l 		-0)MJJr8   )rf   rg   rh   r   rj   main_input_namer   r   r   r   _can_record_outputsr*   r   r   r   r   r   rd   rn   ro   s   @r7   r   r   "  sl    &O-.+%
2   K7I0J K    Kr8   r   c                   .     e Zd ZdZdef fdZd Z xZS )GlmAsrMultiModalProjectorz
    Audio adaptor (small MLP) that projects GlmAsrEncoder features
    to the LLM embedding space so they can replace `<sound>` tokens.
    r#   c                 j   t         |           t        j                  |j                  j
                  |j                  j                  dz        | _        t        |j                     | _        t        j                  |j                  j                  dz  |j                  j                        | _        y )Nr   )r)   r*   r   r   audio_configr   text_configrC   linear_1r   projector_hidden_actactlinear_2r   s     r7   r*   z"GlmAsrMultiModalProjector.__init__S  s    		&"5"5"G"GI[I[IgIgjkIkl&556		&"4"4"@"@1"DfFXFXFdFder8   c                 l    | j                  |      }| j                  |      }| j                  |      }|S re   )r   r   r   )r3   audio_featuresrt   s      r7   rd   z!GlmAsrMultiModalProjector.forwardY  s2    n5/m4r8   )rf   rg   rh   r   r   r*   rd   rn   ro   s   @r7   r   r   M  s    
f| fr8   r   z~
    The GlmAsr model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Llama language model.
    custom_introc                   *    e Zd ZdZdZdZ fdZd Zd Zd Z	d Z
d Zd Ze ed	
      dej                   dej"                  dee   deez  fd              Zee	 	 	 	 	 	 	 	 	 	 	 ddej.                  dz  dej                   dz  dej"                  dz  dej"                  dz  dej.                  dz  dedz  dej                   dz  dej.                  dz  dedz  dej.                  dz  deej"                  z  dee   defd              Z fdZ xZS )GlmAsrForConditionalGenerationNc                 *   t         |   |       |j                  j                  | _        t	        j
                  |j                        | _        t        j
                  |j                        | _	        t        |      | _        | j                          y re   )r)   r*   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   r   s     r7   r*   z'GlmAsrForConditionalGeneration.__init__j  sn      ,,77$001D1DE2>>v?Q?QR%>v%F" 	r8   c                 6    | j                   j                         S re   )r	  get_input_embeddingsr3   s    r7   r  z3GlmAsrForConditionalGeneration.get_input_embeddingst  s    ""7799r8   c                 :    | j                   j                  |       y re   )r	  set_input_embeddings)r3   r   s     r7   r  z3GlmAsrForConditionalGeneration.set_input_embeddingsw  s    007r8   c                 6    | j                   j                         S re   )r	  get_output_embeddingsr  s    r7   r  z4GlmAsrForConditionalGeneration.get_output_embeddingsz  s    ""88::r8   c                 :    | j                   j                  |       y re   )r	  set_output_embeddings)r3   new_embeddingss     r7   r  z4GlmAsrForConditionalGeneration.set_output_embeddings}  s    11.Ar8   c                 :    | j                   j                  |       y re   )r	  set_decoder)r3   decoders     r7   r  z*GlmAsrForConditionalGeneration.set_decoder  s    ''0r8   c                 6    | j                   j                         S re   )r	  get_decoderr  s    r7   r  z*GlmAsrForConditionalGeneration.get_decoder  s    ""..00r8   zgCompute audio embeddings from log-mel input features using the audio encoder and multi-modal projector.r  r   input_features_maskr   r:   c                 *    | j                   |fddi|}|j                  }|j                  |j                  d   d| j                  j
                  j                        }| j                  |      }|j                  d      }dD ]  \  }}	}
|d|z  z   |	dz
  z
  dz
  |
z  dz   } d}||z
  |z  dz   }t        j                  |j                  d   |j                  	      d
d
d
f   |d
d
d
f   k  }||j                  |j                           |_        |S )a
  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padded feature indices.
        return_dictTr   rO   ))r   r   r   )r   r   r   r   r      r   N)r  r   rw   rV   r#   r   r   r
  sumrF   rG   r4   rI   pooler_output)r3   r   r  r   audio_outputsaudio_hidden_statesaudio_embedsaudio_lengthsr   r   r   merge_factorpost_lengths
valid_masks                 r7   get_audio_featuresz1GlmAsrForConditionalGeneration.get_audio_features  s<   ( )((TTTVT+==199  #R)A)A)S)S
 112EF+//3,B 	`(G[&*Q[8K!OLqPU[[^__M	`%4EI\\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
&2:==ATAT3U&V#r8   	input_idsr   r_   r   r   labels	use_cachecache_positionlogits_to_keepc                    | | j                         |      }||| j                  ||d      j                  }|| j                  j                  k(  j                  d      }|j                  |j                  |j                        |j                  |j                              } | j                  d||||||	|
|d|}|S )a  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import GlmAsrForConditionalGeneration, AutoProcessor

        >>> model_id = "zai-org/GLM-ASR-Nano-2512"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = GlmAsrForConditionalGeneration.from_pretrained(model_id, dtype="auto", device_map="auto")
        >>> inputs = processor.apply_transcription_request("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")

        >>> inputs = inputs.to(model.device, dtype=model.dtype)

        >>> outputs = model.generate(**inputs, do_sample=False, max_new_tokens=500)

        >>> decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
        >>> print(decoded_outputs)
        ```T)r  rO   )r   r   r_   r   r)  r*  r+  r,  r   )
r  r'  r  r#   audio_token_idr   masked_scatterrI   r4   r	  )r3   r(  r   r  r   r_   r   r   r)  r*  r+  r,  r   r"  audio_token_maskoutputss                   r7   rd   z&GlmAsrForConditionalGeneration.forward  s    \  7D557	BM%)*?22>CVdh2iwwL !*T[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M +>$*=*= 
+
')%+))
+
 
+
 r8   c                     |j                  dd       }|j                  dd       }|j                  d      }t        |   |i |}||d   d   dk(  r|||d<   |||d<   |S )Nr   r  r+  r   )poprA   r)   prepare_inputs_for_generation)r3   argsr   r   r  r+  model_inputsr6   s          r7   r4  z<GlmAsrForConditionalGeneration.prepare_inputs_for_generation  s      $4d;$jj)>E$45w<dMfM%,7G*H*Kq*P)1?-.".6I23r8   )NNNNNNNNNNr   )rf   rg   rh   _keep_in_fp32_modules_strict_tp_plan_pp_planr*   r  r  r  r  r  r  r   r   rF   FloatTensorri   r   r   rl   r   r'  
LongTensorr   boolrE   r   rd   r4  rn   ro   s   @r7   r  r  `  s    $( HH:8;B11 ~ ))  #\\  +,	 
 
+	+   D  .23737.204(,26*.!%26-.C##d*C ))D0C #\\D0	C
 t+C &&-C C ((4/C   4'C $;C ((4/C ell*C +,C 
 C  CJ r8   r  )r   r  r   )r   )Nr   )<collections.abcr   typingr   activationsr   cache_utilsr   
generationr   integrationsr	   modeling_layersr
   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   r   utils.output_capturingr   autor   r   configuration_glmasrr   r   rF   r   Moduler!   rs   ri   rE   r{   rJ   r   r   r   r   r   r   r   r   r  __all__r   r8   r7   <module>rO     s  * %  !   ) / 9 R K F & K K Y Y 5 2 C @<BII @<F(	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2$ )*2)bii 2) +2)j		  3  F O  (K) (KV		 & 
^%:O ^
^B Wr8   