
    qin8                         d dl Z d dlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZ  e       rd dlZ ej$                  e      Z G d d	e	d
      Z G d de
      ZdgZy)    N   )
AudioInputmake_list_of_audio)BatchFeature)ProcessingKwargsProcessorMixinUnpack)	TextInput)is_torch_availableloggingc                   ,    e Zd Zddidddddddd	d
Zy)GlmAsrProcessorKwargspaddingTi>  g      >@
max_length)sampling_ratechunk_lengthreturn_attention_maskr   ptleft)return_tensorspadding_side)text_kwargsaudio_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/glmasr/processing_glmasr.pyr   r   (   s5     t
 # %)#	
 #"
Ir    r   F)totalc                        e Zd ZdZ	 	 	 	 d fd	ZddZ	 	 ddeee   z  dedz  d	e	dz  d
e
e   def
dZedee   fd       Z	 ddeee   z  ez  deee   z  dz  d
e
e   defdZdddZdedefdZ xZS )GlmAsrProcessora  
    Constructs an GlmAsr processor which wraps an GlmAsr feature extractor and an GlmAsr
    tokenizer into a single processor.

    [`GlmAsrProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
    [`Qwen2TokenizerFast`]. See the [`~GlmAsrProcessor.__call__`] for more information.

    Args:
            feature_extractor ([`WhisperFeatureExtractor`]):
                The feature extractor is a required input.
            tokenizer ([`Qwen2TokenizerFast`]):
                The tokenizer is a required input.
            chat_template (`Optional[str]`, *optional*):
                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
                template will be used.
            audio_token (`Optional[str]`, *optional*, defaults to `"<|pad|>`"):
                Special token used to represent audio inputs in the chat template.
            default_transcription_prompt (`str`, *optional*, defaults to `"Please transcribe this audio into text"`):
                Default prompt to use for transcription tasks when applying transcription requests.
            max_audio_len (`int`, *optional*, defaults to 655):
                Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
                655 gives approximately 8192 tokens, corresponding to the maximum sequence length of the text model.
    Nc                     || _         |j                  |      | _        || _        || _        t
        |   |||       y )N)chat_template)audio_tokenconvert_tokens_to_idsaudio_token_iddefault_transcription_promptmax_audio_lensuper__init__)selffeature_extractor	tokenizerr&   r'   r*   r+   	__class__s          r!   r-   zGlmAsrProcessor.__init__S   sE     ''==kJ,H)**I]Sr    returnc                 d    d}dD ]  \  }}}|d|z  z   |dz
  z
  dz
  |z  dz   } ||z
  |z  dz   }|S )N   ))   r   r5   )r5   r      r6   r5   r   )r.   audio_lengthsmerge_factorr   kernel_sizestride
num_tokenss          r!   _get_audio_token_lengthz'GlmAsrProcessor._get_audio_token_lengthb   sc    ,B 	`(G[&*Q[8K!OLqPU[[^__M	` $l2|CaG
r    Ftextaudiooutput_labelskwargsc           
          | j                   t        fd| j                  j                  i|}|d   }|d   }|j	                  d      }|dk7  r"t        | j                  j                   d      t        |t              r|g}n3t        |t        t        f      rt        d |D              st        d      i }	|3t        |      }t        |      t        |      k7  r$t        d	t        |       d
t        |       d      t        |d   |d   z        }
t        | j                   |d   z        }g }g }|D ]  }t        |j"                  d         }t%        d||
z   dz
  |
z        }||kD  r<t&        j)                  d||d   z  dd| j                    d| j                    d       |}|j+                  |       t-        |||
z        }t/        |      D ]-  }||
z  }t-        |dz   |
z  |      }|j+                  |||        /   | j0                  |fi |}	|	j3                  d      }||	d<   t5        j6                  t5        j8                  |j;                  d      |      D cg c]  }|j;                          c}      }| j=                  |      }t?        |      D ]N  \  }}tA        jB                  tA        jD                  | jF                        | jF                  |z  ||         }|||<   P  | j                  |fi |}i ||	}|rF|d   jI                         }d||| jJ                  k(  <   d||| j                  jL                  k(  <   ||d<   tO        ||      S c c}w )a=  
        Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
        method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
        audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
        with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
        the text is tokenized as-is (LM-only behavior).

        Args:
            text (`str` or `list[str]`):
                Input sequence or batch of sequences.
            audio (`np.ndarray` or `list[np.ndarray]`):
                Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
                `audio` inputs.
            output_labels (bool, *optional*, default=False):
                Whether to return labels for training.

        Returns:
            [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
            audio features (`input_features`, `input_features_mask`).
        tokenizer_init_kwargsr   r   r   r   z% only supports `return_tensors='pt'`.c              3   <   K   | ]  }t        |t                y wN
isinstancestr).0ts     r!   	<genexpr>z+GlmAsrProcessor.__call__.<locals>.<genexpr>   s     9[QR*Q:L9[   zAInvalid input text. Please provide a string, or a list of stringszGot z
 text but z audios; they must match 1:1.r   r   r   r5   zAudio duration (z.1fzs) exceeds zs; truncating to first zs.attention_maskinput_features_mask	input_idsilabels)datatensor_type)(_merge_kwargsr   r0   init_kwargsget
ValueErrorr1   r   rF   rG   listtupleallr   lenintr+   shapemaxloggerwarningappendminranger/   poptorchstacksplitsumr<   	enumerateresubescaper'   cloner)   pad_token_idr   )r.   r=   r>   r?   r@   call_kwargsr   r   r   audio_inputswindow_sizemax_windowsper_sample_windowsflat_chunksaudio_el	n_samplesn_wintime_capistartendpadding_masksr7   audio_tokens_lengthsaudio_lengthexpandedtext_inputsrQ   rP   s                                 r!   __call__zGlmAsrProcessor.__call__j   s   : )d((!
"&.."<"<
 
 "-0">2$)9:T! 7 788]^__dC 6DTD%=1c9[VZ9[6[`aa&u-E4yCJ& 4D	{*SZLHe!fgg l?;l>>ZZ[Kd00L4PPQK,.,.K! <q 12	A	K 7! ;KL;&NN*9|O7T+TUX*YYdeiewewdx  yP  QU  Qc  Qc  Pd  df  g (E"))%0y%+*=>u <AOEq1u3X>C&&xc':;<<" 2411+NNL'++,<=L2>L./ "KK%++lFVFVWYFZ\n:o(pQ(pqM#'#?#?#N  $--A#B #<66"))D,<,<"=t?O?OR^?^`def`gh"Q#
 %dnnT9[9.+..+&,,.F48F6T0001<@F6T^^8889#DN>BB% )qs   $Mc                     | j                   j                  }| j                  j                  }t        t        j                  ||z   dgz               S )NrM   )r0   model_input_namesr/   rW   dictfromkeys)r.   	tok_names	fea_namess      r!   r   z!GlmAsrProcessor.model_input_names   sD    NN44	**<<	DMM)i"7;P:Q"QRSSr    promptc           	         t        |t              r|g}nt        |t        t        f      r |rt	        d |D              rt        |      }nst        t        |            }t               rU|D cg c]J  }t        |t        j                        r,|j                         j                         j                         n|L }}t        |      }|dk(  rt        d      || j                  g|z  }nt        |t              r|g|z  }nt        |t        t        f      r}t        |      |k7  rt        dt        |       d| d      g }|D ]L  }||j                  | j                         !t        |t              r|j                  |       Ct!        d       nt!        d      t#        ||      D 	
cg c](  \  }	}
d	t        |
t              rd
|
dnd
|
dd|	dgdg* }}	}
 | j$                  |fdddd|S c c}w c c}
}	w )a	  
        Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

        Args:
            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
                the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
            prompt (`str` or `list[str]`, *optional*):
                Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
                each sample uses `"Transcribe the input speech."`.
            **kwargs:
                Additional keyword arguments forwarded to [`~GlmAsrProcessor.apply_chat_template`] (for example
                `text_kwargs`, `audio_kwargs`, ...).

        Returns:
            [`BatchFeature`]: Processor outputs ready to be passed to [`GlmAsrForConditionalGeneration.generate`].

        c              3   <   K   | ]  }t        |t                y wrD   rE   )rH   els     r!   rJ   z>GlmAsrProcessor.apply_transcription_request.<locals>.<genexpr>   s     ?dXZ
2s@S?drK   r   z)`audio` must contain at least one sample.z	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.userr>   )typepath)r   r>   r=   )r   r=   )rolecontentT)tokenizeadd_generation_promptreturn_dict)rF   rG   rW   rX   rY   r   r   rd   TensordetachcpunumpyrZ   rV   r*   r`   	TypeErrorzipapply_chat_template)r.   r>   r   r@   audio_itemsr   
batch_sizepromptsitemprompt_text
audio_itemconversationss               r!   apply_transcription_requestz+GlmAsrProcessor.apply_transcription_request   s   2 eS!38'Ke}-%C?d^c?d<du+K1%89K!#kvwegJr5<<<Xryy{0668^``ww%
?HII>889JFG$h+Gu.6{j( F}OJ<Gkl  G O<NN4#D#DEc*NN4(#$MNNO Z[[ ,/w+D
 (Z # &j#6 ")*=&-
C!'=	 

 
 (t''
"&	

 
 	
S x4
s   -AG70-G<)strip_prefixc                     | j                   j                  |i |}|r|D cg c]  }| j                  |       }}|S c c}w )ap  
        Forward arguments to [`~PreTrainedTokenizer.batch_decode`] and optionally remove the assistant framing the model
        was trained to produce.

        AF3 transcription requests respond with sentences such as `"The spoken content of the audio is "..."."`.
        Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
        )r0   batch_decode"_strip_assistant_prefix_and_quotes)r.   r   argsr@   decodedr=   s         r!   r   zGlmAsrProcessor.batch_decode&  sK     .$..--t>v>QXYt>>tDYGY Zs   ?c                 @   |j                         }dD ]1  }|j                  |      s|t        |      d j                         } n |j                  d      r|dd j                         }t        |      dk\  r%|d   |d   k(  r|d   dv r|dd j                         }|S )	zi
        Remove the assistant prefix and surrounding quotes from a decoded transcription string.
        )z"The spoken content of the audio isz!The transcription of the audio isN.rN   r6   r   >   "'r5   )strip
startswithrZ   endswith)r.   r=   strippedprefixs       r!   r   z2GlmAsrProcessor._strip_assistant_prefix_and_quotes3  s    
 ::<
 	F ""6*#CKM288:	 S!}**,Hx=A(1+""=(1+Q[B["~++-Hr    )Nz<|pad|>z&Please transcribe this audio into texti  )r7   torch.Tensorr2   r   )NFrD   )r   r   r   __doc__r-   r<   r
   rW   r   boolr	   r   r   r   propertyrG   r   r   r   r   __classcell__)r1   s   @r!   r$   r$   :   s   8 %MT $(%*	cC$y/)cC D cC d{	cC
 ./cC 
cCJ T49 T T *.O
T#Y+O
 d3i$&O
 ./	O

 
O
b 05 s s r    r$   )ri   r   npaudio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r   r	   tokenization_utils_baser
   utilsr   r   rd   
get_loggerr   r^   r   r$   __all__r   r    r!   <module>r      sf   , 
  9 4 H H 0 0  
		H	%,E $Nn Nb 
r    