
    qi1                         d dl Zd dlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZ ddlmZ  ej                  e      Zded	ed
edededej&                  fdZ ed       G d de             ZdgZy)    N   )hertz_to_mel)SequenceFeatureExtractor)BatchFeature)
TensorTypelogging)requiresnum_mel_binsnum_spectrogram_binssample_ratelower_edge_hertzupper_edge_hertzreturnc                    t         j                  }d}|dz  }t        j                  d|||      |d }	t        |	d      ddt         j                  f   }
t        j                  t        |d      t        |d      | dz   |      }|dd	 t         j                  ddf   |dd
 t         j                  ddf   |dd t         j                  ddf   }}}|
|z
  ||z
  z  }||
z
  ||z
  z  }t        j
                  dt        j                  ||            }t        j                  ||dgddgg      j                  |      S )z.NumPy-port of the JAX mel weight matrix logic.   g       @        )dtypeNkaldi)	mel_scale   r   )	npfloat64linspacer   newaxismaximumminimumpadastype)r
   r   r   r   r   r   internal_dtypebands_to_zeronyquist_hertzlinear_frequenciesspectrogram_bins_meledgeslower_edge_mel
center_melupper_edge_mellower_slopesupper_slopesmel_weights_matrixs                     b/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/lasr/feature_extraction_lasr.pylinear_to_mel_weight_matrixr.      sJ    ZZN M#%MS-9MUcderest'(:gNqRTR\R\}]KK%9%9q	E 	cr
2::q=!aBJJM"ab	"**a-  !/JN )>9j>>YZL"%99nz>YZLCL,)OP66$q'9Aq6&BCJJ5QQ    )torch)backendsc                       e Zd ZdZddgZ	 	 	 	 	 	 d fd	ZddZ	 	 	 	 	 	 	 	 	 	 ddej                  e	e
   z  e	ej                     z  e	e	e
      z  ded	edz  d
eez  dz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  defdZ xZS )LasrFeatureExtractora7  
    Constructs a LASR feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
    Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
            feature_size (`int`, *optional*, defaults to 128):
                The feature dimension of the extracted features.
            sampling_rate (`int`, *optional*, defaults to 16000):
                The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
            hop_length (`int`, *optional*, defaults to 160):
                Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
            n_fft (`int`, *optional*, defaults to 512):
                Size of the Fourier transform.
            win_length (`int`, *optional*, defaults to 400):
                The window length for the STFT computation.
            padding_value (`float`, *optional*, defaults to 0.0):
                Padding value used to pad the audio. Should correspond to silences.
    input_featuresattention_maskc           
          t        |   d|||d| || _        || _        || _        t        j                  t        ||dz  dz   |ddt        j                              | _
        y )N)feature_sizesampling_ratepadding_valuer   r   g     @_@g     L@)r
   r   r   r   r   r    )super__init__
hop_lengthn_fft
win_lengthr0   
from_numpyr.   r   r   mel_filters)	selfr7   r8   r=   r>   r?   r9   kwargs	__class__s	           r-   r<   zLasrFeatureExtractor.__init__^   sn     	wl-_lwpvw$
$ ++')%*aZ!^)!&!'jj	
r/   c                    t        j                  | j                  d|t         j                        }|j	                  t         j                        }|j                  d| j                  | j                        }t         j                  j                  ||z  | j                        }t        j                  |      dz  }| j                  j	                  |      }t        j                  ||z  d      }t        j                  |      }|S )NF)periodicdevicer   r   )nr   gh㈵>)min)r0   hann_windowr?   r   tounfoldr=   fftrfftr>   absrA   clamplog)	rB   waveformrG   windowframesstft
power_specrA   mel_specs	            r-   _torch_extract_fbank_featuresz2LasrFeatureExtractor._torch_extract_fbank_featuresx   s    ""4??U6Y^YfYfg;;u}}-
 T__dooFyy~~fvo~<YYt_)
 &&))&1;;zK7TB99X&r/   N
raw_speech
truncationpad_to_multiple_ofreturn_tensorsreturn_attention_maskpadding
max_lengthr8   do_normalizerG   return_token_timestampsr   c                    |O|| j                   k7  rmt        d| j                  j                   d| j                    d| j                    d| d	      t        j                  d| j                  j                   d       t        |t        j                        rt        j                  |      }nqt        |t        t        f      r[t        |d	   t        t        j                  f      r#|D cg c]  }t        j                  |       }}nt        j                  |      }t        |t        j                        xr t        |j                        d
kD  }|rVt        |j                        dkD  r>t        j                  d| j                  j                   d       |j!                  d      }t        |t        t        f      }|r^|D ]Y  }t        |j                        d
kD  st        j                  d| j                  j                   d       |j!                  d      }[ |s|r4|D cg c](  }|dddf   j#                  t        j$                        * }}n'|dddf   j#                  t        j$                        g}t'        d|i      }| j)                  ||||||d      }|j*                  j-                  d      }| j/                  ||
      }d|j#                  t        j$                        i}|rO|j0                  dd| j2                  d
z
  d| j4                  f   }|j#                  t        j6                        |d<   t'        ||      S c c}w c c}w )a  
        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
        the STFT computation if available, otherwise a slower NumPy based one.

        Args:
            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*, defaults to None):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                For Parakeet models, `attention_mask` should always be passed for batched inference, to avoid subtle
                bugs.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            do_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
                improve the performance of the model.
            device (`str`, *optional*, defaults to `'cpu'`):
                Specifies the device for computation of the log-mel spectrogram of audio signals in the
                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
            return_token_timestamps (`bool`, *optional*, defaults to `None`):
                Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.

                Whether or not to return the number of frames of the input raw_speech.
                These num_frames can be used by the model to compute word level timestamps.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r   r   z2Only mono-channel audio is supported for input to z;. We will take the mean of the channels to convert to mono.r   r4   pt)r^   r_   rZ   r[   r]   r\   r5   )datatensor_type)r8   
ValueErrorrD   __name__loggerwarning
isinstancer   ndarrayr0   tensorlisttupleTensorlenshapemeanrK   float32r   r   r4   squeezerX   r5   r?   r=   bool)rB   rY   rZ   r[   r\   r]   r^   r_   r8   r`   rG   ra   rC   speechis_batched_torchis_batched_sequencebatched_speechpadded_inputsr4   re   r5   s                        r-   __call__zLasrFeatureExtractor.__call__   s!   H $ 2 22 I$..JaJaIb c))-););(< =))-););(<Im_TUW  NNVW[WeWeWnWnVo p\ \ j"**-j1J
T5M2*Q-$

);<AKLvell62L
L"\\*5
%j%,,?]C
HXHXDY\]D]J$4$4 5 9NNDT^^E\E\D] ^L L $,J(dE]C$ -v||$q(NNLT^^MdMdLe fT T $[[_F- 2JTU&D/,,U]];UJU$QW-00?@J%'7&DE!!1"7 ! 
 '55==bA;;NFSn//>
 !*99!T__q=P=cTXTcTc=c:cdN%3%6%6uzz%BD!">BB_ M. Vs   ,M*-M)   i>     i   i  r   )cpu)
FNNNlongestNNNr   N)rh   
__module____qualname____doc__model_input_namesr<   rX   r   rl   rn   floatrv   intstrr   r   r|   __classcell__)rD   s   @r-   r3   r3   B   s<   0 *+;< 
4, !)-26-1'!%$($("/3ECJJe,tBJJ/??$tE{BSSEC EC  $J	EC
 j(4/EC  $d{EC tEC $JEC TzEC TkEC d
EC "&EC 
ECr/   r3   )numpyr   r0   audio_utilsr   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r   utils.import_utilsr	   
get_loggerrh   ri   r   r   rl   r.   r3   __all__r:   r/   r-   <module>r      s      ' I 4 ( * 
		H	%"R"R"R "R 	"R
 "R ZZ"RJ 
:MC3 MC MC` "
"r/   