
    qiG                        d Z ddlZddlmZmZ ddlmZ ddlmZm	Z	 ddl
ZddlmZ ddlmZmZ ddlmZmZmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlm Z  ddl!m"Z" er	 ddl#m$Z$ 	 ddl)Z) G d de      Z* G d de      Z+dede	e,   fdZ-e G d de             Z.e G d de             Z/ G d de      Z0 G d  d!e0      Z1y# e%$ r7Z& ejN                  de&         ejN                  d        e(de&       dZ&[&ww xY w# e%$ r7Z& ejN                  de&         ejN                  d        e(de&       dZ&[&ww xY w)"zWhisper speech-to-text services with locally-downloaded models.

This module implements Whisper transcription using locally-downloaded models,
supporting both Faster Whisper and MLX Whisper backends for efficient inference.
    N)	dataclassfield)Enum)AsyncGeneratorOptional)logger)TYPE_CHECKINGoverride)
ErrorFrameFrameTranscriptionFrame)	NOT_GIVENSTTSettings	_NotGiven_warn_deprecated_param)SegmentedSTTService)Languageresolve_language)time_now_iso8601)
traced_sttWhisperModelException: GIn order to use Whisper, you need to `pip install pipecat-ai[whisper]`.zMissing module: zKIn order to use Whisper, you need to `pip install pipecat-ai[mlx-whisper]`.c                   0    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zy
)Modela   Whisper model selection options for Faster Whisper.

    Provides various model sizes and specializations for speech recognition,
    balancing quality and performance based on use case requirements.

    Parameters:
        TINY: Smallest multilingual model, fastest inference.
        BASE: Basic multilingual model, good speed/quality balance.
        SMALL: Small multilingual model, better speed/quality balance than BASE.
        MEDIUM: Medium-sized multilingual model, better quality.
        LARGE: Best quality multilingual model, slower inference.
        LARGE_V3_TURBO: Fast multilingual model, slightly lower quality than LARGE.
        DISTIL_LARGE_V2: Fast multilingual distilled model.
        DISTIL_MEDIUM_EN: Fast English-only distilled model.
    tinybasesmallmediumzlarge-v3z)deepdml/faster-whisper-large-v3-turbo-ct2z&Systran/faster-distil-whisper-large-v2z'Systran/faster-distil-whisper-medium.enN)__name__
__module____qualname____doc__TINYBASESMALLMEDIUMLARGELARGE_V3_TURBODISTIL_LARGE_V2DISTIL_MEDIUM_EN     N/opt/pipecat/venv/lib/python3.12/site-packages/pipecat/services/whisper/stt.pyr   r   -   s5    " DDEFE@N>O Ar.   r   c                   (    e Zd ZdZdZdZdZdZdZdZ	y)	MLXModelas  MLX Whisper model selection options for Apple Silicon.

    Provides various model sizes optimized for Apple Silicon hardware,
    including quantized variants for improved performance.

    Parameters:
        TINY: Smallest multilingual model for MLX.
        MEDIUM: Medium-sized multilingual model for MLX.
        LARGE_V3: Best quality multilingual model for MLX.
        LARGE_V3_TURBO: Finetuned, pruned Whisper large-v3, much faster with slightly lower quality.
        DISTIL_LARGE_V3: Fast multilingual distilled model for MLX.
        LARGE_V3_TURBO_Q4: LARGE_V3_TURBO quantized to Q4 for reduced memory usage.
    zmlx-community/whisper-tinyz mlx-community/whisper-medium-mlxz"mlx-community/whisper-large-v3-mlxz$mlx-community/whisper-large-v3-turboz%mlx-community/distil-whisper-large-v3z'mlx-community/whisper-large-v3-turbo-q4N)
r!   r"   r#   r$   r%   r(   LARGE_V3r*   DISTIL_LARGE_V3LARGE_V3_TURBO_Q4r-   r.   r/   r1   r1   K   s(     (D/F3H;N=OAr.   r1   languagereturnc                    i t         j                  dt         j                  dt         j                  dt         j                  dt         j
                  dt         j                  dt         j                  dt         j                  dt         j                  d	t         j                  d
t         j                  dt         j                  dt         j                  dt         j                  dt         j                  dt         j                   dt         j"                  dt         j$                  dt         j&                  dt         j(                  dt         j*                  dt         j,                  dt         j.                  dt         j0                  dt         j2                  dt         j4                  dt         j6                  dt         j8                  dt         j:                  dt         j<                  di}t?        | |d       S )!a@  Maps pipecat Language enum to Whisper language codes.

    Args:
        language: A Language enum value representing the input language.

    Returns:
        str or None: The corresponding Whisper language code, or None if not supported.

    Note:
        Only includes languages officially supported by Whisper.
    arbncsdadeelenesfafifrhihuiditjakonlplptrorusksvthtrukurvizhT)use_base_code) r   ARBNCSDADEELENESFAFIFRHIHUIDITJAKONLPLPTRORUSKSVTHTRUKURVIZHr   )r5   LANGUAGE_MAPs     r/   language_to_whisper_languagerv   c   s   =T= 	T	= 	T= 	T= 	T= 	T= 	T=  	T!=$ 	T%=( 	T)=, 	T-=0 	T1=4 	T5=8 	T9=< 	T==@ 	TA=D 	TE=H 	TTTTTTTTTTTTTy=L~ Hl$GGr.   c                   6    e Zd ZU dZ ed       Zeez  ed<   y)WhisperSTTSettingszSettings for WhisperSTTService.

    Parameters:
        no_speech_prob: Probability threshold for filtering non-speech segments.
    c                      t         S Nr   r-   r.   r/   <lambda>zWhisperSTTSettings.<lambda>       i r.   default_factoryno_speech_probN)	r!   r"   r#   r$   r   r   floatr   __annotations__r-   r.   r/   rx   rx      s     ).>O(PNEI%Pr.   rx   c                   ~    e Zd ZU dZ ed       Zeez  ed<    ed       Z	eez  ed<    ed       Z
eez  ed<   y	)
WhisperMLXSTTSettingszSettings for WhisperMLXSTTService.

    Parameters:
        no_speech_prob: Probability threshold for filtering non-speech segments.
        temperature: Sampling temperature (0.0-1.0).
        engine: Whisper engine identifier.
    c                      t         S rz   r{   r-   r.   r/   r|   zWhisperMLXSTTSettings.<lambda>   r}   r.   r~   r   c                      t         S rz   r{   r-   r.   r/   r|   zWhisperMLXSTTSettings.<lambda>   s    9 r.   temperaturec                      t         S rz   r{   r-   r.   r/   r|   zWhisperMLXSTTSettings.<lambda>   s    I r.   engineN)r!   r"   r#   r$   r   r   r   r   r   r   r   strr-   r.   r/   r   r      sG     ).>O(PNEI%P%*;L%MK"M#4EFFC)OFr.   r   c                        e Zd ZU dZeZeed<   ddddddddeee	z     ded	ed
ee
   dee   dee   f fdZdefdZdedee   fdZd Ze	 ddededee   fd       Zdedeedf   fdZ xZS )WhisperSTTServicezClass to transcribe audio with a locally-downloaded Whisper model.

    This service uses Faster Whisper to perform speech-to-text transcription on audio
    segments. It supports multiple languages and various model sizes.
    	_settingsNautodefault)modeldevicecompute_typer   r5   settingsr   r   r   r   r5   r   c                   t        t        j                  j                  t        j
                  d      }|4t        dt         d       t        |t              r|n|j                  |_	        |t        dt         d       ||_
        |t        dt         d       ||_        ||j                  |       t        	| 8  dd|i| || _        || _        d| _        | j%                          y)	a  Initialize the Whisper STT service.

        Args:
            model: The Whisper model to use for transcription. Can be a Model enum or string.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperSTTSettings(model=...)`` instead.

            device: The device to run inference on ('cpu', 'cuda', or 'auto').
                Defaults to ``"auto"``.
            compute_type: The compute type for inference ('default', 'int8',
                'int8_float16', etc.). Defaults to ``"default"``.
            no_speech_prob: Probability threshold for filtering out non-speech segments.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperSTTSettings(no_speech_prob=...)`` instead.

            language: The default language for transcription.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperSTTSettings(language=...)`` instead.

            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            **kwargs: Additional arguments passed to SegmentedSTTService.
        g?)r   r5   r   Nr   r   r5   r   r-   )rx   r   r,   valuer   r]   r   
isinstancer   r   r   r5   apply_updatesuper__init___device_compute_type_model_load)
selfr   r   r   r   r5   r   kwargsdefault_settings	__class__s
            r/   r   zWhisperSTTService.__init__   s    L .((..[[
 "7,>H.8.DU%++"%"#35GIYZ.<+":/A:N(0%
 ))(3 	
%	
	
 ).2

r.   r6   c                      y)zIndicates whether this service can generate metrics.

        Returns:
            bool: True, as this service supports metric generation.
        Tr-   r   s    r/   can_generate_metricsz&WhisperSTTService.can_generate_metrics  s     r.   c                     t        |      S )zConvert from pipecat Language to Whisper language code.

        Args:
            language: The Language enum value to convert.

        Returns:
            str or None: The corresponding Whisper language code, or None if not supported.
        )rv   )r   r5   s     r/   language_to_service_languagez.WhisperSTTService.language_to_service_language'  s     ,H55r.   c                 j   	 ddl m} t        j                  d        || j                  j
                  | j                  | j                        | _        t        j                  d       y# t        $ r>}t        j                  d|        t        j                  d       d| _        Y d}~yd}~ww xY w)	zLoads the Whisper model.

        Note:
            If this is the first time this model is being run,
            it will take time to download from the Hugging Face model hub.
        r   r   zLoading Whisper model...)r   r   zLoaded Whisper modelr   r   N)faster_whisperr   r   debugr   r   r   r   r   ModuleNotFoundErrorerror)r   r   es      r/   r   zWhisperSTTService._load2  s    	3LL34&$$T\\HZHZDK LL/0" 	LL;qc*+LLbcDKK	s   A(A+ +	B244B--B2
transcriptis_finalc                    K   ywz+Handle a transcription result with tracing.Nr-   r   r   r   r5   s       r/   _handle_transcriptionz'WhisperSTTService._handle_transcriptionF       
 	   audioc                V  K   | j                   st        d       y| j                          d{    t        j                  |t        j
                        j                  t        j                        dz  }t        j                  | j                   j                  || j                  j                         d{   \  }}d}|D ]8  }|j                  | j                  j                  k  s'||j                   dz  }: | j                          d{    |r| j!                  |d| j                  j                         d{    t#        j$                  d	| d
       t'        || j(                  t+               | j                  j                         yy7 o7 7 7 ]w)a  Transcribe audio data using Whisper.

        Args:
            audio: Raw audio bytes in 16-bit PCM format.

        Yields:
            Frame: Either a TranscriptionFrame containing the transcribed text
                  or an ErrorFrame if transcription fails.

        Note:
            The audio is expected to be 16-bit signed PCM data.
            The service will normalize it to float32 in the range [-1, 1].
        zWhisper model not availableNdtype      @)r5     TTranscription: [])r   r   start_processing_metricsnp
frombufferint16astypefloat32asyncio	to_thread
transcriber   r5   r   textstop_processing_metricsr   r   r   r   _user_idr   )r   r   audio_floatsegments_r   segments          r/   run_sttzWhisperSTTService.run_sttM  sh     {{:;;++--- mmE:AA"**MPWW#--KK""K$..:Q:Q
 
!  	+G%%(E(EE7<<.**	+ **,,,,,T49P9PQQQLL+D634$ "''	   	.

 	- RsM   /F)F BF)>F#?1F)1'F)F%0F)	F'
AF)#F)%F)'F)rz   )r!   r"   r#   r$   rx   Settingsr   r   r   r   r   r   r   boolr   r   r   r   r   bytesr   r   r   __classcell__)r   s   @r/   r   r      s    "H!!
 (,%*.'+15H e$H 	H
 H !H 8$H -.HTd 	6X 	6(3- 	6( NR)-9A(9K )5 )^E4K-H )r.   r   c                       e Zd ZU dZeZeed<   dddddddeee	z     dee
   dee   dee
   d	ee   f
d
Zed        Ze	 ddededee   fd       Zededeedf   fd       Zy)WhisperSTTServiceMLXzSubclass of `WhisperSTTService` with MLX Whisper model support.

    This service uses MLX Whisper to perform speech-to-text transcription on audio
    segments. It's optimized for Apple Silicon and supports multiple languages and quantizations.
    r   N)r   r   r5   r   r   r   r   r5   r   r   c                   t        t        j                  j                  t        j
                  ddd      }|4t        dt         d       t        |t              r|n|j                  |_	        |t        dt         d       ||_
        |t        dt         d       ||_        |t        d	t         d	       ||_        ||j                  |       t        j                  | fd
|i| y)ah  Initialize the MLX Whisper STT service.

        Args:
            model: The MLX Whisper model to use for transcription. Can be an MLXModel enum or string.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperMLXSTTSettings(model=...)`` instead.

            no_speech_prob: Probability threshold for filtering out non-speech segments.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperMLXSTTSettings(no_speech_prob=...)`` instead.

            language: The default language for transcription.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperMLXSTTSettings(language=...)`` instead.

            temperature: Temperature for sampling. Can be a float or tuple of floats.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperMLXSTTSettings(temperature=...)`` instead.

            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            **kwargs: Additional arguments passed to SegmentedSTTService.
        g333333?        mlx)r   r5   r   r   r   Nr   r   r5   r   r   )r   r1   r%   r   r   r]   r   r   r   r   r   r5   r   r   r   r   )r   r   r   r5   r   r   r   r   s           r/   r   zWhisperSTTServiceMLX.__init__  s    L 1--%%[[
 "7,A7K.8.DU%++"%"#35JL\].<+":/DjQ(0%""=2GW+6(
 ))(3 	$$	
%	
 	
r.   c                      y)z7MLX Whisper loads models on demand, so this is a no-op.Nr-   r   s    r/   r   zWhisperSTTServiceMLX._load  s     	r.   r   r   c                    K   ywr   r-   r   s       r/   r   z*WhisperSTTServiceMLX._handle_transcription  r   r   r   r6   c                V  K   	 ddl }| j                          d{    t        j                  |t        j                        j                  t        j                        dz  }t        j                  |j                  || j                  j                  | j                  j                  | j                  j                         d{   }d}|j                  dg       D ]Z  }|j                  dd      d	k(  r|j                  d
d      | j                  j                  k  sC||j                  dd       dz  }\ t!        |j#                               dk(  rd}| j%                          d{    |r| j'                  |d| j                  j                         d{    t)        j*                  d| d       t-        || j.                  t1               | j                  j                         yy7 7 27 7 ^# t2        $ r}t5        d|        Y d}~yd}~ww xY ww)a  Transcribe audio data using MLX Whisper.

        The audio is expected to be 16-bit signed PCM data.
        MLX Whisper will handle the conversion internally.

        Args:
            audio: Raw audio bytes in 16-bit PCM format.

        Yields:
            Frame: Either a TranscriptionFrame containing the transcribed text
                  or an ErrorFrame if transcription fails.
        r   Nr   r   )path_or_hf_repor   r5   r   r   compression_ratiogrq?r   r   r   r   Tr   r   zUnknown error occurred: )r   )mlx_whisperr   r   r   r   r   r   r   r   r   r   r   r   r5   getr   lenstripr   r   r   r   r   r   r   	Exceptionr   )r   r   r   r   chunkr   r   r   s           r/   r   zWhisperSTTServiceMLX.run_stt  s    (	C//111 --RXX>EEbjjQT[[K!++&& $ 4 4 NN6600 E D 99Z4 :;;2D9=OO;;/58U8UUw{{6267q99D: 4::< A%..00000tT^^=T=TUUU/vQ78(MM$&NN++	  5 2
& 1 V  	C%=aS#ABBB	Cs~   H)H G7B,H G:	AH $AH /G=00H  G?!AH 5H)7H :H =H ?H 	H&
H!H)!H&&H)rz   )r!   r"   r#   r$   r   r   r   r   r   r1   r   r   r   r
   r   r   r   r   r   r   r   r   r-   r.   r/   r   r   y  s     %H$$
 +/*.'+'+48G
 h'G
 !	G

 8$G
 e_G
 01G
V   NR)-9A(9K  5C5 5C^E4K-H 5C 5Cr.   r   )2r$   r   dataclassesr   r   enumr   typingr   r   numpyr   logurur   typing_extensionsr	   r
   pipecat.frames.framesr   r   r   pipecat.services.settingsr   r   r   r   pipecat.services.stt_servicer   pipecat.transcriptions.languager   r   pipecat.utils.timer   (pipecat.utils.tracing.service_decoratorsr   r   r   r   r   r   r   r   r   r1   r   rv   rx   r   r   r   r-   r.   r/   <module>r      su    (  +   5 G G _ _ < F / ?0/0AD A<Bt B0KH8 KH KH\ Q Q Q GK G Gk+ k\WC, WCs
  0{1#&'^_*1#.//0  0{1#&'bc*1#.//0s0   "C )D D
2C<<DE 	2D;;E 