
    qi(                        d Z ddlmZmZ ddlmZmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlm Z  e G d de             Z!dedee"   fdZ# G d de      Z$y)zBase class for Whisper-based speech-to-text services.

This module provides common functionality for services implementing the Whisper API
interface, including language mapping, metrics generation, and error handling.
    )	dataclassfield)AsyncGeneratorOptional)loggerAsyncOpenAI)Transcription)
ErrorFrameFrameTranscriptionFrame)	NOT_GIVENSTTSettings	_NotGiven_warn_deprecated_param)WHISPER_TTFS_P99)SegmentedSTTService)Languageresolve_language)time_now_iso8601)
traced_sttc                   f    e Zd ZU dZ ed       Zedz  ez  ed<    ed       Z	e
dz  ez  ed<   y)BaseWhisperSTTSettingszSettings for BaseWhisperSTTService.

    Parameters:
        prompt: Optional text to guide the model's style or continue
            a previous segment.
        temperature: Sampling temperature between 0 and 1.
    c                      t         S Nr        S/opt/pipecat/venv/lib/python3.12/site-packages/pipecat/services/whisper/base_stt.py<lambda>zBaseWhisperSTTSettings.<lambda>'   s    9 r   )default_factoryNpromptc                      t         S r   r   r   r   r   r    zBaseWhisperSTTSettings.<lambda>(   s    ) r   temperature)__name__
__module____qualname____doc__r   r"   strr   __annotations__r$   floatr   r   r   r   r      s;     &+;L%MFC$J"M,1BS,TK	)Tr   r   languagereturnc                    i t         j                  dt         j                  dt         j                  dt         j                  dt         j
                  dt         j                  dt         j                  dt         j                  dt         j                  d	t         j                  d
t         j                  dt         j                  dt         j                  dt         j                  dt         j                  dt         j                   dt         j"                  di t         j$                  dt         j&                  dt         j(                  dt         j*                  dt         j,                  dt         j.                  dt         j0                  dt         j2                  dt         j4                  dt         j6                  dt         j8                  dt         j:                  dt         j<                  dt         j>                  dt         j@                  d t         jB                  d!t         jD                  d"i t         jF                  d#t         jH                  d$t         jJ                  d%t         jL                  d&t         jN                  d't         jP                  d(t         jR                  d)t         jT                  d*t         jV                  d+t         jX                  d,t         jZ                  d-t         j\                  d.t         j^                  d/t         j`                  d0t         jb                  d1t         jd                  d2t         jf                  d3t         jh                  d4t         jj                  d5t         jl                  d6t         jn                  d7t         jp                  d8t         jr                  d9i}tu        | |d:;      S )<at  Maps pipecat Language enum to Whisper API language codes.

    Language support for Whisper API.
    Docs: https://platform.openai.com/docs/guides/speech-to-text#supported-languages

    Args:
        language: A Language enum value representing the input language.

    Returns:
        str or None: The corresponding Whisper language code, or None if not supported.
    afarhyazbebsbgcazhhrcsdanlenetfifrgldeelhehihuisiditjaknkkkolvltmkmsmrminenofaplptrorusrskslesswsvtltathtrukurvicyT)use_base_code);r   AFARHYAZBEBSBGCAZHHRCSDANLENETFIFRGLDEELHEHIHUISIDITJAKNKKKOLVLTMKMSMRMINENOFAPLPTRORUSRSKSLESSWSVTLTATHTRUKURVICYr   )r,   LANGUAGE_MAPs     r   language_to_whisper_languager   +   s   :T:T: 	T: 	T	:
 	T: 	T: 	T: 	T: 	T: 	T: 	T: 	T: 	T: 	T: 	T:  	T!:" 	T#:$ 	T%:& 	T':( 	T):* 	T+:, 	T-:. 	T/:0 	T1:2 	T3:4 	T5:6 	T7:8 	T9:: 	T;:< 	T=:> 	T?:@ 	TA:B 	TC:D 	TE:F 	TG:H 	TI:J 	TK:L 	TM:N 	TO:P 	TQ:R 	TS:T 	TU:V 	TW:X 	TY:Z 	T[:\ 	T]:^ 	T_:` 	Ta:b 	Tc:d 	Te:f 	Tg:h 	TTTTTTs:Lx Hl$GGr   c                   4    e Zd ZU dZeZeed<   ddddddddded
dee	   dee	   dee	   d	ee
   d
ee	   dee   dededee   dee   f fdZdee	   dee	   fdZdefdZd	e
dee	   fdZe	 dde	ded	ee
   fd       Zdedeedf   fdZdedefdZ xZS )BaseWhisperSTTServicezBase class for Whisper-based speech-to-text services.

    Provides common functionality for services implementing the Whisper API interface,
    including metrics generation and error handling.
    	_settingsNF)
modelapi_keybase_urlr,   r"   r$   include_prob_metricspush_empty_transcriptssettingsttfs_p99_latencyr   r   r   r,   r"   r$   r   r   r   r   c       
            t        dddd      }|t        dt         d       ||_        |'t        dt         d       | j                  |      |_        |t        dt         d       ||_        |t        dt         d       ||_        |	|j                  |	       t        | $  d|
|d| | j                  ||      | _        || _        || _        y)	a  Initialize the Whisper STT service.

        Args:
            model: Name of the Whisper model to use.

                .. deprecated:: 0.0.105
                    Use ``settings=BaseWhisperSTTSettings(model=...)`` instead.

            api_key: Service API key. Defaults to None.
            base_url: Service API base URL. Defaults to None.
            language: Language of the audio input.

                .. deprecated:: 0.0.105
                    Use ``settings=BaseWhisperSTTSettings(language=...)`` instead.

            prompt: Optional text to guide the model's style or continue a previous segment.

                .. deprecated:: 0.0.105
                    Use ``settings=BaseWhisperSTTSettings(prompt=...)`` instead.

            temperature: Sampling temperature between 0 and 1.

                .. deprecated:: 0.0.105
                    Use ``settings=BaseWhisperSTTSettings(temperature=...)`` instead.

            include_prob_metrics: If True, enables probability metrics in API response.
                Each service implements this differently (see child classes).
                Defaults to False.
            push_empty_transcripts: - If true, allow empty `TranscriptionFrame` frames to be
                pushed downstream instead of discarding them. This is intended for situations
                where VAD fires even though the user did not speak. In these cases, it is
                useful to know that nothing was transcribed so that the agent can resume
                speaking, instead of waiting longer for a transcription.
                Defaults to False.
            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
                Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark
            **kwargs: Additional arguments passed to SegmentedSTTService.
        N)r   r,   r"   r$   r   r,   r"   r$   )r   r   r   )r   r   r   language_to_service_languager,   r"   r$   apply_updatesuper__init___create_client_client_include_prob_metrics_push_empty_transcripts)selfr   r   r   r,   r"   r$   r   r   r   r   kwargsdefault_settings	__class__s                r   r   zBaseWhisperSTTService.__init__   s    p 2	
 "7,BGL%*"":/EzR(,(I(I((S%"8-CXN&,#""=2H-X+6(
 ))(3 	
-%	
 	

 **7H=%9"'=$r   c                     t        ||      S )N)r   r   r   )r   r   r   s      r   r   z$BaseWhisperSTTService._create_client   s    7X>>r   r-   c                      y)zWhether this service can generate processing metrics.

        Returns:
            bool: True, as this service supports metric generation.
        Tr   )r   s    r   can_generate_metricsz*BaseWhisperSTTService.can_generate_metrics   s     r   c                     t        |      S )zConvert from pipecat Language to service language code.

        Args:
            language: The Language enum value to convert.

        Returns:
            str or None: The corresponding service language code, or None if not supported.
        )r   )r   r,   s     r   r   z2BaseWhisperSTTService.language_to_service_language   s     ,H55r   
transcriptis_finalc                    K   yw)z+Handle a transcription result with tracing.Nr   )r   r   r   r,   s       r   _handle_transcriptionz+BaseWhisperSTTService._handle_transcription   s     
 	s   audioc                X  K   	 | j                          d{    | j                  |       d{   }| j                          d{    |j                  j	                         }|st        j                  d       |s| j                  rm| j                  |d| j                  j                         d{    t        j                  d| d       t        || j                  t               |       yy7 7 7 7 I# t        $ r}t!        d|        Y d}~yd}~ww xY ww)	a  Transcribe audio data to text.

        Args:
            audio: Raw audio data to transcribe.

        Yields:
            Frame: Either a TranscriptionFrame containing the transcribed text
                  or an ErrorFrame if transcription fails.
        Nz%Received empty transcription from APITzTranscription: [])resultzUnknown error occurred: )error)start_processing_metrics_transcribestop_processing_metricstextstripr   warningr   r   r   r,   debugr   _user_idr   	Exceptionr   )r   r   responser   es        r   run_sttzBaseWhisperSTTService.run_stt   s
    	C//111!--e44H..000==&&(DFGt3300tT^^=T=TUUU/vQ78(MM$&#	  4 240 V  	C%=aS#ABBB	Cst   D*D C:D C<D C>	A-D 6D 7AD 8D*:D <D >D  D 	D'D"D*"D''D*c                    K   t         w)a  Transcribe audio data to text.

        Args:
            audio: Raw audio data in WAV format.

        Returns:
            Transcription: Object containing the transcribed text.

        Raises:
            NotImplementedError: Must be implemented by subclasses.
        )NotImplementedError)r   r   s     r   r   z!BaseWhisperSTTService._transcribe  s      "!s   	r   )r%   r&   r'   r(   r   Settingsr*   r   r   r)   r   r+   boolr   r   r   r   r   r   bytesr   r   r   r
   r   __classcell__)r   s   @r   r   r   v   sr    &H%%
  $!%"&'+ $'+%*',59,<Z> }Z> #	Z>
 3-Z> 8$Z> Z> e_Z> #Z> !%Z> 12Z> #5/Z>x?hsm ?x} ?d 	6X 	6(3- 	6 NR)-9A(9K !C5 !C^E4K-H !CF"u " "r   r   N)%r(   dataclassesr   r   typingr   r   logurur   openair	   openai.types.audior
   pipecat.frames.framesr   r   r   pipecat.services.settingsr   r   r   r   pipecat.services.stt_latencyr   pipecat.services.stt_servicer   pipecat.transcriptions.languager   r   pipecat.utils.timer   (pipecat.utils.tracing.service_decoratorsr   r   r)   r   r   r   r   r   <module>r      s    ) +   , G G _ _ 9 < F / ? 
U[ 
U 
UHH8 HH HHVr"/ r"r   