
    qi3@                        d Z ddlZddlZddlmZmZ ddlmZmZm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z' 	 ddl(m)Z* ddl+m,Z, dZ1de"de	e2   fdZ3e G d de             Z4 G d de       Z5y# e-$ r7Z. ej^                  de.         ej^                  d        e0de.       dZ.[.ww xY w)zGradium's speech-to-text service implementation.

This module provides integration with Gradium's real-time speech-to-text
WebSocket API for streaming audio transcription.
    N)	dataclassfield)AnyAsyncGeneratorOptional)logger)	BaseModel)CancelFrameEndFrameFrame
StartFrameTranscriptionFrameVADUserStartedSpeakingFrameVADUserStoppedSpeakingFrame)FrameDirection)	NOT_GIVENSTTSettings	_NotGiven_warn_deprecated_param)GRADIUM_TTFS_P99)WebsocketSTTService)Languageresolve_language)time_now_iso8601)
traced_stt)connect)StatezException: zIIn order to use Gradium, you need to `pip install "pipecat-ai[gradium]"`.zMissing module: i]  languagereturnc           
          t         j                  dt         j                  dt         j                  dt         j                  dt         j
                  di}t        | |d      S )zConvert a Language enum to Gradium's language code format.

    Args:
        language: The Language enum value to convert.

    Returns:
        The Gradium language code string or None if not supported.
    deenesfrptT)use_base_code)r   DEENESFRPTr   )r   LANGUAGE_MAPs     N/opt/pipecat/venv/lib/python3.12/site-packages/pipecat/services/gradium/stt.pylanguage_to_gradium_languager.   1   sI     	TTTTTL Hl$GG    c                   <    e Zd ZU dZ ed       Zee   ez  e	d<   y)GradiumSTTSettingsa^  Settings for GradiumSTTService.

    Parameters:
        delay_in_frames: Delay in audio frames (80ms each) before text is
            generated. Higher delays allow more context but increase latency.
            Allowed values: 7, 8, 10, 12, 14, 16, 20, 24, 36, 48.
            Default is 10 (800ms). Lower values like 7-8 give faster response.
    c                      t         S N)r    r/   r-   <lambda>zGradiumSTTSettings.<lambda>P   s    y r/   )default_factorydelay_in_framesN)
__name__
__module____qualname____doc__r   r7   r   intr   __annotations__r4   r/   r-   r1   r1   E   s"     27GX1YOXc]Y.Yr/   r1   c                   ~    e Zd ZU dZeZeed<    G d de      Zdddde	dde
d	e
d
ee   dee
   dee   dee   f fdZdefdZdedee
ef   f fdZdef fdZdef fdZdef fdZdedef fdZd Zdedeedf   fdZ e!de
dede"fd       Z# fd Z$d! Z% fd"Z&d# Z'd$ Z(d% Z)d& Z*d' Z+d( Z,d)e
fd*Z- xZ.S )+GradiumSTTServicezGradium real-time speech-to-text service.

    Provides real-time speech transcription using Gradium's WebSocket API.
    Supports both interim and final transcriptions with configurable parameters
    for audio processing and connection management.
    	_settingsc                   :    e Zd ZU dZdZee   ed<   dZee	   ed<   y)GradiumSTTService.InputParamsa  Configuration parameters for Gradium STT API.

        .. deprecated:: 0.0.105
            Use ``settings=GradiumSTTSettings(...)`` instead.

        Parameters:
            language: Expected language of the audio (e.g., "en", "es", "fr").
                This helps ground the model to a specific language and improve
                transcription quality.
            delay_in_frames: Delay in audio frames (80ms each) before text is
                generated. Higher delays allow more context but increase latency.
                Allowed values: 7, 8, 10, 12, 14, 16, 20, 24, 36, 48.
                Default is 10 (800ms). Lower values like 7-8 give faster response.
        Nr   r7   )
r8   r9   r:   r;   r   r   r   r=   r7   r<   r4   r/   r-   InputParamsrB   ^   s&    	 (,(8$+)-#-r/   rC   z&wss://eu.api.gradium.ai/api/speech/asrN)api_endpoint_base_urlparamsjson_configsettingsttfs_p99_latencyapi_keyrD   rE   rF   rG   rH   c                   |ddl }|j                  dt        d       t        ddd      }	|@t	        dt               |s.|j
                  |	_        |j                  |j                  |	_        ||	j                  |       t        
| $  d
t        ||	d| || _        || _        d| _        || _        d| _        t!               | _        d	| _        d| _        d| _        d| _        y)a  Initialize the Gradium STT service.

        Args:
            api_key: Gradium API key for authentication.
            api_endpoint_base_url: WebSocket endpoint URL. Defaults to Gradium's streaming endpoint.
            params: Configuration parameters for language and delay settings.

                .. deprecated:: 0.0.105
                    Use ``settings=GradiumSTTSettings(...)`` instead.

            json_config: Optional JSON configuration string for additional model settings.

                .. deprecated:: 0.0.101
                    Use `params` instead for type-safe configuration.

            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
                Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark
            **kwargs: Additional arguments passed to parent STTService class.
        Nr   zdParameter 'json_config' is deprecated and will be removed in a future version, use 'params' instead.   )
stacklevel)modelr   r7   rE   )sample_raterH   rG   P   r4   )warningswarnDeprecationWarningr1   r   r   r7   apply_updatesuper__init__SAMPLE_RATE_api_key_api_endpoint_base_url
_websocket_json_config_receive_task	bytearray_audio_buffer_chunk_size_ms_chunk_size_bytes_delay_in_frames_frame_size)selfrI   rD   rE   rF   rG   rH   kwargsrP   default_settings	__class__s             r-   rU   zGradiumSTTService.__init__q   s   @ "MMv"   . 
 "8-?@,2OO )))57=7M7M$4 ))(3 	
#-%	
 		
  &;#'!&[ !" !"r/   r   c                      y)zzCheck if the service can generate metrics.

        Returns:
            True if metrics generation is supported.
        Tr4   rb   s    r-   can_generate_metricsz&GradiumSTTService.can_generate_metrics   s     r/   deltac                    K   t         |   |       d{   }|s|S | j                  r0| j                          d{    | j	                          d{    |S 7 F7  7 
w)zApply a settings delta, sync params, and reconnect.

        Args:
            delta: A :class:`STTSettings` (or ``GradiumSTTSettings``) delta.

        Returns:
            Dict mapping changed field names to their previous values.
        N)rT   _update_settingsrY   _disconnect_connect)rb   ri   changedre   s      r-   rk   z"GradiumSTTService._update_settings   s_      077N??""$$$--/!! 8
 %!s1   A#A'A#AA#A!A#A#!A#framec                    K   t         |   |       d{    t        | j                  | j                  z  dz  dz        | _        | j                          d{    y7 J7 w)zmStart the speech-to-text service.

        Args:
            frame: Start frame to begin processing.
        NrK   i  )rT   startr<   r^   rN   r_   rm   rb   ro   re   s     r-   rq   zGradiumSTTService.start   s\      gmE"""!$T%8%84;K;K%Ka%ORV%V!Wmmo 	#s"   A%A!AA%A#A%#A%c                 t   K   t         |   |       d{    | j                          d{    y7 7 w)ziStop the speech-to-text service.

        Args:
            frame: End frame to stop processing.
        N)rT   stoprl   rr   s     r-   rt   zGradiumSTTService.stop   s6      gl5!!!    	"    848688c                 t   K   t         |   |       d{    | j                          d{    y7 7 w)zoCancel the speech-to-text service.

        Args:
            frame: Cancel frame to abort processing.
        N)rT   cancelrl   rr   s     r-   rw   zGradiumSTTService.cancel   s6      gnU###    	$ ru   	directionc                    K   t         |   ||       d{    t        |t              r| j	                          d{    yt        |t
              r| j                          d{    yy7 W7 17 
w)a  Process frames with VAD-specific handling.

        When VAD detects the user has stopped speaking, we flush the transcription
        by sending silence frames. This makes the system more reactive by getting
        the final transcription faster without closing the connection.

        Args:
            frame: The frame to process.
            direction: The direction of frame processing.
        N)rT   process_frame
isinstancer   start_processing_metricsr   _flush_transcription)rb   ro   rx   re   s      r-   rz   zGradiumSTTService.process_frame   sl      g#E9555e89//111:;++--- <	 	6 2-s2   A5A/'A5A1 (A5(A3)A51A53A5c                   K   | j                   r&| j                   j                  t        j                  ury| j                  dk  rt        j                  d       yt        | j                  dz        }t        j                  |      j                  d      }t        j                  d| j                   d       t        | j                        D ]>  }d|d	}	 | j                   j                  t        j                  |             d{    @ y7 # t         $ r#}t        j"                  d
|        Y d}~ yd}~ww xY ww)a  Flush the transcription by sending silence frames.

        When VAD detects the user stopped speaking, we send delay_in_frames
        chunks of silence (zeros) to flush the remaining audio from the model's
        buffer. This allows for faster turn-around without closing the connection.

        From Gradium docs: "feed in delay_in_frames chunks of silence (vectors
        of zeros). If those are fed in faster than realtime, the API also has
        a possibility to process them faster."
        Nr   z&No delay_in_frames set, skipping flushrK   utf-8zFlushing Gradium STT with z silence framesaudiotyper   zFailed to send silence frame: )rY   stater   OPENr`   r   debugbytesra   base64	b64encodedecoderangesendjsondumps	Exceptionwarning)rb   silence_bytessilence_b64_msges         r-   r}   z&GradiumSTTService._flush_transcription  s     $//"7"7uzz"I  A%LLAB d..23&&}5<<WE1$2G2G1HXYt,,- 	A"[9Coo**4::c?;;;	 < !?sCDsB   CE1D	D
DED	D?D:4E:D??Er   c                |  K   | j                   j                  |       t        | j                         | j                  k\  rt	        | j                   d| j                         }| j                   | j                  d | _         t        j                  |      j                  d      }d|d}| j                  r\| j                  j                  t        j                  u r6| j                  j                  t        j                  |             d{    t        | j                         | j                  k\  rd y7 -w)zProcess audio data for speech-to-text conversion.

        Args:
            audio: Raw audio bytes to process.

        Yields:
            None (processing handled via WebSocket messages).
        Nr   r   r   )r]   extendlenr_   r   r   r   r   rY   r   r   r   r   r   r   )rb   r   chunkr   s       r-   run_sttzGradiumSTTService.run_stt/  s      	!!%($$$%)?)??$,,-Et/E/EFGE!%!3!3D4J4J4L!MD$$U+227;E"U3C4??#8#8EJJ#Foo**4::c?;;; $$$%)?)?? 
 <s   D
D<D:&D<4D<
transcriptis_finalr   c                    K   yw)z'Record transcription event for tracing.Nr4   )rb   r   r   r   s       r-   _trace_transcriptionz&GradiumSTTService._trace_transcriptionD  s      	s   c                   K   t         |           d {    | j                          d {    | j                  r=| j                  s0| j                  | j                  | j                              | _        y y y 7 f7 Pwr3   )rT   rm   _connect_websocketrY   r[   create_task_receive_task_handler_report_errorrb   re   s    r-   rm   zGradiumSTTService._connectI  sm     g   %%'''??4#5#5!%!1!1$2L2LTM_M_2`!aD $6?	 	!'s    B A<B A>AB >B c                   K   	 | j                   r'| j                   j                  t        j                  u ry t	        j
                  d       | j                  }| j                  dd}t        ||       d {   | _         | j                  d       d {    ddd}i }| j                  rt        j                  | j                        }| j                  j                  r&t        | j                  j                        }|r||d	<   | j                  j                   r| j                  j                   |d
<   |r||d<   | j                   j#                  t        j$                  |             d {    | j                   j'                          d {   }t        j                  |      }|d   dk(  rt)        d|d          |d   dk7  rt)        d|d          |j+                  d
d      | _        |j+                  dd      | _        t	        j
                  d| j,                   d| j.                   d       y 7 7 7 7 # t(        $ r%}| j1                  d| |       d {  7    d }~ww xY ww)NzConnecting to Gradium STTpipecat)z	x-api-keyzx-api-source)additional_headerson_connectedsetuppcm)r   input_formatr   r7   rF   r   errorzreceived error messagereadyzunexpected first message type r   
frame_sizei  z*Connected to Gradium STT (delay_in_frames=z, frame_size=)Unknown error occurred: 	error_msg	exception)rY   r   r   r   r   r   rX   rW   websocket_connect_call_event_handlerrZ   r   loadsr@   r   r.   r7   r   r   recvr   getr`   ra   
push_error)rb   ws_urlheaders	setup_msgrF   gradium_language	ready_msgr   s           r-   r   z$GradiumSTTService._connect_websocketQ  sE    2	4??#8#8EJJ#FLL4500F!]] )G %6#*% DO **>::: %I
 K  "jj):):;~~&&#?@W@W#X #.>K
+~~--151O1O-.+6	-(//&&tzz)'<==="oo2244I

9-I G+/)I2F1G HII G+"@6AR@S TUU %.MM2CQ$GD!(}}\4@DLL<T=R=R<S T"../q2A ;" >4  	//.Fqc,JVW/XXX	s   I72I I7A I 7H<8I H?CI /I0!I IB)I ;I7<I ?I I I 	I4I/(I+)I//I44I7c                    K   t         |           d {    | j                  r*| j                  | j                         d {    d | _        | j	                          d {    y 7 S7 &7 	wr3   )rT   rl   r[   cancel_task_disconnect_websocketr   s    r-   rl   zGradiumSTTService._disconnect  sf     g!###""4#5#5666!%D((*** 	$ 7 	+s3   A/A).A/A+A/#A-$A/+A/-A/c                   K   	 | j                   r]| j                   j                  t        j                  u r7t	        j
                  d       | j                   j                          d {    d | _         | j                  d       d {    y 7 %# t        $ r)}| j                  d| |       d {  7   Y d }~Pd }~ww xY w7 <# d | _         | j                  d       d {  7   w xY ww)NzDisconnecting from Gradium STTr   r   on_disconnected)
rY   r   r   r   r   r   closer   r   r   )rb   r   s     r-   r   z'GradiumSTTService._disconnect_websocket  s     	>4??#8#8EJJ#F=>oo++--- #DO**+<=== . 	Y//.Fqc,JVW/XXX	Y > #DO**+<===sw   C-A$B (B)B -C-C	C-B 	CB=2B53B=8C =CC C-C*#C&$C**C-c                 H    | j                   r| j                   S t        d      )NzWebsocket not connected)rY   r   rg   s    r-   _get_websocketz GradiumSTTService._get_websocket  s    ????"122r/   c                 
  K   | j                         2 3 d {   }	 t        j                  |      }| j                  |       d {    97 47 # t        j                  $ r t        j                  d|        Y jw xY w6 y w)NzReceived non-JSON message: )r   r   r   _process_responseJSONDecodeErrorr   r   )rb   r   datas      r-   _process_messagesz#GradiumSTTService._process_messages  s     !002 	H 	H'Hzz'*,,T222	H 3'' H!<WIFGH	 3sT   BBABB)AAABBA+A>;B=A>>Bc                    K   	 | j                          d {    t        j                  |  d       | j                          d {    I7 57 w)Nz= Gradium connection was disconnected (timeout?), reconnecting)r   r   r   r   rg   s    r-   _receive_messagesz#GradiumSTTService._receive_messages  sM     ((***LLD6!^_`))+++ *+s!   AA/AAAAc                   K   |j                  dd      }|dk(  r| j                  |d          d {    y |dk(  r| j                          d {    y |dk(  r| j                  d|        d {    y y 7 G7 +7 
w)Nr    textend_of_streamr   zError: )r   )r   _handle_text_handle_end_of_streamr   )rb   r   type_s      r-   r   z#GradiumSTTService._process_response  s     #F?##CK000o%,,...g//gcUO/<<<  1.<s3   /A?A9A?A;"A?2A=3A?;A?=A?c                 6   K   t        j                  d       yw)zHandle termination message.z*Received end_of_stream message from serverN)r   r   rg   s    r-   r   z'GradiumSTTService._handle_end_of_stream  s     ABs   r   c                    K   | j                  t        || j                  t                            d{    | j	                  |dd       d{    | j                          d{    y7 97 7 	w)zHandle transcription results.NT)r   r   )
push_framer   _user_idr   r   stop_processing_metrics)rb   r   s     r-   r   zGradiumSTTService._handle_text  ss     oo "
 	
 	
 ''td'KKK**,,,	
 	L,s3   2A4A.A4A0A4(A2)A40A42A4)/r8   r9   r:   r;   r1   Settingsr=   r	   rC   r   strr   floatrU   boolrh   r   dictr   rk   r   rq   r   rt   r
   rw   r   r   rz   r}   r   r   r   r   r   r   rm   r   rl   r   r   r   r   r   r   r   __classcell__)re   s   @r-   r?   r?   S   sv    "H!!.i .. &N(,%)15,<S S  #	S
 %S c]S -.S #5/Sjd K DcN $ ! !!+ !. .> .$B5 ^E4K-H * S D T\  b3j+	>3
H,=C
-s 
-r/   r?   )6r;   r   r   dataclassesr   r   typingr   r   r   logurur   pydanticr	   pipecat.frames.framesr
   r   r   r   r   r   r   "pipecat.processors.frame_processorr   pipecat.services.settingsr   r   r   r   pipecat.services.stt_latencyr   pipecat.services.stt_servicer   pipecat.transcriptions.languager   r   pipecat.utils.timer   (pipecat.utils.tracing.service_decoratorsr   websockets.asyncio.clientr   r   websockets.protocolr   ModuleNotFoundErrorr   r   r   rV   r   r.   r1   r?   r4   r/   r-   <module>r      s      ( 0 0     > _ _ 9 < F / ?,F) H8 H H( 
Z 
Z 
Zq-+ q-U  ,FLL;qc"#FLL\]
&qc*
++,s   .B# #C(2CC