
    qi"                         d Z ddlmZ ddlmZmZmZmZ ddlZddl	m
Z
 ddlmZ ddlmZmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ dedee   fdZe G d de             Z G d de      Zy)zXTTS text-to-speech service implementation.

This module provides integration with Coqui XTTS streaming server for
text-to-speech synthesis using local Docker deployment.
    )	dataclass)AnyAsyncGeneratorDictOptionalN)logger)create_stream_resampler)
ErrorFrameFrame
StartFrameTTSAudioRawFrame)TTSSettings_warn_deprecated_param)
TTSService)Languageresolve_language)
traced_ttslanguagereturnc                 B   i t         j                  dt         j                  dt         j                  dt         j                  dt         j
                  dt         j                  dt         j                  dt         j                  dt         j                  d	t         j                  d
t         j                  dt         j                  dt         j                  dt         j                  dt         j                  dt         j                   d}t#        | |d      S )zConvert a Language enum to XTTS language code.

    Args:
        language: The Language enum value to convert.

    Returns:
        The corresponding XTTS language code, or None if not supported.
    csdeenesfrhihuitjakonlplptrutrzzh-cnT)use_base_code)r   CSDEENESFRHIHUITJAKONLPLPTRUTRZHr   )r   LANGUAGE_MAPs     K/opt/pipecat/venv/lib/python3.12/site-packages/pipecat/services/xtts/tts.pylanguage_to_xtts_languager9   '   s   TT 	T 	T	
 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T" 	W#L( Hl$GG    c                       e Zd ZdZy)XTTSTTSSettingszSettings for XTTSService.N)__name__
__module____qualname____doc__ r:   r8   r<   r<   G   s    #r:   r<   c                        e Zd ZU dZeZeed<   dej                  dddde	e
   de
dej                  ded	e	e   d
e	e   f fdZdefdZdede	e
   fdZdef fdZede
de
deedf   fd       Z xZS )XTTSServicezCoqui XTTS text-to-speech service.

    Provides text-to-speech synthesis using a locally running Coqui XTTS
    streaming server. Supports multiple languages and voice cloning through
    studio speakers configuration.
    	_settingsN)voice_idr   sample_ratesettingsrE   base_urlaiohttp_sessionr   rF   rG   c                
   t        dd| j                  |            }|t        dt         d       ||_        ||j	                  |       t        	|   d|dd|d| || _        d| _        || _	        t               | _        y)a  Initialize the XTTS service.

        Args:
            voice_id: ID of the voice/speaker to use for synthesis.

                .. deprecated:: 0.0.105
                    Use ``settings=XTTSTTSSettings(voice=...)`` instead.

            base_url: Base URL of the XTTS streaming server.
            aiohttp_session: HTTP session for making requests to the server.
            language: Language for synthesis. Defaults to English.
            sample_rate: Audio sample rate. If None, uses default.
            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            **kwargs: Additional arguments passed to parent TTSService.
        N)modelvoicer   rE   rL   T)rF   push_start_framepush_stop_framesrG   rA   )r<   language_to_service_languager   rL   apply_updatesuper__init__	_base_url_studio_speakers_aiohttp_sessionr	   
_resampler)
selfrE   rH   rI   r   rF   rG   kwargsdefault_settings	__class__s
            r8   rR   zXTTSService.__init__Y   s    8 +66x@
 ":H%-"
 ))(3 	
#!!%		

 	
 ":> /13r:   r   c                      y)zCheck if this service can generate processing metrics.

        Returns:
            True, as XTTS service supports metrics generation.
        TrA   )rW   s    r8   can_generate_metricsz XTTSService.can_generate_metrics   s     r:   c                     t        |      S )zConvert a Language enum to XTTS service language format.

        Args:
            language: The language to convert.

        Returns:
            The XTTS-specific language code, or None if not supported.
        )r9   )rW   r   s     r8   rO   z(XTTSService.language_to_service_language   s     )22r:   framec                    K   t         |   |       d{    | j                  ry| j                  j	                  | j
                  dz         4 d{   }|j                  dk7  rU|j                          d{   }| j                  d|j                   d| d       d{    	 ddd      d{    y|j                          d{   | _        ddd      d{    y7 7 7 s7 J7 ;7 $7 # 1 d{  7  sw Y   yxY ww)zStart the XTTS service and load studio speakers.

        Args:
            frame: The start frame containing initialization parameters.
        Nz/studio_speakers   z'Error getting studio speakers (status: 	, error: ))	error_msg)
rQ   startrT   rU   getrS   statustext
push_errorjson)rW   r^   rrg   rZ   s       r8   rd   zXTTSService.start   s     gmE"""  ((,,T^^>P-PQ 	3 	3UVxx3VVX~oo GzQZ[_Z``ab &    	3 	3 	3 +,&&(ND!	3 	3 	3 	#
	3%	3 %3	3 	3 	3 	3s   DC+=DC-D#C9;C/<*C9&C1'C9,D7C38D=C9C5	C9D%C7&D-D/C91C93D5C97D9D?D DDrg   
context_idc                  K   t        j                  |  d| d       | j                  st        j                  |  d       y| j                  | j                  j
                     }| j                  dz   }|j                  dd      j                  dd      | j                  j                  |d	   |d
   ddd}| j                  j                  ||      4 d{   }|j                  dk7  rJ|j                          d{   }t        d|j                   d| d       	 ddd      d{    y| j                  |       d{    | j                  }t!               }|j"                  j%                  |      2 3 d{   }	t'        |	      dkD  s| j)                          d{    |j+                  |	       t'        |      dk\  sP|dd }
|dd }| j,                  j/                  t1        |
      d| j2                         d{   }t5        || j2                  d|      }| t'        |      dk\  ro7 h7 D7 7 7 7 7 A6 t'        |      dkD  rW| j,                  j/                  t1        |      d| j2                         d{  7  }t5        || j2                  d|      }| ddd      d{  7   y# 1 d{  7  sw Y   yxY ww)a  Generate speech from text using XTTS streaming server.

        Args:
            text: The text to synthesize into speech.
            context_id: The context ID for tracking audio frames.

        Yields:
            Frame: Audio frames containing the synthesized speech.
        z: Generating TTS []z no studio speakers availableNz/tts_stream. *speaker_embeddinggpt_cond_latentF   )rg   r   rq   rr   add_wav_headerstream_chunk_size)ri   r`   zError getting audio (status: ra   rb   )errorr   i  i]     )rk   )r   debugrT   rv   rD   rL   rS   replacer   rU   postrf   rg   r
   start_tts_usage_metrics
chunk_size	bytearraycontentiter_chunkedlenstop_ttfb_metricsextendrV   resamplebytesrF   r   )rW   rg   rk   
embeddingsurlpayloadrj   
CHUNK_SIZEbufferchunkprocess_dataresampled_audior^   s                r8   run_ttszXTTSService.run_tts   s     	v/vQ78$$LLD6!>?@**4>>+?+?@
nn}, LLb)11#r://!+,?!@)*;<#!#
 ((--c-@ ,	 ,	Axx3VVX~ )FqxxjPYZ^Y__`'abb	,	 ,	 ,	 ..t444J[F yy55jA $ $eu:>00222MM%( Fu, (.fu~!' 150H0H!,/8H8H1 + !1+T-=-=qZ! $ Fu,',	%,	 5
$2+!  B4 6{Q(,(@(@&M5$*:*:) # # )#T%5%5qZ Y,	 ,	 ,	 ,	 ,	s   CKIK#K
 I%K
&K1I2K7K
I
5K
III	K
K
,I-#K
=K
I0K
 K
KK
K
K
IK
K
AK
J#K
8KKK
KKKK)r=   r>   r?   r@   r<   Settings__annotations__r   r)   r   straiohttpClientSessionintrR   boolr\   rO   r   rd   r   r   r   r   __classcell__)rZ   s   @r8   rC   rC   N   s     H
 #' &[[%).2;4 3-;4 	;4
 !..;4 ;4 c];4 ?+;4zd 	3X 	3(3- 	33 3( I# I3 I>%QU+;V I Ir:   rC   ) r@   dataclassesr   typingr   r   r   r   r   logurur   pipecat.audio.utilsr	   pipecat.frames.framesr
   r   r   r   pipecat.services.settingsr   r   pipecat.services.tts_servicer   pipecat.transcriptions.languager   r   (pipecat.utils.tracing.service_decoratorsr   r   r9   r<   rC   rA   r:   r8   <module>r      s{    " 6 6   7  J 3 F ?H HXc] H@ 	k 	 	y* yr:   