
    qi/3                         d dl mZmZmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ  e	       rd d	lZdd
lmZ ddlmZ dZ G d ded      Z G d de      Zy	)    )Any	TypedDictoverload   )
AudioInput)GenerationConfig)is_torch_available)ChatChatType   )PipelineN)%MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING)SpeechT5HifiGanzmicrosoft/speecht5_hifiganc                   &    e Zd ZU dZeed<   eed<   y)AudioOutputz
    audio (`AudioInput`):
        The generated audio waveform.
    sampling_rate (`int`):
        The sampling rate of the generated audio waveform.
    audiosampling_rateN)__name__
__module____qualname____doc__r   __annotations__int     V/opt/pipecat/venv/lib/python3.12/site-packages/transformers/pipelines/text_to_audio.pyr   r   !   s     r   r   F)totalc                   
    e Zd ZdZdZdZdZdZdZ e	d      Z
ddd fd
Zd	 Zd
 Zedededefd       Zedee   dedee   fd       Zedededefd       Zedee   dedee   fd       Z fdZ	 	 	 ddZd Z xZS )TextToAudioPipelinea  
    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
    pipeline generates an audio file from an input text and optional other conditional inputs.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline(model="suno/bark-small")
    >>> output = pipe("Hey it's HuggingFace on the phone!")

    >>> audio = output["audio"]
    >>> sampling_rate = output["sampling_rate"]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    <Tip>

    You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
    [`TextToAudioPipeline.__call__.generate_kwargs`].

    Example:

    ```python
    >>> from transformers import pipeline

    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small")

    >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
    >>> generate_kwargs = {
    ...     "do_sample": True,
    ...     "temperature": 0.7,
    ...     "max_new_tokens": 35,
    ... }

    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
    ```

    </Tip>

    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
    `"text-to-audio"`.

    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
    TNF   )max_new_tokens)vocoderr   c                   t        |   |i | d | _        | j                  j                  t        j                         v rE|<t        j                  t              j                  | j                  j                        n|| _        | j                  j                  j                  dv rd | _        || _        | j                  %| j                  j                  j                  | _        | j                  | j                  j                  }| j                  j                   j#                  dd       }||j%                  |j'                                dD ]H  }t)        ||d       }||| _        t)        |dd       (t)        |j*                  |d       }|B|| _        J | j                  J| j                  =t-        | j                  d      r&| j                  j.                  j                  | _        y y y y )N)musicgenspeecht5generation_config)sample_rater   codec_configfeature_extractor)super__init__r"   model	__class__r   valuesr   from_pretrainedDEFAULT_VOCODER_IDtodeviceconfig
model_type	processorr   __dict__getupdateto_dictgetattrr(   hasattrr)   )	selfr"   r   argskwargsr3   
gen_configsampling_rate_namer-   s	           r   r+   zTextToAudioPipeline.__init__m   s   $)&)::#H#O#O#QQ ?  //0BCFFtzzGXGXY L ::''+CC!DN*<<#!%!4!4!B!BD% ZZ&&F,,001DdKJ%j0023&F ;" '0BD I ,)6D&V^T:F$+F,?,?ASUY$ZM$0-:*; %$..*DQUQ_Q_atIu!%!A!A!O!OD Jv*D%r   c                    t        |t              r|g}| j                  j                  j                  dk(  rSd}t        | j                  d      r!t        | j                  j                  dd      }|dddd}|j                  |       |}| j                  | j                  n| j                  }t        |t              r" |j                  |j                  fddd|}|S | j                  j                  j                  d	k(  r%|D cg c]  }|j                  d
      sd| n| }}| j                  j                  j                  dk(  r%|D cg c]  }|j                  d
      sd| n| }} ||fi |ddi}|S c c}w c c}w )Nbarkr    semantic_configmax_input_semantic_lengthFT)
max_lengthadd_special_tokensreturn_attention_maskreturn_token_type_ids)tokenizereturn_dictcsm[z[0]diaz[S1] return_tensorspt)
isinstancestrr,   r3   r4   r;   r&   r:   rC   r8   r5   	tokenizerr
   apply_chat_templatemessages
startswith)r<   textr>   rE   
new_kwargspreprocessoroutputts           r   
preprocesszTextToAudioPipeline.preprocess   s   dC 6D::''61 Jt--/@A$T%;%;%K%KMhjmn
(&+)-).	J f%F)-)Ct~~dD!5\55  	F  zz  ++u4KOPac):#aS	APPzz  ++u4MQR1<<+<%s!CRR!$F&FFF QRs   E?Fc                 z   | j                  || j                        }|d   }|d   }| j                  j                         r| j                  || j                        }d|vr| j                  |d<   |j                  |       |j                  ddi       | j                  j                  j                  dv r	d|vrd|d<    | j                  j                  di ||}n>t        |      rt        d	|j                                 | j                  di ||d
   }| j                  | j                  |      }|S )N)r2   forward_paramsgenerate_kwargsr&   return_dict_in_generateT)rK   output_audiozYou're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. For reference, the `generate_kwargs` used here are: r   r   )_ensure_tensor_on_devicer2   r,   can_generater&   r8   r3   r4   generatelen
ValueErrorkeysr"   )r<   model_inputsr>   r]   r^   rY   s         r   _forwardzTextToAudioPipeline._forward   sT   ..vdkk.J 01 !23::""$";;OTXT_T_;`O #/97;7M7M 34 !!/2 !!#<d"CDzz  ++w6 "759N>2(TZZ((J<J>JF?# KKZK_K_KaJbd 
  TZZA,A.A!DF<<#\\&)Fr   text_inputsr]   returnc                      y Nr   r<   ri   r]   s      r   __call__zTextToAudioPipeline.__call__   s    PSr   c                      y rl   r   rm   s      r   rn   zTextToAudioPipeline.__call__   s    \_r   c                      y rl   r   rm   s      r   rn   zTextToAudioPipeline.__call__   s    UXr   c                      y rl   r   rm   s      r   rn   zTextToAudioPipeline.__call__   s    adr   c                 $    t        |   |fi |S )aL  
        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.

        Args:
            text_inputs (`str`, `list[str]`, `ChatType`, or `list[ChatType]`):
                One or several texts to generate. If strings or a list of string are passed, this pipeline will
                generate the corresponding text. Alternatively, a "chat", in the form of a list of dicts with "role"
                and "content" keys, can be passed, or a list of such chats. When chats are passed, the model's chat
                template will be used to format them before passing them to the model.
            forward_params (`dict`, *optional*):
                Parameters passed to the model generation/forward method. `forward_params` are always passed to the
                underlying model.
            generate_kwargs (`dict`, *optional*):
                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                complete overview of generate, check the [following
                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
                only passed to the underlying model if the latter is a generative model.

        Return:
            `AudioOutput` or a list of `AudioOutput`, which is a `TypedDict` with two keys:

            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
        )r*   rn   )r<   ri   r]   r-   s      r   rn   zTextToAudioPipeline.__call__   s    2 w>~>>r   c                     t        | dd       | j                  |d<   t        | dd       | j                  |d<   | j                  |d<   |r|ni |r|ni d}|i }i }|||fS )Nassistant_modelassistant_tokenizerrR   )r]   r^   )r:   rt   rR   ru   )r<   preprocess_paramsr]   r^   paramspostprocess_paramss         r   _sanitize_parametersz(TextToAudioPipeline._sanitize_parameters  s     4*D1=151E1EO-.4.5A+/>>OK(595M5MO12 1?nB2Ar

 $ " &*<<<r   c                 X   d}t        |t              rd|v r|d   }nd}|d   }nt        |t              r|d   }|r'| j                  | j                  j	                  |      }t        |t
              r`|D cg c]?  }|j                  dt        j                        j                         j                         A }}t        |      dkD  r|n|d   }n=|j                  dt        j                        j                         j                         }t        || j                  	      S c c}w )
NFr   T	sequencesr   cpu)r2   dtyper   )r   r   )rP   dicttupler5   decodelistr1   torchfloatnumpysqueezerd   r   r   )r<   r   needs_decodingels       r   postprocesszTextToAudioPipeline.postprocess$  s   eT"%g!%k*u%!HEdnn8NN))%0EeT"X]^RTRUU%u{{U;AACKKM^E^ Z!^EqEHHEH=CCEMMOE,,
 	
 _s   7AD')NNN)r   r   r   r   _pipeline_calls_generate_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr+   r[   rh   r   rQ   r   r   rn   r   r   ry   r   __classcell__)r-   s   @r   r   r   -   s    2h  $O!#O "2" '+$ &PP%N(T SCS3S;S S_DI__kIZ_ _XHXXX XdDNdcddS^N_d d?: 	=.
r   r   )typingr   r   r   audio_utilsr   
generationr   utilsr	   utils.chat_template_utilsr
   r   baser   r   models.auto.modeling_autor   !models.speecht5.modeling_speecht5r   r0   r   r   r   r   r   <module>r      sO    , + $ ) & 6  QC1 	)5 	N
( N
r   