
    qi                     F    d Z ddlmZ ddlmZ e G d de             ZdgZy)z$Speech processor class for SpeechT5.   )ProcessorMixin)auto_docstringc                   4     e Zd Z fdZed        Zd Z xZS )SpeechT5Processorc                 &    t         |   ||       y )N)super__init__)selffeature_extractor	tokenizer	__class__s      b/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/speecht5/processing_speecht5.pyr	   zSpeechT5Processor.__init__   s    *I6    c                 <   |j                  dd       }|j                  dd       }|j                  dd       }|j                  dd       }|j                  dd       }||t        d      ||t        d      ||||t        d      | | j                  |g|d|i|}n| | j                  |fi |}nd }| | j                  |||d	|}	|	d
   }
n| | j                  |fi |}	|	d   }
nd }	||	S |	
|d<   |	j	                  d      }|||d<   |S )Naudiotexttext_targetaudio_targetsampling_ratez\Cannot process both `audio` and `text` inputs. Did you mean `audio_target` or `text_target`?z\Cannot process both `audio_target` and `text_target` inputs. Did you mean `audio` or `text`?zaYou need to specify either an `audio`, `audio_target`, `text`, or `text_target` input to process.)r   r   input_values	input_idslabelsattention_maskdecoder_attention_mask)pop
ValueErrorr   r   get)r
   argskwargsr   r   r   r   r   inputstargetsr   r   s               r   __call__zSpeechT5Processor.__call__   s   

7D)zz&$'jj5zz.$7

?D9!1n  #(?n  =\1dl{GZs  +T++E`D``Y_`F#T^^D3F3FF#,d,,,]juntuG^,F$$dnn[;F;G[)FG>N%F8%,[[1A%B"%13I/0r   c                    |j                  dd      }|j                  dd      }|j                  dd      }||t        d      |||t        d      |! | j                  j                  |g|i |}n"| | j                  j                  |fi |}nd}|d|v st        |t              r*d|d   v r# | j                  j                  |fi |}|d   }nt| j                  j                  }| j                  j                  | j                  _         | j                  j                  |g|i |}|| j                  _        |d   }nd}||S |||d<   |j                  d      }	|	|	|d	<   |S )
au  
        Collates the audio and text inputs, as well as their targets, into a padded batch.

        Audio inputs are padded by SpeechT5FeatureExtractor's [`~SpeechT5FeatureExtractor.pad`]. Text inputs are padded
        by SpeechT5Tokenizer's [`~SpeechT5Tokenizer.pad`].

        Valid input combinations are:

        - `input_ids` only
        - `input_values` only
        - `labels` only, either log-mel spectrograms or text tokens
        - `input_ids` and log-mel spectrogram `labels`
        - `input_values` and text `labels`

        Please refer to the docstring of the above two methods for more information.
        r   Nr   r   z:Cannot process both `input_values` and `input_ids` inputs.zZYou need to specify either an `input_values`, `input_ids`, or `labels` input to be padded.    r   r   )
r   r   r   padr   
isinstancelistfeature_sizenum_mel_binsr   )
r
   r   r   r   r   r   r    r!   feature_size_hackr   s
             r   r%   zSpeechT5Processor.padJ   s   " zz.$7JJ{D1	Hd+#	(=YZZI$5&.l  #/T++//NtNvNF"'T^^''	<V<FFf$FD)AkU[\]U^F^,$..,,V>v> -$($:$:$G$G!6:6L6L6Y6Y&&34$0044VMdMfM6G&&3 0G>N%F8%,[[1A%B"%13I/0r   )__name__
__module____qualname__r	   r   r"   r%   __classcell__)r   s   @r   r   r      s"    7 . .`:r   r   N)__doc__processing_utilsr   utilsr   r   __all__ r   r   <module>r4      s9    + . # o o od 
r   