
    qiB+                         d dl Z d dlmZ d dlZd dlZddlmZmZm	Z	m
Z
mZ ddlmZmZ  e       rddlmZ  ej"                  e      Zded	ed
ej,                  fdZ e ed             G d de             Zy)    N)Any   )add_end_docstringsis_torch_availableis_torchaudio_availableis_torchcodec_availablelogging   )Pipelinebuild_pipeline_init_args),MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMESbpayloadsampling_ratereturnc                 z   | }d}d}dddd|d|d|d	d
ddg}	 t        j                  |t         j                  t         j                        }|j                  |       }|d   }t        j                  |t        j                        }	|	j                  d   dk(  rt	        d      |	S # t        $ r t	        d      w xY w)z?
    Helper function to read an audio file through ffmpeg.
    1f32leffmpegz-izpipe:0z-acz-arz-fz-hide_bannerz	-loglevelquietzpipe:1)stdinstdoutzFffmpeg was not found but is required to load audio files from filenamer   zMalformed soundfile)

subprocessPopenPIPEFileNotFoundError
ValueErrorcommunicatenp
frombufferfloat32shape)
r   r   aracformat_for_conversionffmpeg_commandffmpeg_processoutput_stream	out_bytesaudios
             ]/opt/pipecat/venv/lib/python3.12/site-packages/transformers/pipelines/audio_classification.pyffmpeg_readr+      s     ?B	B#

N c#)).
XbXgXgh #..x8Ma IMM)RZZ0E{{1~.//L  cabbcs   4B% %B:T)has_feature_extractorc            	            e Zd ZdZdZdZdZdZ fdZde	j                  ez  ez  ez  dedeeeef      f fdZdd	Zd
 Zd ZddZ xZS )AudioClassificationPipelinea  
    Audio classification pipeline using any `AutoModelForAudioClassification`. This pipeline predicts the class of a
    raw waveform or an audio file. In case of an audio file, ffmpeg should be installed to support multiple audio
    formats.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> classifier = pipeline(model="superb/wav2vec2-base-superb-ks")
    >>> classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
    [{'score': 0.997, 'label': '_unknown_'}, {'score': 0.002, 'label': 'left'}, {'score': 0.0, 'label': 'yes'}, {'score': 0.0, 'label': 'down'}, {'score': 0.0, 'label': 'stop'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)


    This pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"audio-classification"`.

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=audio-classification).
    FTc                 ~    d|v r|d   d |d<   n	d|vrd|d<   t        |   |i | | j                  t               y )Ntop_k   )super__init__check_model_typer   )selfargskwargs	__class__s      r*   r3   z$AudioClassificationPipeline.__init__b   sO    f!8"F7OF"F7O$)&)JK    inputsr7   r   c                 $    t        |   |fi |S )a  
        Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more
        information.

        Args:
            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
                The inputs is either :
                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
                      same way.
                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
                        Raw audio at the correct sampling rate (no further check will be done)
                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
                      pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int,
                      "raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or
                      `"array"` is used to denote the raw audio waveform.
            top_k (`int`, *optional*, defaults to None):
                The number of top labels that will be returned by the pipeline. If the provided number is `None` or
                higher than the number of labels available in the model configuration, it will default to the number of
                labels.
            function_to_apply(`str`, *optional*, defaults to "softmax"):
                The function to apply to the model output. By default, the pipeline will apply the softmax function to
                the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
                built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
                post-processing.

        Return:
            A list of `dict` with the following keys:

            - **label** (`str`) -- The label predicted.
            - **score** (`float`) -- The corresponding probability.
        )r2   __call__)r5   r:   r7   r8   s      r*   r<   z$AudioClassificationPipeline.__call__l   s    D w1&11r9   c                 ,   i }|$| j                   j                  j                  |d<   nH|| j                   j                  j                  kD  r | j                   j                  j                  }||d<   ||dvrt        d| d      ||d<   nd|d<   i i |fS )Nr0   )softmaxsigmoidnonez'Invalid value for `function_to_apply`: z2. Valid options are ['softmax', 'sigmoid', 'none']function_to_applyr>   )modelconfig
num_labelsr   )r5   r0   rA   r7   postprocess_paramss        r*   _sanitize_parametersz0AudioClassificationPipeline._sanitize_parameters   s     =*.***;*;*F*Fw'tzz((333

))44*/w'( (FF =>O=P QG G  7H236?232)))r9   c                    t        |t              ri|j                  d      s|j                  d      r"t        j                  |d      j
                  }n%t        |d      5 }|j                         }d d d        t        |t              r t        || j                  j                        }t               r8dd l}t        ||j                        r|j                         j!                         }t#               rSdd l}dd l}t        ||j&                  j(                        r+|j+                         }|j,                  }||j.                  d}t        |t0              r|j3                         }d|v rd	|v sd
|v st5        d      |j7                  d	d       }|$|j7                  dd        |j7                  d
d       }|j7                  d      }|}|| j                  j                  k7  rdd l}t9               rddlm}	 nt?        d      |	jA                  t        |tB        jD                        r|jG                  |      n||| j                  j                        j!                         }t        |tB        jD                        stI        d      tK        |jL                        dk7  rt5        d      | j                  || j                  j                  d      }
| jN                  |
jQ                  | jN                        }
|
S # 1 sw Y   xY w)Nzhttp://zhttps://T)follow_redirectsrbr   )arrayr   r   rawrJ   zWhen passing a dictionary to AudioClassificationPipeline, the dict needs to contain a "raw" key containing the numpy array or torch tensor representing the audio and a "sampling_rate" key, containing the sampling_rate associated with that arraypath)
functionalztorchaudio is required to resample audio samples in AudioClassificationPipeline. The torchaudio package can be installed through: `pip install torchaudio`.z2We expect a numpy ndarray or torch tensor as inputr
   zFWe expect a single channel audio input for AudioClassificationPipelinept)r   return_tensors)dtype))
isinstancestr
startswithhttpxgetcontentopenreadbytesr+   feature_extractorr   r   torchTensorcpunumpyr   
torchcodecdecodersAudioDecoderget_all_samplesdatasample_ratedictcopyr   popr   
torchaudiorM   ImportErrorresampler   ndarray
from_numpy	TypeErrorlenr!   rP   to)r5   r:   fr[   r_   _audio_samples_array_inputsin_sampling_rateF	processeds              r*   
preprocessz&AudioClassificationPipeline.preprocess   s   fc"  +v/@/@/L 6DAII&$' &1VVXF& fe$ )?)?)M)MNF&%,,/++-"$&*"5"5"B"BC!'!7!7!9',,#)N<V<VWfd#[[]F $v-5F?gQWFW N  jj-G

64( **Wd3%zz/:F4#9#9#G#GG*,:%e 
 0:62::0NE$$V,TZ$**88 %'	  &"**-PQQv||!eff**$"8"8"F"FW[ + 
	 ::!!4::6IE& &s   !K??L	c                 *     | j                   di |}|S )N )rB   )r5   model_inputsmodel_outputss      r*   _forwardz$AudioClassificationPipeline._forward   s    "

2\2r9   c                    |dk(  r|j                   d   j                  d      }n2|dk(  r|j                   d   j                         }n|j                   d   }|j                  |      \  }}|j	                         }|j	                         }t        ||      D cg c]+  \  }}|| j                  j                  j                  |   d- }	}}|	S c c}}w )Nr>   r   r?   )scorelabel)	logitsr>   r?   topktolistziprB   rC   id2label)
r5   r{   r0   rA   probsscoresidsr   _idlabelss
             r*   postprocessz'AudioClassificationPipeline.postprocess   s    	)!((+33B7E)+!((+335E!((+Ejj'jjl_bcikn_opQ[QVX[EDJJ,=,=,F,Fs,KLpp qs   0C)NN)r1   r>   )__name__
__module____qualname____doc___load_processor_load_image_processor_load_feature_extractor_load_tokenizerr3   r   rk   rY   rR   re   r   listr<   rF   rw   r|   r   __classcell__)r8   s   @r*   r.   r.   B   s}    2 O!"OL"2rzzE1C7$> "2# "2RVW[\_ad\dWeRf "2H*,IVr9   r.   )r   typingr   rT   r^   r   utilsr   r   r   r   r	   baser   r   models.auto.modeling_autor   
get_loggerr   loggerrY   intrk   r+   r.   ry   r9   r*   <module>r      s        u u 4 X			H	%!% ! !

 !H ,4HI@( @ J@r9   