
    qif                         d Z ddlZddlmZ ddlZddlmZ ddlm	Z	m
Z
 dZ	 ddlZ G d
 d      Z G d de	      Zy# e$ r7Z ej                  de         ej                  d        ed	e       dZ[ww xY w)zSilero Voice Activity Detection (VAD) implementation for Pipecat.

This module provides a VAD analyzer based on the Silero VAD ONNX model,
which can detect voice activity in audio streams with high accuracy.
Supports 8kHz and 16kHz sample rates.
    N)Optional)logger)VADAnalyzer	VADParamsg      @zException: zAIn order to use Silero VAD, you need to `pip install pipecat-ai`.zMissing module(s): c                   8    e Zd ZdZddZdefdZd	dZdefdZy)
SileroOnnxModelzONNX runtime wrapper for the Silero VAD model.

    Provides voice activity detection using the pre-trained Silero VAD model
    with ONNX runtime for efficient inference. Handles model state management
    and input validation for audio processing.
    c                     t        j                         }d|_        d|_        |r5dt        j                         v rt        j
                  |dg|      | _        nt        j
                  ||      | _        | j                          ddg| _        y)zInitialize the Silero ONNX model.

        Args:
            path: Path to the ONNX model file.
            force_onnx_cpu: Whether to force CPU execution provider.
           CPUExecutionProvider)	providerssess_options)r   @  >  N)	onnxruntimeSessionOptionsinter_op_num_threadsintra_op_num_threadsget_available_providersInferenceSessionsessionreset_statessample_rates)selfpathforce_onnx_cpuoptss       J/opt/pipecat/venv/lib/python3.12/site-packages/pipecat/audio/vad/silero.py__init__zSileroOnnxModel.__init__*   s     ))+$%!$%!48[8[8]]&77!7 8tDL '774PDL!5M    src                 n   t        j                  |      dk(  rt        j                  |d      }t        j                  |      dkD  rt        d|j	                                || j
                  vrt        d| j
                   d      |t        j                  |      d   z  dkD  rt        d      ||fS )	z)Validate and preprocess input audio data.r
   r      z*Too many dimensions for input audio chunk zSupported sampling rates: z (or multiple of 16000)g     @?@zInput audio chunk is too short)npndimexpand_dims
ValueErrordimr   shape)r   xr    s      r   _validate_inputzSileroOnnxModel._validate_input?   s    771:?q!$A771:>I!%%'STTT&&&,T->->,??VW  A&=>>"ur   c                     t        j                  d|dfd      | _        t        j                  |dfd      | _        d| _        d| _        y)zReset the internal model states.

        Args:
            batch_size: Batch size for state initialization. Defaults to 1.
        r"      float32dtyper   N)r#   zeros_state_context_last_sr_last_batch_size)r   
batch_sizes     r   r   zSileroOnnxModel.reset_statesO   sB     hh:s39E*a	B !r   c                    | j                  ||      \  }}|dk(  rdnd}t        j                  |      d   |k7  r%t        dt        j                  |      d    d      t        j                  |      d   }|dk(  rdnd	}| j                  s| j                  |       | j                  r | j                  |k7  r| j                  |       | j                  r | j                  |k7  r| j                  |       t        j                  | j                        d
   st        j                  ||fd      | _        t        j                  | j                  |fd
      }|dv rN|| j                  t        j                  |d      d}| j                  j                  d|      }|\  }}	|	| _
        n
t               |d| df   | _        || _        || _        |S )z*Process audio input through the VAD model.r         zProvided number of samples is z< (Supported values: 256 for 8000 sample rate, 512 for 16000)r   @       r
   r-   r.   )axis)r   r   int64)inputstater    N.)r*   r#   r(   r&   r4   r   r3   r2   r0   concatenater1   arrayr   run)
r   r)   r    num_samplesr5   context_size
ort_inputsort_outsoutr?   s
             r   __call__zSileroOnnxModel.__call__Z   s   $$Q+25[cc88A;r?k)0!R0AA}~  XXa[^
5[rb$$j)MM 3j)!!(=(=(Kj)xx&q)HHj,%?yQDMNNDMM1-A6#$t{{"((2U\B]^J||''j9H!JCDK,#}~-. *
r   N)T)r
   )	__name__
__module____qualname____doc__r   intr*   r   rH    r   r   r   r   "   s)    **S  	"%c %r   r   c                   h     e Zd ZdZddddee   dee   f fdZdef fdZdefd	Z	de
fd
Z xZS )SileroVADAnalyzera  Voice Activity Detection analyzer using the Silero VAD model.

    Implements VAD analysis using the pre-trained Silero ONNX model for
    accurate voice activity detection. Supports 8kHz and 16kHz sample rates
    with automatic model state management and periodic resets.
    Nsample_rateparamsrR   rS   c                   t         |   ||       t        j                  d       d}d}	 ddl}t        |j                  |      j                  |            }t        d	      | _        d| _        t        j                  d
       y# t        $ rm ddl	m
} 	 |j                  ||      5 }|}ddd       n# 1 sw Y   nxY wn8# t        $ r, t        |j                  |      j                  |            }Y nw xY wY w xY w)zInitialize the Silero VAD analyzer.

        Args:
            sample_rate: Audio sample rate (8000 or 16000 Hz). If None, will be set later.
            params: VAD parameters for detection thresholds and timing.
        rQ   zLoading Silero VAD model...zsilero_vad.onnxzpipecat.audio.vad.datar   N)	resourcesT)r   zLoaded Silero VAD)superr   r   debugimportlib_resourcesstrfilesjoinpathBaseException	importlibrU   r   r   _model_last_reset_time)	r   rR   rS   
model_namepackage_pathimpresourcesmodel_file_pathf	__class__s	           r   r   zSileroVADAnalyzer.__init__   s     	[@23&
/	]6!,"4"4\"B"K"KJ"WXO &odK !()  	];]!&&|Z@ (A&'O( ( (  ]"%l&8&8&F&O&OPZ&["\]	]sM   -B	 	C?C+B7.	C7C 	<CC?2C96C?8C99C?>C?c                 V    |dk7  r|dk7  rt        d| d      t        | 	  |       y)zSet the sample rate for audio processing.

        Args:
            sample_rate: Audio sample rate (must be 8000 or 16000 Hz).

        Raises:
            ValueError: If sample rate is not 8000 or 16000 Hz.
        r   r   z?Silero VAD sample rate needs to be 16000 or 8000 (sample rate: )N)r&   rV   set_sample_rate)r   rR   re   s     r   rh   z!SileroVADAnalyzer.set_sample_rate   s?     %K4$7QR]Q^^_`  	,r   returnc                 (    | j                   dk(  rdS dS )zGet the number of audio frames required for VAD analysis.

        Returns:
            Number of frames required (512 for 16kHz, 256 for 8kHz).
        r   r7   r8   )rR   )r   s    r   num_frames_requiredz%SileroVADAnalyzer.num_frames_required   s     &&%/s8S8r   c                    	 t        j                  |t         j                        }t        j                  |t         j                        j                  t         j                        dz  }| j                  || j                        d   }t        j                         }|| j                  z
  }|t        k\  r!| j
                  j                          || _        |S # t        $ r"}t        j                  d|        Y d}~yd}~ww xY w)zCalculate voice activity confidence for the given audio buffer.

        Args:
            buffer: Audio buffer to analyze.

        Returns:
            Voice confidence score between 0.0 and 1.0.
        r.   g      @r   z'Error analyzing audio with Silero VAD: N)r#   
frombufferint16astyper-   r^   rR   timer_   _MODEL_RESET_STATES_TIMEr   	Exceptionr   error)r   bufferaudio_int16audio_float32new_confidence	curr_time	diff_timees           r   voice_confidencez"SileroVADAnalyzer.voice_confidence   s    	--9KMM+RXXFMMbjjY\ccM![[8H8HI!LN 		I!D$9$99I44((*(1%!! 	LLB1#FG	s   CC 	D"C??D)rI   rJ   rK   rL   r   rM   r   r   rh   rk   floatr{   __classcell__)re   s   @r   rP   rP      sQ     8<[_ *x} *XiEX *J-3 - 9S 9% r   rP   )rL   rp   typingr   numpyr#   logurur   pipecat.audio.vad.vad_analyzerr   r   rq   r   ModuleNotFoundErrorrz   rs   rr   r   rP   rN   r   r   <module>r      s        A  /] ]@` `M  /FLL;qc"#FLLTU
)!-
../s   < A82A33A8