
    qiI                         d Z ddlZddlmZmZ ddlmZmZ ddl	m
Z
mZmZmZmZ  ej                  e      Z G d de      Zy)	zZ
Sequence feature extraction class for common feature extractors to preprocess sequences.
    N   )is_valid_audio
load_audio)BatchFeatureFeatureExtractionMixin)PaddingStrategy
TensorTypeis_torch_tensorloggingto_numpyc                       e Zd ZdZdededef fdZ	 	 	 	 	 	 ddeee   z  e	e
ef   z  e	e
ee   f   z  ee	e
ef      z  dee
z  ez  d	edz  d
ededz  dedz  de
ez  dz  defdZdej                  ddfde	e
ej"                  f   ez  d	edz  dededz  dedz  de	fdZ	 	 	 dde	e
ej"                  f   ez  d	edz  dedz  d
edz  fdZddZde
ee
   z  eee
      z  fdZ xZS )SequenceFeatureExtractora  
    This is a general feature extraction class for speech recognition.

    Args:
        feature_size (`int`):
            The feature dimension of the extracted features.
        sampling_rate (`int`):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`):
            The value that is used to fill the padding values / vectors.
    feature_sizesampling_ratepadding_valuec                     || _         || _        || _        |j                  dd      | _        |j                  dd      | _        t        |   di | y )Npadding_siderightreturn_attention_maskT )r   r   r   popr   r   super__init__)selfr   r   r   kwargs	__class__s        `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/feature_extraction_sequence_utils.pyr   z!SequenceFeatureExtractor.__init__)   sR    (**"JJ~w?%+ZZ0G%N""6"    Nprocessed_featurespadding
max_length
truncationpad_to_multiple_ofr   return_tensorsreturnc           
      x    t        |t        t        f      rMt        |d   t        t        f      r4|d   j                         D 	ci c]  }||D 	cg c]  }	|	|   	 c}	 }}}	 j                  d   |vr5t        d j                  d    dt        |j                                      | j                  d      }
||n j                  }t        |
      dk(  r	|rg |d<   |S |
d   }t        |t        t        f      r@d}t        |
|         dk(  r|dz  }t        |
|         dk(  r|t        |
      k  r|
|   d   }|[t        |      rd}nMt        |t        t        t        t        t        j                  f      rd}nt        d| d	t        |       d
      |j!                         D ]I  \  }}t        |d   t        t        f      rt#        |      ||<   .|D cg c]  }t#        |       c}||<   K  j%                  ||      }| j                  d      }
t        |
      t'        fd|j)                         D              st        d      g }t+              D ]N  }|j!                         D ci c]  \  }}|||    }}} j-                  ||||      }|j/                  |       P |t0        j2                  k(  r$t5         fd|D              }t0        j6                  }i }t+              D ]  } j9                  ||   ||||      }|j!                         D ]p  \  }}||vrg ||<   |j:                  t        j:                  t        j<                        u r|j?                  t        j@                        }||   j/                  |       r  t	        ||      S c c}	w c c}	}w c c}w c c}}w )a2  
        Pad input values / input vectors or a batch of input values / input vectors up to predefined length or to the
        max sequence length in the batch.

        Padding side (left/right) padding values are defined at the feature extractor level (with `self.padding_side`,
        `self.padding_value`)

        <Tip>

        If the `processed_features` passed are dictionary of numpy arrays or PyTorch tensors  the
        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
        PyTorch tensors, you will lose the specific device of your tensors however.

        </Tip>

        Args:
            processed_features ([`BatchFeature`], list of [`BatchFeature`], `dict[str, list[float]]`, `dict[str, list[list[float]]` or `list[dict[str, list[float]]]`):
                Processed inputs. Can represent one input ([`BatchFeature`] or `dict[str, list[float]]`) or a batch of
                input values / vectors (list of [`BatchFeature`], *dict[str, list[list[float]]]* or *list[dict[str,
                list[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
                collate function.

                Instead of `list[float]` you can have tensors (numpy arrays or PyTorch tensors),
                see the note above for the return type.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
        r   zYou should supply an instance of `transformers.BatchFeature` or list of `transformers.BatchFeature` to this method that includes z, but you provided attention_maskr   ptnpztype of z
 unknown: z6. Should be one of a python, numpy, or pytorch object.)r    r!   c              3   :   K   | ]  }t        |      k(    y w)N)len).0v
batch_sizes     r   	<genexpr>z/SequenceFeatureExtractor.pad.<locals>.<genexpr>   s     MA3q6Z'Ms   zLSome items in the output dictionary have a different batch size than others.)r!   r#   r"   c              3   T   K   | ]  }t        |j                  d              ! yw)r   N)r+   model_input_names)r,   input_slicer   s     r   r/   z/SequenceFeatureExtractor.pad.<locals>.<genexpr>   s&     m[ST-C-CA-F!GHms   %()r!   padding_strategyr#   r   )tensor_type)!
isinstancelisttupledictr   keysr1   
ValueErrorr   r+   r
   intfloatr)   ndarraytypeitemsr   _get_padding_strategiesallvaluesrange	_truncateappendr   LONGESTmax
MAX_LENGTH_paddtypefloat64astypefloat32)r   r   r    r!   r"   r#   r   r$   keyexamplerequired_inputfirst_elementindexvaluer-   r3   truncated_inputsikinputsinputs_slicebatch_outputsoutputsr.   s   `                      @r   padzSequenceFeatureExtractor.pad3   s	   L (4-8ZHZ[\H]`dfr_s=t RddeQfQkQkQm"JM2DEwgclEE" "
 !!!$,>>1151G1G1J0K L+002346  ,D,B,B1,EF%:%F!DLfLf 	 ~!#$79"#34%% 'q)mdE]3EnU+,1
 nU+,1s>** .u 5a 8!}-!%MCeRZZ+PQ!% }oZ]8K7L MK K 
 -224 	GJC%(S%L1*25/"3'@E*F18A;*F"3'		G  77T^7_+D,B,B1,EF(
M1C1J1J1LMMkllz" 		2A*<*B*B*DE$!Qa1gEFE>>%#5%	 * L ##L1		2 666m\lmmJ.99z" 	1Aii #%!1#5&;   G &mmo 1
Um+)+M#&;;"((2::"66!LL4Ec"))%01	1" M~FF F"d +G Fs$   
N+N&N+N19N6&N+r3   c                    || j                   d      }|t        j                  k(  rt        |      }||||z  dk7  r||z  dz   |z  }|t        j                  k7  xr t        |      |k  }|r5d|vr1t        j                  t        |      t
        j                        |d<   |r)|t        |      z
  }| j                  dk(  rn|rt        j                  |d   d|f      |d<   | j                  dkD  rd|fdfnd|f}	t        j                  ||	d| j                        || j                   d   <   |S | j                  d	k(  rn|rt        j                  |d   |df      |d<   | j                  dkD  r|dfdfn|df}	t        j                  ||	d| j                        || j                   d   <   |S t        d
t        | j                        z         |S )a  
        Pad inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            processed_features (`Union[dict[str, np.ndarray], BatchFeature]`):
                Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
                of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see below)
            padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`):
                PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The feature_extractor padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of (`int`, *optional*):
                Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
                enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
                which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Set to False to avoid returning attention mask (default: set to model specifics)
        r   r   r'   )rJ   r   )r   r   constant)constant_valuesleftzInvalid padding strategy:)r1   r   rF   r+   
DO_NOT_PADr)   onesint32r   r[   r   r   r:   str)
r   r   r!   r3   r#   r   rP   needs_to_be_padded
differencepadding_shapes
             r   rI   zSequenceFeatureExtractor._pad   s   D ,D,B,B1,EF666^,J!&8&D*WiJimnJn%);;q@DVVJ-1K1KKpPSTbPcfpPp %5=O%O35773~;NVXV^V^3_/0#c.&99J  G+(;=66*+;<q*o<&'78 >B=N=NQR=R!Z& 9YZ\fXg@B"M:tOaOaA"4#9#9!#<= "! ""f,(;=66*+;<z1o<&'78 >B=N=NQR=R*a& 9YcefXg@B"M:tOaOaA"4#9#9!#<= "! !!<s4CTCT?U!UVV!!r   c                    |s|S |r|t        d      || j                  d      }||||z  dk7  r||z  dz   |z  }t        |      |kD  }|r4|| j                  d      d| || j                  d   <   d|v r|d   d| |d<   |S )a  
        Truncate inputs to predefined length or max length in the batch

        Args:
            processed_features(`Union[dict[str, np.ndarray], BatchFeature]`):
                Dictionary of input values (`np.ndarray[float]`) / input vectors (`list[np.ndarray[float]]`) or batch
                of inputs values (`list[np.ndarray[int]]`) / input vectors (`list[np.ndarray[int]]`)
            max_length (`int`, *optional*):
                maximum length of the returned list and optionally padding length (see below)
            pad_to_multiple_of (`int`, *optional*) :
                Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
                enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
                which benefit from having sequence lengths be a multiple of 128.
            truncation (`bool`, *optional*):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
        NzKWhen setting ``truncation=True``, make sure that ``max_length`` is defined.r   r   r'   )r:   r1   r+   )r   r   r!   r#   r"   rP   needs_to_be_truncateds          r   rD   z"SequenceFeatureExtractor._truncate%  s    . %%J.jkk+D,B,B1,EF !&8&D*WiJimnJn%);;q@DVVJ #N 3j @ <NtOeOefgOh<ijuku<vt55a89#557IJZ7[\g]g7h"#34!!r   c                 n   |durD|du rt         j                  }n?t        |t               st        |      }n#t        |t               r|}nt         j                  }|0t         j                  k(  rt        dt         j                   d      t         j                  k7  r| j                  t        d      |S )z3
        Find the correct padding strategy
        FTzWhen setting ``padding=z(``, make sure that max_length is definedzAsking to pad but the feature_extractor does not have a padding value. Please select a value to use as `padding_value`. For example: `feature_extractor.padding_value = 0.0`.)r   rF   r5   r`   rH   r:   r   )r   r    r!   r3   s       r   r@   z0SequenceFeatureExtractor._get_padding_strategiesP  s     %$#2#:#: 9#27#; G_5#* .99 ?#=#== -o.H.H-IIqr 
 999t?Q?Q?Y] 
  r   audio_url_or_urlsc                     t        |t              r|D cg c]  }| j                  |       c}S t        |t              rt	        |      S t        |      r|S t        dt        |             c c}w )z
        Convert a single or a list of urls into the corresponding `np.ndarray` objects.

        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
        returned.
        z=only a single or a list of entries is supported but got type=)r5   r6   fetch_audiorc   r   r   	TypeErrorr>   )r   rj   xs      r   rl   z$SequenceFeatureExtractor.fetch_audiop  sr     '.1BCAD$$Q'CC)3//00-.$$[\`ar\s[tuvv Ds   A/)TNFNNN)NNN)FN)__name__
__module____qualname____doc__r;   r<   r   r   r6   r8   rc   boolr   r	   r[   r`   r)   r=   rI   rD   r@   rl   __classcell__)r   s   @r   r   r      s   
#S # #U #" 15!% )--126hG(
|

sL 
!" sD&&
'( tC%&
'	(hG o-hG $JhG hG  $JhG  $d{hG j(4/hG 
hGZ "&,;,F,F)--1F" bjj1L@F" $JF" *	F"
  $JF"  $d{F" 
F"V "&)-"&)" bjj1L@)" $J)"  $J	)"
 4K)"V @wS49_tDI-N wr   r   )rr   numpyr)   audio_utilsr   r   feature_extraction_utilsr   r   utilsr   r	   r
   r   r   
get_loggerro   loggerr   r   r   r   <module>r{      s@     3 J R R 
		H	%bw5 bwr   