
    qi5                     
   d dl Z d dlmZ d dlmZ d dlZddlmZm	Z	m
Z
  e
       rd dlZ e	       rd dlZddlmZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ  G d
 ded      Z G d ded      Ze G d de             ZdgZy)    N)Path)Any   )auto_docstringis_soundfile_availableis_torch_available)
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   ,    e Zd ZU dZeeef   dz  ed<   y)CsmAudioKwargsa  
    encoded_length_kwargs (`dict[str, Any]`, *optional*):
        Dictionary of keyword arguments used to compute the encoded audio sequence length. This includes parameters
        such as `kernel_sizes`, `strides`, `dilations`, and `use_causal_conv` that define the convolutional layers
        used in audio encoding. The encoded length is used to determine how many audio tokens to generate for each
        audio input in the text sequence.
    Nencoded_length_kwargs)__name__
__module____qualname____doc__dictstrr   __annotations__     X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/csm/processing_csm.pyr   r   $   s      S>D00r   r   F)totalc                   L    e Zd ZU eed<   ddddg dg dg ddd	d
dddidZy)CsmProcessorKwargsaudio_kwargsTleftF)paddingpadding_sideadd_special_tokens)   r         r   r(   
   r   r(      r   r(      r      )r(   r(   r(   r-   r(   r(      r(   r(      r(   r(   r)   r(      )r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   r(   )kernel_sizesstrides	dilationsuse_causal_convi]  )r   sampling_ratereturn_tensorspt)text_kwargsr"   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r!   r!   0   sG       ""'
 !QHJ#'	& #
 +D1Ir   r!   c                        e Zd Z	 d fd	Zedd       Zdedeez  e	eez     z  de
e   fdZe	 	 	 ddeez  e	e   z  e	e   z  dz  dedz  d	edz  d
edz  de
e   f
d       Zed        Z xZS )CsmProcessorNc                    t        |d      s(d| _        |j                  | j                        | _        n"|j                  | _        |j                  | _        t        |d      s(d| _        |j                  | j                        | _        n"|j                  | _        |j
                  | _        t        |   |||       y )Naudio_tokenz	<|AUDIO|>audio_eos_tokenz<|audio_eos|>)chat_template)hasattrr>   convert_tokens_to_idsaudio_token_idr?   audio_eos_token_idsuper__init__)selffeature_extractor	tokenizerr@   	__class__s       r   rF   zCsmProcessor.__init__G   s     y-0*D"+"A"A$BRBR"SD(44D"+":":Dy"34#2D &/&E&EdFZFZ&[D##,#<#<D &/&B&BD#*I]Sr   c                 :   | }|||||S t        |||      D ]~  \  }}}|dz
  |z  dz   }	||z
  }
|
dz  }|
|z
  }||	z
  |
z   |z  dz   }t        j                  |      dz
  }||z  |z   |
z
  }||z
  }|r|
}|}n||z   }||z   |z   }|||dz
  z  z
  dz
  |z  dz   } |S )a|  
        Compute the length of the encoded audio sequence.

        Args:
            audio_length (int): The length of the audio sequence.
            kernel_sizes (list[int]): The kernel sizes for the convolutional layers.
            strides (list[int]): The strides for the convolutional layers.
            use_causal_conv (bool): Whether to use causal convolutions.
        r(   r0   )zipmathceil)audio_lengthr1   r2   r3   r4   
cur_lengthkernel_sizestridedilationeffective_kernel_sizepadding_totalpadding_rightpadding_leftn_framesideal_lengthextra_paddings                   r   _get_encoded_lengthz CsmProcessor._get_encoded_length]   s    "
7?i6G?Kb-0w	-R 	W)K%01_$@1$D!'&0M)Q.M(=8L"%::]JfTWXXHyy*Q.H#f,{:]JL(:5M, - - =#l2]BJ$x;?'CCaGFRUVVJ%	W( r   audiosaving_pathkwargsc                 F   t               st        d      t        |      }t        |t        t
        f      r|g}n3t        |t        t        f      rt        d |D              st        d      t        |      t        |      k7  rt        d       | j                  t        fi |}|d   }|d   }t        ||      D ]b  \  }}t        |t        j                        r,|j!                         j#                         j%                         }t'        j(                  |||       d y )Nz/Please install `soundfile` to save audio files.c              3   H   K   | ]  }t        |t        t        f        y wN)
isinstancer   r   ).0ps     r   	<genexpr>z*CsmProcessor.save_audio.<locals>.<genexpr>   s     @q`aAPSUY{A[@qs    "zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer"   r5   )r   ImportErrorr
   rb   r   r   listtupleall
ValueErrorlen_merge_kwargsr!   rL   torchTensorcpufloatnumpysfwrite)	rG   r\   r]   r^   output_kwargsr"   r5   audio_valuerd   s	            r   
save_audiozCsmProcessor.save_audio   s    &'OPP #5) kC;/&-K[4-8S@qep@q=q`aau:[))TUU***

 %^4$_5!%5 	4NK+u||4)oo/557==?HHQ]3	4r   textoutput_labelsdepth_decoder_labels_ratioc                 ^
    | j                   t        fd| j                  j                  i|}|d   }|d   }|j	                  dd      }	|	dk7  r"t        | j                  j                   d      t        |t              r|g}n3t        |t        t        f      rt        d |D              st        d	      |D 
cg c]  }
|
j                  | j                         }}
d
}|t        |      }t!        |      }t#        |      d
kD  r-|t#        |      k7  r|t        d      t        d| d| d      ||j%                  di       }|D cg c]"  } | j&                  |j(                  d   fi |$ }}|j+                         }g }|D ]  }g }| j                  |v r]|j%                  d
      }| j                  |z  }|j-                  |       |j/                  | j                  dd      }| j                  |v r]d|v r'|j/                  d|j%                  d
      d      }d|v r'|j-                  |        |} | j                  |fi |}i }|j1                  |       ||j%                  dd       g g }}d
}|D ]  }|d
k(  rJ|j-                  t3        j4                  d
             |j-                  t7        j8                  dg             S|j-                  t3        j:                  ||||z    D cg c]<  }t        |t6        j<                        r|j?                         jA                         n|> c}d             |j-                  t7        j8                  ||||z    D cg c]  }|j(                  d    c}      jC                  d             ||z  }!  | jD                  |fi |}|j%                  dd       |j1                  |       tG        d |D              }|D cg c]@  }t6        jH                  jJ                  jM                  |d
||j(                  d   z
  fd      B }}t7        jN                  |d
      |d<   |r|d   | jP                  k(  jS                         }|j(                  d
   } |dk  r-t7        jT                  |       dtW        | d|z
  z         }!||!   }"n|}"t7        jX                  |d   | jP                  k(  |d   | jZ                  k(  z  |d   d      }#d|#|"ddd
f   |"dddf   f<   |#|d<   t]        ||	      S c c}
w c c}w c c}w c c}w c c}w ) a  
        output_labels (bool, *optional*, default=False):
            Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
            - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
            - `-100` will be ignored in the loss computation
            - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
        depth_decoder_labels_ratio (float, *optional*, default=1.0):
            The ratio of audio frames to keep for the depth decoder labels.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **labels** -- List of labels for the audio frames. Returned when `output_labels=True`.
        tokenizer_init_kwargsr8   r"   r6   Nr7   z% only supports `return_tensors='pt'`.c              3   <   K   | ]  }t        |t                y wra   )rb   r   )rc   ts     r   re   z(CsmProcessor.__call__.<locals>.<genexpr>   s     9[QR*Q:L9[s   zAInvalid input text. Please provide a string, or a list of stringsr   z@No audio were provided, but there are audio tokens in the promptz)The number of audio tokens in each text (z7) should be the same as the number of provided audios (z).r   z<placeholder>r(   return_attention_mask)axis)dimpadding_maskc              3   :   K   | ]  }|j                   d      yw)r~   N)shape)rc   cut_idxss     r   re   z(CsmProcessor.__call__.<locals>.<genexpr>  s     R(..,Rs   )valueinput_values_cutoffs	input_ids      ?iilabels)datatensor_type)/rl   r!   rI   init_kwargsgetrj   rJ   r   rb   r   rg   rh   ri   countr>   r
   rk   sumpopr[   r   copyappendreplaceupdatenpzerosrm   tensorconcatenatern   ro   rq   cumsumrH   maxnn
functionalpadstackrC   nonzerorandpermintwhererD   r   )$rG   rw   r\   rx   ry   r^   rt   r8   r"   r6   r}   n_audio_in_textn_audior   audio_arraynum_audio_tokens_listnum_audio_tokens_list_copyexpanded_textsamplereplace_strnum_audio_tokensexpanded_audio_tokenencodingr   concatenated_audior   offsetelaudio_inputsmax_lenr   audio_frame_idxsn_audio_frames	rand_idxsskip_frames_idxsr   s$                                       r   __call__zCsmProcessor.__call__   s   : +**
"&.."<"<
 
 $M2$^4$)94@T! 7 788]^__dC 6DTD%=1c9[VZ9[6[`aa>BC1774#3#34CC&u-E%jG!#33G(G} !cdd ??P Q229"> 
 $0$4$45Lb$Q!lq%]h((():):2)>XBWX%! % *?)C)C)E& M - &&&0'A'E'Ea'H$+/+;+;>N+N(&&';<#^^D,<,<oqQF &&&0 &/#^^O[__Q=OQRSF &/$$V,- !D!4>>$6+6H4d;792 4F* &a<&--bhhqk:(//bT0BC&-- +0'9I*J$& 5?r5<<4P 0VX X "$ )//U6FU\L\=]%^rbhhrl%^_ffkmfn g%F#&& 24112DUUL^T2KK% R=QRRG !5$ ##''1gr@R6R2S[]'^$  $ ,1;;7KQR+SD'( $[ 1T5H5H HQQS-33A6N)S0!NN>:;sSSTWqSqAr=st	#3I#> #3 [[k"d&9&99d;>OSWSjSj>jk[!F
 FJF#AqD)+;AqD+AAB#DN>BBG D$%L &_$s    <"T'T5AT 'T%,AT*c                     | j                   j                  }| j                  j                  }|D cg c]
  }|dk7  s	| }}t        ||z   dgz         S c c}w )Nr   r   )rI   model_input_namesrH   rg   )rG   tokenizer_input_namesfeature_extractor_input_namesnames       r   r   zCsmProcessor.model_input_names7  se     $ @ @(,(>(>(P(P% ;X(r$[_cq[q(r%(r),IIMcLddee )ss
   
AAra   )NNNN)NFr   )r   r   r   rF   staticmethodr[   r	   r   r   rg   r   r!   rv   r   r   r   boolrp   r   propertyr   __classcell__)rJ   s   @r   r<   r<   E   s    	T, # #J 4 4 4Z$sTz"22 4 +,	 4D  $(%*36OC++d9o=EV@WWZ^^OC D OC d{	OC
 %*DLOC +,OC OCb f fr   r<   )rM   pathlibr   typingr   rq   r   utilsr   r   r   rm   	soundfilerr   audio_utilsr	   r
   feature_extraction_utilsr   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   r!   r<   __all__r   r   r   <module>r      s        O O  9 4 U U C	1[ 	1) * yf> yf yfx 
r   