
    qi+Q                         d Z ddlZddlmZ ddlmZmZ ddlmZ ddl	m
Z
mZmZmZ ddlmZmZmZ  e       rddlZ e       rddlZ G d	 d
e
d      Z G d ded      Ze G d de             ZdgZy)zProcessor class for Dia    N)Path   )
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)auto_docstringis_soundfile_availableis_torch_availablec                   J    e Zd ZU dZeed<   eed<   eed<   ee   ed<   eed<   y)DiaAudioKwargsa  
    bos_token_id (`int`, *optional*, defaults to `1026`):
        The token ID used as the beginning-of-sequence token for audio codebooks. This token is prepended to each
        audio sequence during encoding.
    eos_token_id (`int`, *optional*, defaults to `1024`):
        The token ID used as the end-of-sequence token for audio codebooks. This token is appended to audio sequences
        during training (when `generation=False`) to mark the end of the audio.
    pad_token_id (`int`, *optional*, defaults to `1025`):
        The token ID used for padding audio codebook sequences. This token is used to fill positions in the delay
        pattern where no valid audio token exists.
    delay_pattern (`list[int]`, *optional*, defaults to `[0, 8, 9, 10, 11, 12, 13, 14, 15]`):
        A list of delay values (in frames) for each codebook channel. The delay pattern creates temporal offsets
        between different codebook channels, allowing the model to capture dependencies across channels. Each value
        represents the number of frames to delay that specific channel.
    generation (`bool`, *optional*, defaults to `True`):
        Whether the processor is being used for generation (text-to-speech) or training. When `True`, the processor
        prepares inputs for generation mode where audio is generated from text. When `False`, it prepares inputs for
        training where both text and audio are provided.
    bos_token_ideos_token_idpad_token_iddelay_pattern
generationN)__name__
__module____qualname____doc__int__annotations__listbool     X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/dia/processing_dia.pyr   r       s+    ( 9r   r   F)totalc                   B    e Zd ZU eed<   dddddddg d	dd
dddidZy)DiaProcessorKwargsaudio_kwargsTrightF)paddingpadding_sideadd_special_tokensi   i  i  )	r      	   
                  iD  )r   r   r   r   r   sampling_ratereturn_tensorspt)text_kwargsr$   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r    r#   r#   <   sD       #"'
 !  >"
 d
Ir   r#   c                   p    e Zd ZdZ fdZe	 	 ddeee   z  dedz  de	dz  de
e   fd       Z	 dd	d
dedz  de
e   ded
   fdZ	 dd	d
dedz  de
e   dd
fdZdd
de
e   defdZdedeez  eeez     z  de
e   fdZe	 d dedededee   de	ded   fd       Zedd
dededed   dd
f
d       Z xZS )!DiaProcessorDacModelc                 *    t         |   |||       y)z
        audio_tokenizer (`DacModel`):
            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is a required input.
        )audio_tokenizerN)super__init__)selffeature_extractor	tokenizerr;   	__class__s       r    r=   zDiaProcessor.__init__V   s    
 	*IWr   Ntextaudiooutput_labelskwargsc           
      `   t               st        d      |t        d       | j                  t        fi |}|d   }|d   }|j	                  dd      }|dk7  r"t        | j
                  j                   d      i }	t        |t              r|g}n3t        |t        t        f      rt        d	 |D              st        d
       | j                  |fi |}
|	j                  |
       |j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }||||t        d      |r|rt        d| d| d      |	d   j                  d   }t!        |      }t#        |      }|Qt%        |      } | j&                  |fi |}t)        j*                  | j,                  j.                  j0                        }|d   d   j                  d   |z  }g }g }t3        |d   |d         D ]  \  }}| j&                  j4                  }t)        j6                  |j9                  d      |z        |z  }||z  }||z
  }t;        j<                         5  |ddd|f   j?                  | j,                  j@                        }| j,                  jC                  |      jD                  jG                  dd      }ddd       |s-t:        jH                  jJ                  jM                  dd|       }t:        jH                  jJ                  jM                  dd|dz   dddfd|       }|dz   |z   }||rdndz  }t;        jN                  dg|z  dg|z  z   t:        jP                  !      dddf   }|jS                  |       |jS                  |        t;        jT                  |d      }t;        jT                  |d      }na|rTt;        jV                  |d|f|t:        jP                  !      }t;        jX                  |d|z   ft:        jP                  "      }nt        d#      ||j                  d   k7  rt        d$| d%|j                  d    d&      |j                  d   } | |z
  }!| j[                  || ||d'(      }"t;        jV                  || |f|t:        j\                  )      }#||#ddd|!f<   | j_                  |#|||"*      }$|	j                  |$|d+       |r|	d,   ja                         ddddf   }%d-|%|%|k(  <   d-|%|%|k(  <   |%jG                  dd      jc                  ||z  d      je                         jQ                         |	d.<   |	d,   ddddf   |	d,<   |	d/   ddddf   |	d/<   tg        |	|0      S # 1 sw Y   xY w)1a
  
        output_labels (`bool`, *optional*, defaults to `False`):
            Whether to return labels for training. When `True`, the processor generates labels from the decoder input
            sequence by shifting it by one position. Labels use special values: `-100` for tokens to ignore in loss
            computation (padding and BOS tokens), and `-101` for audio frames used only for the backbone model (when
            `depth_decoder_labels_ratio < 1.0`). Cannot be used together with `generation=True`.
        zThe `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't find it in your environment. You can install torch via `pip install torch`.Nz0You need to specify the `text` input to process.r4   r$   r2   r3   z% only supports `return_tensors='pt'`.c              3   <   K   | ]  }t        |t                y wN)
isinstancestr).0ts     r    	<genexpr>z(DiaProcessor.__call__.<locals>.<genexpr>   s     9[QR*Q:L9[s   zAInvalid input text. Please provide a string, or a list of stringsr   r   r   r   r   TzTo enable processing for Dia, we need the `bos_token_id`, `eos_token_id`, `pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those.z9Labels with `generation` is incompatible, got generation=z, output_labels=.	input_idsr   padding_maskinput_valuesdim.      )r   r   r   rU   r   r   constant)padmodevaluedtype)sizer\   z;If you try to train, you should provide audio data as well.zNNeed the same amount of samples for both text and audio, but got text samples=z and audio samples = z	 instead.Fbszseq_lennum_channelsr   revert)
fill_valuer\   rC   r   r   precomputed_idx)decoder_input_idsdecoder_attention_maskrf   ilabelsrg   )datatensor_type)4r   
ValueError_merge_kwargsr#   getrA   r   rI   rJ   r   tupleallr@   updatepopshapelenmaxr   r?   mathprodr;   configdownsampling_ratioszip
hop_lengthceilsumtorchno_gradtodeviceencodeaudio_codes	transposenn
functionalrX   tensorlongappendcatfullonesbuild_indicesr   apply_audio_delayclonereshape
contiguousr   )&r>   rB   rC   rD   rE   output_kwargsr4   r$   r2   ri   	encodingsr   audio_bos_token_idaudio_eos_token_idaudio_pad_token_idr   
batch_sizera   	max_delayinput_audioscompression_ratemax_encoded_sequence_lenrf   rg   rP   base_pad_lencurrent_audio_lenencoded_sequence_lenpadding_lenrO   num_valid_inputsattention_maskmax_seq_lenmax_audio_lenre   prefilldelayed_decoder_input_idsrh   s&                                         r    __call__zDiaProcessor.__call__]   sa    "#^ 
 <OPP***


 $M2$^4$)94@T! 7 788]^__ dC 6DTD%=1c9[VZ9[6[`aa"DNN47;7	I %(($?)--ndC)--ndC)--ndC!%%lD9
&!)!)$k 
 -KJ<Wghugvvwx  +&,,Q/
=)&	 &u-E1411%H<HL#yy)=)=)D)D)X)XY'3N'CA'F'L'LR'PTd'd$ "%'" (+<+GVdIe'f >#e#55@@$(IIl.>.>2.>.F.U$VYe$e!'8<L'L$69MM ]]_ _!$-?.?-?"?@CCDDXDXD_D_`E $ 4 4 ; ;E B N N X XYZ\] ^I_ " % 3 3 7 7!'9
Rd !8 !I "HH//33Aq+/1a#C*\n 4 	 $8!#;i#G  A: !&qcK.?1#HXBX.X`e`j`j!klprsls!t!((3&--n=9>< !&		*; C%*YY/E1%M" %

J<+HJ\didndn o &+ZZj!i-5PX]XbXb%c"Z[[*0033`ak`l m##4#:#:1#=">iI  -2226#i/,,%' - 
 **l3)))

 &7>M>!"$($:$:+++	 %; %
! 	*C_uvw-.446q!"u=F37F6//037F6//0#--a3;;J<UWYZeegllnDN(,-@(A!SbS&(ID$%-12J-KAsPRsF-SD)*>BB]_ _s   =A#V##V-	rf   torch.Tensoraudio_prompt_lenreturnc                 Z    | j                   t        fi |}|d   }|j                  dd      }|j                  dd      }|j                  dd      }|||t        d      |Rt	        j
                  ||j                  t        j                        }|d   j                  |j                  d         }	n|dddddf   |k(  j                  d	
      }	|j                  d   |dddddf   |k(  j                  d	
      z
  dz
  }
|j                  \  }}}| j                  ||||d      }| j                  |d	d	|      j                  dd      }g }t	        j                         5  t        |	j                  d         D ]  }||dd|	|   |
|   f   d   }|j!                  | j"                  j                        }| j"                  j%                  |      j&                  j)                         j+                         }|j-                  |        	 ddd       |S # 1 sw Y   |S xY w)a  
        Decodes a batch of audio codebook sequences into their respective audio waveforms via the
        `audio_tokenizer`. See [`~DacModel.decode`] for more information.

        Args:
            decoder_input_ids (`torch.Tensor`): The complete output sequence of the decoder.
            audio_prompt_len (`int`): The audio prefix length (e.g. when using voice cloning).
        r$   r   Nr   r   zTo enable decoding for Dia, we need the `bos_token_id`, `pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those.)r   r\   r   rQ   rS   rU   Tr^   rd   rV   )N.)r   )rl   r#   rq   rk   r}   r   r   r   expandrr   r|   r   r   r   r~   ranger   r;   decodeaudio_valuescpusqueezer   )r>   rf   r   rE   r   r$   r   r   r   start_of_generation_idxend_of_generation_idxr_   r`   ra   re   output_sequencesaudiosioutput_iaudio_is                       r    batch_decodezDiaProcessor.batch_decode
  sx    +**

 %^4$(($?)--ndC)--ndC%);)C}G\[  '$||,<EVE]E]ejeoeop&6t&<&C&CDUD[D[\]D^&_#'8Aq'AEW'W&\&\ac&\&d# ##A&*;Aq!G*DHZ*Z)_)_df)_)ggjkk 	
 &7%<%<"Wl,,%' - 
  11# + 2 
 )Aq/ 	 ]]_ 	'288;< '+Aq2I!2LOdefOg2g,ghirs#;;t';';'B'BC..55(5KXX\\^ffhg&	'	' 	' s   *B+H  H*c                     |j                   d   dk7  rt        d|j                   d    d       | j                  ||fi |d   S )z
        Decodes a single sequence of audio codebooks into the respective audio waveform via the
        `audio_tokenizer`. See [`~DacModel.decode`] and [`~DiaProcessor.batch_decode`] for more information.
        r   rU   z5Expecting a single output to be decoded but received z samples instead.)rr   rk   r   )r>   rf   r   rE   s       r    r   zDiaProcessor.decodeQ  sc     ""1%*GHYH_H_`aHbGcctu  !t  !24DOOPQRRr   rg   c                      | j                   t        fi |}|d   }|j                  dd      }|t        d      |j                  d   t        |      z
  S )z0Utility function to get the audio prompt length.r$   r   NzTo enable the utility of retrieving the prompt length for Dia, we need the `delay_pattern`. You may have accidentally overwritten this.rU   )rl   r#   rq   rk   rr   rt   )r>   rg   rE   r   r$   r   s         r    get_audio_prompt_lenz!DiaProcessor.get_audio_prompt_lenb  su     +**

 %^4$(($? O  &++A.]1CCCr   saving_pathc                 F   t               st        d      t        |      }t        |t        t
        f      r|g}n3t        |t        t        f      rt        d |D              st        d      t        |      t        |      k7  rt        d       | j                  t        fi |}|d   }|d   }t        ||      D ]b  \  }}t        |t        j                        r,|j!                         j#                         j%                         }t'        j(                  |||       d y )Nz/Please install `soundfile` to save audio files.c              3   H   K   | ]  }t        |t        t        f        y wrH   )rI   rJ   r   )rK   ps     r    rM   z*DiaProcessor.save_audio.<locals>.<genexpr>  s     @q`aAPSUY{A[@qs    "zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer$   r1   )r   ImportErrorr   rI   rJ   r   r   rn   ro   rk   rs   rl   r#   ry   r}   Tensorr   floatnumpysfwrite)	r>   rC   r   rE   r   r$   r1   audio_valuer   s	            r    
save_audiozDiaProcessor.save_audiow  s    &'OPP #5) kC;/&-K[4-8S@qep@q=q`aau:[))TUU***

 %^4$_5!%5 	4NK+u||4)oo/557==?HHQ]3	4r   r_   r`   ra   r   rb   )r   r   c                    t        j                  |t         j                        }t        j                  |t         j                        dddf   j	                  | |      d   }|s||ddddf   z
  }n||ddddf   z   }t        j
                  |d|dz
        }t        j                  | t         j                        ddddf   j	                  | ||      }t        j                  |t         j                        ddddf   j	                  | ||      }	t        j                  |j                  d      |j                  d      |	j                  d      gd      j                         }
||
fS )a  
        Precompute (sequence_idx, all_idx) so that out[seq, channel] = in[seq - delay[channel], channel]
        or in[seq, channel] = out[seq + delay[channel], channel] if `revert`.
        Negative sequence_idx => BOS; sequence_idx >= seq_len => PAD.
        r[   N).Nr   rU   rQ   rS   )	r}   r   int32aranger   clampstackr   r   )r_   r`   ra   r   rb   delay_arraysequence_idxvalid_sequence_idx	batch_idxchannel_idxall_idxs              r    r   zDiaProcessor.build_indices  sI    ll=D ||G5;;?aHOOPSU\]^gh'+dD!m*DDL'+dD!m*DDL"[[q'A+FLLEKK8D$GNNsT[]ij	ll<u{{CD$PQMRYYZ]_fhtu++r"$6$>$>r$BKDWDWXZD[\
 $& 	
 W$$r   r   r   re   c           	      r   | j                   }|\  }}|j                  |      }|j                  |      }t        j                  |d      \  }}}	| |||	f   j	                  | j                               }
|dk  }|| j                  d   k\  }t        j                  ||t        j                  |||
            }|S )a  
        Applies or reverts the delay pattern to batched audio tokens using precomputed indices,
        inserting BOS where sequence_idx < 0 and PAD where sequence_idx >= seq_len.

        Args:
            audio: audio tokens of shape [bsz, seq_len, num_channels]
            pad_token_id: the PAD token
            bos_token_id: the BOS token
            precomputed_idx: from `build_indices`

        Returns:
            final_audio: delayed or reverted audio tokens of shape [bsz, seq_len, num_channels]
        rQ   rS   r   rU   )r   r   r}   unbindviewr]   rr   where)rC   r   r   re   r   r   r   r   r   r   gathered_audiomask_bosmask_padfinal_audios                 r    r   zDiaProcessor.apply_audio_delay  s    *  /g#v.**V$ 6;\\'r5R2	%{y*<kIJOOPUPZPZP\]  !#5;;q>1kk(L%++hP\^l:mnr   )NFrH   )F)r   r   r   audio_tokenizer_classr=   r   rJ   r   r   r   r   r#   r   r   r   r   r   r   r   staticmethodrn   r   r   __classcell__)rA   s   @r    r8   r8   R   s   &X  $(%*	jCDIojC D jC d{	jC
 +,jC jC^ (,E)E *E +,	E
 
n	ET (,S)S *S +,	S
 
S"D .D +,D 
	D* 4 4 4Z$sTz"22 4 +,	 4D   % % %  % Cy	 %
  % 
-	. %  %D """ " =>	"
 
" "r   r8   )r   ru   pathlibr   audio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r	   r
   r   utilsr   r   r   r}   	soundfiler   r   r#   r8   __all__r   r   r    <module>r      s|       9 4 U U O O [ 8) , L> L L^ 
r   