
    qiS                         d dl mZ d dlmZmZ d dlZd dlmZ ddl	m
Z
mZmZmZmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ  ej@                  e!      Z" G d de      Z#y)    )Callable)AnyOptionalN   )(DiaClassifierFreeGuidanceLogitsProcessor"DiaEOSChannelFilterLogitsProcessor!DiaEOSDelayPatternLogitsProcessorLogitsProcessorListTemperatureLogitsWarper)StoppingCriteriaList)BaseStreamer)GenerateOutputGenerationConfigGenerationMixinGenerationMode)is_deepspeed_zero3_enabled)is_fsdp_managed_module)PreTrainedModel)loggingc                   .    e Zd ZdZ	 	 	 	 	 	 	 	 d$dededz  dej                  dz  deeej                  ge
e   f   dz  dedz  dedz  deeef   dz  d	ej                  dz  d
ej                  dz  def fdZdedz  dedeeef   f fdZ	 	 	 d%dej                  dz  dej                  dz  deeej                  f   dz  deej                  edz  eeej                  f   f   f fdZ	 d&dededeeej                  f   dej                  dej&                  dz  deej                  eeej                  f   f   fdZ	 	 d' fd	Zedej                  dedej                  dz  dej                  fd       Z	 	 	 	 	 	 	 	 	 	 	 d(dej                  dz  dedz  dedz  dedz  deeej                  ge
e   f   dz  dedz  ded   ded    d	ej                  dz  d
ej                  dz  d!edz  fd"Z ej8                         	 	 	 	 	 	 	 	 	 	 	 d(dej                  dz  dedz  dedz  dedz  deeej                  ge
e   f   dz  dedz  ded   ded    d	ej                  dz  d
ej                  dz  d!edz  deej                  z  fd#       Z xZS ))DiaGenerationMixinNgeneration_configinput_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnlogits_processordevicemodel_kwargsnegative_prompt_idsnegative_prompt_attention_maskreturnc
                    |j                   }
|j                  }d |_         d |_        t               }||dk7  r|j                  t	        |             |j                  t        t        | j                  j                        | j                  j                  j                               t        | 1  |||d |||||		      }|
.|
dk7  r)t        |
|j                        }|j                  d|       |j                  t!        | j                  j                  | j                  j                  j                  |j"                  |             |
|_         ||_        |S )N      ?)num_channelseos_token_id	r   r   r   r   r   r   r   r   r       )guidance_scaleguidance_top_kr   )delay_patternr%   max_generation_lenr   )r(   temperaturer
   appendr   r   lenconfigr*   decoder_configr%   super_get_logits_processorr   top_kinsertr	   
max_length)selfr   r   r   r   r   r   r   r   r    original_guidance_scaleoriginal_temperaturecustom_processorsmerged_processorscfg_processor	__class__s                  X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/dia/generation_dia.pyr2   z(DiaGenerationMixin._get_logits_processor,   s_    #4"B"B0<<+/((,% 01+0D0K$$%<=Q%RS  . !:!:;![[77DD	
 "G9/!5/%).% 3+I : 

 #.3Ja3OD6066M $$Q6  -"kk77![[77DD#4#?#?		
 ,C((<%      kwargsc                 h   t        |   |fi |\  }}|j                  9|j                  dk  r*t        j	                  d|j                   d       d|_        |xj
                  t        | j                  j                        z  c_        |j                  d uxr |j                  dk7  | _
        ||fS )Nr#   zAtemperature < 1.0 is not supported for Dia; clamping to 1.0 (got )r'   )r1   _prepare_generation_configr,   loggerwarning_oncer5   maxr/   r*   r(   	_uses_cfg)r6   r   r?   r   r<   s       r=   rB   z-DiaGenerationMixin._prepare_generation_configo   s     +0'*LM^*ibh*i'<((49J9V9VY\9\STeTqTqSrrst -0) 	$$DKK,E,E(FF$ +99EoJ[JjJjnoJo ,..r>   inputsbos_token_idc                    t         |   |||      \  }}}| j                  rXt        j                  |      }t        j
                  ||gd      }|j                  dd       |d   j                  dd      |d<   |||fS )N)rG   rH   r   r   dimattention_mask   r'   )r1   _prepare_model_inputsrF   torch
zeros_likecatgetrepeat)r6   rG   rH   r   
input_nameunconditioned_inputsr<   s         r=   rN   z(DiaGenerationMixin._prepare_model_inputs   s     ,17+H%% ,I ,
(
L >>#(#3#3F#; YY(<=1EF 0$7C1=>N1O1V1VWXZ[1\-.z<//r>   
batch_sizemodel_input_namedecoder_start_token_idc                 L   dx}}|d|v r|j                  d      }|d|v r|j                  d      }||t        j                  d|du d|du d       | j                  j                  j
                  }| j                  r|dz  n|}	|*t        j                  |	d|f|t        j                  |	      }t        j                  |	|j                  d   ft        j                  |
      }|j                         }
|j                  d   |dddddf   | j                  j                  j                  k(  j                  d      j                         z
  }|
ddd|f   j                  dd      j                         }|ddd|f   j                         }||d<   |
|d<   ||fS )zGPrepares `decoder_input_ids` for generation with encoder-decoder modelsNdecoder_input_idsdecoder_attention_maskz[In order to generate with Dia, we need the processed audio input: Got `decoder_input_ids`: z" and got `decoder_attention_mask`=z]. This can be achieved via the [`DiaProcessor`] but now defaulting to non-delayed generation.rM   r'   )dtyper   )sizer\   r   r   rJ   decoder_delay_mask)poprC   rD   r/   r0   r$   rF   rO   fulllongonesshapepad_token_idsumrE   	transpose)r6   rV   rW   r   rX   r   rZ   r[   r$   real_batch_size
delay_maskvalid_input_sizes               r=   )_prepare_decoder_input_ids_for_generationz<DiaGenerationMixin._prepare_decoder_input_ids_for_generation   s    6:92#(;|(K , 0 01D E#(@L(P%1%5%56N%O" $(>(F%T122TUkswUwTx yop  ;;55BBL15jAoZO ($)JJ$a68NV[V`V`io%! &+ZZ%'8'>'>q'AB%**]c&"
 '++-
##A& Aq)T[[-G-G-T-TTYY^`Yaeegh 	 'q*;+;*;';<FFq!LQQS!7;L<L;L8L!M!R!R!T 2H-.-7)* ,..r>   c                    | j                   r|d   j                  d   dz  n|d   j                  d   }|j                  || j                  j                  j
                  d      j                  dd      }t        	|    |fd|i|}| j                  || j                  j                  j                  |      |d<   |j                  dd      r'|d	   d   dkD  r|d   d d dd d f   d d d d d f   |d<   |d   j                         |d<   | j                   rRd
D ]M  }|j                  |d       t        dgdg||   j                  dz
  z  z         } ||   j                  | ||<   O |S )Nr   rM   r^   r'   encoder_outputsrZ   	use_cacheFcache_position)rZ   r[   decoder_position_ids)rF   rd   reshaper/   r0   r$   rg   r1   prepare_inputs_for_generationapply_delay_maskre   rR   
contiguoustuplendimrS   )
r6   	input_idsrm   r_   r?   rV   model_inputskeyrepeat_patternr<   s
            r=   rr   z0DiaGenerationMixin.prepare_inputs_for_generation   s    :>_Q'--a0A5_]^M_MeMefgMh
%%j$++2L2L2Y2Y[]^hhijlmn	 w<YrXgrkqr -1,A,At{{11>>@R-
()
 K/LAQ4RST4UXY4Y0<=P0QRSUWYZRZ0[\]_cef\f0gL,- -99L,M,X,X,Z() >>^ R##C.:%*A3!S8I8N8NQR8R1S+S%TN(@S(9(@(@.(QL%	R r>   rw   pad_idri   c                     || S t        | j                  d   |j                  d         }|d d d |d d f   }| d d d |d d f   }t        j                  ||k(  ||      | d d d |d d f<   | S )Nr'   )minrd   rO   where)rw   r{   ri   mask_len
valid_maskvalid_inputs         r=   rs   z#DiaGenerationMixin.apply_delay_mask   s    yq):+;+;A+>?9H9a0
9H9a0 &+[[v1E{T^%_	!YhY/"r>   stopping_criteriasynced_gpusassistant_modelr   streamerr   custom_generatec                    | j                  |||||      } | j                  |fi |\  }}|j                  |      }|t        j                  t        j
                  fvrt        d      | j                  |j                                | j                  |||       |0t               xs t        |       xr t        j                         dkD  }||n	t               }||n	t               }|j!                  dd       d u}| j#                  ||j$                  |      \  }}}|j&                  d   }|j(                  }| j+                  |||       d|vr| j-                  ||||      }| j/                  ||||j0                  |j(                        \  }}|j2                  r!| j5                  ||j!                  d            }||j7                  |j9                                |j&                  d	   }|j!                  d
      d u xr |j:                  d u}|j!                  d      d u xr |j<                  d u}| j?                  ||||||      }| jA                         r	d|vrd|d<   | jC                  |||       |j:                  dz
  }|j&                  d   |k7  r-|dk(  r(| jD                  jF                  s||j&                  d   z  }| jI                  |||||       | jK                  ||||||j(                  ||	|
	      }| jM                  |||j!                  d            }|jN                  |d<   |jQ                  d	|j&                  d	         }|jR                  dkD  rt        d       | jT                  |f|||d||S )NzGot incompatible mode for generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.r'   rL   r   )r   rm   )rV   rW   r   rX   r   	tokenizerr^   r5   
min_length)r   has_default_max_lengthhas_default_min_lengthrW   inputs_tensorinput_ids_lengthlogits_to_keepinputs_embedsr&   )r   r   r   rn   z2`num_return_sequences>1` is incompatible with Dia.)r   r   r   )+_extract_generation_mode_kwargsrB   get_generation_moder   SAMPLEGREEDY_SEARCH
ValueError_validate_model_kwargscopy_validate_generation_moder   r   distget_world_sizer
   r   rR   rN   rH   rd   r   _prepare_special_tokens._prepare_encoder_decoder_kwargs_for_generationrk   _decoder_start_token_tensortoken_healingheal_tokensputcpur5   r   _prepare_generated_length_supports_logits_to_keep_validate_generated_lengthr/   is_encoder_decoder_prepare_cache_for_generationr2   _get_stopping_criteriarn   rq   num_return_sequences_sample)r6   rG   r   r   r   r   r   r   r   r   r    r   r?   generation_mode_kwargsr   generation_modekwargs_has_attention_maskr   rW   rV   r   rw   r   r   r   max_cache_lengthprepared_logits_processorprepared_stopping_criterias                               r=   _main_generate_loopz&DiaGenerationMixin._main_generate_loop   s+   " "&!E!E"
 +J$*I*IJ[*f_e*f'<+??P>#8#8.:V:V"WWT 
 	##L$5$5$78&&8IKab 57W;QRV;Wv]a]p]p]ruv]vK/?/K+QdQf1B1N-ThTj %1$4$45Et$LTX$X!8<8R8R%22L9
5' #((+
%%$$%68QZ`$a L0NN|-=?PL
 #'"P"P!-%#4#P#P '' #Q #
	< **((4J4N4N{4[\ILL) %??2.!'L!9T!A!nFWFbFbjnFn!'L!9T!A!nFWFbFbjnFn ::/#9#9-'- ; 
 ((*/?|/S-.L)*''(9;KMcd -77!;"&66 O3KK22 3 3A 66**|_jJZ	

 %)$>$>/!1+%=- ''% 3+I %? 
%
! &*%@%@//,00= &A &
" %6$?$?[! %%b)//"*=>	 11A5QRR t||
68/	

 %
 
 	
r>   c                    |j                  d      }||j                         } | j                  d|||||||||	|
|d|}t        |t        j
                         }|r|j                  }n|}| j                  j                  j                  }|j                  d   |z  }|j                  ||d      j                  dd      }| j                  || j                  j                  j                  |      }|r	||_        |S |}|S )NrZ   )rG   r   r   r   r   r   r   r   r   r    r   r   r^   r'   rM    )rR   cloner   
isinstancerO   Tensor	sequencesr/   r0   r$   rd   rq   rg   rs   re   )r6   rG   r   r   r   r   r   r   r   r   r    r   r?   ri   outputreturn_dict_in_generateoutput_sequencesr$   bszs                      r=   generatezDiaGenerationMixin.generate  s+   " ZZ 34
!#))+J))) 
/-/%=#+ 3+I+
 
 '1&F"F"%//% {{11>>$$Q'<7+33CrJTTUVXYZ  001A4;;C]C]CjCjlvw"/F  &Fr>   )NNNNNNNN)NNN)N)NN)NNNNNNNNNNN) __name__
__module____qualname__rF   r   intrO   
LongTensorr   r   listr
   strdictr   r2   ru   rB   rN   r   rk   rr   staticmethodrs   r   boolr   r   no_gradr   r   __classcell__)r<   s   @r=   r   r   (   sv   I
 ,059TX7;!.237>BA!+A! "DjA! !++d2	A!
 #+C+>S	+I"JT"QA! .4A! d
A! 38nt+A! #\\D0A! ).t(;A! 
A!F/!1D!8/DG/	%	&/* '+,07;	0t#0 llT)0 3,-4	0
 
u||S4Zc5<<.?)@@	A08 '+1/1/ 1/ 3,-	1/
 !&1/ t#1/ 
uc5<<&7!88	91/l 	#J ELL # 5<<Z^K^ chcoco   '+597;9=TX#'7;-137>B&*U
t#U
 ,d2U
 .4	U

 0$6U
 #+C+>S	+I"JT"QU
 D[U
 ""34U
 >*U
 #\\D0U
 ).t(;U
 tU
n U]]_ '+597;9=TX#'7;-137>B&*7t#7 ,d27 .4	7
 0$67 #+C+>S	+I"JT"Q7 D[7 ""347 >*7 #\\D07 ).t(;7 t7 
%**	*7 7r>   r   )$collections.abcr   typingr   r   rO   torch.distributeddistributedr   generation.logits_processr   r   r	   r
   r   generation.stopping_criteriar   generation.streamersr   generation.utilsr   r   r   r   integrations.deepspeedr   integrations.fsdpr   modeling_utilsr   utilsr   
get_loggerr   rC   r   r   r>   r=   <module>r      sZ    %       A 0 a a @ 7 -  
		H	%e er>   