
    qi                     V   d Z ddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZ  e       r
ddlZddlmZ  ej                   e      Zd Z e       r e       rdd	lmZ ndd
lmZ  G d de      Z G d de      Zdad Zd Zd Zd Zd Zd Z ddZ!d Z"ddZ#ddZ$ddZ%d Z&y)z
Integration with Deepspeed
    N)partialmethod   )dep_version_check)is_accelerate_availableis_torch_availablelogging)nnc                      t         j                  j                  d      d u} | r!	 t         j                  j                  d      }yy # t         j                  j                  $ r Y yw xY w)N	deepspeedTF)	importlibutil	find_specmetadataPackageNotFoundError)package_exists_s     U/opt/pipecat/venv/lib/python3.12/site-packages/transformers/integrations/deepspeed.pyis_deepspeed_availabler   $   sc    ^^--k:$FN 	""++K8A  !!66 		s   A A&%A&)HfDeepSpeedConfig)objectc                   "     e Zd ZdZ fdZ xZS )r   aJ  
    This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.

    A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
    things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
    it's important that this object remains alive while the program is still running.

    [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
    with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
    the DeepSpeed configuration is not modified in any way.

    Args:
        config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.

    c                 f    t        |        t        d       t        d       t        |   |       y )N
accelerater   )set_hf_deepspeed_configr   super__init__selfconfig_file_or_dict	__class__s     r   r   zHfDeepSpeedConfig.__init__J   s)    %,'+&,-    )__name__
__module____qualname____doc__r   __classcell__r    s   @r   r   r   9   s     . .r!   r   c                   X     e Zd ZdZ fdZd Zd Zd
dZ eed      Z	ddZ
d	 Z xZS )HfTrainerDeepSpeedConfigz
    The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the
    same lifespan as the latter.
    c                 @    t         |   |       d | _        g | _        y N)r   r   _dtype
mismatchesr   s     r   r   z!HfTrainerDeepSpeedConfig.__init__X   s    ,-r!   c                 H    | j                   t        d      | j                   S )Nz8trainer_config_process() wasn't called yet to tell dtype)r,   
ValueError)r   s    r   dtypezHfTrainerDeepSpeedConfig.dtype]   s"    ;;WXX{{r!   c                 4    | j                  |      }|y|dk(  S )NFauto)	get_value)r   ds_key_longvals      r   is_autoz HfTrainerDeepSpeedConfig.is_autob   s"    nn[);&= r!   c           
          | j                  |      \  }}|y|j                  |      dk(  r|||<   y|sy|j                  |      }|.||k7  r(| j                  j                  d| d| d| d|        yyy)a  
        A utility method that massages the config file and can optionally verify that the values match.

        1. Replace "auto" values with `TrainingArguments` value.

        2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
        config values and if mismatched add the entry to `self.mismatched` - will assert during
        `trainer_config_finalize` for one or more mismatches.

        Nr2   z- ds =z vs hf )find_config_nodegetr-   append)r   r4   hf_valhf_key
must_matchconfigds_keyds_vals           r   
fill_matchz#HfTrainerDeepSpeedConfig.fill_matchi   s     ..{;>::f'#F6NF#&F"2OO""U;-qxqQWPX#YZ #3r!   F)r>   c                    |j                   |j                  z  |j                  z  }| j                  d|j                  d|        | j                  d|j                  d       | j                  d|d|        | j                  d|j                  d       | j                  d|j
                  d	       | j                  d
|j                  |j                  gd       | j                  d|j                  d       | j                  d|j                  d       | j                  dd       | j                  d|j
                  d	       |j                  rE| j                  j                  di       | j                  d<   |j                  | j                  d   d<   | j                  d|j                  xs |j                  d       | j                  d|j                   xs |j"                  d       | j%                  d      rt&        j(                  | _        y| j%                  d      rt&        j,                  | _        yt&        j.                  | _        y)z
        Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
        creation.
        train_micro_batch_size_per_gpuper_device_train_batch_sizegradient_accumulation_stepstrain_batch_sizeztrain_batch_size (calculated)gradient_clippingmax_grad_normzoptimizer.params.lrlearning_ratezoptimizer.params.betaszadam_beta1+adam_beta2zoptimizer.params.epsadam_epsilonzoptimizer.params.weight_decayweight_decayzscheduler.params.warmup_min_lrr   zscheduler.params.warmup_max_lr
checkpointuse_node_local_storagezfp16.enabledzfp16|fp16_full_evalzbf16.enabledzbf16|bf16_full_evalN)
world_sizerE   rF   rB   rI   rJ   
adam_beta1
adam_beta2rK   rL   	fill_onlysave_on_each_noder?   r:   fp16fp16_full_evalbf16bf16_full_evalis_truetorchbfloat16r,   float16float32)r   argsauto_find_batch_sizerG   s       r   trainer_config_processz/HfTrainerDeepSpeedConfig.trainer_config_process   s     ??T-M-MMPTPpPpp,,,)$$		
 	),,)	

 	+$$		
 	+T-?-?Q-t/A/A?S$__doo.#	

 	.0A0A>R79J9JN[7;8$:L:Lo^ !!(,b(IDKK%BFBXBXDKK%&>? 	)Id6I6ILab)Id6I6ILab <<'..DK\\.)--DK--DKr!   c                    g d}|D cg c]  }| j                  |      s| }}t        |      dkD  rd}t        |d      rt        |j                  d      r|j                  j                  }nt        |j                  d      r t        |j                  j                        }nt        |j                  d      rAt        |j                  j                  d      r!|j                  j                  j                  }n_t        |j                  d      rIt        |j                  j                  d      r)t        |j                  j                  j                        }|t        d| d	      | j                  d
||z         | j                         r6| j                  dt        d|z  |z               | j                  dd|z         | j                  d|d       | j                  d|j                  |      d       t        | j                        dkD  r*dj                  | j                        }t        d| d      yc c}w )z
        This stage is run after we have the model and know num_training_steps.

        Now we can complete the configuration process.
        )$zero_optimization.reduce_bucket_size-zero_optimization.stage3_prefetch_bucket_size4zero_optimization.stage3_param_persistence_thresholdr   Nr?   hidden_sizehidden_sizestext_configzThe model's config file has neither `hidden_size` nor `hidden_sizes` entry, therefore it's not possible to automatically fill out the following `auto` entries in the DeepSpeed config file: zb. You can fix that by replacing `auto` values for these keys with an integer value of your choice.ra   rb   g?rc   
   z scheduler.params.total_num_stepsznum_training_steps (calculated)z!scheduler.params.warmup_num_stepswarmup_steps
z]Please correct the following DeepSpeed config values that mismatch TrainingArguments values:
zF
The easiest method is to set these DeepSpeed config values to 'auto'.)r6   lenhasattrr?   rd   maxre   rf   r/   rR   is_zero3intrB   get_warmup_stepsr-   join)	r   r]   modelnum_training_stepshidden_size_based_keysxhidden_size_auto_keysrd   r-   s	            r   trainer_config_finalizez0HfTrainerDeepSpeedConfig.trainer_config_finalize   s   "

 -C VqdllSTo V V$%)Kuh'5<<7"',,":":KU\\>:"%ell&?&?"@KU\\=9gellF^F^`m>n"',,":":"F"FKU\\=9gellF^F^`n>o"%ell&>&>&K&K"LK" 55J4K LYY  NNA;Q\C\]}}Ck)K78 J$ 	.-	

 	/!!"45	
 t!#4??3J'L(oq  $a !Ws
   II)NTF)r"   r#   r$   r%   r   r0   r6   rB   r   rR   r_   rv   r&   r'   s   @r   r)   r)   R   s7    


![4 jU;I8(tCr!   r)   c                 .    t        j                  |       ay r+   )weakrefref_hf_deepspeed_config_weak_ref)hf_deepspeed_config_objs    r   r   r   	  s    
 %,KK0G$H!r!   c                      d a y r+   )r{    r!   r   unset_hf_deepspeed_configr     s
     %)!r!   c                  T    t         "t               t               j                         S y)NF)r{   rm   r~   r!   r   is_deepspeed_zero3_enabledr     s&    $05R5T5`,.7799r!   c                  L    t         t               t               j                  S y r+   )r{   r?   r~   r!   r   deepspeed_configr     s#    $05R5T5`,.555r!   c                    ddl ddl}ddlm} ddlm | j                         fd |j                         5   |       5   | | j                         ddd       ddd       y# 1 sw Y   xY w# 1 sw Y   yxY w)aA  
    DeepSpeed ZeRO-3 variant of `PreTrainedModel.initialize_weights`. Mirrors the `smart_apply`
    dispatch logic but gathers each module's partitioned parameters before calling
    `_initialize_weights`, so initialization operates on full tensors instead of empty shards.
    Only rank 0 performs the actual init.
    r   Nr   )guard_torch_init_functions)PreTrainedModelc                    | j                         D ]+  }t        |      r ||j                         # ||       - t        | j	                  d            }|rMj
                  j                  |d      5  j                  j                         dk(  r	 ||        d d d        y  ||        y # 1 sw Y   y xY w)NF)recurser   modifier_rank)	children
isinstance_initialize_weightslist
parameterszeroGatheredParameterscommget_rank)model_or_modulefnchildparamsr   _apply_zero3r   is_remote_codes       r   r   z.initialize_weights_zero3.<locals>._apply_zero34  s    $--/ 	(E%1UE$=$=>UB'		( o000?@2262K 8>>**,178 8 /	8 8s   :'B44B=)	r   rY   initializationr   modeling_utilsr   r   no_gradr   )rq   rY   r   r   r   r   r   s      @@@@r   initialize_weights_zero3r   %  sv     ;0))+N0 
 ;') 	; 9 9:	;; ;	; 	;; ;s$   A;
A/A;/A8	4A;;Bc                   ! t               }||j                  di       j                  dd      }|j                  di       }t        |t              r,t	        ||j                  di       j                  dd            }|dkD  rt        d      dd	lm}m}m	!m
} t        |d
d      }	| j                  }
i }| j                         j                         D ]4  \  }}t        j                   |j"                  |j$                  d      ||<   6 |D cg c]  }t        ||      s| }}|D cg c]  }t        ||      s| }}t'        |      dk(  r>i }|j                         D ]  \  }} |||g |
|      \  }}||v s|||<     |	|	|_        |S |D ci c]  }|j*                  D ]  }||  }}}i }i }t-        |j/                         !fd      }|D ]  }|j1                  |      } |||||
|      \  }}||v s(|U||   } ||j*                  |j2                  |j4                        }|j7                  ||      }|j9                  ||||       |||<    |j                         D ]X  \  }}	 |j;                  || | j<                        }|j                         D ]!  \  }}t        |t>              r|d   n|}|||<   # Z |	|	|_        |S c c}w c c}w c c}}w # t@        $ r} tC        d| d|        | d} ~ ww xY w)z
    Apply weight conversions (renaming and merging/splitting operations) to a state dict.
    This is a simplified version that handles the conversion without loading into the model.
    Ntensor_parallelautotp_size   	inferencetp_sizezWeight conversions (e.g., MoE expert fusion) with DeepSpeed Tensor Parallelism are not yet implemented but support is coming soon. Please disable tensor_parallel in your DeepSpeed config or convert your checkpoint to the expected format first.r   )WeightConverterWeightRenamingdot_natural_keyrename_source_key	_metadatameta)r0   devicer   c                      |       S r+   r~   )kr   s    r   <lambda>z9_apply_weight_conversions_to_state_dict.<locals>.<lambda>  s    /!:L r!   )key)source_patternstarget_patterns
operations)rq   r?   z'Failed to apply weight conversion for 'zb'. This likely means the checkpoint format is incompatible with the current model version. Error: )"r   r:   r   dictrl   NotImplementedErrorcore_model_loadingr   r   r   r   getattrbase_model_prefix
state_dictitemsrY   emptyshaper0   rj   r   r   sortedkeyspopr   r   
setdefault
add_tensorconvertr?   r   	ExceptionRuntimeError)"rq   r   weight_mapping	ds_configr   inference_configr   r   r   r   prefixmodel_state_dictr   paramentry	renamings
convertersnew_state_dictoriginal_keytensorrenamed_keyr   	converterr   pattern_to_converterconversion_mappingsorted_keyssource_patternnew_convertermappingrealized_valuetarget_nameer   s"                                    @r   '_apply_weight_conversions_to_state_dictr   H  s    !"I-- 126::=!L$==b9&-'#3#7#78I2#N#R#RS\^_#`aGQ;%d  ih z;5H$$F &&(..0 [
U %EKKu{{SY Z[ %3X5j6WXIX%3ZEz%7Y%ZJZ :!$.$4$4$6 	5 L&.|YFTdeNK...4{+	5
 '/N$ ;EhYiNgNghAyLhAhh
 N*0LMK# 5-&7iQ[]ceu&v#^ **) 1@	 /$-$=$=$-$=$=(33!
 -77]S"";nfU /5{++50 !3 8 8 : W	$__|| - N
 '5&:&:&< 4"U$.ud$;a.3{+4$ #+ K YZ iT  	9+ G 		s7   K K*K<KK2AK!!	L*K<<Lc           	        	
 t        |dd      
|j                         }

|_        d}|t        |dd      }|"t        |      dkD  rt	        | ||      }|| _        g | j                         }t        |j                               t        | dd      }|j                         D ci c]%  \  }}|j                  | d|       | d| n||' }}}ddt        j                  f	
fd		 	| |d
       fS c c}}w )a  
    Loads state dict into a model specifically for Zero3, since DeepSpeed does not support the `transformers`
    tensor parallelism API.

    Nearly identical code to PyTorch's `_load_from_state_dict`

    Args:
        model_to_load: The model to load weights into
        state_dict: The state dict containing the weights
        load_config: Optional LoadStateDictConfig containing weight_mapping and other loading options
    r   Nr   r   r   .Fmodulec                 r   i nj                  |d d i       }||d<   |||dg g f}t               rdd l}t        | j	                  |d d d            }g }|D ]5  }	|	|v s||	   }
d|
_        |j                  |
       j                  |	       7 t        |      dkD  rV|j                  j                  |d      5  t        j                  j                         dk(  r | j                  |  d d d        | j                  j!                         D ]  \  }}|	 ||||z   dz   |        y # 1 sw Y   @xY w)	Nassign_to_params_buffersTr   F)r   r   r   r   )r:   r   r   r   named_parameters_is_hf_initializedr;   discardrj   r   r   rY   distributedr   _load_from_state_dict_modulesr   )r   r   r   r   local_metadatar]   r   r   params_to_gatherr   r   namer   
error_msgsloadr   missing_keyss                r   r   z/_load_state_dict_into_zero3_model.<locals>.load  sa   '/X\\&"+r5R5M12FND"b*M &'  $F$;$;6#2;X]$;$^_!% ,
?,Q/E/3E,$++E2 ((+, #$q( ^^667GWX6Y <((113q8444d;< "??002 	WKD% UJ(;=UV	W	< <s   =1D--D6)r   ) F)r   copyr   rj   r   _weight_conversionsr   setr   r   r:   r	   Module)model_to_loadr   load_configr   meta_model_state_dictprefix_modelr   vr   r   r   r   s           @@@@r   !_load_state_dict_into_zero3_modelr     s>    z;5H"J'
 N .>E !c.&9A&=<]JXfg
,:)J)446,1134L=*=tDL $$&Aq #8";";|nAaS<Q"R"^L>1#	dehi	iJ  WRYY  W  WD 	
UC|##Ws   +*Dc                 &    ddl m}m} |j                  }d}d|v r
 ||      }n:|j	                         rt
        j                  d        j                         }d|d<   d}	d	|v r ||      }	||	fS t        ||      r fd
}
 |||
      }	||	fS )zY
    A convenience wrapper that deals with optimizer and lr scheduler configuration.
    r   )
DummyOptimDummySchedulerN	optimizer)r   zDetected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)Tzero_allow_untested_optimizer	schedulerc                 f    t        j                         }d |_        |j                  |       }|S )N)rr   r   )r   lr_schedulercreate_scheduler)r   trainer_copyr   rr   trainers      r   _lr_scheduler_callablez5deepspeed_optim_sched.<locals>._lr_scheduler_callable*  s=    #yy1 -1)+<<'9Y  =   $#r!   )lr_scheduler_callable)	accelerate.utilsr   r   r?   
is_offloadloggerinfocreate_optimizerr   )r  hf_deepspeed_configr]   rr   model_parametersr   r   r?   r   r   r  s   `  `       r   deepspeed_optim_schedr    s     < ''F If&67	))+KKV ,,.	26./Lf%i0" l"" i,	$ *)KabLl""r!   c                    ddl m} | j                  }| j                  }| j                  j
                  j                  j                  }|j                  |||       |j                  |j                                |rH|j                         st        d      |j                  d       |j                  d       d\  }}d}	||fS d| _        |j                  j!                  di       j!                  d	d
      }
|
d
kD  r1ddl}|j%                  ||
|j'                         |j                        }t)        t+        d |j-                                     }	t/        | ||||	      \  }}||fS )a  
    Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.

    If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.

    Args:
        trainer: Trainer object
        num_training_steps: per single gpu
        resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
        inference: launch in inference mode (no optimizer and no lr scheduler)
        auto_find_batch_size: whether to ignore the `train_micro_batch_size_per_gpu` argument as it's being
            set automatically by the auto batch size finder

    Returns: optimizer, lr_scheduler

    We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
    https://github.com/deepspeedai/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
    can't resume from a checkpoint after it did some stepping https://github.com/deepspeedai/DeepSpeed/issues/1612

    r   )r  zMZeRO inference only makes sense with ZeRO Stage 3 - please adjust your configr   r   )NNNr   r   r   )rq   r   r0   r?   c                     | j                   S r+   )requires_grad)ps    r   r   z deepspeed_init.<locals>.<lambda>r  s
     r!   )deepspeed.utilsr  rq   r]   acceleratorstatedeepspeed_pluginhf_ds_configrv   setLevelget_process_log_levelrm   r/   del_config_sub_treer   r?   r:   r   tp_model_initr0   r   filterr   r  )r  rr   r   	ds_loggerrq   r]   r
  r   r   r  deepspeed_tp_sizer   s               r   deepspeed_initr  :  sq   * 4MME<<D!--33DDQQ //e=OP t1134"++-lmm 	//<//?",	<* l""' !/66::;LbQUUVcefgq ++))//1*11	 , E  '@%BRBRBT UV"7($0BDT#
	< l""r!   c                     dd l }t        |j                  | d            }t        |      dkD  rAt        j	                  d|        | j                  ||dd      \  }}|t        d|       y t        d|       )Nr   z/global_step*zAttempting to resume from T)load_module_strictload_optimizer_statesload_lr_scheduler_statesz-[deepspeed] failed to resume from checkpoint z!Can't find a valid checkpoint at )globr   rj   r  r  load_checkpointr/   )deepspeed_enginecheckpoint_pathr  r"  deepspeed_checkpoint_dirs	load_pathr   s          r   deepspeed_load_checkpointr(  }  s    
  &tyyO3DM1R'S T
$%)00ABC'771"&%)	 8 
	1 L_L]^__  <_<MNOOr!   c                     | j                   j                  }t        |j                  j                        |_        |j                  j                  |_        |j                  j                  ||       y)a  
    Sets values in the deepspeed plugin based on the TrainingArguments.

    Args:
        accelerator (`Accelerator`): The Accelerator object.
        args (`TrainingArguments`): The training arguments to propagate to DeepSpeed config.
        auto_find_batch_size (`bool`, *optional*, defaults to `False`):
            Whether batch size was auto-discovered by trying increasingly smaller sizes.
    N)r  r  r)   r  r?   r   r_   )r  r]   r^   	ds_plugins       r   propagate_args_to_deepspeedr+    sY     !!22I5i6L6L6S6STI!*!7!7!>!>I11$8LMr!   c                 >   d|vrd|v r|d   |d<    |d	i |}|j                   }| j                  d   j                         }|j                  }t        j
                  j                  j                  j                  ||      |d   dk7  j                  d      j                         }	t        j
                  j                  j                  j                  |	|      t        fdt        |      D              }
t              }|
t        |d      z  }|r||fS |S )
aq  
    Computes the loss under sequence parallelism with `sp_backend="deepspeed"` and `sp_size > 1`.

    Performs weighted loss aggregation across SP ranks, accounting for varying numbers of valid tokens per rank
    (e.g., when some ranks receive only padding or prompt tokens that are masked with -100).

    Args:
        accelerator (`Accelerator`): The accelerator instance with `torch_device_mesh` support.
        model (`torch.nn.Module`): The model to compute the loss for.
        inputs (`dict[str, torch.Tensor | Any]`): The input data for the model. Must include `"shift_labels"` key.
        return_outputs (`bool`): Whether to return the model outputs along with the loss.
        pc (`accelerate.parallelism_config.ParallelismConfig`): The parallelism configuration.

    Returns:
        The loss, or a tuple of `(loss, outputs)` if `return_outputs` is `True`.
    labelsshift_labelssp)groupir   c              3   D   K   | ]  }|   d kD  r|   |   z    yw)r   Nr~   ).0rankgood_tokens_per_ranklosses_per_ranks     r   	<genexpr>z,deepspeed_sp_compute_loss.<locals>.<genexpr>  s6      %) 	 4T ::s    r   r~   )losstorch_device_mesh	get_groupsp_sizerY   r   r	   
functional
all_gatherviewsumrangerl   )r  rq   inputsreturn_outputspcoutputsr7  sp_groupsp_world_sizegood_tokens
total_losstotal_good_tokensr4  r5  s               @@r   deepspeed_sp_compute_lossrI    s   , v.F":!.1xofoG<<D,,T2<<>HJJM''**55@@X@VO.)T177;??AK ,,//::EEkYaEb -( J
 01-q11D,D'?6$6r!   r+   rw   )T)'r%   r   importlib.metadatar   importlib.utilry   	functoolsr   dependency_versions_checkr   utilsr   r   r   rY   r	   
get_loggerr"   r  r   accelerate.utils.deepspeedr   DeepSpeedConfigbuiltinsr   r)   r{   r   r   r   r   r   r   r   r  r  r(  r+  rI  r~   r!   r   <module>rS     s        # 9 H H  
		H	%
 !7!9O 3. .2p0 ph !% I) ;FhVN$b3#l@#FP0N",7r!   