From 6ebe5b6f0ee1c3fb94341caacdb363fc68008955 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 17 Oct 2024 19:39:44 +0800 Subject: [PATCH 1/3] bug fix --- paddlenlp/trainer/auto_trainer.py | 1 - paddlenlp/trainer/trainer.py | 11 ++++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/paddlenlp/trainer/auto_trainer.py b/paddlenlp/trainer/auto_trainer.py index 81dec37b611e..be252791d3a2 100644 --- a/paddlenlp/trainer/auto_trainer.py +++ b/paddlenlp/trainer/auto_trainer.py @@ -692,7 +692,6 @@ def _save( output_dir: Optional[str] = None, state_dict=None, merge_tensor_parallel=False, - signal_dir: Optional[str] = None, ): output_dir = output_dir if output_dir is not None else self.args.output_dir os.makedirs(output_dir, exist_ok=True) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index ddc872ad6173..aa69b9414255 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -2292,7 +2292,6 @@ def save_model( self, output_dir: Optional[str] = None, merge_tensor_parallel: Optional[bool] = False, - signal_dir: Optional[str] = None, ): """ Will save the model, so you can reload it using `from_pretrained()`. @@ -2303,7 +2302,9 @@ def save_model( if output_dir is None: output_dir = self.args.output_dir - if signal_dir is None: + if PREFIX_CHECKPOINT_DIR in output_dir: + signal_dir = os.path.join(self.args.output_signal_dir, os.path.split(output_dir)[-1]) + else: signal_dir = self.args.output_signal_dir if ShardingOption.FULL_SHARD in self.args.sharding: @@ -2370,11 +2371,11 @@ def _save_checkpoint(self, model, metrics=None): signal_dir = os.path.join(run_signal_dir, checkpoint_folder) if isinstance(self.model, LoRAModel) and (self.model.quantized or self.args.pipeline_parallel_degree > 1): - self.save_model(output_dir, False, signal_dir) + self.save_model(output_dir) elif isinstance(self.model, LoRAModel) or isinstance(self.model, PrefixModelForCausalLM): - self.save_model(output_dir, True, signal_dir) + self.save_model(output_dir, True) else: - self.save_model(output_dir, False, signal_dir) + self.save_model(output_dir) # only save model state dict, ignore optimizer and scheduler if not self.args.ignore_save_lr_and_optim: From 2eafad30631b30b74ab58e733c4e066202e5ae99 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 17 Oct 2024 21:32:35 +0800 Subject: [PATCH 2/3] bug fix --- paddlenlp/trainer/trainer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index aa69b9414255..c7cfa72462b4 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -2311,7 +2311,7 @@ def save_model( self.model_wrapped.get_all_parameters(convert2cpu=True) if self.args.should_save_model_state: - self._save(output_dir=output_dir, merge_tensor_parallel=merge_tensor_parallel, signal_dir=signal_dir) + self._save(output_dir=output_dir, merge_tensor_parallel=merge_tensor_parallel) else: if self.args.unified_checkpoint and "async_save" in self.args.unified_checkpoint_config: os.makedirs(signal_dir, exist_ok=True) @@ -2592,15 +2592,16 @@ def _save( output_dir: Optional[str] = None, state_dict=None, merge_tensor_parallel=False, - signal_dir: Optional[str] = None, ): output_dir = output_dir if output_dir is not None else self.args.output_dir os.makedirs(output_dir, exist_ok=True) logger.info(f"Saving model checkpoint to {output_dir}") # signal_dir is used for asynchronous saving situations. + signal_dir = self.args.output_signal_dir if self.args.unified_checkpoint and "async_save" in self.args.unified_checkpoint_config: - signal_dir = signal_dir if signal_dir is not None else self.args.output_signal_dir + if PREFIX_CHECKPOINT_DIR in output_dir: + signal_dir = os.path.join(signal_dir, os.path.split(output_dir)[-1]) os.makedirs(signal_dir, exist_ok=True) logger.info(f"Saving model checkpoint finish signal to {signal_dir}") From 71285d02995b0201f25b89137f9994e6374033cd Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 29 Oct 2024 11:38:38 +0800 Subject: [PATCH 3/3] mv assert to warning --- paddlenlp/trainer/training_args.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index 569d188b2772..030c14cb9acb 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -1403,7 +1403,8 @@ def is_segment_parallel_supported(): ) if "split_param" in sharding_parallel_config: - assert self.sharding == [ShardingOption.SHARD_OP], "Only sharding stage1 support split_param." + if ShardingOption.SHARD_OP not in self.sharding: + logger.warning("Only sharding stage1 support split_param.") assert ( self.amp_master_grad ), "If `split_param` in sharding_parallel_config, `amp_master_grad` must be True."