From 93c71400ec11b8fa105f16f241bd4df78120421b Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Tue, 26 Dec 2023 20:08:38 +0800 Subject: [PATCH] [Fix] Update for better logging information (#717) * update for better logging information * remove limit for auto_collation when num_workers=0 * keep workers alive when finished loading one epoch * disable persistent_workers=True for BUG in paddle framework --- ppsci/data/__init__.py | 9 ++++----- ppsci/solver/printer.py | 24 +++++++++++++----------- ppsci/solver/solver.py | 14 +++++--------- ppsci/solver/train.py | 4 ++-- ppsci/utils/save_load.py | 13 ++++++++++++- ppsci/visualize/plot.py | 8 ++++---- ppsci/visualize/vtu.py | 4 ++-- 7 files changed, 42 insertions(+), 34 deletions(-) diff --git a/ppsci/data/__init__.py b/ppsci/data/__init__.py index 96bab676a..40dc96bee 100644 --- a/ppsci/data/__init__.py +++ b/ppsci/data/__init__.py @@ -129,10 +129,6 @@ def build_dataloader(_dataset, cfg): ) else: if cfg.get("auto_collation", True) is False: - if cfg.get("num_workers", _DEFAULT_NUM_WORKERS) < 1: - raise ValueError( - "num_workers should be greater than 1 when 'auto_collation' is False." - ) # 1. wrap batch_sampler again into BatchSampler for disabling auto collation, # which can speed up the process of batch samples indexing from dataset. See # details at: https://discuss.pytorch.org/t/efficiency-of-dataloader-and-collate-for-large-array-like-datasets/59569/8 @@ -144,7 +140,7 @@ def build_dataloader(_dataset, cfg): ) # 2. disable auto collation by given identity collate_fn which return the first # (also the only) batch data in batch list, or there will be a redundant - # axis at the first dimension returned by dataloader. is step is necessary + # axis at the first dimension returned by dataloader. It is step is necessary # because paddle do not support 'sampler' as instantiation argument of 'io.DataLoader' collate_fn = lambda batch: batch[0] # noqa: E731 logger.info("Auto collation is disabled to speed up batch sampling") @@ -157,6 +153,9 @@ def build_dataloader(_dataset, cfg): num_workers=cfg.get("num_workers", _DEFAULT_NUM_WORKERS), use_shared_memory=cfg.get("use_shared_memory", False), worker_init_fn=init_fn, + # TODO: Do not enable persistent_workers' below for + # 'IndexError: pop from empty list ...' will be raised in certain cases + # persistent_workers=cfg.get("num_workers", _DEFAULT_NUM_WORKERS) > 0, ) if len(dataloader_) == 0: diff --git a/ppsci/solver/printer.py b/ppsci/solver/printer.py index f1598824b..f6631f62e 100644 --- a/ppsci/solver/printer.py +++ b/ppsci/solver/printer.py @@ -66,18 +66,18 @@ def log_train_info( [trainer.train_time_info[key].mean for key in trainer.train_time_info] ) - ips_msg = ( - f"ips: {batch_size / trainer.train_time_info['batch_cost'].avg:.5f} samples/s" - ) + ips_msg = f"ips: {batch_size / trainer.train_time_info['batch_cost'].avg:.2f}" eta_sec = ( (trainer.epochs - epoch_id + 1) * trainer.iters_per_epoch - iter_id ) * trainer.train_time_info["batch_cost"].avg - eta_msg = f"eta: {str(datetime.timedelta(seconds=int(eta_sec))):s}" + eta_msg = f"eta: {str(datetime.timedelta(seconds=int(eta_sec)))}" + epoch_width = len(str(trainer.epochs)) + iters_width = len(str(trainer.iters_per_epoch)) log_str = ( - f"[Train][Epoch {epoch_id}/{trainer.epochs}]" - f"[Iter: {iter_id}/{trainer.iters_per_epoch}] {lr_msg}, " + f"[Train][Epoch {epoch_id:>{epoch_width}}/{trainer.epochs}]" + f"[Iter {iter_id:>{iters_width}}/{trainer.iters_per_epoch}] {lr_msg}, " f"{metric_msg}, {time_msg}, {ips_msg}, {eta_msg}" ) if trainer.benchmark_flag: @@ -122,14 +122,16 @@ def log_eval_info( [trainer.eval_time_info[key].mean for key in trainer.eval_time_info] ) - ips_msg = ( - f"ips: {batch_size / trainer.eval_time_info['batch_cost'].avg:.5f}" f"samples/s" - ) + ips_msg = f"ips: {batch_size / trainer.eval_time_info['batch_cost'].avg:.2f}" eta_sec = (iters_per_epoch - iter_id) * trainer.eval_time_info["batch_cost"].avg - eta_msg = f"eta: {str(datetime.timedelta(seconds=int(eta_sec))):s}" + eta_msg = f"eta: {str(datetime.timedelta(seconds=int(eta_sec)))}" + + epoch_width = len(str(trainer.epochs)) + iters_width = len(str(iters_per_epoch)) logger.info( - f"[Eval][Epoch {epoch_id}][Iter: {iter_id}/{iters_per_epoch}] " + f"[Eval][Epoch {epoch_id:>{epoch_width}}/{trainer.epochs}]" + f"[Iter {iter_id:>{iters_width}}/{iters_per_epoch}] " f"{metric_msg}, {time_msg}, {ips_msg}, {eta_msg}" ) diff --git a/ppsci/solver/solver.py b/ppsci/solver/solver.py index 7d6eebc34..4b8d8643f 100644 --- a/ppsci/solver/solver.py +++ b/ppsci/solver/solver.py @@ -169,7 +169,7 @@ def __init__( self.start_eval_epoch = start_eval_epoch self.eval_freq = eval_freq - # initialize training log recorder for loss, time cost, metric, etc. + # initialize training log(training loss, time cost, etc.) recorder during one epoch self.train_output_info: Dict[str, misc.AverageMeter] = {} self.train_time_info = { "batch_cost": misc.AverageMeter("batch_cost", ".5f", postfix="s"), @@ -177,7 +177,7 @@ def __init__( } self.train_loss_info: Dict[str, misc.AverageMeter] = {} - # initialize evaluation log recorder for loss, time cost, metric, etc. + # initialize evaluation log(evaluation loss, metric, etc.) recorder. self.eval_output_info: Dict[str, misc.AverageMeter] = {} self.eval_time_info = { "batch_cost": misc.AverageMeter("batch_cost", ".5f", postfix="s"), @@ -391,15 +391,10 @@ def convert_expr( def train(self): """Training.""" self.global_step = self.best_metric["epoch"] * self.iters_per_epoch + start_epoch = self.best_metric["epoch"] + 1 - for epoch_id in range(self.best_metric["epoch"] + 1, self.epochs + 1): + for epoch_id in range(start_epoch, self.epochs + 1): self.train_epoch_func(self, epoch_id, self.log_freq) - - # log training summation at end of a epoch - metric_msg = ", ".join( - [self.train_output_info[key].avg_info for key in self.train_output_info] - ) - logger.info(f"[Train][Epoch {epoch_id}/{self.epochs}][Avg] {metric_msg}") self.train_output_info.clear() cur_metric = float("inf") @@ -463,6 +458,7 @@ def train(self): self.output_dir, "latest", self.equation, + print_log=(epoch_id == start_epoch), ) @misc.run_on_eval_mode diff --git a/ppsci/solver/train.py b/ppsci/solver/train.py index 737cc89bc..b15502584 100644 --- a/ppsci/solver/train.py +++ b/ppsci/solver/train.py @@ -130,7 +130,7 @@ def train_epoch_func(solver: "solver.Solver", epoch_id: int, log_freq: int): solver.train_time_info["reader_cost"].update(reader_cost) solver.train_time_info["batch_cost"].update(batch_cost) printer.update_train_loss(solver, loss_dict, total_batch_size) - if iter_id == 1 or iter_id % log_freq == 0: + if solver.global_step % log_freq == 0 or solver.global_step == 1: printer.log_train_info(solver, total_batch_size, epoch_id, iter_id) batch_tic = time.perf_counter() @@ -234,7 +234,7 @@ def closure(): solver.train_time_info["reader_cost"].update(reader_cost) solver.train_time_info["batch_cost"].update(batch_cost) printer.update_train_loss(solver, loss_dict, total_batch_size) - if iter_id == 1 or iter_id % log_freq == 0: + if solver.global_step % log_freq == 0 or solver.global_step == 1: printer.log_train_info(solver, total_batch_size, epoch_id, iter_id) batch_tic = time.perf_counter() diff --git a/ppsci/utils/save_load.py b/ppsci/utils/save_load.py index f349103c5..d1616725a 100644 --- a/ppsci/utils/save_load.py +++ b/ppsci/utils/save_load.py @@ -157,6 +157,7 @@ def save_checkpoint( output_dir: Optional[str] = None, prefix: str = "model", equation: Optional[Dict[str, equation.PDE]] = None, + print_log: bool = True, ): """Save checkpoint, including model params, optimizer params, metric information. @@ -168,6 +169,9 @@ def save_checkpoint( output_dir (Optional[str]): Directory for checkpoint storage. prefix (str, optional): Prefix for storage. Defaults to "model". equation (Optional[Dict[str, equation.PDE]]): Equations. Defaults to None. + print_log (bool, optional): Whether print saving log information, mainly for + keeping log tidy without duplicate 'Finish saving checkpoint ...' log strings. + Defaults to True. """ if paddle.distributed.get_rank() != 0: return @@ -195,4 +199,11 @@ def save_checkpoint( f"{ckpt_path}.pdeqn", ) - logger.message(f"Finish saving checkpoint to {ckpt_path}") + if print_log: + log_str = f"Finish saving checkpoint to: {ckpt_path}" + if prefix == "latest": + log_str += ( + "(latest checkpoint will be saved every epoch as expected, " + "but this log will be printed only once for tidy logging)" + ) + logger.message(log_str) diff --git a/ppsci/visualize/plot.py b/ppsci/visualize/plot.py index 19ebda4bf..ec70145d2 100644 --- a/ppsci/visualize/plot.py +++ b/ppsci/visualize/plot.py @@ -111,10 +111,10 @@ def _save_plot_from_1d_array(filename, coord, value, value_keys, num_timestamps= fig.savefig(f"{filename}_{t}", dpi=300) if num_timestamps == 1: - logger.message(f"1D result is saved to {filename}.png") + logger.message(f"1D result is saved to: {filename}.png") else: logger.message( - f"1D result is saved to {filename}_0.png" + f"1D result is saved to: {filename}_0.png" f" ~ {filename}_{num_timestamps - 1}.png" ) @@ -353,10 +353,10 @@ def _save_plot_from_3d_array( fig.savefig(f"{filename}_{t}", dpi=300) if num_timestamps == 1: - logger.message(f"3D result is saved to {filename}.png") + logger.message(f"3D result is saved to: {filename}.png") else: logger.message( - f"3D result is saved to {filename}_0.png" + f"3D result is saved to: {filename}_0.png" f" ~ {filename}_{num_timestamps - 1}.png" ) diff --git a/ppsci/visualize/vtu.py b/ppsci/visualize/vtu.py index f0d76e987..2cfde0b65 100644 --- a/ppsci/visualize/vtu.py +++ b/ppsci/visualize/vtu.py @@ -91,10 +91,10 @@ def _save_vtu_from_array(filename, coord, value, value_keys, num_timestamps=1): if num_timestamps > 1: logger.message( - f"Visualization results are saved to {filename}_t-0.vtu ~ {filename}_t-{num_timestamps - 1}.vtu" + f"Visualization results are saved to: {filename}_t-0.vtu ~ {filename}_t-{num_timestamps - 1}.vtu" ) else: - logger.message(f"Visualization result is saved to {filename}.vtu") + logger.message(f"Visualization result is saved to: {filename}.vtu") def save_vtu_from_dict(