From 31e43125a296160e4cb940d9be1426c776e5bb88 Mon Sep 17 00:00:00 2001 From: OutisLi Date: Mon, 28 Jul 2025 17:30:08 +0800 Subject: [PATCH 1/6] display accumulated loss for trainning otherthan single batch loss --- deepmd/pt/train/training.py | 96 ++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 27 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 193dcd8cb9..d6510db86c 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -150,6 +150,12 @@ def __init__( ) self.lcurve_should_print_header = True + # Loss accumulation for averaging over display interval + self.loss_accumulator = {} + self.step_count_in_interval = 0 + self.last_display_step = 0 + self.step_count_per_task = {} + def get_opt_param(params): opt_type = params.get("opt_type", "Adam") opt_param = { @@ -808,22 +814,48 @@ def fake_model(): else: raise ValueError(f"Not supported optimizer type '{self.opt_type}'") + # Accumulate loss for averaging over display interval + self.step_count_in_interval += 1 + if not self.multi_task: + # Accumulate loss for single task + if not self.loss_accumulator: + # Initialize accumulator with current loss structure + for item in more_loss: + if "l2_" not in item: + self.loss_accumulator[item] = 0.0 + for item in more_loss: + if "l2_" not in item: + self.loss_accumulator[item] += more_loss[item] + else: + # Accumulate loss for multi-task + if task_key not in self.loss_accumulator: + self.loss_accumulator[task_key] = {} + if task_key not in self.step_count_per_task: + self.step_count_per_task[task_key] = 0 + self.step_count_per_task[task_key] += 1 + + for item in more_loss: + if "l2_" not in item: + if item not in self.loss_accumulator[task_key]: + self.loss_accumulator[task_key][item] = 0.0 + self.loss_accumulator[task_key][item] += more_loss[item] + # Log and persist display_step_id = _step_id + 1 - if self.display_in_training and ( - display_step_id % self.disp_freq == 0 or display_step_id == 1 - ): + if self.display_in_training and (display_step_id % self.disp_freq == 0 or display_step_id == 1): self.wrapper.eval() # Will set to train mode before fininshing validation def log_loss_train(_loss, _more_loss, _task_key="Default"): results = {} - rmse_val = { - item: _more_loss[item] - for item in _more_loss - if "l2_" not in item - } - for item in sorted(rmse_val.keys()): - results[item] = rmse_val[item] + if not self.multi_task: + # Use accumulated average loss for single task + for item in self.loss_accumulator: + results[item] = self.loss_accumulator[item] / self.step_count_in_interval + else: + # Use accumulated average loss for multi-task + if _task_key in self.loss_accumulator and _task_key in self.step_count_per_task: + for item in self.loss_accumulator[_task_key]: + results[item] = self.loss_accumulator[_task_key][item] / self.step_count_per_task[_task_key] return results def log_loss_valid(_task_key="Default"): @@ -882,24 +914,10 @@ def log_loss_valid(_task_key="Default"): else: train_results = {_key: {} for _key in self.model_keys} valid_results = {_key: {} for _key in self.model_keys} - train_results[task_key] = log_loss_train( - loss, more_loss, _task_key=task_key - ) + + # For multi-task, use accumulated average loss for all tasks for _key in self.model_keys: - if _key != task_key: - self.optimizer.zero_grad() - input_dict, label_dict, _ = self.get_data( - is_train=True, task_key=_key - ) - _, loss, more_loss = self.wrapper( - **input_dict, - cur_lr=pref_lr, - label=label_dict, - task_key=_key, - ) - train_results[_key] = log_loss_train( - loss, more_loss, _task_key=_key - ) + train_results[_key] = log_loss_train(loss, more_loss, _task_key=_key) valid_results[_key] = log_loss_valid(_task_key=_key) if self.rank == 0: log.info( @@ -921,6 +939,20 @@ def log_loss_valid(_task_key="Default"): ) self.wrapper.train() + # Reset loss accumulators after display + if not self.multi_task: + for item in self.loss_accumulator: + self.loss_accumulator[item] = 0.0 + else: + for task_key in self.model_keys: + if task_key in self.loss_accumulator: + for item in self.loss_accumulator[task_key]: + self.loss_accumulator[task_key][item] = 0.0 + if task_key in self.step_count_per_task: + self.step_count_per_task[task_key] = 0 + self.step_count_in_interval = 0 + self.last_display_step = display_step_id + current_time = time.time() train_time = current_time - self.t0 self.t0 = current_time @@ -993,6 +1025,16 @@ def log_loss_valid(_task_key="Default"): self.t0 = time.time() self.total_train_time = 0.0 self.timed_steps = 0 + + # Initialize loss accumulators + if not self.multi_task: + self.loss_accumulator = {} + else: + self.loss_accumulator = {key: {} for key in self.model_keys} + self.step_count_per_task = dict.fromkeys(self.model_keys, 0) + self.step_count_in_interval = 0 + self.last_display_step = 0 + for step_id in range(self.start_step, self.num_steps): step(step_id) if JIT: From 27d22ea8d930da1bcdce01876b3da89ff904f8ff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Jul 2025 10:06:00 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt/train/training.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index d6510db86c..60fec00f1f 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -842,7 +842,9 @@ def fake_model(): # Log and persist display_step_id = _step_id + 1 - if self.display_in_training and (display_step_id % self.disp_freq == 0 or display_step_id == 1): + if self.display_in_training and ( + display_step_id % self.disp_freq == 0 or display_step_id == 1 + ): self.wrapper.eval() # Will set to train mode before fininshing validation def log_loss_train(_loss, _more_loss, _task_key="Default"): @@ -850,12 +852,21 @@ def log_loss_train(_loss, _more_loss, _task_key="Default"): if not self.multi_task: # Use accumulated average loss for single task for item in self.loss_accumulator: - results[item] = self.loss_accumulator[item] / self.step_count_in_interval + results[item] = ( + self.loss_accumulator[item] + / self.step_count_in_interval + ) else: # Use accumulated average loss for multi-task - if _task_key in self.loss_accumulator and _task_key in self.step_count_per_task: + if ( + _task_key in self.loss_accumulator + and _task_key in self.step_count_per_task + ): for item in self.loss_accumulator[_task_key]: - results[item] = self.loss_accumulator[_task_key][item] / self.step_count_per_task[_task_key] + results[item] = ( + self.loss_accumulator[_task_key][item] + / self.step_count_per_task[_task_key] + ) return results def log_loss_valid(_task_key="Default"): @@ -917,7 +928,9 @@ def log_loss_valid(_task_key="Default"): # For multi-task, use accumulated average loss for all tasks for _key in self.model_keys: - train_results[_key] = log_loss_train(loss, more_loss, _task_key=_key) + train_results[_key] = log_loss_train( + loss, more_loss, _task_key=_key + ) valid_results[_key] = log_loss_valid(_task_key=_key) if self.rank == 0: log.info( From 8a9a1614596e325bb0f27c523a3b70926380f267 Mon Sep 17 00:00:00 2001 From: OutisLi Date: Thu, 31 Jul 2025 17:18:32 +0800 Subject: [PATCH 3/6] fix:add a parameter disp_avg in input file to allow user to choose, the default value is false, which displays just like before --- deepmd/pt/train/training.py | 167 ++++++++++++++++++++---------------- deepmd/utils/argcheck.py | 4 + 2 files changed, 97 insertions(+), 74 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 60fec00f1f..151c7cd885 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -140,6 +140,7 @@ def __init__( self.num_steps = training_params["numb_steps"] self.disp_file = training_params.get("disp_file", "lcurve.out") self.disp_freq = training_params.get("disp_freq", 1000) + self.disp_avg = training_params.get("disp_avg", False) self.save_ckpt = training_params.get("save_ckpt", "model.ckpt") self.save_freq = training_params.get("save_freq", 1000) self.max_ckpt_keep = training_params.get("max_ckpt_keep", 5) @@ -150,12 +151,6 @@ def __init__( ) self.lcurve_should_print_header = True - # Loss accumulation for averaging over display interval - self.loss_accumulator = {} - self.step_count_in_interval = 0 - self.last_display_step = 0 - self.step_count_per_task = {} - def get_opt_param(params): opt_type = params.get("opt_type", "Adam") opt_param = { @@ -814,31 +809,32 @@ def fake_model(): else: raise ValueError(f"Not supported optimizer type '{self.opt_type}'") - # Accumulate loss for averaging over display interval - self.step_count_in_interval += 1 - if not self.multi_task: - # Accumulate loss for single task - if not self.loss_accumulator: - # Initialize accumulator with current loss structure + if self.disp_avg: + # Accumulate loss for averaging over display interval + self.step_count_in_interval += 1 + if not self.multi_task: + # Accumulate loss for single task + if not self.train_loss_accu: + # Initialize accumulator with current loss structure + for item in more_loss: + if "l2_" not in item: + self.train_loss_accu[item] = 0.0 for item in more_loss: if "l2_" not in item: - self.loss_accumulator[item] = 0.0 - for item in more_loss: - if "l2_" not in item: - self.loss_accumulator[item] += more_loss[item] - else: - # Accumulate loss for multi-task - if task_key not in self.loss_accumulator: - self.loss_accumulator[task_key] = {} - if task_key not in self.step_count_per_task: - self.step_count_per_task[task_key] = 0 - self.step_count_per_task[task_key] += 1 + self.train_loss_accu[item] += more_loss[item] + else: + # Accumulate loss for multi-task + if task_key not in self.train_loss_accu: + self.train_loss_accu[task_key] = {} + if task_key not in self.step_count_per_task: + self.step_count_per_task[task_key] = 0 + self.step_count_per_task[task_key] += 1 - for item in more_loss: - if "l2_" not in item: - if item not in self.loss_accumulator[task_key]: - self.loss_accumulator[task_key][item] = 0.0 - self.loss_accumulator[task_key][item] += more_loss[item] + for item in more_loss: + if "l2_" not in item: + if item not in self.train_loss_accu[task_key]: + self.train_loss_accu[task_key][item] = 0.0 + self.train_loss_accu[task_key][item] += more_loss[item] # Log and persist display_step_id = _step_id + 1 @@ -847,27 +843,35 @@ def fake_model(): ): self.wrapper.eval() # Will set to train mode before fininshing validation - def log_loss_train(_loss, _more_loss, _task_key="Default"): - results = {} - if not self.multi_task: - # Use accumulated average loss for single task - for item in self.loss_accumulator: - results[item] = ( - self.loss_accumulator[item] - / self.step_count_in_interval - ) - else: - # Use accumulated average loss for multi-task - if ( - _task_key in self.loss_accumulator - and _task_key in self.step_count_per_task - ): - for item in self.loss_accumulator[_task_key]: + if self.disp_avg: + def log_loss_train(_loss, _more_loss, _task_key="Default"): + results = {} + if not self.multi_task: + # Use accumulated average loss for single task + for item in self.train_loss_accu: results[item] = ( - self.loss_accumulator[_task_key][item] - / self.step_count_per_task[_task_key] + self.train_loss_accu[item] + / self.step_count_in_interval ) - return results + else: + # Use accumulated average loss for multi-task + if ( + _task_key in self.train_loss_accu + and _task_key in self.step_count_per_task + ): + for item in self.train_loss_accu[_task_key]: + results[item] = ( + self.train_loss_accu[_task_key][item] + / self.step_count_per_task[_task_key] + ) + return results + else: + def log_loss_train(_loss, _more_loss, _task_key="Default"): + results = {} + rmse_val = {item: _more_loss[item] for item in _more_loss if "l2_" not in item} + for item in sorted(rmse_val.keys()): + results[item] = rmse_val[item] + return results def log_loss_valid(_task_key="Default"): single_results = {} @@ -925,12 +929,25 @@ def log_loss_valid(_task_key="Default"): else: train_results = {_key: {} for _key in self.model_keys} valid_results = {_key: {} for _key in self.model_keys} - - # For multi-task, use accumulated average loss for all tasks - for _key in self.model_keys: - train_results[_key] = log_loss_train( - loss, more_loss, _task_key=_key - ) + if self.disp_avg: + # For multi-task, use accumulated average loss for all tasks + for _key in self.model_keys: + train_results[_key] = log_loss_train( + loss, more_loss, _task_key=_key + ) + else: + train_results[task_key] = log_loss_train(loss, more_loss, _task_key=task_key) + for _key in self.model_keys: + if _key != task_key: + self.optimizer.zero_grad() + input_dict, label_dict, _ = self.get_data(is_train=True, task_key=_key) + _, loss, more_loss = self.wrapper( + **input_dict, + cur_lr=pref_lr, + label=label_dict, + task_key=_key, + ) + train_results[_key] = log_loss_train(loss, more_loss, _task_key=_key) valid_results[_key] = log_loss_valid(_task_key=_key) if self.rank == 0: log.info( @@ -952,19 +969,20 @@ def log_loss_valid(_task_key="Default"): ) self.wrapper.train() - # Reset loss accumulators after display - if not self.multi_task: - for item in self.loss_accumulator: - self.loss_accumulator[item] = 0.0 - else: - for task_key in self.model_keys: - if task_key in self.loss_accumulator: - for item in self.loss_accumulator[task_key]: - self.loss_accumulator[task_key][item] = 0.0 - if task_key in self.step_count_per_task: - self.step_count_per_task[task_key] = 0 - self.step_count_in_interval = 0 - self.last_display_step = display_step_id + if self.disp_avg: + # Reset loss accumulators after display + if not self.multi_task: + for item in self.train_loss_accu: + self.train_loss_accu[item] = 0.0 + else: + for task_key in self.model_keys: + if task_key in self.train_loss_accu: + for item in self.train_loss_accu[task_key]: + self.train_loss_accu[task_key][item] = 0.0 + if task_key in self.step_count_per_task: + self.step_count_per_task[task_key] = 0 + self.step_count_in_interval = 0 + self.last_display_step = display_step_id current_time = time.time() train_time = current_time - self.t0 @@ -1039,14 +1057,15 @@ def log_loss_valid(_task_key="Default"): self.total_train_time = 0.0 self.timed_steps = 0 - # Initialize loss accumulators - if not self.multi_task: - self.loss_accumulator = {} - else: - self.loss_accumulator = {key: {} for key in self.model_keys} - self.step_count_per_task = dict.fromkeys(self.model_keys, 0) - self.step_count_in_interval = 0 - self.last_display_step = 0 + if self.disp_avg: + # Initialize loss accumulators + if not self.multi_task: + self.train_loss_accu = {} + else: + self.train_loss_accu = {key: {} for key in self.model_keys} + self.step_count_per_task = dict.fromkeys(self.model_keys, 0) + self.step_count_in_interval = 0 + self.last_display_step = 0 for step_id in range(self.start_step, self.num_steps): step(step_id) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index fb911550dd..b9a1d1e22f 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -3137,6 +3137,7 @@ def training_args( ) doc_disp_training = "Displaying verbose information during training." doc_time_training = "Timing during training." + doc_disp_avg = "Display the average loss over the display interval for training sets." doc_profiling = "Export the profiling results to the Chrome JSON file for performance analysis, driven by the legacy TensorFlow profiling API or PyTorch Profiler. The output file will be saved to `profiling_file`." doc_profiling_file = "Output file for profiling." doc_enable_profiler = "Export the profiling results to the TensorBoard log for performance analysis, driven by TensorFlow Profiler (available in TensorFlow 2.3) or PyTorch Profiler. The log will be saved to `tensorboard_log_dir`." @@ -3213,6 +3214,9 @@ def training_args( Argument( "time_training", bool, optional=True, default=True, doc=doc_time_training ), + Argument( + "disp_avg", bool, optional=True, default=False, doc=doc_disp_avg + ), Argument( "profiling", bool, From 4797c001e3828a6aca7e416c2c3cef34e17f0cfb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 31 Jul 2025 09:20:16 +0000 Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt/train/training.py | 20 ++++++++++++++++---- deepmd/utils/argcheck.py | 8 ++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 151c7cd885..de387641c6 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -844,6 +844,7 @@ def fake_model(): self.wrapper.eval() # Will set to train mode before fininshing validation if self.disp_avg: + def log_loss_train(_loss, _more_loss, _task_key="Default"): results = {} if not self.multi_task: @@ -866,9 +867,14 @@ def log_loss_train(_loss, _more_loss, _task_key="Default"): ) return results else: + def log_loss_train(_loss, _more_loss, _task_key="Default"): results = {} - rmse_val = {item: _more_loss[item] for item in _more_loss if "l2_" not in item} + rmse_val = { + item: _more_loss[item] + for item in _more_loss + if "l2_" not in item + } for item in sorted(rmse_val.keys()): results[item] = rmse_val[item] return results @@ -936,18 +942,24 @@ def log_loss_valid(_task_key="Default"): loss, more_loss, _task_key=_key ) else: - train_results[task_key] = log_loss_train(loss, more_loss, _task_key=task_key) + train_results[task_key] = log_loss_train( + loss, more_loss, _task_key=task_key + ) for _key in self.model_keys: if _key != task_key: self.optimizer.zero_grad() - input_dict, label_dict, _ = self.get_data(is_train=True, task_key=_key) + input_dict, label_dict, _ = self.get_data( + is_train=True, task_key=_key + ) _, loss, more_loss = self.wrapper( **input_dict, cur_lr=pref_lr, label=label_dict, task_key=_key, ) - train_results[_key] = log_loss_train(loss, more_loss, _task_key=_key) + train_results[_key] = log_loss_train( + loss, more_loss, _task_key=_key + ) valid_results[_key] = log_loss_valid(_task_key=_key) if self.rank == 0: log.info( diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index b9a1d1e22f..d6f1b374aa 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -3137,7 +3137,9 @@ def training_args( ) doc_disp_training = "Displaying verbose information during training." doc_time_training = "Timing during training." - doc_disp_avg = "Display the average loss over the display interval for training sets." + doc_disp_avg = ( + "Display the average loss over the display interval for training sets." + ) doc_profiling = "Export the profiling results to the Chrome JSON file for performance analysis, driven by the legacy TensorFlow profiling API or PyTorch Profiler. The output file will be saved to `profiling_file`." doc_profiling_file = "Output file for profiling." doc_enable_profiler = "Export the profiling results to the TensorBoard log for performance analysis, driven by TensorFlow Profiler (available in TensorFlow 2.3) or PyTorch Profiler. The log will be saved to `tensorboard_log_dir`." @@ -3214,9 +3216,7 @@ def training_args( Argument( "time_training", bool, optional=True, default=True, doc=doc_time_training ), - Argument( - "disp_avg", bool, optional=True, default=False, doc=doc_disp_avg - ), + Argument("disp_avg", bool, optional=True, default=False, doc=doc_disp_avg), Argument( "profiling", bool, From 45d62daefc5ca407a0635dfbbc580f58b65f9e71 Mon Sep 17 00:00:00 2001 From: LI TIANCHENG <137472077+OutisLi@users.noreply.github.com> Date: Fri, 1 Aug 2025 15:09:15 +0800 Subject: [PATCH 5/6] doc: specify pytorch backend Co-authored-by: Jinzhe Zeng Signed-off-by: LI TIANCHENG <137472077+OutisLi@users.noreply.github.com> --- deepmd/utils/argcheck.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index d6f1b374aa..26d107f7ea 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -3216,7 +3216,7 @@ def training_args( Argument( "time_training", bool, optional=True, default=True, doc=doc_time_training ), - Argument("disp_avg", bool, optional=True, default=False, doc=doc_disp_avg), + Argument("disp_avg", bool, optional=True, default=False, doc=doc_only_pt_supported + doc_disp_avg), Argument( "profiling", bool, From fa79d453c7fc95776e737cea89152bf371f5fd57 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 1 Aug 2025 07:10:51 +0000 Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/utils/argcheck.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 26d107f7ea..e4c15ebd21 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -3216,7 +3216,13 @@ def training_args( Argument( "time_training", bool, optional=True, default=True, doc=doc_time_training ), - Argument("disp_avg", bool, optional=True, default=False, doc=doc_only_pt_supported + doc_disp_avg), + Argument( + "disp_avg", + bool, + optional=True, + default=False, + doc=doc_only_pt_supported + doc_disp_avg, + ), Argument( "profiling", bool,