feat(pt): use num_epoch to set num_steps

OutisLi · OutisLi · commit 94149a947e3e · 2026-01-11T23:55:26.000+08:00
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
@@ -139,9 +139,11 @@ def __init__(
             else 1
         )
         self.num_model = len(self.model_keys)
+        self.model_prob = None
 
         # Iteration config
-        self.num_steps = training_params["numb_steps"]
+        self.num_steps = training_params.get("numb_steps")
+        self.num_epoch = training_params.get("num_epoch")
         self.disp_file = training_params.get("disp_file", "lcurve.out")
         self.disp_freq = training_params.get("disp_freq", 1000)
         self.disp_avg = training_params.get("disp_avg", False)
@@ -247,6 +249,47 @@ def get_dataloader_and_iter(
                 valid_numb_batch,
             )
 
+        def compute_total_numb_batch(
+            numb_batches: Iterable[int],
+            sampler_weights: np.ndarray,
+        ) -> int:
+            weights = np.asarray(sampler_weights, dtype=np.float64)
+            if weights.ndim != 1:
+                raise ValueError("Sampler weights must be 1D.")
+            if weights.size == 0:
+                raise ValueError("Sampler weights are empty.")
+            weight_sum = float(np.sum(weights))
+            if weight_sum <= 0.0:
+                raise ValueError("Sampler weights must sum to a positive value.")
+            probs = weights / weight_sum
+            nbatches = np.asarray(numb_batches, dtype=np.float64)
+            if nbatches.shape[0] != probs.shape[0]:
+                raise ValueError("Number of batches and sampler weights must match.")
+            valid = probs > 0.0
+            if not np.any(valid):
+                raise ValueError(
+                    "Sampler probabilities must contain at least one positive entry."
+                )
+            return int(np.ceil(np.max(nbatches[valid] / probs[valid])))
+
+        def resolve_model_prob(
+            model_keys: list[str],
+            model_prob_config: dict[str, Any] | None,
+            model_training_data: dict[str, DpLoaderSet],
+        ) -> np.ndarray:
+            model_prob = np.zeros(len(model_keys), dtype=np.float64)
+            if model_prob_config is not None:
+                for ii, model_key in enumerate(model_keys):
+                    if model_key in model_prob_config:
+                        model_prob[ii] = float(model_prob_config[model_key])
+            else:
+                for ii, model_key in enumerate(model_keys):
+                    model_prob[ii] = float(len(model_training_data[model_key]))
+            sum_prob = float(np.sum(model_prob))
+            if sum_prob <= 0.0:
+                raise ValueError("Sum of model prob must be larger than 0!")
+            return model_prob / sum_prob
+
         def single_model_stat(
             _model: Any,
             _data_stat_nbatch: int,
@@ -430,6 +473,56 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
                         ),
                     )
 
+        # Resolve training steps
+        if not self.multi_task:
+            sampler_weights = to_numpy_array(self.training_dataloader.sampler.weights)
+            total_numb_batch = compute_total_numb_batch(
+                training_data.index,
+                sampler_weights,
+            )
+        else:
+            per_task_total = []
+            for model_key in self.model_keys:
+                sampler_weights = to_numpy_array(
+                    self.training_dataloader[model_key].sampler.weights
+                )
+                per_task_total.append(
+                    compute_total_numb_batch(
+                        training_data[model_key].index,
+                        sampler_weights,
+                    )
+                )
+            self.model_prob = resolve_model_prob(
+                self.model_keys,
+                training_params.get("model_prob"),
+                training_data,
+            )
+            total_numb_batch = int(
+                np.ceil(np.sum(np.asarray(per_task_total) * self.model_prob))
+            )
+        if self.num_steps is None:
+            if self.num_epoch is None:
+                raise ValueError(
+                    "Either training.numb_steps or training.num_epoch must be set."
+                )
+            if self.num_epoch <= 0:
+                raise ValueError("training.num_epoch must be positive.")
+            if total_numb_batch <= 0:
+                raise ValueError("Total number of training batches must be positive.")
+            self.num_steps = int(np.ceil(self.num_epoch * total_numb_batch))
+            log.info(
+                "Computed num_steps=%d from num_epoch=%s and total_numb_batch=%d.",
+                self.num_steps,
+                self.num_epoch,
+                total_numb_batch,
+            )
+        elif self.num_epoch is not None:
+            log.warning(
+                "Both training.numb_steps and training.num_epoch are set; "
+                "using numb_steps=%d.",
+                self.num_steps,
+            )
+
         # Learning rate
         self.warmup_steps = training_params.get("warmup_steps", 0)
         self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
@@ -637,19 +730,12 @@ def single_model_finetune(
                 )
 
         # Get model prob for multi-task
-        if self.multi_task:
-            self.model_prob = np.array([0.0 for key in self.model_keys])
-            if training_params.get("model_prob", None) is not None:
-                model_prob = training_params["model_prob"]
-                for ii, model_key in enumerate(self.model_keys):
-                    if model_key in model_prob:
-                        self.model_prob[ii] += float(model_prob[model_key])
-            else:
-                for ii, model_key in enumerate(self.model_keys):
-                    self.model_prob[ii] += float(len(self.training_data[model_key]))
-            sum_prob = np.sum(self.model_prob)
-            assert sum_prob > 0.0, "Sum of model prob must be larger than 0!"
-            self.model_prob = self.model_prob / sum_prob
+        if self.multi_task and self.model_prob is None:
+            self.model_prob = resolve_model_prob(
+                self.model_keys,
+                training_params.get("model_prob"),
+                training_data,
+            )
 
         # Multi-task share params
         if shared_links is not None:
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
@@ -3213,7 +3213,16 @@ def mixed_precision_args() -> list[Argument]:  # ! added by Denghui.
 def training_args(
     multi_task: bool = False,
 ) -> list[Argument]:  # ! modified by Ziyao: data configuration isolated.
-    doc_numb_steps = "Number of training batch. Each training uses one batch of data."
+    doc_numb_steps = "Number of training batches. Each training uses one batch of data. If set, this value takes precedence over num_epoch."
+    doc_num_epoch = (
+        "Number of training epochs. "
+        "When numb_steps is not set, the total steps are computed as "
+        "ceil(num_epoch * total_numb_batch). For each training dataset, "
+        "total_numb_batch is computed as ceil(max_i(n_bch_i / p_i)), where p_i "
+        "is the sampling probability of system i after sys_probs/auto_prob. "
+        "In multi-task mode, total_numb_batch is the model_prob-weighted sum "
+        "over tasks."
+    )
     doc_seed = "The random seed for getting frames from the training data set."
     doc_disp_file = "The file for printing learning curve."
     doc_disp_freq = "The frequency of printing learning curve."
@@ -3286,7 +3295,13 @@ def training_args(
     args += [
         mixed_precision_data,
         Argument(
-            "numb_steps", int, optional=False, doc=doc_numb_steps, alias=["stop_batch"]
+            "numb_steps", int, optional=True, doc=doc_numb_steps, alias=["stop_batch"]
+        ),
+        Argument(
+            "num_epoch",
+            [int, float],
+            optional=True,
+            doc=doc_only_pt_supported + doc_num_epoch,
         ),
         Argument("seed", [int, None], optional=True, doc=doc_seed),
         Argument(
diff --git a/source/tests/pt/test_sampler.py b/source/tests/pt/test_sampler.py