add num_epoch_dict for multitask training

OutisLi · OutisLi · commit 8cef98438576 · 2026-01-13T13:22:09.000+08:00
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
@@ -134,6 +134,7 @@ def __init__(
         # Iteration config
         self.num_steps = training_params.get("numb_steps")
         self.num_epoch = training_params.get("num_epoch")
+        self.num_epoch_dict = training_params.get("num_epoch_dict")
         self.acc_freq: int = training_params.get(
             "acc_freq", 1
         )  # gradient accumulation steps
@@ -465,24 +466,63 @@ def get_lr(lr_params):
                 np.ceil(np.sum(np.asarray(per_task_total) * self.model_prob))
             )
         if self.num_steps is None:
-            if self.num_epoch is None:
+            # === Step 1. Check num_epoch_dict first (multi-task only) ===
+            if self.multi_task and self.num_epoch_dict:
+                missing = [k for k in self.model_keys if k not in self.num_epoch_dict]
+                if missing:
+                    raise ValueError(
+                        f"training.num_epoch_dict must specify all tasks; missing: {missing}"
+                    )
+                # Validate epoch values
+                for model_key in self.model_keys:
+                    epoch_value = self.num_epoch_dict[model_key]
+                    if epoch_value is not None and epoch_value <= 0:
+                        raise ValueError(
+                            f"training.num_epoch_dict['{model_key}'] must be positive, got {epoch_value}."
+                        )
+                # Compute steps needed for each task to complete its epochs
+                per_task_steps = []
+                for ii, model_key in enumerate(self.model_keys):
+                    epoch_value = self.num_epoch_dict[model_key]
+                    if epoch_value is not None:
+                        # steps_i = epoch_i * per_task_total[i] / model_prob[i]
+                        steps_i = epoch_value * per_task_total[ii] / self.model_prob[ii]
+                        per_task_steps.append(steps_i)
+                self.num_steps = int(np.ceil(np.max(per_task_steps)))
+                log.info(
+                    "Computed num_steps=%d from num_epoch_dict=%s with per-task steps: %s.",
+                    self.num_steps,
+                    self.num_epoch_dict,
+                    {
+                        k: int(np.ceil(v))
+                        for k, v in zip(self.model_keys, per_task_steps)
+                    },
+                )
+            # === Step 2. Fall back to num_epoch ===
+            elif self.num_epoch is None:
                 raise ValueError(
-                    "Either training.numb_steps or training.num_epoch must be set."
+                    "Either training.numb_steps, training.num_epoch, or "
+                    "training.num_epoch_dict (multi-task only) must be set."
                 )
-            if self.num_epoch <= 0:
-                raise ValueError("training.num_epoch must be positive.")
-            if total_numb_batch <= 0:
-                raise ValueError("Total number of training batches must be positive.")
-            self.num_steps = int(np.ceil(self.num_epoch * total_numb_batch))
-            log.info(
-                "Computed num_steps=%d from num_epoch=%s and total_numb_batch=%d.",
-                self.num_steps,
-                self.num_epoch,
-                total_numb_batch,
-            )
-        elif self.num_epoch is not None:
+            else:
+                if self.num_epoch <= 0:
+                    raise ValueError("training.num_epoch must be positive.")
+                if total_numb_batch <= 0:
+                    raise ValueError(
+                        "Total number of training batches must be positive."
+                    )
+                self.num_steps = int(np.ceil(self.num_epoch * total_numb_batch))
+                log.info(
+                    "Computed num_steps=%d from num_epoch=%s and total_numb_batch=%d.",
+                    self.num_steps,
+                    self.num_epoch,
+                    total_numb_batch,
+                )
+        elif self.num_epoch is not None or (
+            self.multi_task and self.num_epoch_dict is not None
+        ):
             log.warning(
-                "Both training.numb_steps and training.num_epoch are set; "
+                "Both training.numb_steps and training.num_epoch (or num_epoch_dict) are set; "
                 "using numb_steps=%d.",
                 self.num_steps,
             )
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
@@ -144,6 +144,7 @@ def __init__(
         # Iteration config
         self.num_steps = training_params.get("numb_steps")
         self.num_epoch = training_params.get("num_epoch")
+        self.num_epoch_dict = training_params.get("num_epoch_dict")
         self.disp_file = training_params.get("disp_file", "lcurve.out")
         self.disp_freq = training_params.get("disp_freq", 1000)
         self.disp_avg = training_params.get("disp_avg", False)
@@ -519,24 +520,63 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
                 np.ceil(np.sum(np.asarray(per_task_total) * self.model_prob))
             )
         if self.num_steps is None:
-            if self.num_epoch is None:
+            # === Step 1. Check num_epoch_dict first (multi-task only) ===
+            if self.multi_task and self.num_epoch_dict:
+                missing = [k for k in self.model_keys if k not in self.num_epoch_dict]
+                if missing:
+                    raise ValueError(
+                        f"training.num_epoch_dict must specify all tasks; missing: {missing}"
+                    )
+                # Validate epoch values
+                for model_key in self.model_keys:
+                    epoch_value = self.num_epoch_dict[model_key]
+                    if epoch_value is not None and epoch_value <= 0:
+                        raise ValueError(
+                            f"training.num_epoch_dict['{model_key}'] must be positive, got {epoch_value}."
+                        )
+                # Compute steps needed for each task to complete its epochs
+                per_task_steps = []
+                for ii, model_key in enumerate(self.model_keys):
+                    epoch_value = self.num_epoch_dict[model_key]
+                    if epoch_value is not None:
+                        # steps_i = epoch_i * per_task_total[i] / model_prob[i]
+                        steps_i = epoch_value * per_task_total[ii] / self.model_prob[ii]
+                        per_task_steps.append(steps_i)
+                self.num_steps = int(np.ceil(np.max(per_task_steps)))
+                log.info(
+                    "Computed num_steps=%d from num_epoch_dict=%s with per-task steps: %s.",
+                    self.num_steps,
+                    self.num_epoch_dict,
+                    {
+                        k: int(np.ceil(v))
+                        for k, v in zip(self.model_keys, per_task_steps)
+                    },
+                )
+            # === Step 2. Fall back to num_epoch ===
+            elif self.num_epoch is None:
                 raise ValueError(
-                    "Either training.numb_steps or training.num_epoch must be set."
+                    "Either training.numb_steps, training.num_epoch, or "
+                    "training.num_epoch_dict (multi-task only) must be set."
                 )
-            if self.num_epoch <= 0:
-                raise ValueError("training.num_epoch must be positive.")
-            if total_numb_batch <= 0:
-                raise ValueError("Total number of training batches must be positive.")
-            self.num_steps = int(np.ceil(self.num_epoch * total_numb_batch))
-            log.info(
-                "Computed num_steps=%d from num_epoch=%s and total_numb_batch=%d.",
-                self.num_steps,
-                self.num_epoch,
-                total_numb_batch,
-            )
-        elif self.num_epoch is not None:
+            else:
+                if self.num_epoch <= 0:
+                    raise ValueError("training.num_epoch must be positive.")
+                if total_numb_batch <= 0:
+                    raise ValueError(
+                        "Total number of training batches must be positive."
+                    )
+                self.num_steps = int(np.ceil(self.num_epoch * total_numb_batch))
+                log.info(
+                    "Computed num_steps=%d from num_epoch=%s and total_numb_batch=%d.",
+                    self.num_steps,
+                    self.num_epoch,
+                    total_numb_batch,
+                )
+        elif self.num_epoch is not None or (
+            self.multi_task and self.num_epoch_dict is not None
+        ):
             log.warning(
-                "Both training.numb_steps and training.num_epoch are set; "
+                "Both training.numb_steps and training.num_epoch (or num_epoch_dict) are set; "
                 "using numb_steps=%d.",
                 self.num_steps,
             )
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
@@ -3229,10 +3229,26 @@ def training_args(
         "as above, and the final total_numb_batch is their model_prob-weighted sum. "
         "Note that in multi-task mode, this defines an 'expected epoch' where each "
         "sample is visited once in expectation across all tasks, rather than a "
-        "full epoch for each individual task. For multi-task pretraining scenarios "
-        "where different tasks require different numbers of visits, using numb_steps "
-        "directly is recommended for more explicit control. At least one of numb_steps "
-        "or num_epoch must be set; otherwise a ValueError is raised."
+        "full epoch for each individual task. In multi-task mode, num_epoch_dict "
+        "takes precedence over num_epoch if both are set. For multi-task pretraining "
+        "scenarios where different tasks require different numbers of visits, using "
+        "numb_steps directly is recommended for more explicit control. At least one "
+        "of numb_steps or num_epoch (or num_epoch_dict in multi-task mode) must be "
+        "set; otherwise a ValueError is raised."
+    )
+    doc_num_epoch_dict = (
+        "Number of training epochs for each model branch in multi-task mode "
+        "(can be fractional). This is a dictionary mapping model keys to the "
+        "number of epochs to train that specific model. When set, the total "
+        "training steps are computed as max_i(num_epoch_dict[i] * per_task_total[i] / model_prob[i]), "
+        "ensuring each model completes at least its specified number of epochs. "
+        "The model requiring the most steps will complete approximately its target "
+        "epochs, while other models may complete more epochs. This is particularly "
+        "useful for multi-task fine-tuning scenarios where a data-rich pretrained model "
+        "is jointly trained with a data-scarce downstream task, and only the downstream "
+        "task's epoch count is of interest. In multi-task mode, this parameter takes "
+        "precedence over num_epoch if both are set. All model keys must be specified "
+        "in the dictionary."
     )
     doc_seed = "The random seed for getting frames from the training data set."
     doc_disp_file = "The file for printing learning curve."
@@ -3303,6 +3319,13 @@ def training_args(
         if not multi_task
         else [
             Argument("model_prob", dict, optional=True, default={}, doc=doc_model_prob),
+            Argument(
+                "num_epoch_dict",
+                dict,
+                optional=True,
+                default={},
+                doc=doc_num_epoch_dict,
+            ),
             Argument("data_dict", dict, data_args, repeat=True, doc=doc_data_dict),
         ]
     )
diff --git a/doc/train/multi-task-training.md b/doc/train/multi-task-training.md
@@ -81,6 +81,14 @@ Specifically, there are several parts that need to be modified:
   You can specify any positive real number weight for each task. The higher the weight, the higher the probability of being sampled in each training.
   This setting is optional, and if not set, tasks will be sampled with equal weights.
 
+- (Optional) {ref}`training/num_epoch_dict <training/num_epoch_dict>`: The number of training epochs for each model branch, specified as a dictionary mapping `model_key` to epoch values.
+  This allows different tasks to train for different numbers of epochs, which is particularly useful for multi-task fine-tuning scenarios
+  where a data-rich pretrained model is jointly trained with a data-scarce downstream task.
+  When set, the total training steps are computed as `max_i(num_epoch_dict[i] * per_task_total[i] / model_prob[i])`,
+  ensuring each model completes at least its specified number of epochs.
+  The model requiring the most steps will complete approximately its target epochs, while other models may complete more epochs.
+  In multi-task mode, this parameter takes precedence over `num_epoch` if both are set.
+
 An example input for multi-task training two models in water system is shown as following:
 
 ```{literalinclude} ../../examples/water_multi_task/pytorch_example/input_torch.json
diff --git a/source/tests/pt/test_sampler.py b/source/tests/pt/test_sampler.py
@@ -410,6 +410,113 @@ def test_sampling_stability_multi_task(self) -> None:
             )
         )
 
+    def test_num_epoch_dict(self) -> None:
+        """Test num_epoch_dict calculation logic for multi-task training."""
+        # === Step 1. Build Datasets ===
+        model_keys = ["model_1", "model_2"]
+        systems_1 = [
+            str(Path(__file__).parent / "water/data/data_0"),
+            str(Path(__file__).parent / "water/data/data_1"),
+        ]
+        systems_2 = [
+            str(Path(__file__).parent / "water/data/data_1"),
+            str(Path(__file__).parent / "water/data/single"),
+        ]
+        dataset_1 = pt_dataloader.DpLoaderSet(
+            systems_1,
+            self.batch_size,
+            self.type_map,
+            seed=10,
+            shuffle=False,
+        )
+        dataset_2 = pt_dataloader.DpLoaderSet(
+            systems_2,
+            self.batch_size,
+            self.type_map,
+            seed=10,
+            shuffle=False,
+        )
+        sampler_1 = pt_dataloader.get_sampler_from_params(
+            dataset_1, {"sys_probs": [0.7, 0.3], "auto_prob": "prob_sys_size"}
+        )
+        sampler_2 = pt_dataloader.get_sampler_from_params(
+            dataset_2, {"sys_probs": [0.4, 0.6], "auto_prob": "prob_sys_size"}
+        )
+        probs_1 = self._normalize_probs(np.asarray(sampler_1.weights))
+        probs_2 = self._normalize_probs(np.asarray(sampler_2.weights))
+
+        # === Step 2. Compute per-task total_numb_batch ===
+        per_task_total = np.array(
+            [
+                self._compute_total_numb_batch(
+                    np.asarray(dataset_1.index, dtype=np.float64), probs_1
+                ),
+                self._compute_total_numb_batch(
+                    np.asarray(dataset_2.index, dtype=np.float64), probs_2
+                ),
+            ],
+            dtype=np.float64,
+        )
+
+        # === Step 3. Test num_epoch_dict calculation ===
+        model_prob = np.asarray([0.4, 0.6], dtype=np.float64)
+        model_prob = model_prob / np.sum(model_prob)
+        num_epoch_dict = {model_keys[0]: 2.0, model_keys[1]: 5.0}
+
+        # Compute expected steps for each task
+        # steps_i = epoch_i * per_task_total[i] / model_prob[i]
+        per_task_steps = np.array(
+            [
+                num_epoch_dict[model_keys[0]] * per_task_total[0] / model_prob[0],
+                num_epoch_dict[model_keys[1]] * per_task_total[1] / model_prob[1],
+            ],
+            dtype=np.float64,
+        )
+
+        # Total steps should be max of per-task steps
+        expected_num_steps = int(np.ceil(np.max(per_task_steps)))
+
+        # Verify the calculation matches the expected formula
+        self.assertIsInstance(expected_num_steps, int)
+        self.assertGreater(expected_num_steps, 0)
+
+        # Verify that running expected_num_steps would give each task at least
+        # its target epochs (may be more for tasks needing fewer steps)
+        expected_model_0_counts = expected_num_steps * model_prob[0]
+        expected_model_1_counts = expected_num_steps * model_prob[1]
+
+        # Each task should complete at least its target epochs
+        expected_epochs_0 = expected_model_0_counts / per_task_total[0]
+        expected_epochs_1 = expected_model_1_counts / per_task_total[1]
+
+        self.assertGreaterEqual(
+            expected_epochs_0,
+            num_epoch_dict[model_keys[0]],
+            msg="Model 0 should complete at least 2 epochs",
+        )
+        self.assertGreaterEqual(
+            expected_epochs_1,
+            num_epoch_dict[model_keys[1]],
+            msg="Model 1 should complete at least 5 epochs",
+        )
+
+        # The task requiring the most steps should complete approximately its target
+        max_task_idx = int(np.argmax(per_task_steps))
+        if max_task_idx == 0:
+            self.assertAlmostEqual(
+                expected_epochs_0,
+                num_epoch_dict[model_keys[0]],
+                delta=0.1,
+                msg="Model 0 (max steps) should complete approximately 2 epochs",
+            )
+        else:
+            self.assertAlmostEqual(
+                expected_epochs_1,
+                num_epoch_dict[model_keys[1]],
+                delta=0.1,
+                msg="Model 1 (max steps) should complete approximately 5 epochs",
+            )
+
 
 if __name__ == "__main__":
     unittest.main()