feat(pd): support gradient accumulation (#4920)

HydrogenSulfate · web-flow · commit c79686285fb6 · 2025-08-27T12:19:15.000Z
support gradient accumulation for paddle backend.

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **New Features**
- Configurable gradient accumulation (acc_freq) that batches optimizer
updates, optional gradient clipping, and multi‑GPU gradient sync to
occur at the configured interval; acc_freq=1 preserves prior behavior.

- **Documentation**
  - Added argument docs and a Paddle backend notice describing acc_freq.

- **Tests**
- Added tests exercising gradient accumulation and updated test cleanup.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
@@ -133,6 +133,9 @@ def __init__(
 
         # Iteration config
         self.num_steps = training_params["numb_steps"]
+        self.acc_freq: int = training_params.get(
+            "acc_freq", 1
+        )  # gradient accumulation steps
         self.disp_file = training_params.get("disp_file", "lcurve.out")
         self.disp_freq = training_params.get("disp_freq", 1000)
         self.save_ckpt = training_params.get("save_ckpt", "model.ckpt")
@@ -744,7 +747,6 @@ def step(_step_id, task_key="Default") -> None:
                 _lr = self.lr_exp
             cur_lr = _lr.value(_step_id)
             pref_lr = cur_lr
-            self.optimizer.clear_grad(set_to_zero=False)
 
             with nvprof_context(enable_profiling, "Fetching data"):
                 input_dict, label_dict, log_dict = self.get_data(
@@ -780,22 +782,27 @@ def step(_step_id, task_key="Default") -> None:
                     with nvprof_context(enable_profiling, "Backward pass"):
                         loss.backward()
 
-                # fuse + allreduce manually before optimization if use DDP + no_sync
-                # details in https://github.com/PaddlePaddle/Paddle/issues/48898#issuecomment-1343838622
-                if self.world_size > 1:
-                    hpu.fused_allreduce_gradients(list(self.wrapper.parameters()), None)
-
-                if self.gradient_max_norm > 0.0:
-                    with nvprof_context(enable_profiling, "Gradient clip"):
-                        paddle.nn.utils.clip_grad_norm_(
-                            self.wrapper.parameters(),
-                            self.gradient_max_norm,
-                            error_if_nonfinite=True,
+                # gradient accumulation
+                if (_step_id + 1) % self.acc_freq == 0:
+                    # fuse + allreduce manually before optimization if use DDP + no_sync
+                    # details in https://github.com/PaddlePaddle/Paddle/issues/48898#issuecomment-1343838622
+                    if self.world_size > 1:
+                        hpu.fused_allreduce_gradients(
+                            list(self.wrapper.parameters()), None
                         )
 
-                with nvprof_context(enable_profiling, "Adam update"):
-                    self.optimizer.step()
-                self.scheduler.step()
+                    if self.gradient_max_norm > 0.0:
+                        with nvprof_context(enable_profiling, "Gradient clip"):
+                            paddle.nn.utils.clip_grad_norm_(
+                                self.wrapper.parameters(),
+                                self.gradient_max_norm,
+                                error_if_nonfinite=True,
+                            )
+
+                    with nvprof_context(enable_profiling, "Adam update"):
+                        self.optimizer.step()
+                    self.optimizer.clear_grad(set_to_zero=False)
+                    self.scheduler.step()
 
             else:
                 raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
@@ -40,6 +40,7 @@
 
 doc_only_tf_supported = "(Supported Backend: TensorFlow) "
 doc_only_pt_supported = "(Supported Backend: PyTorch) "
+doc_only_pd_supported = "(Supported Backend: Paddle) "
 # descriptors
 doc_loc_frame = "Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame."
 doc_se_e2_a = "Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor."
@@ -3167,6 +3168,7 @@ def training_args(
     doc_kf_blocksize = "The blocksize for the Kalman filter."
     doc_model_prob = "The visiting probability of each model for each training step in the multi-task mode."
     doc_data_dict = "The multiple definition of the data, used in the multi-task mode."
+    doc_acc_freq = "Gradient accumulation steps (number of steps to accumulate gradients before performing an update)."
 
     arg_training_data = training_data_args()
     arg_validation_data = validation_data_args()
@@ -3269,6 +3271,13 @@ def training_args(
             optional=True,
             doc=doc_only_pt_supported + doc_gradient_max_norm,
         ),
+        Argument(
+            "acc_freq",
+            int,
+            optional=True,
+            default=1,
+            doc=doc_only_pd_supported + doc_acc_freq,
+        ),
     ]
     variants = [
         Variant(
diff --git a/source/tests/pd/test_training.py b/source/tests/pd/test_training.py
@@ -150,9 +150,25 @@ def setUp(self) -> None:
         self.config["model"] = deepcopy(model_se_e2_a)
         self.config["training"]["numb_steps"] = 1
         self.config["training"]["save_freq"] = 1
-        # import paddle
         enable_prim(True)
-        # assert paddle.framework.core._is_eager_prim_enabled()
+
+    def tearDown(self) -> None:
+        DPTrainTest.tearDown(self)
+
+
+class TestEnergyModelGradientAccumulation(unittest.TestCase, DPTrainTest):
+    def setUp(self) -> None:
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.config["training"]["acc_freq"] = 4
+        enable_prim(True)
 
     def tearDown(self) -> None:
         DPTrainTest.tearDown(self)