add use_cache as modifier var, so that the user can choose whether to save the data modification before training or to perform modification on-the-fly.

ChiahsinChu · ChiahsinChu · commit 432be5821488 · 2025-12-25T16:38:03.000+08:00
diff --git a/deepmd/pt/modifier/base_modifier.py b/deepmd/pt/modifier/base_modifier.py
@@ -28,12 +28,14 @@
 
 
 class BaseModifier(torch.nn.Module, make_base_modifier()):
-    def __init__(self) -> None:
+    def __init__(self, use_cache: bool = True) -> None:
         """Construct a base modifier for data modification tasks."""
         torch.nn.Module.__init__(self)
         self.modifier_type = "base"
         self.jitable = True
 
+        self.use_cache = use_cache
+
     def serialize(self) -> dict:
         """Serialize the modifier.
 
diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
@@ -140,11 +140,11 @@ def __init__(
         # The prefix sum stores the range of indices contained in each directory, which is needed by get_item method
         self.prefix_sum = np.cumsum(frames_list).tolist()
 
-        self.apply_modifier_at_load = True
+        self.use_modifier_cache = True
         if self.modifier is not None:
-            if hasattr(self.modifier, "apply_modifier_at_load"):
-                self.apply_modifier_at_load = self.modifier.apply_modifier_at_load
-            # Cache for modified frames when apply_modifier_at_load is True
+            if hasattr(self.modifier, "use_cache"):
+                self.use_modifier_cache = self.modifier.use_cache
+            # Cache for modified frames when use_modifier_cache is True
             self._modified_frame_cache = {}
 
     def add(
@@ -385,9 +385,9 @@ def get_natoms_vec(self, ntypes: int) -> np.ndarray:
 
     def get_single_frame(self, index: int) -> dict:
         """Orchestrates loading a single frame efficiently using memmap."""
-        # Check if we have a cached modified frame and apply_modifier_at_load is True
+        # Check if we have a cached modified frame and use_modifier_cache is True
         if (
-            self.apply_modifier_at_load
+            self.use_modifier_cache
             and self.modifier is not None
             and index in self._modified_frame_cache
         ):
@@ -490,19 +490,18 @@ def get_single_frame(self, index: int) -> dict:
         if self.modifier is not None:
             # Apply modifier if it exists
             self.modifier.modify_data(frame_data, self)
-            if self.apply_modifier_at_load:
+            if self.use_modifier_cache:
                 # Cache the modified frame to avoid recomputation
                 self._modified_frame_cache[index] = copy.deepcopy(frame_data)
-
         return frame_data
 
     def preload_and_modify_all_data(self) -> None:
         """Preload all frames and apply modifier to cache them.
 
-        This method is useful when apply_modifier_at_load is True and you want to
+        This method is useful when use_modifier_cache is True and you want to
         avoid applying the modifier repeatedly during training.
         """
-        if not self.apply_modifier_at_load or self.modifier is None:
+        if not self.use_modifier_cache or self.modifier is None:
             return
 
         log.info("Preloading and modifying all data frames...")
diff --git a/source/tests/pt/test_data_modifier.py b/source/tests/pt/test_data_modifier.py
@@ -64,23 +64,30 @@
 @modifier_args_plugin.register("random_tester", doc=doc_random_tester)
 def modifier_random_tester() -> list:
     doc_seed = "Random seed used to initialize the random number generator for deterministic scaling factors."
+    doc_use_cache = "Whether to cache modified frames to improve performance by avoiding recomputation."
     return [
         Argument("seed", int, optional=True, doc=doc_seed),
+        Argument("use_cache", bool, optional=True, doc=doc_use_cache),
     ]
 
 
 @modifier_args_plugin.register("zero_tester", doc=doc_zero_tester)
 def modifier_zero_tester() -> list:
-    return []
+    doc_use_cache = "Whether to cache modified frames to improve performance by avoiding recomputation."
+    return [
+        Argument("use_cache", bool, optional=True, doc=doc_use_cache),
+    ]
 
 
 @modifier_args_plugin.register("scaling_tester", doc=doc_scaling_tester)
 def modifier_scaling_tester() -> list[Argument]:
     doc_model_name = "The name of the frozen energy model file."
     doc_sfactor = "The scaling factor for correction."
+    doc_use_cache = "Whether to cache modified frames to improve performance by avoiding recomputation."
     return [
         Argument("model_name", str, optional=False, doc=doc_model_name),
         Argument("sfactor", float, optional=False, doc=doc_sfactor),
+        Argument("use_cache", bool, optional=True, doc=doc_use_cache),
     ]
 
 
@@ -92,12 +99,14 @@ def __new__(cls, *args, **kwargs):
     def __init__(
         self,
         seed: int = 1,
+        use_cache: bool = True,
     ) -> None:
         """Construct a random_tester modifier that scales data by deterministic random factors for testing."""
-        super().__init__()
+        super().__init__(use_cache)
         self.modifier_type = "random_tester"
         # Use a fixed seed for deterministic behavior
         self.rng = np.random.default_rng(seed)
+        self.sfactor = self.rng.random()
 
     def forward(
         self,
@@ -121,21 +130,24 @@ def modify_data(self, data: dict[str, Array | float], data_sys: DeepmdData) -> N
             return
 
         if "find_energy" in data and data["find_energy"] == 1.0:
-            data["energy"] = data["energy"] * self.rng.random()
+            data["energy"] = data["energy"] * self.sfactor
         if "find_force" in data and data["find_force"] == 1.0:
-            data["force"] = data["force"] * self.rng.random()
+            data["force"] = data["force"] * self.sfactor
         if "find_virial" in data and data["find_virial"] == 1.0:
-            data["virial"] = data["virial"] * self.rng.random()
+            data["virial"] = data["virial"] * self.sfactor
 
 
 @BaseModifier.register("zero_tester")
 class ModifierZeroTester(BaseModifier):
     def __new__(cls, *args, **kwargs):
         return super().__new__(cls)
 
-    def __init__(self) -> None:
+    def __init__(
+        self,
+        use_cache: bool = True,
+    ) -> None:
         """Construct a modifier that zeros out data for testing."""
-        super().__init__()
+        super().__init__(use_cache)
         self.modifier_type = "zero_tester"
 
     def forward(
@@ -176,9 +188,10 @@ def __init__(
         self,
         model_name: str,
         sfactor: float = 1.0,
+        use_cache: bool = True,
     ) -> None:
         """Initialize a test modifier that applies scaled model predictions using a frozen model."""
-        super().__init__()
+        super().__init__(use_cache)
         self.modifier_type = "scaling_tester"
         self.model_name = model_name
         self.sfactor = sfactor
@@ -212,6 +225,7 @@ def forward(
 @parameterized(
     (1, 2),  # training data batch_size
     (1, 2),  # validation data batch_size
+    (True, False),  # use_cache
 )
 class TestDataModifier(unittest.TestCase):
     def setUp(self) -> None:
@@ -240,7 +254,10 @@ def test_init_modify_data(self):
         """Ensure modify_data applied."""
         tmp_config = self.config.copy()
         # add tester data modifier
-        tmp_config["model"]["modifier"] = {"type": "zero_tester"}
+        tmp_config["model"]["modifier"] = {
+            "type": "zero_tester",
+            "use_cache": self.param[2],
+        }
 
         # data modification is finished in __init__
         trainer = get_trainer(tmp_config)
@@ -262,6 +279,7 @@ def test_full_modify_data(self):
         tmp_config["model"]["modifier"] = {
             "type": "random_tester",
             "seed": 1024,
+            "use_cache": self.param[2],
         }
 
         # data modification is finished in __init__
@@ -307,6 +325,7 @@ def test_inference(self):
             "type": "scaling_tester",
             "model_name": "frozen_model_dm.pth",
             "sfactor": sfactor,
+            "use_cache": True,
         }
 
         trainer = get_trainer(tmp_config)