From f261151fbb0e1a21599d69905b393e9ddfbe66d4 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Mon, 23 Mar 2026 12:26:30 +0800
Subject: [PATCH 01/14] fix: Square atom_norm in non-Huber energy and virial
 loss calculations.

---
 deepmd/dpmodel/loss/ener.py | 4 ++--
 deepmd/pd/loss/ener.py      | 4 ++--
 deepmd/pt/loss/ener.py      | 4 ++--
 deepmd/tf/loss/ener.py      | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/deepmd/dpmodel/loss/ener.py b/deepmd/dpmodel/loss/ener.py
index 201a77fcdd..8172302f5d 100644
--- a/deepmd/dpmodel/loss/ener.py
+++ b/deepmd/dpmodel/loss/ener.py
@@ -245,7 +245,7 @@ def call(
             if self.loss_func == "mse":
                 l2_ener_loss = xp.mean(xp.square(energy - energy_hat))
                 if not self.use_huber:
-                    loss += atom_norm_ener * (pref_e * l2_ener_loss)
+                    loss += atom_norm_ener**2 * (pref_e * l2_ener_loss)
                 else:
                     l_huber_loss = custom_huber_loss(
                         atom_norm_ener * energy,
@@ -312,7 +312,7 @@ def call(
                     xp.square(virial_hat_reshape - virial_reshape),
                 )
                 if not self.use_huber:
-                    loss += atom_norm * (pref_v * l2_virial_loss)
+                    loss += atom_norm**2 * (pref_v * l2_virial_loss)
                 else:
                     l_huber_loss = custom_huber_loss(
                         atom_norm * virial_reshape,
diff --git a/deepmd/pd/loss/ener.py b/deepmd/pd/loss/ener.py
index cf093b90d4..9694cd1236 100644
--- a/deepmd/pd/loss/ener.py
+++ b/deepmd/pd/loss/ener.py
@@ -233,7 +233,7 @@ def forward(
                         l2_ener_loss.detach(), find_energy
                     )
                 if not self.use_huber:
-                    loss += atom_norm * (pref_e * l2_ener_loss)
+                    loss += atom_norm**2 * (pref_e * l2_ener_loss)
                 else:
                     l_huber_loss = custom_huber_loss(
                         atom_norm * energy_pred,
@@ -404,7 +404,7 @@ def forward(
                     l2_virial_loss.detach(), find_virial
                 )
             if not self.use_huber:
-                loss += atom_norm * (pref_v * l2_virial_loss)
+                loss += atom_norm**2 * (pref_v * l2_virial_loss)
             else:
                 l_huber_loss = custom_huber_loss(
                     atom_norm * model_pred["virial"].reshape([-1]),
diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py
index 66d60aacec..1830a6e64b 100644
--- a/deepmd/pt/loss/ener.py
+++ b/deepmd/pt/loss/ener.py
@@ -239,7 +239,7 @@ def forward(
                         l2_ener_loss.detach(), find_energy
                     )
                 if not self.use_huber:
-                    loss += atom_norm * (pref_e * l2_ener_loss)
+                    loss += atom_norm**2 * (pref_e * l2_ener_loss)
                 else:
                     l_huber_loss = custom_huber_loss(
                         atom_norm * energy_pred,
@@ -421,7 +421,7 @@ def forward(
                         l2_virial_loss.detach(), find_virial
                     )
                 if not self.use_huber:
-                    loss += atom_norm * (pref_v * l2_virial_loss)
+                    loss += atom_norm**2 * (pref_v * l2_virial_loss)
                 else:
                     l_huber_loss = custom_huber_loss(
                         atom_norm * model_pred["virial"].reshape(-1),
diff --git a/deepmd/tf/loss/ener.py b/deepmd/tf/loss/ener.py
index 91607245a2..726c903dba 100644
--- a/deepmd/tf/loss/ener.py
+++ b/deepmd/tf/loss/ener.py
@@ -346,7 +346,7 @@ def build(
         more_loss = {}
         if self.has_e:
             if not self.use_huber:
-                loss += atom_norm_ener * (pref_e * l2_ener_loss)
+                loss += atom_norm_ener**2 * (pref_e * l2_ener_loss)
             else:
                 l_huber_loss = custom_huber_loss(
                     atom_norm_ener * energy,
@@ -370,7 +370,7 @@ def build(
             )
         if self.has_v:
             if not self.use_huber:
-                loss += global_cvt_2_ener_float(atom_norm * (pref_v * l2_virial_loss))
+                loss += global_cvt_2_ener_float(atom_norm**2 * (pref_v * l2_virial_loss))
             else:
                 l_huber_loss = custom_huber_loss(
                     atom_norm * tf.reshape(virial, [-1]),

From 247f05388fed9b64d49bc940a067780ed1a2b1b8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 23 Mar 2026 04:28:19 +0000
Subject: [PATCH 02/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/tf/loss/ener.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepmd/tf/loss/ener.py b/deepmd/tf/loss/ener.py
index 726c903dba..08d89fdefd 100644
--- a/deepmd/tf/loss/ener.py
+++ b/deepmd/tf/loss/ener.py
@@ -370,7 +370,9 @@ def build(
             )
         if self.has_v:
             if not self.use_huber:
-                loss += global_cvt_2_ener_float(atom_norm**2 * (pref_v * l2_virial_loss))
+                loss += global_cvt_2_ener_float(
+                    atom_norm**2 * (pref_v * l2_virial_loss)
+                )
             else:
                 l_huber_loss = custom_huber_loss(
                     atom_norm * tf.reshape(virial, [-1]),

From 11be90881ee62196fe02ace8f582279166794043 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 16 Apr 2026 07:51:28 +0000
Subject: [PATCH 03/14] feat(loss): add intensive parameter for
 backward-compatible RMSE normalization

Agent-Logs-Url: https://github.com/anyangml/deepmd-kit/sessions/98719546-6f9c-433e-a3de-dc65d98c68fb

Co-authored-by: anyangml <137014849+anyangml@users.noreply.github.com>
---
 deepmd/dpmodel/loss/ener.py      | 24 ++++++++++++++++++++----
 deepmd/dpmodel/loss/ener_spin.py | 24 ++++++++++++++++++++----
 deepmd/pd/loss/ener.py           | 15 +++++++++++++--
 deepmd/pt/loss/ener.py           | 15 +++++++++++++--
 deepmd/pt/loss/ener_spin.py      | 15 +++++++++++++--
 deepmd/tf/loss/ener.py           | 15 +++++++++++++--
 deepmd/utils/argcheck.py         | 28 ++++++++++++++++++++++++++++
 7 files changed, 120 insertions(+), 16 deletions(-)

diff --git a/deepmd/dpmodel/loss/ener.py b/deepmd/dpmodel/loss/ener.py
index 55c427c678..09f43751ca 100644
--- a/deepmd/dpmodel/loss/ener.py
+++ b/deepmd/dpmodel/loss/ener.py
@@ -90,6 +90,13 @@ class EnergyLoss(Loss):
         If true, use L2 norm of force vectors for loss calculation when loss_func='mae' or use_huber is True.
         Instead of computing loss on force components, computes loss on ||F_pred - F_label||_2.
         This treats the force vector as a whole rather than three independent components.
+    intensive : bool
+        If true (default), energy and virial losses are computed as intensive quantities,
+        normalized by the square of the number of atoms (1/N^2). This ensures the loss
+        value is independent of system size and consistent with per-atom RMSE reporting.
+        If false, uses the legacy normalization (1/N), which may cause the loss to scale
+        with system size. Set to false for backward compatibility with models trained
+        using deepmd-kit <= 3.0.1.
     **kwargs
         Other keyword arguments.
     """
@@ -116,6 +123,7 @@ def __init__(
         huber_delta: float | list[float] = 0.01,
         loss_func: str = "mse",
         f_use_norm: bool = False,
+        intensive: bool = True,
         **kwargs: Any,
     ) -> None:
         # Validate loss_func
@@ -155,6 +163,7 @@ def __init__(
         self.use_huber = use_huber
         self.huber_delta = huber_delta
         self.f_use_norm = f_use_norm
+        self.intensive = intensive
         if self.f_use_norm and not (self.use_huber or self.loss_func == "mae"):
             raise RuntimeError(
                 "f_use_norm can only be True when use_huber or loss_func='mae'."
@@ -256,11 +265,13 @@ def call(
 
         loss = 0
         more_loss = {}
+        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        norm_exp = 2 if self.intensive else 1
         if self.has_e:
             if self.loss_func == "mse":
                 l2_ener_loss = xp.mean(xp.square(energy - energy_hat))
                 if not self.use_huber:
-                    loss += atom_norm_ener**2 * (pref_e * l2_ener_loss)
+                    loss += atom_norm_ener**norm_exp * (pref_e * l2_ener_loss)
                 else:
                     l_huber_loss = custom_huber_loss(
                         atom_norm_ener * energy,
@@ -335,7 +346,7 @@ def call(
                     xp.square(virial_hat_reshape - virial_reshape),
                 )
                 if not self.use_huber:
-                    loss += atom_norm**2 * (pref_v * l2_virial_loss)
+                    loss += atom_norm**norm_exp * (pref_v * l2_virial_loss)
                 else:
                     l_huber_loss = custom_huber_loss(
                         atom_norm * virial_reshape,
@@ -525,7 +536,7 @@ def serialize(self) -> dict:
         """
         return {
             "@class": "EnergyLoss",
-            "@version": 2,
+            "@version": 3,
             "starter_learning_rate": self.starter_learning_rate,
             "start_pref_e": self.start_pref_e,
             "limit_pref_e": self.limit_pref_e,
@@ -546,6 +557,7 @@ def serialize(self) -> dict:
             "huber_delta": self.huber_delta,
             "loss_func": self.loss_func,
             "f_use_norm": self.f_use_norm,
+            "intensive": self.intensive,
         }
 
     @classmethod
@@ -563,6 +575,10 @@ def deserialize(cls, data: dict) -> "Loss":
             The deserialized loss module
         """
         data = data.copy()
-        check_version_compatibility(data.pop("@version"), 2, 1)
+        version = data.pop("@version")
+        check_version_compatibility(version, 3, 1)
         data.pop("@class")
+        # Backward compatibility: version 1-2 used legacy normalization
+        if version < 3:
+            data.setdefault("intensive", False)
         return cls(**data)
diff --git a/deepmd/dpmodel/loss/ener_spin.py b/deepmd/dpmodel/loss/ener_spin.py
index a13d626764..ef77a0bd0b 100644
--- a/deepmd/dpmodel/loss/ener_spin.py
+++ b/deepmd/dpmodel/loss/ener_spin.py
@@ -50,6 +50,13 @@ class EnergySpinLoss(Loss):
         if true, the energy will be computed as \sum_i c_i E_i
     loss_func : str
         Loss function type: 'mse' or 'mae'.
+    intensive : bool
+        If true (default), energy and virial losses are computed as intensive quantities,
+        normalized by the square of the number of atoms (1/N^2). This ensures the loss
+        value is independent of system size and consistent with per-atom RMSE reporting.
+        If false, uses the legacy normalization (1/N), which may cause the loss to scale
+        with system size. Set to false for backward compatibility with models trained
+        using deepmd-kit <= 3.0.1.
     **kwargs
         Other keyword arguments.
     """
@@ -69,6 +76,7 @@ def __init__(
         limit_pref_ae: float = 0.0,
         enable_atom_ener_coeff: bool = False,
         loss_func: str = "mse",
+        intensive: bool = True,
         **kwargs: Any,
     ) -> None:
         valid_loss_funcs = ["mse", "mae"]
@@ -89,6 +97,7 @@ def __init__(
         self.start_pref_ae = start_pref_ae
         self.limit_pref_ae = limit_pref_ae
         self.enable_atom_ener_coeff = enable_atom_ener_coeff
+        self.intensive = intensive
         self.has_e = self.start_pref_e != 0.0 or self.limit_pref_e != 0.0
         self.has_fr = self.start_pref_fr != 0.0 or self.limit_pref_fr != 0.0
         self.has_fm = self.start_pref_fm != 0.0 or self.limit_pref_fm != 0.0
@@ -117,6 +126,8 @@ def call(
         loss = 0
         more_loss = {}
         atom_norm = 1.0 / natoms
+        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        norm_exp = 2 if self.intensive else 1
 
         if self.has_e:
             energy_pred = model_dict["energy"]
@@ -130,7 +141,7 @@ def call(
                 energy_pred = xp.sum(atom_ener_coeff * atom_ener_pred, axis=1)
             if self.loss_func == "mse":
                 l2_ener_loss = xp.mean(xp.square(energy_pred - energy_label))
-                loss += atom_norm * (pref_e * l2_ener_loss)
+                loss += atom_norm**norm_exp * (pref_e * l2_ener_loss)
                 more_loss["rmse_e"] = self.display_if_exist(
                     xp.sqrt(l2_ener_loss) * atom_norm, find_energy
                 )
@@ -238,7 +249,7 @@ def call(
             diff_v = virial_label - virial_pred
             if self.loss_func == "mse":
                 l2_virial_loss = xp.mean(xp.square(diff_v))
-                loss += atom_norm * (pref_v * l2_virial_loss)
+                loss += atom_norm**norm_exp * (pref_v * l2_virial_loss)
                 more_loss["rmse_v"] = self.display_if_exist(
                     xp.sqrt(l2_virial_loss) * atom_norm, find_virial
                 )
@@ -326,7 +337,7 @@ def serialize(self) -> dict:
         """Serialize the loss module."""
         return {
             "@class": "EnergySpinLoss",
-            "@version": 1,
+            "@version": 2,
             "starter_learning_rate": self.starter_learning_rate,
             "start_pref_e": self.start_pref_e,
             "limit_pref_e": self.limit_pref_e,
@@ -340,12 +351,17 @@ def serialize(self) -> dict:
             "limit_pref_ae": self.limit_pref_ae,
             "enable_atom_ener_coeff": self.enable_atom_ener_coeff,
             "loss_func": self.loss_func,
+            "intensive": self.intensive,
         }
 
     @classmethod
     def deserialize(cls, data: dict) -> "EnergySpinLoss":
         """Deserialize the loss module."""
         data = data.copy()
-        check_version_compatibility(data.pop("@version"), 1, 1)
+        version = data.pop("@version")
+        check_version_compatibility(version, 2, 1)
         data.pop("@class")
+        # Backward compatibility: version 1 used legacy normalization
+        if version < 2:
+            data.setdefault("intensive", False)
         return cls(**data)
diff --git a/deepmd/pd/loss/ener.py b/deepmd/pd/loss/ener.py
index 08b16fd4cf..88522059b8 100644
--- a/deepmd/pd/loss/ener.py
+++ b/deepmd/pd/loss/ener.py
@@ -61,6 +61,7 @@ def __init__(
         use_huber: bool = False,
         huber_delta: float | list[float] = 0.01,
         f_use_norm: bool = False,
+        intensive: bool = True,
         **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on energy, force and virial.
@@ -119,6 +120,13 @@ def __init__(
         f_use_norm : bool
             If True, use L2 norm of force vectors for loss calculation.
             Not implemented in PD backend, only for serialization compatibility.
+        intensive : bool
+            If true (default), energy and virial losses are computed as intensive quantities,
+            normalized by the square of the number of atoms (1/N^2). This ensures the loss
+            value is independent of system size and consistent with per-atom RMSE reporting.
+            If false, uses the legacy normalization (1/N), which may cause the loss to scale
+            with system size. Set to false for backward compatibility with models trained
+            using deepmd-kit <= 3.0.1.
         **kwargs
             Other keyword arguments.
         """
@@ -161,6 +169,7 @@ def __init__(
         self.inference = inference
         self.use_huber = use_huber
         self.huber_delta = huber_delta
+        self.intensive = intensive
         (
             self._huber_delta_energy,
             self._huber_delta_force,
@@ -218,6 +227,8 @@ def forward(
         # more_loss['log_keys'] = []  # showed when validation on the fly
         # more_loss['test_keys'] = []  # showed when doing dp test
         atom_norm = 1.0 / natoms
+        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        norm_exp = 2 if self.intensive else 1
         if self.has_e and "energy" in model_pred and "energy" in label:
             energy_pred = model_pred["energy"]
             energy_label = label["energy"]
@@ -243,7 +254,7 @@ def forward(
                         l2_ener_loss.detach(), find_energy
                     )
                 if not self.use_huber:
-                    loss += atom_norm**2 * (pref_e * l2_ener_loss)
+                    loss += atom_norm**norm_exp * (pref_e * l2_ener_loss)
                 else:
                     l_huber_loss = custom_huber_loss(
                         atom_norm * energy_pred,
@@ -414,7 +425,7 @@ def forward(
                     l2_virial_loss.detach(), find_virial
                 )
             if not self.use_huber:
-                loss += atom_norm**2 * (pref_v * l2_virial_loss)
+                loss += atom_norm**norm_exp * (pref_v * l2_virial_loss)
             else:
                 l_huber_loss = custom_huber_loss(
                     atom_norm * model_pred["virial"].reshape([-1]),
diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py
index 1e614024cc..038715a6c7 100644
--- a/deepmd/pt/loss/ener.py
+++ b/deepmd/pt/loss/ener.py
@@ -61,6 +61,7 @@ def __init__(
         use_huber: bool = False,
         f_use_norm: bool = False,
         huber_delta: float | list[float] = 0.01,
+        intensive: bool = True,
         **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on energy, force and virial.
@@ -120,6 +121,13 @@ def __init__(
             The threshold delta (D) used for Huber loss, controlling transition between
             L2 and L1 loss. It can be either one float shared by all terms or a list of
             three values ordered as [energy, force, virial].
+        intensive : bool
+            If true (default), energy and virial losses are computed as intensive quantities,
+            normalized by the square of the number of atoms (1/N^2). This ensures the loss
+            value is independent of system size and consistent with per-atom RMSE reporting.
+            If false, uses the legacy normalization (1/N), which may cause the loss to scale
+            with system size. Set to false for backward compatibility with models trained
+            using deepmd-kit <= 3.0.1.
         **kwargs
             Other keyword arguments.
         """
@@ -163,6 +171,7 @@ def __init__(
         self.inference = inference
         self.use_huber = use_huber
         self.f_use_norm = f_use_norm
+        self.intensive = intensive
         if self.f_use_norm and not (self.use_huber or self.loss_func == "mae"):
             raise RuntimeError(
                 "f_use_norm can only be True when use_huber or loss_func='mae'."
@@ -225,6 +234,8 @@ def forward(
         # more_loss['log_keys'] = []  # showed when validation on the fly
         # more_loss['test_keys'] = []  # showed when doing dp test
         atom_norm = 1.0 / natoms
+        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        norm_exp = 2 if self.intensive else 1
         if self.has_e and "energy" in model_pred and "energy" in label:
             energy_pred = model_pred["energy"]
             energy_label = label["energy"]
@@ -250,7 +261,7 @@ def forward(
                         l2_ener_loss.detach(), find_energy
                     )
                 if not self.use_huber:
-                    loss += atom_norm**2 * (pref_e * l2_ener_loss)
+                    loss += atom_norm**norm_exp * (pref_e * l2_ener_loss)
                 else:
                     l_huber_loss = custom_huber_loss(
                         atom_norm * energy_pred,
@@ -432,7 +443,7 @@ def forward(
                         l2_virial_loss.detach(), find_virial
                     )
                 if not self.use_huber:
-                    loss += atom_norm**2 * (pref_v * l2_virial_loss)
+                    loss += atom_norm**norm_exp * (pref_v * l2_virial_loss)
                 else:
                     l_huber_loss = custom_huber_loss(
                         atom_norm * model_pred["virial"].reshape(-1),
diff --git a/deepmd/pt/loss/ener_spin.py b/deepmd/pt/loss/ener_spin.py
index df9885109d..f7f706ac53 100644
--- a/deepmd/pt/loss/ener_spin.py
+++ b/deepmd/pt/loss/ener_spin.py
@@ -40,6 +40,7 @@ def __init__(
         enable_atom_ener_coeff: bool = False,
         loss_func: str = "mse",
         inference: bool = False,
+        intensive: bool = True,
         **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on energy, real force, magnetic force and virial.
@@ -76,6 +77,13 @@ def __init__(
             MAE loss is less sensitive to outliers compared to MSE loss.
         inference : bool
             If true, it will output all losses found in output, ignoring the pre-factors.
+        intensive : bool
+            If true (default), energy and virial losses are computed as intensive quantities,
+            normalized by the square of the number of atoms (1/N^2). This ensures the loss
+            value is independent of system size and consistent with per-atom RMSE reporting.
+            If false, uses the legacy normalization (1/N), which may cause the loss to scale
+            with system size. Set to false for backward compatibility with models trained
+            using deepmd-kit <= 3.0.1.
         **kwargs
             Other keyword arguments.
         """
@@ -101,6 +109,7 @@ def __init__(
         self.limit_pref_ae = limit_pref_ae
         self.enable_atom_ener_coeff = enable_atom_ener_coeff
         self.inference = inference
+        self.intensive = intensive
 
     def forward(
         self,
@@ -145,6 +154,8 @@ def forward(
         # more_loss['log_keys'] = []  # showed when validation on the fly
         # more_loss['test_keys'] = []  # showed when doing dp test
         atom_norm = 1.0 / natoms
+        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        norm_exp = 2 if self.intensive else 1
         if self.has_e and "energy" in model_pred and "energy" in label:
             energy_pred = model_pred["energy"]
             energy_label = label["energy"]
@@ -169,7 +180,7 @@ def forward(
                     more_loss["l2_ener_loss"] = self.display_if_exist(
                         l2_ener_loss.detach(), find_energy
                     )
-                loss += atom_norm * (pref_e * l2_ener_loss)
+                loss += atom_norm**norm_exp * (pref_e * l2_ener_loss)
                 rmse_e = l2_ener_loss.sqrt() * atom_norm
                 more_loss["rmse_e"] = self.display_if_exist(
                     rmse_e.detach(), find_energy
@@ -324,7 +335,7 @@ def forward(
                     more_loss["l2_virial_loss"] = self.display_if_exist(
                         l2_virial_loss.detach(), find_virial
                     )
-                loss += atom_norm * (pref_v * l2_virial_loss)
+                loss += atom_norm**norm_exp * (pref_v * l2_virial_loss)
                 rmse_v = l2_virial_loss.sqrt() * atom_norm
                 more_loss["rmse_v"] = self.display_if_exist(
                     rmse_v.detach(), find_virial
diff --git a/deepmd/tf/loss/ener.py b/deepmd/tf/loss/ener.py
index d26d4ee075..5b3a4e7e66 100644
--- a/deepmd/tf/loss/ener.py
+++ b/deepmd/tf/loss/ener.py
@@ -100,6 +100,13 @@ class EnerStdLoss(Loss):
     f_use_norm : bool
         If True, use L2 norm of force vectors for loss calculation.
         Not implemented in TF backend, only for serialization compatibility.
+    intensive : bool
+        If true (default), energy and virial losses are computed as intensive quantities,
+        normalized by the square of the number of atoms (1/N^2). This ensures the loss
+        value is independent of system size and consistent with per-atom RMSE reporting.
+        If false, uses the legacy normalization (1/N), which may cause the loss to scale
+        with system size. Set to false for backward compatibility with models trained
+        using deepmd-kit <= 3.0.1.
     **kwargs
         Other keyword arguments.
     """
@@ -126,6 +133,7 @@ def __init__(
         huber_delta: float | list[float] = 0.01,
         loss_func: str = "mse",
         f_use_norm: bool = False,
+        intensive: bool = True,
         **kwargs: Any,
     ) -> None:
         if loss_func != "mse":
@@ -167,6 +175,7 @@ def __init__(
             )
         self.use_huber = use_huber
         self.huber_delta = huber_delta
+        self.intensive = intensive
         (
             self._huber_delta_energy,
             self._huber_delta_force,
@@ -354,9 +363,11 @@ def build(
 
         loss = 0
         more_loss = {}
+        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        norm_exp = 2 if self.intensive else 1
         if self.has_e:
             if not self.use_huber:
-                loss += atom_norm_ener**2 * (pref_e * l2_ener_loss)
+                loss += atom_norm_ener**norm_exp * (pref_e * l2_ener_loss)
             else:
                 l_huber_loss = custom_huber_loss(
                     atom_norm_ener * energy,
@@ -381,7 +392,7 @@ def build(
         if self.has_v:
             if not self.use_huber:
                 loss += global_cvt_2_ener_float(
-                    atom_norm**2 * (pref_v * l2_virial_loss)
+                    atom_norm**norm_exp * (pref_v * l2_virial_loss)
                 )
             else:
                 l_huber_loss = custom_huber_loss(
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 4ac49cf4d8..b4174df385 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -3234,6 +3234,13 @@ def loss_ener() -> list[Argument]:
         "This treats the force vector as a whole rather than three independent components. "
         "Only effective when loss_func='mae' or use_huber=True."
     )
+    doc_intensive = (
+        "If true (default), energy and virial losses are computed as intensive quantities, "
+        "normalized by the square of the number of atoms (1/N^2). "
+        "This ensures the loss value is independent of system size and consistent with per-atom RMSE reporting. "
+        "If false, uses the legacy normalization (1/N), which may cause the loss to scale with system size. "
+        "Set to false for backward compatibility with models trained using deepmd-kit <= 3.0.1."
+    )
     return [
         Argument(
             "start_pref_e",
@@ -3376,6 +3383,13 @@ def loss_ener() -> list[Argument]:
             default=0.01,
             doc=doc_huber_delta,
         ),
+        Argument(
+            "intensive",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_intensive,
+        ),
     ]
 
 
@@ -3401,6 +3415,13 @@ def loss_ener_spin() -> list[Argument]:
         "MAE loss is less sensitive to outliers compared to MSE loss. "
         "Future extensions may support additional loss types."
     )
+    doc_intensive = (
+        "If true (default), energy and virial losses are computed as intensive quantities, "
+        "normalized by the square of the number of atoms (1/N^2). "
+        "This ensures the loss value is independent of system size and consistent with per-atom RMSE reporting. "
+        "If false, uses the legacy normalization (1/N), which may cause the loss to scale with system size. "
+        "Set to false for backward compatibility with models trained using deepmd-kit <= 3.0.1."
+    )
     return [
         Argument(
             "start_pref_e",
@@ -3501,6 +3522,13 @@ def loss_ener_spin() -> list[Argument]:
             default="mse",
             doc=doc_loss_func,
         ),
+        Argument(
+            "intensive",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_intensive,
+        ),
     ]
 
 

From e8331abddb1a7094ac337c2e89865e00f564159c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 16 Apr 2026 07:54:37 +0000
Subject: [PATCH 04/14] docs(loss): improve comments explaining intensive
 normalization behavior

Agent-Logs-Url: https://github.com/anyangml/deepmd-kit/sessions/98719546-6f9c-433e-a3de-dc65d98c68fb

Co-authored-by: anyangml <137014849+anyangml@users.noreply.github.com>
---
 deepmd/dpmodel/loss/ener.py      | 4 +++-
 deepmd/dpmodel/loss/ener_spin.py | 4 +++-
 deepmd/pd/loss/ener.py           | 4 +++-
 deepmd/pt/loss/ener.py           | 4 +++-
 deepmd/pt/loss/ener_spin.py      | 4 +++-
 deepmd/tf/loss/ener.py           | 4 +++-
 6 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/deepmd/dpmodel/loss/ener.py b/deepmd/dpmodel/loss/ener.py
index 09f43751ca..a652841464 100644
--- a/deepmd/dpmodel/loss/ener.py
+++ b/deepmd/dpmodel/loss/ener.py
@@ -265,7 +265,9 @@ def call(
 
         loss = 0
         more_loss = {}
-        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        # Normalization exponent controls loss scaling with system size:
+        # - norm_exp=2 (intensive=True): loss uses 1/N² scaling, making it independent of system size
+        # - norm_exp=1 (intensive=False, legacy): loss uses 1/N scaling, which varies with system size
         norm_exp = 2 if self.intensive else 1
         if self.has_e:
             if self.loss_func == "mse":
diff --git a/deepmd/dpmodel/loss/ener_spin.py b/deepmd/dpmodel/loss/ener_spin.py
index ef77a0bd0b..69e6c5e8e7 100644
--- a/deepmd/dpmodel/loss/ener_spin.py
+++ b/deepmd/dpmodel/loss/ener_spin.py
@@ -126,7 +126,9 @@ def call(
         loss = 0
         more_loss = {}
         atom_norm = 1.0 / natoms
-        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        # Normalization exponent controls loss scaling with system size:
+        # - norm_exp=2 (intensive=True): loss uses 1/N² scaling, making it independent of system size
+        # - norm_exp=1 (intensive=False, legacy): loss uses 1/N scaling, which varies with system size
         norm_exp = 2 if self.intensive else 1
 
         if self.has_e:
diff --git a/deepmd/pd/loss/ener.py b/deepmd/pd/loss/ener.py
index 88522059b8..191c4c8499 100644
--- a/deepmd/pd/loss/ener.py
+++ b/deepmd/pd/loss/ener.py
@@ -227,7 +227,9 @@ def forward(
         # more_loss['log_keys'] = []  # showed when validation on the fly
         # more_loss['test_keys'] = []  # showed when doing dp test
         atom_norm = 1.0 / natoms
-        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        # Normalization exponent controls loss scaling with system size:
+        # - norm_exp=2 (intensive=True): loss uses 1/N² scaling, making it independent of system size
+        # - norm_exp=1 (intensive=False, legacy): loss uses 1/N scaling, which varies with system size
         norm_exp = 2 if self.intensive else 1
         if self.has_e and "energy" in model_pred and "energy" in label:
             energy_pred = model_pred["energy"]
diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py
index 038715a6c7..cfdc5cc553 100644
--- a/deepmd/pt/loss/ener.py
+++ b/deepmd/pt/loss/ener.py
@@ -234,7 +234,9 @@ def forward(
         # more_loss['log_keys'] = []  # showed when validation on the fly
         # more_loss['test_keys'] = []  # showed when doing dp test
         atom_norm = 1.0 / natoms
-        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        # Normalization exponent controls loss scaling with system size:
+        # - norm_exp=2 (intensive=True): loss uses 1/N² scaling, making it independent of system size
+        # - norm_exp=1 (intensive=False, legacy): loss uses 1/N scaling, which varies with system size
         norm_exp = 2 if self.intensive else 1
         if self.has_e and "energy" in model_pred and "energy" in label:
             energy_pred = model_pred["energy"]
diff --git a/deepmd/pt/loss/ener_spin.py b/deepmd/pt/loss/ener_spin.py
index f7f706ac53..bd1f7ec3e0 100644
--- a/deepmd/pt/loss/ener_spin.py
+++ b/deepmd/pt/loss/ener_spin.py
@@ -154,7 +154,9 @@ def forward(
         # more_loss['log_keys'] = []  # showed when validation on the fly
         # more_loss['test_keys'] = []  # showed when doing dp test
         atom_norm = 1.0 / natoms
-        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        # Normalization exponent controls loss scaling with system size:
+        # - norm_exp=2 (intensive=True): loss uses 1/N² scaling, making it independent of system size
+        # - norm_exp=1 (intensive=False, legacy): loss uses 1/N scaling, which varies with system size
         norm_exp = 2 if self.intensive else 1
         if self.has_e and "energy" in model_pred and "energy" in label:
             energy_pred = model_pred["energy"]
diff --git a/deepmd/tf/loss/ener.py b/deepmd/tf/loss/ener.py
index 5b3a4e7e66..5f53d65d61 100644
--- a/deepmd/tf/loss/ener.py
+++ b/deepmd/tf/loss/ener.py
@@ -363,7 +363,9 @@ def build(
 
         loss = 0
         more_loss = {}
-        # Normalization exponent: 2 for intensive (new), 1 for legacy behavior
+        # Normalization exponent controls loss scaling with system size:
+        # - norm_exp=2 (intensive=True): loss uses 1/N² scaling, making it independent of system size
+        # - norm_exp=1 (intensive=False, legacy): loss uses 1/N scaling, which varies with system size
         norm_exp = 2 if self.intensive else 1
         if self.has_e:
             if not self.use_huber:

From 601b2ef67e51e92472532e290526445fa8c83a16 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 16 Apr 2026 08:05:07 +0000
Subject: [PATCH 05/14] fix(loss): change intensive default to False for
 backward compatibility

Agent-Logs-Url: https://github.com/anyangml/deepmd-kit/sessions/8cff4193-b092-4923-afce-df5de29d63d3

Co-authored-by: anyangml <137014849+anyangml@users.noreply.github.com>
---
 deepmd/dpmodel/loss/ener.py      |  8 ++++----
 deepmd/dpmodel/loss/ener_spin.py |  8 ++++----
 deepmd/pd/loss/ener.py           |  8 ++++----
 deepmd/pt/loss/ener.py           |  8 ++++----
 deepmd/pt/loss/ener_spin.py      |  8 ++++----
 deepmd/tf/loss/ener.py           |  8 ++++----
 deepmd/utils/argcheck.py         | 16 ++++++++--------
 7 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/deepmd/dpmodel/loss/ener.py b/deepmd/dpmodel/loss/ener.py
index a652841464..c18db708d1 100644
--- a/deepmd/dpmodel/loss/ener.py
+++ b/deepmd/dpmodel/loss/ener.py
@@ -91,11 +91,11 @@ class EnergyLoss(Loss):
         Instead of computing loss on force components, computes loss on ||F_pred - F_label||_2.
         This treats the force vector as a whole rather than three independent components.
     intensive : bool
-        If true (default), energy and virial losses are computed as intensive quantities,
+        If true, energy and virial losses are computed as intensive quantities,
         normalized by the square of the number of atoms (1/N^2). This ensures the loss
         value is independent of system size and consistent with per-atom RMSE reporting.
-        If false, uses the legacy normalization (1/N), which may cause the loss to scale
-        with system size. Set to false for backward compatibility with models trained
+        If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
+        with system size. The default is false for backward compatibility with models trained
         using deepmd-kit <= 3.0.1.
     **kwargs
         Other keyword arguments.
@@ -123,7 +123,7 @@ def __init__(
         huber_delta: float | list[float] = 0.01,
         loss_func: str = "mse",
         f_use_norm: bool = False,
-        intensive: bool = True,
+        intensive: bool = False,
         **kwargs: Any,
     ) -> None:
         # Validate loss_func
diff --git a/deepmd/dpmodel/loss/ener_spin.py b/deepmd/dpmodel/loss/ener_spin.py
index 69e6c5e8e7..f93314511b 100644
--- a/deepmd/dpmodel/loss/ener_spin.py
+++ b/deepmd/dpmodel/loss/ener_spin.py
@@ -51,11 +51,11 @@ class EnergySpinLoss(Loss):
     loss_func : str
         Loss function type: 'mse' or 'mae'.
     intensive : bool
-        If true (default), energy and virial losses are computed as intensive quantities,
+        If true, energy and virial losses are computed as intensive quantities,
         normalized by the square of the number of atoms (1/N^2). This ensures the loss
         value is independent of system size and consistent with per-atom RMSE reporting.
-        If false, uses the legacy normalization (1/N), which may cause the loss to scale
-        with system size. Set to false for backward compatibility with models trained
+        If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
+        with system size. The default is false for backward compatibility with models trained
         using deepmd-kit <= 3.0.1.
     **kwargs
         Other keyword arguments.
@@ -76,7 +76,7 @@ def __init__(
         limit_pref_ae: float = 0.0,
         enable_atom_ener_coeff: bool = False,
         loss_func: str = "mse",
-        intensive: bool = True,
+        intensive: bool = False,
         **kwargs: Any,
     ) -> None:
         valid_loss_funcs = ["mse", "mae"]
diff --git a/deepmd/pd/loss/ener.py b/deepmd/pd/loss/ener.py
index 191c4c8499..7e4713ba29 100644
--- a/deepmd/pd/loss/ener.py
+++ b/deepmd/pd/loss/ener.py
@@ -61,7 +61,7 @@ def __init__(
         use_huber: bool = False,
         huber_delta: float | list[float] = 0.01,
         f_use_norm: bool = False,
-        intensive: bool = True,
+        intensive: bool = False,
         **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on energy, force and virial.
@@ -121,11 +121,11 @@ def __init__(
             If True, use L2 norm of force vectors for loss calculation.
             Not implemented in PD backend, only for serialization compatibility.
         intensive : bool
-            If true (default), energy and virial losses are computed as intensive quantities,
+            If true, energy and virial losses are computed as intensive quantities,
             normalized by the square of the number of atoms (1/N^2). This ensures the loss
             value is independent of system size and consistent with per-atom RMSE reporting.
-            If false, uses the legacy normalization (1/N), which may cause the loss to scale
-            with system size. Set to false for backward compatibility with models trained
+            If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
+            with system size. The default is false for backward compatibility with models trained
             using deepmd-kit <= 3.0.1.
         **kwargs
             Other keyword arguments.
diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py
index cfdc5cc553..78f3926306 100644
--- a/deepmd/pt/loss/ener.py
+++ b/deepmd/pt/loss/ener.py
@@ -61,7 +61,7 @@ def __init__(
         use_huber: bool = False,
         f_use_norm: bool = False,
         huber_delta: float | list[float] = 0.01,
-        intensive: bool = True,
+        intensive: bool = False,
         **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on energy, force and virial.
@@ -122,11 +122,11 @@ def __init__(
             L2 and L1 loss. It can be either one float shared by all terms or a list of
             three values ordered as [energy, force, virial].
         intensive : bool
-            If true (default), energy and virial losses are computed as intensive quantities,
+            If true, energy and virial losses are computed as intensive quantities,
             normalized by the square of the number of atoms (1/N^2). This ensures the loss
             value is independent of system size and consistent with per-atom RMSE reporting.
-            If false, uses the legacy normalization (1/N), which may cause the loss to scale
-            with system size. Set to false for backward compatibility with models trained
+            If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
+            with system size. The default is false for backward compatibility with models trained
             using deepmd-kit <= 3.0.1.
         **kwargs
             Other keyword arguments.
diff --git a/deepmd/pt/loss/ener_spin.py b/deepmd/pt/loss/ener_spin.py
index bd1f7ec3e0..0a1dba17d9 100644
--- a/deepmd/pt/loss/ener_spin.py
+++ b/deepmd/pt/loss/ener_spin.py
@@ -40,7 +40,7 @@ def __init__(
         enable_atom_ener_coeff: bool = False,
         loss_func: str = "mse",
         inference: bool = False,
-        intensive: bool = True,
+        intensive: bool = False,
         **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on energy, real force, magnetic force and virial.
@@ -78,11 +78,11 @@ def __init__(
         inference : bool
             If true, it will output all losses found in output, ignoring the pre-factors.
         intensive : bool
-            If true (default), energy and virial losses are computed as intensive quantities,
+            If true, energy and virial losses are computed as intensive quantities,
             normalized by the square of the number of atoms (1/N^2). This ensures the loss
             value is independent of system size and consistent with per-atom RMSE reporting.
-            If false, uses the legacy normalization (1/N), which may cause the loss to scale
-            with system size. Set to false for backward compatibility with models trained
+            If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
+            with system size. The default is false for backward compatibility with models trained
             using deepmd-kit <= 3.0.1.
         **kwargs
             Other keyword arguments.
diff --git a/deepmd/tf/loss/ener.py b/deepmd/tf/loss/ener.py
index 5f53d65d61..4dbba23043 100644
--- a/deepmd/tf/loss/ener.py
+++ b/deepmd/tf/loss/ener.py
@@ -101,11 +101,11 @@ class EnerStdLoss(Loss):
         If True, use L2 norm of force vectors for loss calculation.
         Not implemented in TF backend, only for serialization compatibility.
     intensive : bool
-        If true (default), energy and virial losses are computed as intensive quantities,
+        If true, energy and virial losses are computed as intensive quantities,
         normalized by the square of the number of atoms (1/N^2). This ensures the loss
         value is independent of system size and consistent with per-atom RMSE reporting.
-        If false, uses the legacy normalization (1/N), which may cause the loss to scale
-        with system size. Set to false for backward compatibility with models trained
+        If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
+        with system size. The default is false for backward compatibility with models trained
         using deepmd-kit <= 3.0.1.
     **kwargs
         Other keyword arguments.
@@ -133,7 +133,7 @@ def __init__(
         huber_delta: float | list[float] = 0.01,
         loss_func: str = "mse",
         f_use_norm: bool = False,
-        intensive: bool = True,
+        intensive: bool = False,
         **kwargs: Any,
     ) -> None:
         if loss_func != "mse":
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index b4174df385..7b3ec0c849 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -3235,11 +3235,11 @@ def loss_ener() -> list[Argument]:
         "Only effective when loss_func='mae' or use_huber=True."
     )
     doc_intensive = (
-        "If true (default), energy and virial losses are computed as intensive quantities, "
+        "If true, energy and virial losses are computed as intensive quantities, "
         "normalized by the square of the number of atoms (1/N^2). "
         "This ensures the loss value is independent of system size and consistent with per-atom RMSE reporting. "
-        "If false, uses the legacy normalization (1/N), which may cause the loss to scale with system size. "
-        "Set to false for backward compatibility with models trained using deepmd-kit <= 3.0.1."
+        "If false (default), uses the legacy normalization (1/N), which may cause the loss to scale with system size. "
+        "The default is false for backward compatibility with models trained using deepmd-kit <= 3.0.1."
     )
     return [
         Argument(
@@ -3387,7 +3387,7 @@ def loss_ener() -> list[Argument]:
             "intensive",
             bool,
             optional=True,
-            default=True,
+            default=False,
             doc=doc_intensive,
         ),
     ]
@@ -3416,11 +3416,11 @@ def loss_ener_spin() -> list[Argument]:
         "Future extensions may support additional loss types."
     )
     doc_intensive = (
-        "If true (default), energy and virial losses are computed as intensive quantities, "
+        "If true, energy and virial losses are computed as intensive quantities, "
         "normalized by the square of the number of atoms (1/N^2). "
         "This ensures the loss value is independent of system size and consistent with per-atom RMSE reporting. "
-        "If false, uses the legacy normalization (1/N), which may cause the loss to scale with system size. "
-        "Set to false for backward compatibility with models trained using deepmd-kit <= 3.0.1."
+        "If false (default), uses the legacy normalization (1/N), which may cause the loss to scale with system size. "
+        "The default is false for backward compatibility with models trained using deepmd-kit <= 3.0.1."
     )
     return [
         Argument(
@@ -3526,7 +3526,7 @@ def loss_ener_spin() -> list[Argument]:
             "intensive",
             bool,
             optional=True,
-            default=True,
+            default=False,
             doc=doc_intensive,
         ),
     ]

From e86952b6addad363b1a29e707b0bf2eaa1ea4289 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 16 Apr 2026 09:21:56 +0000
Subject: [PATCH 06/14] fix(loss): add intensive key to serialization for all
 backends

Agent-Logs-Url: https://github.com/anyangml/deepmd-kit/sessions/0be1bd64-1461-465b-8cc2-8bc019babc04

Co-authored-by: anyangml <137014849+anyangml@users.noreply.github.com>
---
 deepmd/pd/loss/ener.py      | 9 +++++++--
 deepmd/pt/loss/ener.py      | 9 +++++++--
 deepmd/pt/loss/ener_spin.py | 9 +++++++--
 deepmd/tf/loss/ener.py      | 9 +++++++--
 4 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/deepmd/pd/loss/ener.py b/deepmd/pd/loss/ener.py
index 7e4713ba29..2eae343717 100644
--- a/deepmd/pd/loss/ener.py
+++ b/deepmd/pd/loss/ener.py
@@ -577,7 +577,7 @@ def serialize(self) -> dict:
         """
         return {
             "@class": "EnergyLoss",
-            "@version": 2,
+            "@version": 3,
             "starter_learning_rate": self.starter_learning_rate,
             "start_pref_e": self.start_pref_e,
             "limit_pref_e": self.limit_pref_e,
@@ -598,6 +598,7 @@ def serialize(self) -> dict:
             "huber_delta": self.huber_delta,
             "loss_func": self.loss_func,
             "f_use_norm": self.f_use_norm,
+            "intensive": self.intensive,
         }
 
     @classmethod
@@ -615,8 +616,12 @@ def deserialize(cls, data: dict) -> "TaskLoss":
             The deserialized loss module
         """
         data = data.copy()
-        check_version_compatibility(data.pop("@version"), 2, 1)
+        version = data.pop("@version")
+        check_version_compatibility(version, 3, 1)
         data.pop("@class")
+        # Handle backward compatibility for older versions without intensive
+        if version < 3:
+            data.setdefault("intensive", False)
         return cls(**data)
 
 
diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py
index 78f3926306..9749a2d1c9 100644
--- a/deepmd/pt/loss/ener.py
+++ b/deepmd/pt/loss/ener.py
@@ -612,7 +612,7 @@ def serialize(self) -> dict:
         """
         return {
             "@class": "EnergyLoss",
-            "@version": 2,
+            "@version": 3,
             "starter_learning_rate": self.starter_learning_rate,
             "start_pref_e": self.start_pref_e,
             "limit_pref_e": self.limit_pref_e,
@@ -633,6 +633,7 @@ def serialize(self) -> dict:
             "huber_delta": self.huber_delta,
             "loss_func": self.loss_func,
             "f_use_norm": self.f_use_norm,
+            "intensive": self.intensive,
         }
 
     @classmethod
@@ -650,8 +651,12 @@ def deserialize(cls, data: dict) -> "TaskLoss":
             The deserialized loss module
         """
         data = data.copy()
-        check_version_compatibility(data.pop("@version"), 2, 1)
+        version = data.pop("@version")
+        check_version_compatibility(version, 3, 1)
         data.pop("@class")
+        # Handle backward compatibility for older versions without intensive
+        if version < 3:
+            data.setdefault("intensive", False)
         return cls(**data)
 
 
diff --git a/deepmd/pt/loss/ener_spin.py b/deepmd/pt/loss/ener_spin.py
index 0a1dba17d9..cc31b9de32 100644
--- a/deepmd/pt/loss/ener_spin.py
+++ b/deepmd/pt/loss/ener_spin.py
@@ -426,7 +426,7 @@ def serialize(self) -> dict:
         """Serialize the loss module."""
         return {
             "@class": "EnergySpinLoss",
-            "@version": 1,
+            "@version": 2,
             "starter_learning_rate": self.starter_learning_rate,
             "start_pref_e": self.start_pref_e,
             "limit_pref_e": self.limit_pref_e,
@@ -440,12 +440,17 @@ def serialize(self) -> dict:
             "limit_pref_ae": self.limit_pref_ae,
             "enable_atom_ener_coeff": self.enable_atom_ener_coeff,
             "loss_func": self.loss_func,
+            "intensive": self.intensive,
         }
 
     @classmethod
     def deserialize(cls, data: dict) -> "EnergySpinLoss":
         """Deserialize the loss module."""
         data = data.copy()
-        check_version_compatibility(data.pop("@version"), 1, 1)
+        version = data.pop("@version")
+        check_version_compatibility(version, 2, 1)
         data.pop("@class")
+        # Handle backward compatibility for older versions without intensive
+        if version < 2:
+            data.setdefault("intensive", False)
         return cls(**data)
diff --git a/deepmd/tf/loss/ener.py b/deepmd/tf/loss/ener.py
index 4dbba23043..c3b8bea070 100644
--- a/deepmd/tf/loss/ener.py
+++ b/deepmd/tf/loss/ener.py
@@ -556,7 +556,7 @@ def serialize(self, suffix: str = "") -> dict:
         """
         return {
             "@class": "EnergyLoss",
-            "@version": 2,
+            "@version": 3,
             "starter_learning_rate": self.starter_learning_rate,
             "start_pref_e": self.start_pref_e,
             "limit_pref_e": self.limit_pref_e,
@@ -577,6 +577,7 @@ def serialize(self, suffix: str = "") -> dict:
             "huber_delta": self.huber_delta,
             "loss_func": self.loss_func,
             "f_use_norm": self.f_use_norm,
+            "intensive": self.intensive,
         }
 
     @classmethod
@@ -596,8 +597,12 @@ def deserialize(cls, data: dict, suffix: str = "") -> "Loss":
             The deserialized loss module
         """
         data = data.copy()
-        check_version_compatibility(data.pop("@version"), 2, 1)
+        version = data.pop("@version")
+        check_version_compatibility(version, 3, 1)
         data.pop("@class")
+        # Handle backward compatibility for older versions without intensive
+        if version < 3:
+            data.setdefault("intensive", False)
         return cls(**data)
 
 

From 38c97ffd8011b1b66494660e68813adb60fee8b9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 17 Apr 2026 02:28:11 +0000
Subject: [PATCH 07/14] fix(loss): add intensive parameter to TF EnerSpinLoss
 class

Agent-Logs-Url: https://github.com/anyangml/deepmd-kit/sessions/6f9609a4-4571-42ca-b057-d44f7614df9c

Co-authored-by: anyangml <137014849+anyangml@users.noreply.github.com>
---
 deepmd/tf/loss/ener.py | 69 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 2 deletions(-)

diff --git a/deepmd/tf/loss/ener.py b/deepmd/tf/loss/ener.py
index c3b8bea070..78fbad0aa1 100644
--- a/deepmd/tf/loss/ener.py
+++ b/deepmd/tf/loss/ener.py
@@ -626,6 +626,7 @@ def __init__(
         enable_atom_ener_coeff: bool = False,
         use_spin: list | None = None,
         loss_func: str = "mse",
+        intensive: bool = False,
     ) -> None:
         if loss_func != "mse":
             raise NotImplementedError(
@@ -648,6 +649,7 @@ def __init__(
         self.relative_f = relative_f
         self.enable_atom_ener_coeff = enable_atom_ener_coeff
         self.use_spin = use_spin
+        self.intensive = intensive
         self.has_e = self.start_pref_e != 0.0 or self.limit_pref_e != 0.0
         self.has_fr = self.start_pref_fr != 0.0 or self.limit_pref_fr != 0.0
         self.has_fm = self.start_pref_fm != 0.0 or self.limit_pref_fm != 0.0
@@ -733,6 +735,10 @@ def build(
 
         atom_norm = 1.0 / global_cvt_2_tf_float(natoms[0])
         atom_norm_ener = 1.0 / global_cvt_2_ener_float(natoms[0])
+        # RMSE normalization exponent:
+        # - norm_exp=2 (intensive=True): loss uses 1/N² scaling, making it independent of system size
+        # - norm_exp=1 (intensive=False, legacy): loss uses 1/N scaling, which varies with system size
+        norm_exp = 2 if self.intensive else 1
         pref_e = global_cvt_2_ener_float(
             find_energy
             * (
@@ -782,7 +788,7 @@ def build(
         l2_loss = 0
         more_loss = {}
         if self.has_e:
-            l2_loss += atom_norm_ener * (pref_e * l2_ener_loss)
+            l2_loss += atom_norm_ener**norm_exp * (pref_e * l2_ener_loss)
         more_loss["l2_ener_loss"] = self.display_if_exist(l2_ener_loss, find_energy)
         if self.has_fr:
             l2_loss += global_cvt_2_ener_float(pref_fr * l2_force_r_loss)
@@ -795,7 +801,9 @@ def build(
             l2_force_m_loss, find_force
         )
         if self.has_v:
-            l2_loss += global_cvt_2_ener_float(atom_norm * (pref_v * l2_virial_loss))
+            l2_loss += global_cvt_2_ener_float(
+                atom_norm**norm_exp * (pref_v * l2_virial_loss)
+            )
         more_loss["l2_virial_loss"] = self.display_if_exist(l2_virial_loss, find_virial)
         if self.has_ae:
             l2_loss += global_cvt_2_ener_float(pref_ae * l2_atom_ener_loss)
@@ -983,6 +991,63 @@ def label_requirement(self) -> list[DataRequirementItem]:
             )
         return data_requirements
 
+    def serialize(self, suffix: str = "") -> dict:
+        """Serialize the loss module.
+
+        Parameters
+        ----------
+        suffix : str
+            The suffix of the loss module
+
+        Returns
+        -------
+        dict
+            The serialized loss module
+        """
+        return {
+            "@class": "EnergySpinLoss",
+            "@version": 2,
+            "starter_learning_rate": self.starter_learning_rate,
+            "start_pref_e": self.start_pref_e,
+            "limit_pref_e": self.limit_pref_e,
+            "start_pref_fr": self.start_pref_fr,
+            "limit_pref_fr": self.limit_pref_fr,
+            "start_pref_fm": self.start_pref_fm,
+            "limit_pref_fm": self.limit_pref_fm,
+            "start_pref_v": self.start_pref_v,
+            "limit_pref_v": self.limit_pref_v,
+            "start_pref_ae": self.start_pref_ae,
+            "limit_pref_ae": self.limit_pref_ae,
+            "enable_atom_ener_coeff": self.enable_atom_ener_coeff,
+            "loss_func": self.loss_func,
+            "intensive": self.intensive,
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict, suffix: str = "") -> "EnerSpinLoss":
+        """Deserialize the loss module.
+
+        Parameters
+        ----------
+        data : dict
+            The serialized loss module
+        suffix : str
+            The suffix of the loss module
+
+        Returns
+        -------
+        EnerSpinLoss
+            The deserialized loss module
+        """
+        data = data.copy()
+        version = data.pop("@version")
+        check_version_compatibility(version, 2, 1)
+        data.pop("@class")
+        # Handle backward compatibility for older versions without intensive
+        if version < 2:
+            data.setdefault("intensive", False)
+        return cls(**data)
+
 
 class EnerDipoleLoss(Loss):
     def __init__(

From 736e7318706f8a86f4673479618dade15288ef3e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 19 Apr 2026 10:52:01 +0000
Subject: [PATCH 08/14] fix(loss): update intensive parameter documentation and
 add test coverage

Agent-Logs-Url: https://github.com/anyangml/deepmd-kit/sessions/5ebcf063-763f-49b3-a5cb-d4832afba064

Co-authored-by: anyangml <137014849+anyangml@users.noreply.github.com>
---
 deepmd/dpmodel/loss/ener.py                   |  13 +-
 deepmd/dpmodel/loss/ener_spin.py              |  13 +-
 deepmd/pd/loss/ener.py                        |  12 +-
 deepmd/pt/loss/ener.py                        |  12 +-
 deepmd/pt/loss/ener_spin.py                   |  13 +-
 deepmd/tf/loss/ener.py                        |  15 +-
 deepmd/utils/argcheck.py                      |  18 +-
 source/tests/consistent/loss/test_ener.py     | 208 +++++++++++++++++-
 .../tests/consistent/loss/test_ener_spin.py   |   6 +-
 9 files changed, 259 insertions(+), 51 deletions(-)

diff --git a/deepmd/dpmodel/loss/ener.py b/deepmd/dpmodel/loss/ener.py
index c18db708d1..09143a9d38 100644
--- a/deepmd/dpmodel/loss/ener.py
+++ b/deepmd/dpmodel/loss/ener.py
@@ -91,12 +91,13 @@ class EnergyLoss(Loss):
         Instead of computing loss on force components, computes loss on ||F_pred - F_label||_2.
         This treats the force vector as a whole rather than three independent components.
     intensive : bool
-        If true, energy and virial losses are computed as intensive quantities,
-        normalized by the square of the number of atoms (1/N^2). This ensures the loss
-        value is independent of system size and consistent with per-atom RMSE reporting.
-        If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
-        with system size. The default is false for backward compatibility with models trained
-        using deepmd-kit <= 3.0.1.
+        If true, the non-Huber MSE energy and virial losses use intensive normalization,
+        i.e. a 1/N^2 factor instead of the legacy 1/N scaling. This matches per-atom
+        RMSE-style normalization for those terms. MAE and Huber modes use different
+        scaling and are not affected in the same way by this flag.
+        If false (default), the legacy normalization is used for the affected terms.
+        The default is false for backward compatibility with models trained using
+        deepmd-kit <= 3.0.1.
     **kwargs
         Other keyword arguments.
     """
diff --git a/deepmd/dpmodel/loss/ener_spin.py b/deepmd/dpmodel/loss/ener_spin.py
index f93314511b..3837b1ab77 100644
--- a/deepmd/dpmodel/loss/ener_spin.py
+++ b/deepmd/dpmodel/loss/ener_spin.py
@@ -51,12 +51,13 @@ class EnergySpinLoss(Loss):
     loss_func : str
         Loss function type: 'mse' or 'mae'.
     intensive : bool
-        If true, energy and virial losses are computed as intensive quantities,
-        normalized by the square of the number of atoms (1/N^2). This ensures the loss
-        value is independent of system size and consistent with per-atom RMSE reporting.
-        If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
-        with system size. The default is false for backward compatibility with models trained
-        using deepmd-kit <= 3.0.1.
+        If true, the MSE energy and virial terms use intensive normalization,
+        i.e. an additional normalization by the square of the number of atoms
+        (1/N^2) instead of the legacy (1/N) behavior. This keeps those MSE loss
+        terms consistent with per-atom RMSE reporting and less dependent on
+        system size. This option does not change the MAE formulation, which is
+        handled separately. The default is false for backward compatibility with
+        models trained using deepmd-kit <= 3.0.1.
     **kwargs
         Other keyword arguments.
     """
diff --git a/deepmd/pd/loss/ener.py b/deepmd/pd/loss/ener.py
index 2eae343717..ef6967c4a4 100644
--- a/deepmd/pd/loss/ener.py
+++ b/deepmd/pd/loss/ener.py
@@ -121,12 +121,12 @@ def __init__(
             If True, use L2 norm of force vectors for loss calculation.
             Not implemented in PD backend, only for serialization compatibility.
         intensive : bool
-            If true, energy and virial losses are computed as intensive quantities,
-            normalized by the square of the number of atoms (1/N^2). This ensures the loss
-            value is independent of system size and consistent with per-atom RMSE reporting.
-            If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
-            with system size. The default is false for backward compatibility with models trained
-            using deepmd-kit <= 3.0.1.
+            Controls size normalization for energy and virial loss terms. For the non-Huber
+            MSE path, setting this to true applies 1/N^2 scaling, while false uses the legacy
+            1/N scaling. For MAE, the normalization remains 1/N. For Huber loss, residuals are
+            first normalized by 1/N before applying the Huber formula, so this option does not
+            provide a pure 1/N versus 1/N^2 toggle in that path. The default is false for
+            backward compatibility with models trained using deepmd-kit <= 3.0.1.
         **kwargs
             Other keyword arguments.
         """
diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py
index 9749a2d1c9..48edad7d73 100644
--- a/deepmd/pt/loss/ener.py
+++ b/deepmd/pt/loss/ener.py
@@ -122,12 +122,12 @@ def __init__(
             L2 and L1 loss. It can be either one float shared by all terms or a list of
             three values ordered as [energy, force, virial].
         intensive : bool
-            If true, energy and virial losses are computed as intensive quantities,
-            normalized by the square of the number of atoms (1/N^2). This ensures the loss
-            value is independent of system size and consistent with per-atom RMSE reporting.
-            If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
-            with system size. The default is false for backward compatibility with models trained
-            using deepmd-kit <= 3.0.1.
+            Controls size normalization for energy and virial loss terms. For the non-Huber
+            MSE path, setting this to true applies 1/N^2 scaling, while false uses the legacy
+            1/N scaling. For MAE, the normalization remains 1/N. For Huber loss, residuals are
+            first normalized by 1/N before applying the Huber formula, so this option does not
+            provide a pure 1/N versus 1/N^2 toggle in that path. The default is false for
+            backward compatibility with models trained using deepmd-kit <= 3.0.1.
         **kwargs
             Other keyword arguments.
         """
diff --git a/deepmd/pt/loss/ener_spin.py b/deepmd/pt/loss/ener_spin.py
index cc31b9de32..7e566277f3 100644
--- a/deepmd/pt/loss/ener_spin.py
+++ b/deepmd/pt/loss/ener_spin.py
@@ -78,12 +78,13 @@ def __init__(
         inference : bool
             If true, it will output all losses found in output, ignoring the pre-factors.
         intensive : bool
-            If true, energy and virial losses are computed as intensive quantities,
-            normalized by the square of the number of atoms (1/N^2). This ensures the loss
-            value is independent of system size and consistent with per-atom RMSE reporting.
-            If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
-            with system size. The default is false for backward compatibility with models trained
-            using deepmd-kit <= 3.0.1.
+            Controls the normalization exponent used for the MSE energy and virial loss terms.
+            If true, those MSE terms use intensive normalization by the square of the number of
+            atoms (1/N^2), which is consistent with per-atom RMSE reporting. If false (default),
+            the legacy normalization (1/N) is used for those MSE terms. Note that this 1/N^2
+            behavior does not apply to the MAE code paths: MAE energy/virial losses do not use
+            the `intensive` exponent in the same way. The default is false for backward
+            compatibility with models trained using deepmd-kit <= 3.0.1.
         **kwargs
             Other keyword arguments.
         """
diff --git a/deepmd/tf/loss/ener.py b/deepmd/tf/loss/ener.py
index 78fbad0aa1..f14693331c 100644
--- a/deepmd/tf/loss/ener.py
+++ b/deepmd/tf/loss/ener.py
@@ -101,12 +101,13 @@ class EnerStdLoss(Loss):
         If True, use L2 norm of force vectors for loss calculation.
         Not implemented in TF backend, only for serialization compatibility.
     intensive : bool
-        If true, energy and virial losses are computed as intensive quantities,
-        normalized by the square of the number of atoms (1/N^2). This ensures the loss
-        value is independent of system size and consistent with per-atom RMSE reporting.
-        If false (default), uses the legacy normalization (1/N), which may cause the loss to scale
-        with system size. The default is false for backward compatibility with models trained
-        using deepmd-kit <= 3.0.1.
+        Controls the normalization used for energy and virial terms in the non-Huber
+        MSE branch of this TF loss. If true, that branch uses intensive normalization
+        by the square of the number of atoms (1/N^2); if false (default), it uses the
+        legacy normalization (1/N). When ``use_huber=True``, the residual is still
+        normalized by 1/N before applying the Huber loss, so ``intensive`` may not
+        change behavior in that path. The default is false for backward compatibility
+        with models trained using deepmd-kit <= 3.0.1.
     **kwargs
         Other keyword arguments.
     """
@@ -735,7 +736,7 @@ def build(
 
         atom_norm = 1.0 / global_cvt_2_tf_float(natoms[0])
         atom_norm_ener = 1.0 / global_cvt_2_ener_float(natoms[0])
-        # RMSE normalization exponent:
+        # loss normalization exponent:
         # - norm_exp=2 (intensive=True): loss uses 1/N² scaling, making it independent of system size
         # - norm_exp=1 (intensive=False, legacy): loss uses 1/N scaling, which varies with system size
         norm_exp = 2 if self.intensive else 1
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 7b3ec0c849..fe68106873 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -3235,10 +3235,11 @@ def loss_ener() -> list[Argument]:
         "Only effective when loss_func='mae' or use_huber=True."
     )
     doc_intensive = (
-        "If true, energy and virial losses are computed as intensive quantities, "
-        "normalized by the square of the number of atoms (1/N^2). "
-        "This ensures the loss value is independent of system size and consistent with per-atom RMSE reporting. "
-        "If false (default), uses the legacy normalization (1/N), which may cause the loss to scale with system size. "
+        "Controls intensive normalization for energy and virial loss terms in the current implementation. "
+        "For non-Huber MSE energy/virial terms, setting this to true uses 1/N^2 normalization instead of the legacy 1/N scaling. "
+        "This matches per-atom-style reporting more closely for those terms. "
+        "For MAE, the normalization remains 1/N. When `use_huber=True`, the residual is already scaled by 1/N before applying the Huber loss, "
+        "so this flag may have limited or no effect for those terms. "
         "The default is false for backward compatibility with models trained using deepmd-kit <= 3.0.1."
     )
     return [
@@ -3416,10 +3417,11 @@ def loss_ener_spin() -> list[Argument]:
         "Future extensions may support additional loss types."
     )
     doc_intensive = (
-        "If true, energy and virial losses are computed as intensive quantities, "
-        "normalized by the square of the number of atoms (1/N^2). "
-        "This ensures the loss value is independent of system size and consistent with per-atom RMSE reporting. "
-        "If false (default), uses the legacy normalization (1/N), which may cause the loss to scale with system size. "
+        "Controls normalization of the energy and virial loss terms. "
+        "For `loss_func='mse'`, if true, energy and virial losses are computed as intensive quantities, "
+        "normalized by the square of the number of atoms (1/N^2); if false (default), the legacy normalization "
+        "(1/N) is used. "
+        "For `loss_func='mae'`, energy and virial losses remain normalized by the number of atoms (1/N). "
         "The default is false for backward compatibility with models trained using deepmd-kit <= 3.0.1."
     )
     return [
diff --git a/source/tests/consistent/loss/test_ener.py b/source/tests/consistent/loss/test_ener.py
index dcb3988173..27d16f630b 100644
--- a/source/tests/consistent/loss/test_ener.py
+++ b/source/tests/consistent/loss/test_ener.py
@@ -67,11 +67,19 @@
     ("mse", "mae"),  # loss_func
     (False, True),  # f_use_norm
     (False, True),  # mae (dp test extra MAE metrics)
+    (False, True),  # intensive
 )
 class TestEner(CommonTest, LossTest, unittest.TestCase):
     @property
     def data(self) -> dict:
-        (use_huber, enable_atom_ener_coeff, loss_func, f_use_norm, _mae) = self.param
+        (
+            use_huber,
+            enable_atom_ener_coeff,
+            loss_func,
+            f_use_norm,
+            _mae,
+            intensive,
+        ) = self.param
         return {
             "start_pref_e": 0.02,
             "limit_pref_e": 1.0,
@@ -87,17 +95,32 @@ def data(self) -> dict:
             "enable_atom_ener_coeff": enable_atom_ener_coeff,
             "loss_func": loss_func,
             "f_use_norm": f_use_norm,
+            "intensive": intensive,
         }
 
     @property
     def skip_tf(self) -> bool:
-        (_use_huber, _enable_atom_ener_coeff, loss_func, f_use_norm, _mae) = self.param
+        (
+            _use_huber,
+            _enable_atom_ener_coeff,
+            loss_func,
+            f_use_norm,
+            _mae,
+            _intensive,
+        ) = self.param
         # Skip TF for MAE loss tests (not implemented in TF backend)
         return CommonTest.skip_tf or loss_func == "mae" or f_use_norm
 
     @property
     def skip_pd(self) -> bool:
-        (_use_huber, _enable_atom_ener_coeff, loss_func, f_use_norm, _mae) = self.param
+        (
+            _use_huber,
+            _enable_atom_ener_coeff,
+            loss_func,
+            f_use_norm,
+            _mae,
+            _intensive,
+        ) = self.param
         # Skip Paddle for MAE loss tests (not implemented in Paddle backend)
         return not INSTALLED_PD or loss_func == "mae" or f_use_norm
 
@@ -116,7 +139,14 @@ def skip_pd(self) -> bool:
     args = loss_ener()
 
     def setUp(self) -> None:
-        (use_huber, _enable_atom_ener_coeff, loss_func, f_use_norm, mae) = self.param
+        (
+            use_huber,
+            _enable_atom_ener_coeff,
+            loss_func,
+            f_use_norm,
+            mae,
+            _intensive,
+        ) = self.param
         # Skip invalid combinations
         if f_use_norm and not (use_huber or loss_func == "mae"):
             self.skipTest("f_use_norm requires either use_huber or loss_func='mae'")
@@ -557,3 +587,173 @@ def rtol(self) -> float:
     @property
     def atol(self) -> float:
         return 1e-10
+
+
+class TestIntensiveNatomsScaling(unittest.TestCase):
+    """Regression test for natoms-scaling behavior with intensive normalization.
+
+    This test verifies that MSE energy/virial loss contributions scale with 1/N² when
+    intensive=True, ensuring the loss is independent of system size. This guards against
+    future refactors accidentally reverting to 1/N scaling.
+    """
+
+    def test_intensive_energy_scaling(self) -> None:
+        """Test that energy MSE loss scales as 1/N² with intensive=True."""
+        if not INSTALLED_PT:
+            self.skipTest("PyTorch not installed")
+
+        # Create loss function with intensive=True
+        loss_func = EnerLossPT(
+            starter_learning_rate=1e-3,
+            start_pref_e=1.0,
+            limit_pref_e=1.0,
+            start_pref_f=0.0,
+            limit_pref_f=0.0,
+            start_pref_v=1.0,  # Enable virial to test it too
+            limit_pref_v=1.0,
+            intensive=True,
+        )
+
+        rng = np.random.default_rng(20250419)
+        nframes = 1
+
+        # Run with different natoms values and check scaling
+        natoms_values = [4, 8, 16]
+        energy_losses: list[float] = []
+        virial_losses: list[float] = []
+
+        for natoms in natoms_values:
+            predict = {
+                "energy": numpy_to_torch(rng.random((nframes,))),
+                "force": numpy_to_torch(rng.random((nframes, natoms, 3))),
+                "virial": numpy_to_torch(rng.random((nframes, 9))),
+                "atom_energy": numpy_to_torch(rng.random((nframes, natoms))),
+            }
+            label = {
+                "energy": numpy_to_torch(rng.random((nframes,))),
+                "force": numpy_to_torch(rng.random((nframes, natoms, 3))),
+                "virial": numpy_to_torch(rng.random((nframes, 9))),
+                "atom_ener": numpy_to_torch(rng.random((nframes, natoms))),
+                "find_energy": 1.0,
+                "find_force": 1.0,
+                "find_virial": 1.0,
+                "find_atom_ener": 0.0,  # Disable to simplify test
+            }
+
+            _, _loss, more_loss = loss_func(
+                {},
+                lambda p=predict: p,
+                label,
+                natoms,
+                1e-3,
+            )
+
+            # Get the raw L2 losses before prefactor weighting
+            energy_losses.append(float(torch_to_numpy(more_loss["l2_ener_loss"])))
+            virial_losses.append(float(torch_to_numpy(more_loss["l2_virial_loss"])))
+
+        # For MSE loss with intensive=True, the raw l2 loss should be roughly
+        # size-independent (just the mean squared error). The scaling factor
+        # (1/N² vs 1/N) is applied when combining into the total loss.
+        # So we verify that l2_ener_loss and l2_virial_loss are size-independent.
+        # (They should be similar across different natoms since they're just MSE.)
+
+        # The raw MSE losses should not scale linearly with natoms
+        # (if they did, that would indicate bug in normalization)
+        for i in range(len(natoms_values) - 1):
+            ratio = natoms_values[i + 1] / natoms_values[i]
+            # Raw l2 loss should NOT scale with natoms (within reasonable variance)
+            energy_ratio = energy_losses[i + 1] / energy_losses[i]
+            virial_ratio = virial_losses[i + 1] / virial_losses[i]
+
+            # These shouldn't scale linearly with N - allow some variance but not Nx
+            self.assertLess(
+                energy_ratio,
+                ratio * 0.8,
+                f"Energy loss appears to scale with natoms (ratio {energy_ratio:.2f})",
+            )
+            self.assertLess(
+                virial_ratio,
+                ratio * 0.8,
+                f"Virial loss appears to scale with natoms (ratio {virial_ratio:.2f})",
+            )
+
+    def test_intensive_vs_legacy_scaling_difference(self) -> None:
+        """Test that intensive=True produces different loss than intensive=False for energy/virial."""
+        if not INSTALLED_PT:
+            self.skipTest("PyTorch not installed")
+
+        rng = np.random.default_rng(20250419)
+        nframes = 1
+        natoms = 8
+
+        predict = {
+            "energy": numpy_to_torch(rng.random((nframes,))),
+            "force": numpy_to_torch(rng.random((nframes, natoms, 3))),
+            "virial": numpy_to_torch(rng.random((nframes, 9))),
+            "atom_energy": numpy_to_torch(rng.random((nframes, natoms))),
+        }
+        label = {
+            "energy": numpy_to_torch(rng.random((nframes,))),
+            "force": numpy_to_torch(rng.random((nframes, natoms, 3))),
+            "virial": numpy_to_torch(rng.random((nframes, 9))),
+            "atom_ener": numpy_to_torch(rng.random((nframes, natoms))),
+            "find_energy": 1.0,
+            "find_force": 1.0,
+            "find_virial": 1.0,
+            "find_atom_ener": 0.0,
+        }
+
+        # Create loss functions with intensive=True and intensive=False
+        loss_intensive = EnerLossPT(
+            starter_learning_rate=1e-3,
+            start_pref_e=1.0,
+            limit_pref_e=1.0,
+            start_pref_f=0.0,
+            limit_pref_f=0.0,
+            start_pref_v=1.0,
+            limit_pref_v=1.0,
+            intensive=True,
+        )
+        loss_legacy = EnerLossPT(
+            starter_learning_rate=1e-3,
+            start_pref_e=1.0,
+            limit_pref_e=1.0,
+            start_pref_f=0.0,
+            limit_pref_f=0.0,
+            start_pref_v=1.0,
+            limit_pref_v=1.0,
+            intensive=False,
+        )
+
+        _, loss_val_intensive, _ = loss_intensive(
+            {},
+            lambda: predict,
+            label,
+            natoms,
+            1e-3,
+        )
+        _, loss_val_legacy, _ = loss_legacy(
+            {},
+            lambda: predict,
+            label,
+            natoms,
+            1e-3,
+        )
+
+        loss_intensive_val = float(torch_to_numpy(loss_val_intensive))
+        loss_legacy_val = float(torch_to_numpy(loss_val_legacy))
+
+        # The losses should be different when intensive differs
+        # (unless by chance the values are the same, which is unlikely)
+        # The intensive version should have an extra 1/N factor
+        expected_ratio = 1.0 / natoms
+        actual_ratio = loss_intensive_val / loss_legacy_val
+
+        # Allow some tolerance due to floating point
+        self.assertAlmostEqual(
+            actual_ratio,
+            expected_ratio,
+            places=5,
+            msg=f"Expected intensive/legacy ratio ~{expected_ratio:.6f}, got {actual_ratio:.6f}",
+        )
diff --git a/source/tests/consistent/loss/test_ener_spin.py b/source/tests/consistent/loss/test_ener_spin.py
index 2e8734c109..462b4d38ad 100644
--- a/source/tests/consistent/loss/test_ener_spin.py
+++ b/source/tests/consistent/loss/test_ener_spin.py
@@ -47,11 +47,12 @@
 @parameterized(
     ("mse", "mae"),  # loss_func
     (False, True),  # mae (dp test extra MAE metrics)
+    (False, True),  # intensive
 )
 class TestEnerSpin(CommonTest, LossTest, unittest.TestCase):
     @property
     def data(self) -> dict:
-        (loss_func, _mae) = self.param
+        (loss_func, _mae, intensive) = self.param
         return {
             "start_pref_e": 0.02,
             "limit_pref_e": 1.0,
@@ -64,6 +65,7 @@ def data(self) -> dict:
             "start_pref_ae": 1.0,
             "limit_pref_ae": 1.0,
             "loss_func": loss_func,
+            "intensive": intensive,
         }
 
     skip_tf = True
@@ -81,7 +83,7 @@ def data(self) -> dict:
     args = loss_ener_spin()
 
     def setUp(self) -> None:
-        (loss_func, mae) = self.param
+        (loss_func, mae, _intensive) = self.param
         if loss_func == "mae" and mae:
             self.skipTest("mae=True with loss_func='mae' is redundant")
         CommonTest.setUp(self)

From 2e9e600fa6080fe5ebf1bc336cfc8ddedbb18604 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 20 Apr 2026 03:06:05 +0000
Subject: [PATCH 09/14] fix: address PR review comments for intensive
 normalization

Agent-Logs-Url: https://github.com/anyangml/deepmd-kit/sessions/c77bb90b-6600-4466-9c55-7d17ff7bf76a

Co-authored-by: anyangml <137014849+anyangml@users.noreply.github.com>
---
 deepmd/tf/loss/ener.py                        |   4 +
 source/tests/consistent/loss/test_ener.py     | 183 ++++++++-----
 .../tests/consistent/loss/test_ener_spin.py   | 245 ++++++++++++++++++
 3 files changed, 368 insertions(+), 64 deletions(-)

diff --git a/deepmd/tf/loss/ener.py b/deepmd/tf/loss/ener.py
index f14693331c..f9935c1adf 100644
--- a/deepmd/tf/loss/ener.py
+++ b/deepmd/tf/loss/ener.py
@@ -1019,7 +1019,11 @@ def serialize(self, suffix: str = "") -> dict:
             "limit_pref_v": self.limit_pref_v,
             "start_pref_ae": self.start_pref_ae,
             "limit_pref_ae": self.limit_pref_ae,
+            "start_pref_pf": self.start_pref_pf,
+            "limit_pref_pf": self.limit_pref_pf,
+            "relative_f": self.relative_f,
             "enable_atom_ener_coeff": self.enable_atom_ener_coeff,
+            "use_spin": self.use_spin,
             "loss_func": self.loss_func,
             "intensive": self.intensive,
         }
diff --git a/source/tests/consistent/loss/test_ener.py b/source/tests/consistent/loss/test_ener.py
index 27d16f630b..283298bbde 100644
--- a/source/tests/consistent/loss/test_ener.py
+++ b/source/tests/consistent/loss/test_ener.py
@@ -597,86 +597,141 @@ class TestIntensiveNatomsScaling(unittest.TestCase):
     future refactors accidentally reverting to 1/N scaling.
     """
 
-    def test_intensive_energy_scaling(self) -> None:
-        """Test that energy MSE loss scales as 1/N² with intensive=True."""
+    def test_intensive_total_loss_scaling(self) -> None:
+        """Test that total loss scales correctly with 1/N² for intensive=True.
+
+        This test uses controlled energy/virial residuals to verify that the
+        total loss contribution scales with 1/N² (intensive) vs 1/N (legacy).
+        We use identical per-atom residuals across different system sizes to
+        ensure the raw MSE is the same, then verify the total loss scales as
+        expected based on the normalization factor.
+        """
         if not INSTALLED_PT:
             self.skipTest("PyTorch not installed")
 
-        # Create loss function with intensive=True
-        loss_func = EnerLossPT(
+        nframes = 1
+
+        # Test with two different system sizes
+        natoms_small = 4
+        natoms_large = 8  # 2x the small system
+
+        # Use fixed energy residual so MSE is predictable
+        # Energy residual = 1.0, so l2_ener_loss = 1.0
+        fixed_energy_diff = 1.0
+
+        def create_data_with_fixed_residual(natoms: int, energy_diff: float):
+            """Create predict/label with a fixed energy difference."""
+            predict = {
+                "energy": numpy_to_torch(np.array([1.0])),
+                "force": numpy_to_torch(np.zeros((nframes, natoms, 3))),
+                "virial": numpy_to_torch(np.array([[1.0] * 9])),  # Virial residual = 1
+                "atom_energy": numpy_to_torch(np.ones((nframes, natoms)) / natoms),
+            }
+            label = {
+                "energy": numpy_to_torch(np.array([1.0 + energy_diff])),
+                "force": numpy_to_torch(np.zeros((nframes, natoms, 3))),
+                "virial": numpy_to_torch(np.array([[2.0] * 9])),  # Virial residual = 1
+                "atom_ener": numpy_to_torch(
+                    np.ones((nframes, natoms)) * (1.0 + energy_diff) / natoms
+                ),
+                "find_energy": 1.0,
+                "find_force": 0.0,  # Disable force to focus on energy/virial
+                "find_virial": 1.0,
+                "find_atom_ener": 0.0,
+            }
+            return predict, label
+
+        # Create loss functions
+        loss_intensive = EnerLossPT(
             starter_learning_rate=1e-3,
             start_pref_e=1.0,
             limit_pref_e=1.0,
             start_pref_f=0.0,
             limit_pref_f=0.0,
-            start_pref_v=1.0,  # Enable virial to test it too
+            start_pref_v=1.0,
             limit_pref_v=1.0,
             intensive=True,
         )
+        loss_legacy = EnerLossPT(
+            starter_learning_rate=1e-3,
+            start_pref_e=1.0,
+            limit_pref_e=1.0,
+            start_pref_f=0.0,
+            limit_pref_f=0.0,
+            start_pref_v=1.0,
+            limit_pref_v=1.0,
+            intensive=False,
+        )
 
-        rng = np.random.default_rng(20250419)
-        nframes = 1
+        # Compute losses for small system
+        predict_small, label_small = create_data_with_fixed_residual(
+            natoms_small, fixed_energy_diff
+        )
+        _, loss_intensive_small, _ = loss_intensive(
+            {},
+            lambda p=predict_small: p,
+            label_small,
+            natoms_small,
+            1e-3,
+        )
+        _, loss_legacy_small, _ = loss_legacy(
+            {},
+            lambda p=predict_small: p,
+            label_small,
+            natoms_small,
+            1e-3,
+        )
 
-        # Run with different natoms values and check scaling
-        natoms_values = [4, 8, 16]
-        energy_losses: list[float] = []
-        virial_losses: list[float] = []
+        # Compute losses for large system
+        predict_large, label_large = create_data_with_fixed_residual(
+            natoms_large, fixed_energy_diff
+        )
+        _, loss_intensive_large, _ = loss_intensive(
+            {},
+            lambda p=predict_large: p,
+            label_large,
+            natoms_large,
+            1e-3,
+        )
+        _, loss_legacy_large, _ = loss_legacy(
+            {},
+            lambda p=predict_large: p,
+            label_large,
+            natoms_large,
+            1e-3,
+        )
 
-        for natoms in natoms_values:
-            predict = {
-                "energy": numpy_to_torch(rng.random((nframes,))),
-                "force": numpy_to_torch(rng.random((nframes, natoms, 3))),
-                "virial": numpy_to_torch(rng.random((nframes, 9))),
-                "atom_energy": numpy_to_torch(rng.random((nframes, natoms))),
-            }
-            label = {
-                "energy": numpy_to_torch(rng.random((nframes,))),
-                "force": numpy_to_torch(rng.random((nframes, natoms, 3))),
-                "virial": numpy_to_torch(rng.random((nframes, 9))),
-                "atom_ener": numpy_to_torch(rng.random((nframes, natoms))),
-                "find_energy": 1.0,
-                "find_force": 1.0,
-                "find_virial": 1.0,
-                "find_atom_ener": 0.0,  # Disable to simplify test
-            }
+        loss_int_small = float(torch_to_numpy(loss_intensive_small))
+        loss_int_large = float(torch_to_numpy(loss_intensive_large))
+        loss_leg_small = float(torch_to_numpy(loss_legacy_small))
+        loss_leg_large = float(torch_to_numpy(loss_legacy_large))
 
-            _, _loss, more_loss = loss_func(
-                {},
-                lambda p=predict: p,
-                label,
-                natoms,
-                1e-3,
-            )
+        # With same residuals but different natoms:
+        # - intensive (1/N²): loss should scale as (N_small/N_large)² = (4/8)² = 0.25
+        # - legacy (1/N): loss should scale as (N_small/N_large) = 4/8 = 0.5
 
-            # Get the raw L2 losses before prefactor weighting
-            energy_losses.append(float(torch_to_numpy(more_loss["l2_ener_loss"])))
-            virial_losses.append(float(torch_to_numpy(more_loss["l2_virial_loss"])))
-
-        # For MSE loss with intensive=True, the raw l2 loss should be roughly
-        # size-independent (just the mean squared error). The scaling factor
-        # (1/N² vs 1/N) is applied when combining into the total loss.
-        # So we verify that l2_ener_loss and l2_virial_loss are size-independent.
-        # (They should be similar across different natoms since they're just MSE.)
-
-        # The raw MSE losses should not scale linearly with natoms
-        # (if they did, that would indicate bug in normalization)
-        for i in range(len(natoms_values) - 1):
-            ratio = natoms_values[i + 1] / natoms_values[i]
-            # Raw l2 loss should NOT scale with natoms (within reasonable variance)
-            energy_ratio = energy_losses[i + 1] / energy_losses[i]
-            virial_ratio = virial_losses[i + 1] / virial_losses[i]
-
-            # These shouldn't scale linearly with N - allow some variance but not Nx
-            self.assertLess(
-                energy_ratio,
-                ratio * 0.8,
-                f"Energy loss appears to scale with natoms (ratio {energy_ratio:.2f})",
-            )
-            self.assertLess(
-                virial_ratio,
-                ratio * 0.8,
-                f"Virial loss appears to scale with natoms (ratio {virial_ratio:.2f})",
-            )
+        # Verify intensive scaling: loss_large / loss_small should be ~0.25
+        natoms_ratio = natoms_small / natoms_large  # 0.5
+        expected_intensive_ratio = natoms_ratio**2  # 0.25
+        expected_legacy_ratio = natoms_ratio  # 0.5
+
+        actual_intensive_ratio = loss_int_large / loss_int_small
+        actual_legacy_ratio = loss_leg_large / loss_leg_small
+
+        self.assertAlmostEqual(
+            actual_intensive_ratio,
+            expected_intensive_ratio,
+            places=5,
+            msg=f"Intensive loss scaling: expected {expected_intensive_ratio:.4f}, "
+            f"got {actual_intensive_ratio:.4f}",
+        )
+        self.assertAlmostEqual(
+            actual_legacy_ratio,
+            expected_legacy_ratio,
+            places=5,
+            msg=f"Legacy loss scaling: expected {expected_legacy_ratio:.4f}, "
+            f"got {actual_legacy_ratio:.4f}",
+        )
 
     def test_intensive_vs_legacy_scaling_difference(self) -> None:
         """Test that intensive=True produces different loss than intensive=False for energy/virial."""
diff --git a/source/tests/consistent/loss/test_ener_spin.py b/source/tests/consistent/loss/test_ener_spin.py
index 462b4d38ad..4e61bb591c 100644
--- a/source/tests/consistent/loss/test_ener_spin.py
+++ b/source/tests/consistent/loss/test_ener_spin.py
@@ -210,3 +210,248 @@ def rtol(self) -> float:
     @property
     def atol(self) -> float:
         return 1e-10
+
+
+class TestEnerSpinIntensiveScaling(unittest.TestCase):
+    """Regression test for natoms-scaling behavior with intensive normalization.
+
+    This test verifies that MSE energy/virial loss contributions scale with 1/N² when
+    intensive=True, ensuring the loss is independent of system size. This guards against
+    future refactors accidentally reverting to 1/N scaling.
+    """
+
+    def test_intensive_total_loss_scaling(self) -> None:
+        """Test that total loss scales correctly with 1/N² for intensive=True.
+
+        This test uses controlled energy/virial residuals to verify that the
+        total loss contribution scales with 1/N² (intensive) vs 1/N (legacy).
+        """
+        if not INSTALLED_PT:
+            self.skipTest("PyTorch not installed")
+
+        nframes = 1
+
+        # Test with two different system sizes
+        natoms_small = 4
+        natoms_large = 8  # 2x the small system
+        # For spin systems, we have real atoms and virtual (magnetic) atoms
+        n_magnetic = 2  # Half of atoms have magnetic spins
+
+        # Use fixed energy residual so MSE is predictable
+        fixed_energy_diff = 1.0
+
+        def create_data_with_fixed_residual(
+            natoms: int, n_mag: int, energy_diff: float
+        ):
+            """Create predict/label with a fixed energy difference."""
+            mask_mag = np.zeros((nframes, natoms, 1), dtype=bool)
+            mask_mag[:, :n_mag, :] = True
+
+            predict = {
+                "energy": numpy_to_torch(np.array([1.0])),
+                "force": numpy_to_torch(np.zeros((nframes, natoms, 3))),
+                "force_mag": numpy_to_torch(np.zeros((nframes, natoms, 3))),
+                "mask_mag": mask_mag,
+                "virial": numpy_to_torch(np.array([[1.0] * 9])),
+                "atom_energy": numpy_to_torch(np.ones((nframes, natoms)) / natoms),
+            }
+            label = {
+                "energy": numpy_to_torch(np.array([1.0 + energy_diff])),
+                "force": numpy_to_torch(np.zeros((nframes, natoms, 3))),
+                "force_mag": numpy_to_torch(np.zeros((nframes, natoms, 3))),
+                "virial": numpy_to_torch(np.array([[2.0] * 9])),
+                "atom_ener": numpy_to_torch(
+                    np.ones((nframes, natoms)) * (1.0 + energy_diff) / natoms
+                ),
+                "find_energy": 1.0,
+                "find_force": 0.0,  # Disable force to focus on energy/virial
+                "find_force_mag": 0.0,
+                "find_virial": 1.0,
+                "find_atom_ener": 0.0,
+            }
+            return predict, label
+
+        # Create loss functions
+        loss_intensive = EnerSpinLossPT(
+            starter_learning_rate=1e-3,
+            start_pref_e=1.0,
+            limit_pref_e=1.0,
+            start_pref_fr=0.0,
+            limit_pref_fr=0.0,
+            start_pref_fm=0.0,
+            limit_pref_fm=0.0,
+            start_pref_v=1.0,
+            limit_pref_v=1.0,
+            intensive=True,
+        )
+        loss_legacy = EnerSpinLossPT(
+            starter_learning_rate=1e-3,
+            start_pref_e=1.0,
+            limit_pref_e=1.0,
+            start_pref_fr=0.0,
+            limit_pref_fr=0.0,
+            start_pref_fm=0.0,
+            limit_pref_fm=0.0,
+            start_pref_v=1.0,
+            limit_pref_v=1.0,
+            intensive=False,
+        )
+
+        # Compute losses for small system
+        predict_small, label_small = create_data_with_fixed_residual(
+            natoms_small, n_magnetic, fixed_energy_diff
+        )
+        _, loss_intensive_small, _ = loss_intensive(
+            {},
+            lambda p=predict_small: p,
+            label_small,
+            natoms_small,
+            1e-3,
+        )
+        _, loss_legacy_small, _ = loss_legacy(
+            {},
+            lambda p=predict_small: p,
+            label_small,
+            natoms_small,
+            1e-3,
+        )
+
+        # Compute losses for large system (proportionally scale magnetic atoms)
+        predict_large, label_large = create_data_with_fixed_residual(
+            natoms_large, n_magnetic * 2, fixed_energy_diff
+        )
+        _, loss_intensive_large, _ = loss_intensive(
+            {},
+            lambda p=predict_large: p,
+            label_large,
+            natoms_large,
+            1e-3,
+        )
+        _, loss_legacy_large, _ = loss_legacy(
+            {},
+            lambda p=predict_large: p,
+            label_large,
+            natoms_large,
+            1e-3,
+        )
+
+        loss_int_small = float(torch_to_numpy(loss_intensive_small))
+        loss_int_large = float(torch_to_numpy(loss_intensive_large))
+        loss_leg_small = float(torch_to_numpy(loss_legacy_small))
+        loss_leg_large = float(torch_to_numpy(loss_legacy_large))
+
+        # With same residuals but different natoms:
+        # - intensive (1/N²): loss should scale as (N_small/N_large)² = (4/8)² = 0.25
+        # - legacy (1/N): loss should scale as (N_small/N_large) = 4/8 = 0.5
+
+        natoms_ratio = natoms_small / natoms_large  # 0.5
+        expected_intensive_ratio = natoms_ratio**2  # 0.25
+        expected_legacy_ratio = natoms_ratio  # 0.5
+
+        actual_intensive_ratio = loss_int_large / loss_int_small
+        actual_legacy_ratio = loss_leg_large / loss_leg_small
+
+        self.assertAlmostEqual(
+            actual_intensive_ratio,
+            expected_intensive_ratio,
+            places=5,
+            msg=f"Intensive loss scaling: expected {expected_intensive_ratio:.4f}, "
+            f"got {actual_intensive_ratio:.4f}",
+        )
+        self.assertAlmostEqual(
+            actual_legacy_ratio,
+            expected_legacy_ratio,
+            places=5,
+            msg=f"Legacy loss scaling: expected {expected_legacy_ratio:.4f}, "
+            f"got {actual_legacy_ratio:.4f}",
+        )
+
+    def test_intensive_vs_legacy_scaling_difference(self) -> None:
+        """Test that intensive=True produces different loss than intensive=False."""
+        if not INSTALLED_PT:
+            self.skipTest("PyTorch not installed")
+
+        rng = np.random.default_rng(20250419)
+        nframes = 1
+        natoms = 8
+        n_magnetic = 4
+
+        mask_mag = np.zeros((nframes, natoms, 1), dtype=bool)
+        mask_mag[:, :n_magnetic, :] = True
+
+        predict = {
+            "energy": numpy_to_torch(rng.random((nframes,))),
+            "force": numpy_to_torch(rng.random((nframes, natoms, 3))),
+            "force_mag": numpy_to_torch(rng.random((nframes, natoms, 3))),
+            "mask_mag": mask_mag,
+            "virial": numpy_to_torch(rng.random((nframes, 9))),
+            "atom_energy": numpy_to_torch(rng.random((nframes, natoms))),
+        }
+        label = {
+            "energy": numpy_to_torch(rng.random((nframes,))),
+            "force": numpy_to_torch(rng.random((nframes, natoms, 3))),
+            "force_mag": numpy_to_torch(rng.random((nframes, natoms, 3))),
+            "virial": numpy_to_torch(rng.random((nframes, 9))),
+            "atom_ener": numpy_to_torch(rng.random((nframes, natoms))),
+            "find_energy": 1.0,
+            "find_force": 1.0,
+            "find_force_mag": 1.0,
+            "find_virial": 1.0,
+            "find_atom_ener": 0.0,
+        }
+
+        # Create loss functions with intensive=True and intensive=False
+        loss_intensive = EnerSpinLossPT(
+            starter_learning_rate=1e-3,
+            start_pref_e=1.0,
+            limit_pref_e=1.0,
+            start_pref_fr=0.0,
+            limit_pref_fr=0.0,
+            start_pref_fm=0.0,
+            limit_pref_fm=0.0,
+            start_pref_v=1.0,
+            limit_pref_v=1.0,
+            intensive=True,
+        )
+        loss_legacy = EnerSpinLossPT(
+            starter_learning_rate=1e-3,
+            start_pref_e=1.0,
+            limit_pref_e=1.0,
+            start_pref_fr=0.0,
+            limit_pref_fr=0.0,
+            start_pref_fm=0.0,
+            limit_pref_fm=0.0,
+            start_pref_v=1.0,
+            limit_pref_v=1.0,
+            intensive=False,
+        )
+
+        _, loss_val_intensive, _ = loss_intensive(
+            {},
+            lambda: predict,
+            label,
+            natoms,
+            1e-3,
+        )
+        _, loss_val_legacy, _ = loss_legacy(
+            {},
+            lambda: predict,
+            label,
+            natoms,
+            1e-3,
+        )
+
+        loss_intensive_val = float(torch_to_numpy(loss_val_intensive))
+        loss_legacy_val = float(torch_to_numpy(loss_val_legacy))
+
+        # The losses should be different when intensive differs
+        # The intensive version should have an extra 1/N factor
+        expected_ratio = 1.0 / natoms
+        actual_ratio = loss_intensive_val / loss_legacy_val
+
+        self.assertAlmostEqual(
+            actual_ratio,
+            expected_ratio,
+            places=5,
+            msg=f"Expected intensive/legacy ratio ~{expected_ratio:.6f}, got {actual_ratio:.6f}",
+        )

From 2ce402935e32abcbcb5fd38186e526a61e66b302 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Mon, 20 Apr 2026 14:36:03 +0800
Subject: [PATCH 10/14] Update deepmd/utils/argcheck.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Anyang Peng <137014849+anyangml@users.noreply.github.com>
---
 deepmd/utils/argcheck.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index fe68106873..897a5ebac3 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -3421,7 +3421,8 @@ def loss_ener_spin() -> list[Argument]:
         "For `loss_func='mse'`, if true, energy and virial losses are computed as intensive quantities, "
         "normalized by the square of the number of atoms (1/N^2); if false (default), the legacy normalization "
         "(1/N) is used. "
-        "For `loss_func='mae'`, energy and virial losses remain normalized by the number of atoms (1/N). "
+        "For `loss_func='mae'`, this option does not change the existing MAE formulations; in particular, "
+        "the MAE energy and virial terms are not both normalized by 1/N. "
         "The default is false for backward compatibility with models trained using deepmd-kit <= 3.0.1."
     )
     return [

From 0bdd46fc98f3d25d28361ef10cb7be82a2d0ef4a Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Mon, 20 Apr 2026 16:03:40 +0800
Subject: [PATCH 11/14] doc: add explaination

---
 doc/model/train-energy-spin.md | 24 ++++++++++++++++++++++--
 doc/model/train-energy.md      | 22 +++++++++++++++++++++-
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/doc/model/train-energy-spin.md b/doc/model/train-energy-spin.md
index 41b57c2f29..fe83b5acc9 100644
--- a/doc/model/train-energy-spin.md
+++ b/doc/model/train-energy-spin.md
@@ -86,7 +86,22 @@ The spin loss function $L$ for training energy is given by
 
 $$L = p_e L_e + p_{fr} L_{fr} + p_{fm} L_{fm} + p_v L_v$$
 
-where $L_e$, $L_{fr}$, $L_{fm}$ and $L_v$ denote the loss in energy, atomic force, magnatic force and virial, respectively. $p_e$, $p_{fr}$, $p_{fm}$ and $p_v$ give the prefactors of the energy, atomic force, magnatic force and virial losses.
+where $L_e$, $L_{fr}$, $L_{fm}$ and $L_v$ denote the loss in energy, atomic force, magnetic force and virial, respectively. $p_e$, $p_{fr}$, $p_{fm}$ and $p_v$ give the prefactors of the energy, atomic force, magnetic force and virial losses.
+
+By default, the energy and virial losses are normalized by the number of atoms $N$. When **intensive loss normalization** is enabled, these terms are instead normalized by $N^2$. For the energy loss, this converts it to the square of the per-atom energy error:
+
+```math
+    L_E^{\text{intensive}}(\boldsymbol{x};\boldsymbol{\theta})=\left(\frac{E(\boldsymbol{x};\boldsymbol{\theta})-E^*}{N}\right)^2 = \frac{1}{N^2}(E(\boldsymbol{x};\boldsymbol{\theta})-E^*)^2,
+```
+
+and similarly for the virial loss:
+
+```math
+    L_\Xi^{\text{intensive}}(\boldsymbol{x};\boldsymbol{\theta})=\frac{1}{9N^2}\sum_{\alpha,\beta=1}^{3}(\Xi_{\alpha\beta}(\boldsymbol{x};\boldsymbol{\theta})-\Xi_{\alpha\beta}^*)^2.
+```
+
+Intensive normalization ensures the loss scale remains consistent across systems with different numbers of atoms $N$, which is highly recommended for multi-task learning.
+
 
 The prefectors may not be a constant, rather it changes linearly with the learning rate. Taking the atomic force prefactor for example, at training step $t$, it is given by
 
@@ -111,8 +126,10 @@ The {ref}`loss <loss>` section in the `input.json` is
 	"limit_pref_fm":	10.0,
 	"start_pref_v":	    0,
 	"limit_pref_v":	    0,
-	"loss_func":	    "mse"
+	"loss_func":	    "mse",
+	"intensive":	    false
     },
+
 ```
 
 The options {ref}`start_pref_e <loss[ener_spin]/start_pref_e>`, {ref}`limit_pref_e <loss[ener_spin]/limit_pref_e>`, {ref}`start_pref_fr <loss[ener_spin]/start_pref_fr>`, {ref}`limit_pref_fm <loss[ener_spin]/limit_pref_fm>`, {ref}`start_pref_v <loss[ener_spin]/start_pref_v>` and {ref}`limit_pref_v <loss[ener_spin]/limit_pref_v>` determine the start and limit prefactors of energy, atomic force, magnatic force and virial, respectively.
@@ -124,8 +141,11 @@ The {ref}`loss_func <loss[ener_spin]/loss_func>` option specifies the type of lo
 
 When using `loss_func="mse"`, the training will output `rmse_e`, `rmse_fr`, `rmse_fm`, `rmse_v` metrics (root mean square errors). When using `loss_func="mae"`, the training will output `mae_e`, `mae_fr`, `mae_fm`, `mae_v` metrics (mean absolute errors).
 
+The {ref}`intensive <loss[ener_spin]/intensive>` option (default is `false`) controls the normalization of the energy and virial loss terms when `loss_func="mse"`. If set to `true`, these terms are normalized by $1/N^2$ (intensive), ensuring the loss scale is independent of the system size $N$. If `false`, the legacy $1/N$ normalization is used.
+
 If one does not want to train with virial, then he/she may set the virial prefactors {ref}`start_pref_v <loss[ener_spin]/start_pref_v>` and {ref}`limit_pref_v <loss[ener_spin]/limit_pref_v>` to 0.
 
+
 ## Data format
 
 :::{note}
diff --git a/doc/model/train-energy.md b/doc/model/train-energy.md
index ecc6f234b4..7937d082d8 100644
--- a/doc/model/train-energy.md
+++ b/doc/model/train-energy.md
@@ -57,6 +57,21 @@ The properties $\eta$ of the energy loss function could be energy $E$, force $\b
 where $F_{k,\alpha}$ is the $\alpha$-th component of the force on atom $k$, and the superscript $\ast$ indicates the label of the property that should be provided in advance.
 Using $N$ ensures that each loss of fitting property is averaged over atomic contributions before they contribute to the total loss by weight.
 
+By default, the energy and virial losses are normalized by the number of atoms $N$ as shown above. When **intensive loss normalization** is enabled, these terms are instead normalized by $N^2$. For the energy loss, this converts it to the square of the per-atom energy error:
+
+```math
+    L_E^{\text{intensive}}(\boldsymbol{x};\boldsymbol{\theta})=\left(\frac{E(\boldsymbol{x};\boldsymbol{\theta})-E^*}{N}\right)^2 = \frac{1}{N^2}(E(\boldsymbol{x};\boldsymbol{\theta})-E^*)^2,
+```
+
+and similarly for the virial loss:
+
+```math
+    L_\Xi^{\text{intensive}}(\boldsymbol{x};\boldsymbol{\theta})=\frac{1}{9N^2}\sum_{\alpha,\beta=1}^{3}(\Xi_{\alpha\beta}(\boldsymbol{x};\boldsymbol{\theta})-\Xi_{\alpha\beta}^*)^2.
+```
+
+Intensive normalization makes the loss magnitudes independent of the system size $N$ (assuming per-atom errors are consistent), which is crucial for multi-task training involving datasets with varying system sizes to prevent larger systems from dominating the training process.
+
+
 If part of atoms is more important than others, for example, certain atoms play an essential role when calculating free energy profiles or kinetic isotope effects, the MSE of atomic forces with prefactors $q_{k}$ can also be used as the loss function:
 
 ```math
@@ -117,8 +132,10 @@ The {ref}`loss <loss>` section in the `input.json` is
 	"limit_pref_f":	1,
 	"start_pref_v":	0,
 	"limit_pref_v":	0,
-	"loss_func":	"mse"
+	"loss_func":	"mse",
+	"intensive":	false
     }
+
 ```
 
 The options {ref}`start_pref_e <loss[ener]/start_pref_e>`, {ref}`limit_pref_e <loss[ener]/limit_pref_e>`, {ref}`start_pref_f <loss[ener]/start_pref_f>`, {ref}`limit_pref_f <loss[ener]/limit_pref_f>`, {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` determine the start and limit prefactors of energy, force and virial, respectively.
@@ -130,6 +147,9 @@ The {ref}`loss_func <loss[ener]/loss_func>` option specifies the type of loss fu
 
 When using `loss_func="mse"`, the training will output `rmse_e`, `rmse_f`, `rmse_v` metrics (root mean square errors). When using `loss_func="mae"`, the training will output `mae_e`, `mae_f`, `mae_v` metrics (mean absolute errors).
 
+The {ref}`intensive <loss[ener]/intensive>` option (default is `false`) controls the normalization of the energy and virial loss terms when `loss_func="mse"`. If set to `true`, these terms are normalized by $1/N^2$ (making them "intensive"), ensuring the loss scale remains consistent across different system sizes $N$. If `false`, the legacy $1/N$ normalization is used. This option is highly recommended for multi-task learning.
+
 If one does not want to train with virial, then he/she may set the virial prefactors {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` to 0.
 
+
 [^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).

From c856d7277027106cc5b006ea321af3b9c8cbd208 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 20 Apr 2026 08:04:37 +0000
Subject: [PATCH 12/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 doc/model/train-energy-spin.md | 2 --
 doc/model/train-energy.md      | 2 --
 2 files changed, 4 deletions(-)

diff --git a/doc/model/train-energy-spin.md b/doc/model/train-energy-spin.md
index fe83b5acc9..69e2d69334 100644
--- a/doc/model/train-energy-spin.md
+++ b/doc/model/train-energy-spin.md
@@ -102,7 +102,6 @@ and similarly for the virial loss:
 
 Intensive normalization ensures the loss scale remains consistent across systems with different numbers of atoms $N$, which is highly recommended for multi-task learning.
 
-
 The prefectors may not be a constant, rather it changes linearly with the learning rate. Taking the atomic force prefactor for example, at training step $t$, it is given by
 
 $$p_{fr}(t) = p_{fr}^0 \frac{ \alpha(t) }{ \alpha(0) } + p_{fr}^\infty ( 1 - \frac{ \alpha(t) }{ \alpha(0) })$$
@@ -145,7 +144,6 @@ The {ref}`intensive <loss[ener_spin]/intensive>` option (default is `false`) con
 
 If one does not want to train with virial, then he/she may set the virial prefactors {ref}`start_pref_v <loss[ener_spin]/start_pref_v>` and {ref}`limit_pref_v <loss[ener_spin]/limit_pref_v>` to 0.
 
-
 ## Data format
 
 :::{note}
diff --git a/doc/model/train-energy.md b/doc/model/train-energy.md
index 7937d082d8..beaed27250 100644
--- a/doc/model/train-energy.md
+++ b/doc/model/train-energy.md
@@ -71,7 +71,6 @@ and similarly for the virial loss:
 
 Intensive normalization makes the loss magnitudes independent of the system size $N$ (assuming per-atom errors are consistent), which is crucial for multi-task training involving datasets with varying system sizes to prevent larger systems from dominating the training process.
 
-
 If part of atoms is more important than others, for example, certain atoms play an essential role when calculating free energy profiles or kinetic isotope effects, the MSE of atomic forces with prefactors $q_{k}$ can also be used as the loss function:
 
 ```math
@@ -151,5 +150,4 @@ The {ref}`intensive <loss[ener]/intensive>` option (default is `false`) controls
 
 If one does not want to train with virial, then he/she may set the virial prefactors {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` to 0.
 
-
 [^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).

From 29a78335f7a1eb7301d68d3a11f9bb4bef8858da Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Tue, 21 Apr 2026 11:03:29 +0800
Subject: [PATCH 13/14] chore: typo fix

---
 deepmd/dpmodel/loss/ener_spin.py | 2 +-
 deepmd/pd/loss/ener.py           | 2 +-
 deepmd/pt/loss/ener.py           | 2 +-
 deepmd/pt/loss/ener_spin.py      | 2 +-
 deepmd/tf/loss/ener.py           | 2 +-
 deepmd/utils/argcheck.py         | 7 +++----
 6 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/deepmd/dpmodel/loss/ener_spin.py b/deepmd/dpmodel/loss/ener_spin.py
index 3837b1ab77..b03c973cad 100644
--- a/deepmd/dpmodel/loss/ener_spin.py
+++ b/deepmd/dpmodel/loss/ener_spin.py
@@ -57,7 +57,7 @@ class EnergySpinLoss(Loss):
         terms consistent with per-atom RMSE reporting and less dependent on
         system size. This option does not change the MAE formulation, which is
         handled separately. The default is false for backward compatibility with
-        models trained using deepmd-kit <= 3.0.1.
+        models trained using deepmd-kit <= 3.1.3.
     **kwargs
         Other keyword arguments.
     """
diff --git a/deepmd/pd/loss/ener.py b/deepmd/pd/loss/ener.py
index ef6967c4a4..75c5956c86 100644
--- a/deepmd/pd/loss/ener.py
+++ b/deepmd/pd/loss/ener.py
@@ -126,7 +126,7 @@ def __init__(
             1/N scaling. For MAE, the normalization remains 1/N. For Huber loss, residuals are
             first normalized by 1/N before applying the Huber formula, so this option does not
             provide a pure 1/N versus 1/N^2 toggle in that path. The default is false for
-            backward compatibility with models trained using deepmd-kit <= 3.0.1.
+            backward compatibility with models trained using deepmd-kit <= 3.1.3.
         **kwargs
             Other keyword arguments.
         """
diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py
index 48edad7d73..cfac71c62b 100644
--- a/deepmd/pt/loss/ener.py
+++ b/deepmd/pt/loss/ener.py
@@ -127,7 +127,7 @@ def __init__(
             1/N scaling. For MAE, the normalization remains 1/N. For Huber loss, residuals are
             first normalized by 1/N before applying the Huber formula, so this option does not
             provide a pure 1/N versus 1/N^2 toggle in that path. The default is false for
-            backward compatibility with models trained using deepmd-kit <= 3.0.1.
+            backward compatibility with models trained using deepmd-kit <= 3.1.3.
         **kwargs
             Other keyword arguments.
         """
diff --git a/deepmd/pt/loss/ener_spin.py b/deepmd/pt/loss/ener_spin.py
index 7e566277f3..aff5b58cb4 100644
--- a/deepmd/pt/loss/ener_spin.py
+++ b/deepmd/pt/loss/ener_spin.py
@@ -84,7 +84,7 @@ def __init__(
             the legacy normalization (1/N) is used for those MSE terms. Note that this 1/N^2
             behavior does not apply to the MAE code paths: MAE energy/virial losses do not use
             the `intensive` exponent in the same way. The default is false for backward
-            compatibility with models trained using deepmd-kit <= 3.0.1.
+            compatibility with models trained using deepmd-kit <= 3.1.3.
         **kwargs
             Other keyword arguments.
         """
diff --git a/deepmd/tf/loss/ener.py b/deepmd/tf/loss/ener.py
index f9935c1adf..0d1c6304f9 100644
--- a/deepmd/tf/loss/ener.py
+++ b/deepmd/tf/loss/ener.py
@@ -107,7 +107,7 @@ class EnerStdLoss(Loss):
         legacy normalization (1/N). When ``use_huber=True``, the residual is still
         normalized by 1/N before applying the Huber loss, so ``intensive`` may not
         change behavior in that path. The default is false for backward compatibility
-        with models trained using deepmd-kit <= 3.0.1.
+        with models trained using deepmd-kit <= 3.1.3.
     **kwargs
         Other keyword arguments.
     """
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 897a5ebac3..f42f1c0d34 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -3240,7 +3240,7 @@ def loss_ener() -> list[Argument]:
         "This matches per-atom-style reporting more closely for those terms. "
         "For MAE, the normalization remains 1/N. When `use_huber=True`, the residual is already scaled by 1/N before applying the Huber loss, "
         "so this flag may have limited or no effect for those terms. "
-        "The default is false for backward compatibility with models trained using deepmd-kit <= 3.0.1."
+        "The default is false for backward compatibility with models trained using deepmd-kit <= 3.1.3."
     )
     return [
         Argument(
@@ -3421,9 +3421,8 @@ def loss_ener_spin() -> list[Argument]:
         "For `loss_func='mse'`, if true, energy and virial losses are computed as intensive quantities, "
         "normalized by the square of the number of atoms (1/N^2); if false (default), the legacy normalization "
         "(1/N) is used. "
-        "For `loss_func='mae'`, this option does not change the existing MAE formulations; in particular, "
-        "the MAE energy and virial terms are not both normalized by 1/N. "
-        "The default is false for backward compatibility with models trained using deepmd-kit <= 3.0.1."
+        "For `loss_func='mae'`, this option does not change the existing MAE formulations;"
+        "The default is false for backward compatibility with models trained using deepmd-kit <= 3.1.3."
     )
     return [
         Argument(

From a22767feaaf2dd8a3a121634943020407139519e Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Thu, 23 Apr 2026 13:15:38 +0800
Subject: [PATCH 14/14] Update default version for backward compatibility

Signed-off-by: Anyang Peng <137014849+anyangml@users.noreply.github.com>
---
 deepmd/dpmodel/loss/ener.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepmd/dpmodel/loss/ener.py b/deepmd/dpmodel/loss/ener.py
index 09143a9d38..02a58c5e5e 100644
--- a/deepmd/dpmodel/loss/ener.py
+++ b/deepmd/dpmodel/loss/ener.py
@@ -97,7 +97,7 @@ class EnergyLoss(Loss):
         scaling and are not affected in the same way by this flag.
         If false (default), the legacy normalization is used for the affected terms.
         The default is false for backward compatibility with models trained using
-        deepmd-kit <= 3.0.1.
+        deepmd-kit <= 3.1.3.
     **kwargs
         Other keyword arguments.
     """