iProzd
diff --git a/‎.github/workflows/test_cc.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test_cc.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test_cuda.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test_cuda.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test_python.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test_python.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepmd/dpmodel/utils/learning_rate.py‎
Lines changed: 11 additions & 9 deletions b/‎deepmd/dpmodel/utils/learning_rate.py‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎deepmd/pt/model/descriptor/repflows.py‎
Lines changed: 2 additions & 2 deletions b/‎deepmd/pt/model/descriptor/repflows.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎deepmd/pt/model/descriptor/repformers.py‎
Lines changed: 1 addition & 1 deletion b/‎deepmd/pt/model/descriptor/repformers.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepmd/pt/train/ema.py‎
Lines changed: 200 additions & 0 deletions b/‎deepmd/pt/train/ema.py‎
Lines changed: 200 additions & 0 deletions
@@ -44,7 +44,7 @@ jobs:
       - run: python -m pip install uv
       - name: Install Python dependencies
         run: |
-          source/install/uv_with_retry.sh pip install --system --group pin_tensorflow_cpu --group pin_pytorch_cpu --group pin_jax --torch-backend cpu
+          source/install/uv_with_retry.sh pip install --system --group pin_tensorflow_cpu --group pin_pytorch_cpu --group pin_jax_cpu --torch-backend cpu
           export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
           source/install/uv_with_retry.sh pip install --system -e .[cpu,test,lmp,jax] mpi4py mpich
       - name: Convert models
 
@@ -43,7 +43,7 @@ jobs:
           && sudo apt-get -y install cuda-12-3 libcudnn8=8.9.5.*-1+cuda12.3
         if: false # skip as we use nvidia image
       - run: python -m pip install -U uv
-      - run: source/install/uv_with_retry.sh pip install --system --group pin_tensorflow_gpu --group pin_pytorch_gpu --group pin_jax "jax[cuda12]"
+      - run: source/install/uv_with_retry.sh pip install --system --group pin_tensorflow_gpu --group pin_pytorch_gpu --group pin_jax_gpu
       - run: |
           export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
           export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
 
@@ -31,7 +31,7 @@ jobs:
           source/install/uv_with_retry.sh pip install --system openmpi --group pin_tensorflow_cpu --group pin_pytorch_cpu --torch-backend cpu
           export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
           export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
-          source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py --group pin_jax
+          source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py --group pin_jax_cpu
           source/install/uv_with_retry.sh pip install --system --find-links "https://www.paddlepaddle.org.cn/packages/nightly/cpu/paddlepaddle/" --index-url https://pypi.org/simple --trusted-host www.paddlepaddle.org.cn --trusted-host paddlepaddle.org.cn paddlepaddle==3.4.0.dev20260310
         env:
           # Please note that uv has some issues with finding
 
@@ -70,7 +70,11 @@ def __init__(
             The warmup learning rate starts from warmup_start_factor * start_lr.
             Default is 0.0.
         """
-        # === Step 1. Validate stop_lr and stop_lr_ratio (runtime check) ===
+        # === Step 1. Validate start_lr (runtime check) ===
+        if start_lr <= 0 or not np.isfinite(start_lr):
+            raise ValueError(f"start_lr ({start_lr}) must be positive and finite.")
+
+        # === Step 2. Validate stop_lr and stop_lr_ratio (runtime check) ===
         has_stop_lr = stop_lr is not None
         has_stop_lr_ratio = stop_lr_ratio is not None
 
@@ -85,13 +89,13 @@ def __init__(
                 "Got stop_lr=None, stop_lr_ratio=None"
             )
 
-        # === Step 2. Compute stop_lr from stop_lr_ratio if needed ===
+        # === Step 3. Compute stop_lr from stop_lr_ratio if needed ===
         if stop_lr_ratio is not None:
             self.stop_lr = start_lr * stop_lr_ratio
         else:
             self.stop_lr = stop_lr
 
-        # === Step 3. Validate warmup_steps and warmup_ratio (runtime check) ===
+        # === Step 4. Validate warmup_steps and warmup_ratio (runtime check) ===
         has_warmup_steps = warmup_steps != 0
         has_warmup_ratio = warmup_ratio is not None
 
@@ -101,13 +105,13 @@ def __init__(
                 f"Got warmup_steps={warmup_steps}, warmup_ratio={warmup_ratio}"
             )
 
-        # === Step 4. Compute warmup_steps from warmup_ratio if needed ===
+        # === Step 5. Compute warmup_steps from warmup_ratio if needed ===
         if warmup_ratio is not None:
             self.warmup_steps = int(warmup_ratio * num_steps)
         else:
             self.warmup_steps = warmup_steps
 
-        # === Step 5. Validate step ranges (runtime check) ===
+        # === Step 6. Validate step ranges (runtime check) ===
         if num_steps < 0:
             raise ValueError("num_steps must be non-negative")
         if self.warmup_steps < 0:
@@ -117,10 +121,10 @@ def __init__(
         if num_steps == 0 and self.warmup_steps != 0:
             raise ValueError("warmup_steps must be 0 when num_steps is 0")
 
-        # === Step 6. Compute warmup_start_lr ===
+        # === Step 7. Compute warmup_start_lr ===
         self.warmup_start_lr = warmup_start_factor * start_lr
 
-        # === Step 7. Store core parameters ===
+        # === Step 8. Store core parameters ===
         self._start_lr = start_lr
         self.num_steps = num_steps
         # Decay phase covers (num_steps - warmup_steps) steps
@@ -493,8 +497,6 @@ def __init__(
         )
 
         # === Validate WSD-specific invariants ===
-        if self._start_lr <= 0:
-            raise ValueError(f"start_lr ({self._start_lr}) must be positive.")
         if self.stop_lr <= 0:
             raise ValueError(f"stop_lr ({self.stop_lr}) must be positive.")
         if decay_phase_ratio <= 0 or decay_phase_ratio > 1:
 
@@ -496,8 +496,8 @@ def forward(
         a_sw = a_sw.masked_fill(~a_nlist_mask, 0.0)
         # set all padding positions to index of 0
         # if the a neighbor is real or not is indicated by nlist_mask
-        nlist[nlist == -1] = 0
-        a_nlist[a_nlist == -1] = 0
+        nlist = torch.where(nlist == -1, 0, nlist)
+        a_nlist = torch.where(a_nlist == -1, 0, a_nlist)
 
         # get node embedding
         # [nframes, nloc, tebd_dim]
 
@@ -457,7 +457,7 @@ def forward(
 
         # set all padding positions to index of 0
         # if the a neighbor is real or not is indicated by nlist_mask
-        nlist[nlist == -1] = 0
+        nlist = torch.where(nlist == -1, 0, nlist)
         # nb x nall x ng1
         if comm_dict is None:
             assert mapping is not None
 
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+from __future__ import (
+    annotations,
+)
+
+import logging
+from contextlib import (
+    contextmanager,
+)
+from copy import (
+    deepcopy,
+)
+from pathlib import (
+    Path,
+)
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+import torch
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Iterator,
+    )
+
+EMA_CHECKPOINT_KEY = "ema"
+EMA_DECAY_KEY = "decay"
+EMA_MODEL_STATE_KEY = "model"
+EMA_VALIDATION_STATE_KEY = "validation_state"
+
+log = logging.getLogger(__name__)
+
+
+def _append_suffix(path_like: str | Path, suffix: str) -> Path:
+    """Append a suffix before the final file suffix when present."""
+    path = Path(path_like)
+    if path.suffix:
+        return path.with_name(f"{path.stem}{suffix}{path.suffix}")
+    return path.with_name(f"{path.name}{suffix}")
+
+
+def get_ema_checkpoint_prefix(save_ckpt: str | Path) -> str:
+    """Derive the EMA checkpoint prefix from the regular checkpoint prefix."""
+    return str(_append_suffix(save_ckpt, "_ema"))
+
+
+def get_ema_validation_log_path(full_val_file: str | Path) -> Path:
+    """Derive the EMA validation log path from the regular validation log path."""
+    return _append_suffix(full_val_file, "_ema")
+
+
+class ModelEMA:
+    """Maintain an exponential moving average of model parameters.
+
+    This helper assumes DDP/ZeRO-1 style training where every rank owns the
+    same full, consistently ordered model parameters. It is not a sharded
+    parameter EMA implementation.
+    """
+
+    def __init__(
+        self,
+        model: torch.nn.Module | dict[str, torch.nn.Module],
+        decay: float,
+        state: dict[str, Any] | None = None,
+    ) -> None:
+        self.decay = float(decay)
+        self.shadow_params = self._clone_model_parameters(model)
+        self.validation_state: dict[str, Any] = {}
+        if state is not None:
+            self.load_state_dict(state)
+
+    @staticmethod
+    def _named_model_parameters(
+        model: torch.nn.Module | dict[str, torch.nn.Module],
+    ) -> list[tuple[str, torch.nn.Parameter]]:
+        """Collect all floating-point model parameters in a deterministic order."""
+        if isinstance(model, dict):
+            named_parameters = []
+            for model_key in sorted(model):
+                named_parameters.extend(
+                    [
+                        (f"{model_key}.{name}", param)
+                        for name, param in model[model_key].named_parameters()
+                        if torch.is_floating_point(param)
+                    ]
+                )
+            return named_parameters
+        return [
+            (name, param)
+            for name, param in model.named_parameters()
+            if torch.is_floating_point(param)
+        ]
+
+    def _clone_model_parameters(
+        self,
+        model: torch.nn.Module | dict[str, torch.nn.Module],
+    ) -> dict[str, torch.Tensor]:
+        """Clone model parameters to initialize the EMA shadow state."""
+        with torch.no_grad():
+            return {
+                name: param.detach().clone()
+                for name, param in self._named_model_parameters(model)
+            }
+
+    def update(self, model: torch.nn.Module | dict[str, torch.nn.Module]) -> None:
+        """Update EMA shadow parameters from the current model parameters."""
+        with torch.no_grad():
+            for name, param in self._named_model_parameters(model):
+                self.shadow_params[name].lerp_(param.detach(), weight=1.0 - self.decay)
+
+    def state_dict(self) -> dict[str, Any]:
+        """Serialize EMA state for restart."""
+        return {
+            EMA_DECAY_KEY: self.decay,
+            EMA_MODEL_STATE_KEY: {
+                name: tensor.detach().cpu().clone()
+                for name, tensor in self.shadow_params.items()
+            },
+            EMA_VALIDATION_STATE_KEY: deepcopy(self.validation_state),
+        }
+
+    def load_state_dict(self, state: dict[str, Any]) -> None:
+        """Restore EMA shadow parameters and validator state."""
+        if EMA_DECAY_KEY in state:
+            checkpoint_decay = float(state[EMA_DECAY_KEY])
+            if checkpoint_decay != self.decay:
+                log.warning(
+                    "Ignoring EMA checkpoint decay=%s because training.ema_decay=%s "
+                    "is configured.",
+                    checkpoint_decay,
+                    self.decay,
+                )
+        model_state = state.get(EMA_MODEL_STATE_KEY, {})
+        if not isinstance(model_state, dict):
+            raise TypeError("EMA checkpoint field `model` must be a dict.")
+
+        current_keys = set(self.shadow_params)
+        loaded_keys = set(model_state)
+        missing_keys = sorted(current_keys - loaded_keys)
+        unexpected_keys = sorted(loaded_keys - current_keys)
+        if missing_keys or unexpected_keys:
+            raise KeyError(
+                "EMA checkpoint parameter keys do not match the current model. "
+                f"Missing keys: {missing_keys[:5]}, unexpected keys: {unexpected_keys[:5]}."
+            )
+
+        with torch.no_grad():
+            for name, shadow_param in self.shadow_params.items():
+                loaded_param = model_state[name]
+                if not isinstance(loaded_param, torch.Tensor):
+                    raise TypeError(
+                        f"EMA checkpoint tensor for {name!r} must be a torch.Tensor."
+                    )
+                if loaded_param.shape != shadow_param.shape:
+                    raise ValueError(
+                        "EMA checkpoint parameter shape does not match the current "
+                        f"model for {name!r}: expected {tuple(shadow_param.shape)}, "
+                        f"got {tuple(loaded_param.shape)}."
+                    )
+                shadow_param.copy_(
+                    loaded_param.to(
+                        device=shadow_param.device,
+                        dtype=shadow_param.dtype,
+                    )
+                )
+
+        validation_state = state.get(EMA_VALIDATION_STATE_KEY, {})
+        if validation_state is None:
+            validation_state = {}
+        if not isinstance(validation_state, dict):
+            raise TypeError("EMA checkpoint field `validation_state` must be a dict.")
+        self.validation_state = deepcopy(validation_state)
+
+    @contextmanager
+    def apply_shadow(
+        self,
+        model: torch.nn.Module | dict[str, torch.nn.Module],
+    ) -> Iterator[None]:
+        """Temporarily replace model parameters with the EMA shadow state."""
+        backups: dict[str, torch.Tensor] = {}
+        try:
+            with torch.no_grad():
+                for name, param in self._named_model_parameters(model):
+                    backups[name] = param.detach().clone()
+                    param.copy_(
+                        self.shadow_params[name].to(
+                            device=param.device,
+                            dtype=param.dtype,
+                        )
+                    )
+            yield
+        finally:
+            with torch.no_grad():
+                for name, param in self._named_model_parameters(model):
+                    if name in backups:
+                        param.copy_(backups[name])