feat(pt_expt): support .pt training checkpoints in DeepEval

Han Wang · Han Wang · commit 98aee78a86fd · 2026-04-26T17:50:19.000+08:00
`dp --pt-expt test -m foo.pt` previously rejected `.pt` files (only
`.pt2` / `.pte` were supported), and `dp --pt test -m foo.pt` on a
pt_expt-trained checkpoint silently loaded random weights because the
state-dict layout (dpmodel `.w`/`.b` keys) doesn't match the legacy
pt backend's expectations.

- `Backend.detect_backend_by_model` sniffs `.pt` content so files with
  `.w`/`.b` keys (pt_expt) route to the pt_expt DeepEval and files with
  `.matrix`/`.bias` keys (pt) keep routing to pt.
- `pt_expt.DeepEval._load_pt` reconstructs the model from
  `_extra_state["model_params"]`, loads the state-dict via
  `ModelWrapper`, and exposes an eager `forward_common_lower` runner
  with the same signature as the AOTI/exported module so the existing
  `eval()` path is unchanged. Spin-aware and non-spin variants;
  multi-task `.pt` selects a head and remaps keys.
- `pt_expt.get_model` learns `get_spin_model` (mirrors dpmodel) so spin
  checkpoints can be reconstructed from `model_params`.
- Tests cover dispatch sniffing, single-task / multi-task / spin /
  spin-multi-task `.pt` parity vs eager forward, fparam / aparam, and
  `.pt` vs `.pte` cross-format consistency at 1e-10.
diff --git a/deepmd/backend/backend.py b/deepmd/backend/backend.py
@@ -101,10 +101,33 @@ def detect_backend_by_model(filename: str) -> type["Backend"]:
         filename : str
             The model file name
         """
-        filename = str(filename).lower()
+        filename_lower = str(filename).lower()
+        # `.pt` is shared between the pt and pt_expt backends. They use
+        # different parameter naming (pt: `.matrix`/`.bias`, pt_expt:
+        # `.w`/`.b`), so peek at the state-dict keys to disambiguate.
+        if filename_lower.endswith(".pt"):
+            try:
+                import torch
+
+                sd = torch.load(filename, map_location="cpu", weights_only=False)
+                if isinstance(sd, dict) and "model" in sd:
+                    sd = sd["model"]
+                keys = list(sd.keys()) if hasattr(sd, "keys") else []
+                has_pt_expt = any(k.endswith(".w") or k.endswith(".b") for k in keys)
+                has_pt = any(k.endswith(".matrix") or k.endswith(".bias") for k in keys)
+                if has_pt_expt and not has_pt:
+                    target_name = "pt-expt"
+                else:
+                    target_name = "pt"
+                for key, backend in Backend.get_backends().items():
+                    if key == target_name:
+                        return backend
+            except Exception:
+                # Fall through to suffix matching if sniffing fails.
+                pass
         for backend in Backend.get_backends().values():
             for suffix in backend.suffixes:
-                if filename.endswith(suffix):
+                if filename_lower.endswith(suffix):
                     return backend
         raise ValueError(f"Cannot detect the backend of the model file {filename}.")
 
diff --git a/deepmd/pt_expt/infer/deep_eval.py b/deepmd/pt_expt/infer/deep_eval.py
@@ -99,8 +99,16 @@ def __init__(
 
         if self._is_pt2:
             self._load_pt2(model_file)
-        else:
+        elif model_file.endswith(".pte"):
             self._load_pte(model_file)
+        elif model_file.endswith(".pt"):
+            self._load_pt(model_file, head=kwargs.get("head"))
+        else:
+            raise ValueError(
+                f"Unsupported model file '{model_file}' for the pt_expt "
+                "backend: expected `.pt2` / `.pte` (deployable archives) or "
+                "`.pt` (training checkpoint)."
+            )
 
         if isinstance(auto_batch_size, bool):
             if auto_batch_size:
@@ -206,6 +214,160 @@ def _load_pt2(self, model_file: str) -> None:
         self._pt2_runner = aoti_load_package(model_file)
         self.exported_module = None
 
+    def _load_pt(self, model_file: str, head: str | None = None) -> None:
+        """Load a `.pt` training checkpoint (eager mode, no torch.export)."""
+        from copy import (
+            deepcopy,
+        )
+
+        from deepmd.pt.utils.env import (
+            DEVICE,
+        )
+        from deepmd.pt_expt.model import (
+            get_model,
+        )
+
+        state_dict = torch.load(model_file, map_location=DEVICE, weights_only=False)
+        if "model" in state_dict:
+            state_dict = state_dict["model"]
+        model_params = deepcopy(state_dict["_extra_state"]["model_params"])
+
+        if "model_dict" in model_params:
+            # Multi-task: pick the requested head (defaults to "Default" if present).
+            heads = list(model_params["model_dict"].keys())
+            if head is None:
+                if "Default" in heads:
+                    head = "Default"
+                else:
+                    raise ValueError(
+                        f"Multi-task checkpoint '{model_file}' has heads "
+                        f"{heads}; pass --head to select one."
+                    )
+            if head not in heads:
+                raise ValueError(
+                    f"Head '{head}' not found in checkpoint '{model_file}'. "
+                    f"Available heads: {heads}."
+                )
+            head_params = model_params["model_dict"][head]
+            # Restrict state_dict to the chosen head and rename to "Default".
+            head_state = {"_extra_state": state_dict["_extra_state"]}
+            for key, value in state_dict.items():
+                prefix = f"model.{head}."
+                if key.startswith(prefix):
+                    head_state[key.replace(prefix, "model.Default.")] = (
+                        value.clone() if torch.is_tensor(value) else value
+                    )
+            state_dict = head_state
+            model_params = head_params
+
+        model = get_model(deepcopy(model_params)).to(DEVICE)
+
+        # Load weights into a {"Default": model} wrapper to match the
+        # `model.Default.*` key prefix used in the saved state_dict.
+        from deepmd.pt_expt.train.wrapper import (
+            ModelWrapper,
+        )
+
+        wrapper = ModelWrapper(model)
+        wrapper.load_state_dict(state_dict)
+        model = wrapper.model["Default"].eval()
+
+        self._dpmodel = model
+        self._is_spin = (
+            model_params.get("type") == "spin_ener" or "spin" in model_params
+        )
+        self.rcut = model.get_rcut()
+        self.type_map = model.get_type_map()
+        if self._is_spin:
+            self._model_output_def = ModelOutputDef(
+                FittingOutputDef(
+                    [
+                        OutputVariableDef(
+                            "energy",
+                            shape=[1],
+                            reducible=True,
+                            r_differentiable=True,
+                            c_differentiable=True,
+                            atomic=True,
+                            magnetic=True,
+                        )
+                    ]
+                )
+            )
+        else:
+            self._model_output_def = ModelOutputDef(model.atomic_output_def())
+        self._model_def_script = model_params
+        # Populate metadata so eval helpers (e.g. default_fparam fallback)
+        # behave the same as the .pt2/.pte path.  Mirrors the fields that
+        # `_collect_metadata` writes into metadata.json.
+        self.metadata = {
+            "type_map": model.get_type_map(),
+            "rcut": model.get_rcut(),
+            "sel": model.get_sel(),
+            "dim_fparam": model.get_dim_fparam(),
+            "dim_aparam": model.get_dim_aparam(),
+            "mixed_types": model.mixed_types(),
+            "has_default_fparam": model.has_default_fparam(),
+            "default_fparam": model.get_default_fparam(),
+            "is_spin": self._is_spin,
+        }
+        if self._is_spin:
+            self.metadata["ntypes_spin"] = model.spin.get_ntypes_spin()
+            self.metadata["use_spin"] = [bool(v) for v in model.spin.use_spin]
+
+        # Eager runner with the same signature as the .pt2/.pte exported module.
+        # Use forward_common_lower (not forward_lower) to match the export-time
+        # output keys ("energy", "energy_redu", "energy_derv_r", ...) that
+        # communicate_extended_output downstream consumes.
+        # Non-spin: (ext_coord, ext_atype, nlist, mapping, fparam, aparam)
+        # Spin:     (ext_coord, ext_atype, ext_spin, nlist, mapping, fparam, aparam)
+        if self._is_spin:
+
+            def _eager_runner_spin(
+                ext_coord: torch.Tensor,
+                ext_atype: torch.Tensor,
+                ext_spin: torch.Tensor,
+                nlist: torch.Tensor,
+                mapping: torch.Tensor | None,
+                fparam: torch.Tensor | None,
+                aparam: torch.Tensor | None,
+            ) -> dict[str, torch.Tensor]:
+                ext_coord = ext_coord.detach().requires_grad_(True)
+                return model.forward_common_lower(
+                    ext_coord,
+                    ext_atype,
+                    ext_spin,
+                    nlist,
+                    mapping,
+                    fparam=fparam,
+                    aparam=aparam,
+                    do_atomic_virial=True,
+                )
+
+            self.exported_module = _eager_runner_spin
+        else:
+
+            def _eager_runner(
+                ext_coord: torch.Tensor,
+                ext_atype: torch.Tensor,
+                nlist: torch.Tensor,
+                mapping: torch.Tensor | None,
+                fparam: torch.Tensor | None,
+                aparam: torch.Tensor | None,
+            ) -> dict[str, torch.Tensor]:
+                ext_coord = ext_coord.detach().requires_grad_(True)
+                return model.forward_common_lower(
+                    ext_coord,
+                    ext_atype,
+                    nlist,
+                    mapping,
+                    fparam=fparam,
+                    aparam=aparam,
+                    do_atomic_virial=True,
+                )
+
+            self.exported_module = _eager_runner
+
     def get_rcut(self) -> float:
         """Get the cutoff radius of this model."""
         return self.rcut
diff --git a/deepmd/pt_expt/model/get_model.py b/deepmd/pt_expt/model/get_model.py
@@ -37,6 +37,12 @@
 from deepmd.pt_expt.model.property_model import (
     PropertyModel,
 )
+from deepmd.pt_expt.model.spin_ener_model import (
+    SpinEnergyModel,
+)
+from deepmd.utils.spin import (
+    Spin,
+)
 
 
 def _get_standard_model_components(
@@ -162,6 +168,36 @@ def get_linear_model(model_params: dict) -> BaseModel:
     )
 
 
+def get_spin_model(data: dict) -> SpinEnergyModel:
+    """Build a pt_expt spin energy model from a config dictionary.
+
+    Mirrors :func:`deepmd.dpmodel.model.model.get_spin_model`: expands the
+    type map and descriptor sel for virtual spin atoms, then wraps the
+    backbone EnergyModel as a :class:`SpinEnergyModel`.
+    """
+    data = copy.deepcopy(data)
+    data["type_map"] += [item + "_spin" for item in data["type_map"]]
+    spin = Spin(
+        use_spin=data["spin"]["use_spin"],
+        virtual_scale=data["spin"]["virtual_scale"],
+    )
+    pair_exclude_types = spin.get_pair_exclude_types(
+        exclude_types=data.get("pair_exclude_types", None)
+    )
+    data["pair_exclude_types"] = pair_exclude_types
+    data["descriptor"]["exclude_types"] = pair_exclude_types
+    atom_exclude_types = spin.get_atom_exclude_types(
+        exclude_types=data.get("atom_exclude_types", None)
+    )
+    data["atom_exclude_types"] = atom_exclude_types
+    if "env_protection" not in data["descriptor"]:
+        data["descriptor"]["env_protection"] = 1e-6
+    if data["descriptor"]["type"] in ["se_e2_a"]:
+        data["descriptor"]["sel"] += data["descriptor"]["sel"]
+    backbone_model = get_standard_model(data)
+    return SpinEnergyModel(backbone_model=backbone_model, spin=spin)
+
+
 def get_model(data: dict) -> BaseModel:
     """Get a model from a config dictionary.
 
@@ -172,6 +208,8 @@ def get_model(data: dict) -> BaseModel:
     """
     model_type = data.get("type", "standard")
     if model_type == "standard":
+        if "spin" in data:
+            return get_spin_model(data)
         return get_standard_model(data)
     elif model_type == "linear_ener":
         return get_linear_model(data)
diff --git a/source/tests/pt_expt/infer/test_deep_eval_pt_checkpoint.py b/source/tests/pt_expt/infer/test_deep_eval_pt_checkpoint.py