feat(pt_expt): make model.json optional in .pt2/.pte loading

OutisLi · OutisLi · commit 02afe57a2459 · 2026-04-23T23:52:51.000+08:00
The pt_expt DeepEval's inference path runs through aoti_load_package /
the exported module; `self._dpmodel` is only used to resolve metadata
(rcut / type_map / atomic_output_def / dim_fparam / ...), which is
already available in extra/metadata.json (the contract the C++ reader
DeepPotPTExpt enforces). Drop the requirement that extra/model.json be
present:

* _load_pt2 / _load_pte: model.json is optional; metadata.json is now
  the minimum contract.
* _init_from_metadata: reconstructs ModelOutputDef from the serialised
  fitting_output_defs and hoists sel / mixed_types to plain attributes.
* get_dim_fparam / get_dim_aparam / get_sel_type / model_type /
  get_use_spin: fall back to metadata when _dpmodel is None.
* eval_descriptor / eval_typeebd / eval_fitting_last_layer: raise a
  descriptive NotImplementedError in metadata-only mode (they inspect
  the dpmodel instance directly).

Also fixes two metadata-completeness gaps so metadata-only load is
exact:
* _collect_metadata: add the `sel_type` field so get_sel_type works
  without a dpmodel round-trip (relevant for dipole / polar / wfc).
* _collect_metadata: force vdef.category to plain int for deterministic
  JSON serialisation across Python versions.

Archives produced by existing pt_expt serialisation still contain
model.json and continue to use the dpmodel path unchanged. Regression
covered by 77 existing tests in test_deep_eval.py + a dedicated new
suite (test_deep_eval_metadata_only.py) that strips extra/model.json
and asserts bitwise parity against the full archive.
diff --git a/deepmd/pt_expt/infer/deep_eval.py b/deepmd/pt_expt/infer/deep_eval.py
@@ -140,6 +140,12 @@ def _init_from_model_json(self, model_json_str: str) -> None:
 
         self.rcut = self._dpmodel.get_rcut()
         self.type_map = self._dpmodel.get_type_map()
+        # Hoist sel / mixed_types to plain attributes so the inference hot
+        # path (`_build_nlist_*`) does not need the dpmodel instance.  This
+        # matches what `_init_from_metadata` sets and keeps both code paths
+        # numerically identical.
+        self.sel = list(self._dpmodel.get_sel())
+        self.mixed_types = bool(self._dpmodel.mixed_types())
         if self._is_spin:
             self._model_output_def = ModelOutputDef(
                 FittingOutputDef(
@@ -159,23 +165,89 @@ def _init_from_model_json(self, model_json_str: str) -> None:
         else:
             self._model_output_def = ModelOutputDef(self._dpmodel.atomic_output_def())
 
+    def _init_from_metadata(self) -> None:
+        """Initialize DeepEval from ``extra/metadata.json`` alone.
+
+        Used when the ``.pt2`` / ``.pte`` archive ships no ``extra/model.json``
+        (e.g. for backends that do not travel through the dpmodel round-trip).
+        The metadata contract is the same one the C++ ``DeepPotPTExpt``
+        reader consumes, so anything that validates against the C++ side
+        automatically validates here.
+
+        ``self._dpmodel`` is left as ``None`` to signal the metadata-only
+        mode.  Inference does not need it: it runs through
+        ``aoti_load_package`` / the exported module and uses plain
+        attributes (``self.rcut``, ``self.sel``, ``self.mixed_types``,
+        ``self._model_output_def``) for all metadata-level queries.
+        """
+        self._dpmodel = None
+        self._is_spin = bool(self.metadata.get("is_spin", False))
+        self.rcut = float(self.metadata["rcut"])
+        self.type_map = list(self.metadata["type_map"])
+        self.sel = [int(s) for s in self.metadata["sel"]]
+        self.mixed_types = bool(self.metadata["mixed_types"])
+
+        fitting_defs = []
+        for vdef in self.metadata["fitting_output_defs"]:
+            fitting_defs.append(
+                OutputVariableDef(
+                    name=vdef["name"],
+                    shape=list(vdef["shape"]),
+                    reducible=vdef.get("reducible", False),
+                    r_differentiable=vdef.get("r_differentiable", False),
+                    c_differentiable=vdef.get("c_differentiable", False),
+                    atomic=vdef.get("atomic", True),
+                    category=int(
+                        vdef.get("category", OutputVariableCategory.OUT.value)
+                    ),
+                    r_hessian=vdef.get("r_hessian", False),
+                    magnetic=vdef.get("magnetic", False),
+                    intensive=vdef.get("intensive", False),
+                )
+            )
+        self._model_output_def = ModelOutputDef(FittingOutputDef(fitting_defs))
+
     def _load_pte(self, model_file: str) -> None:
-        """Load a .pte (torch.export) model file."""
+        """Load a .pte (torch.export) model file.
+
+        ``model.json`` is optional: when present it is used to reconstruct
+        the dpmodel instance (enabling dpmodel-level introspection such as
+        ``eval_descriptor``); when absent we fall back to pure metadata
+        mode via :meth:`_init_from_metadata`.  ``metadata.json`` is the
+        only contract the inference path actually requires.
+        """
         extra_files = {
             "model.json": "",
             "model_def_script.json": "",
             "metadata.json": "",
         }
         exported = torch.export.load(model_file, extra_files=extra_files)
         self.exported_module = exported.module()
-        self._init_from_model_json(extra_files["model.json"])
         mds = extra_files["model_def_script.json"]
         self._model_def_script = json.loads(mds) if mds else {}
         md = extra_files["metadata.json"]
-        self.metadata = json.loads(md) if md else {}
+        if not md:
+            raise ValueError(
+                f"Invalid .pte file '{model_file}': missing 'metadata.json'"
+            )
+        self.metadata = json.loads(md)
+
+        model_json_str = extra_files["model.json"]
+        if model_json_str:
+            self._init_from_model_json(model_json_str)
+        else:
+            self._init_from_metadata()
 
     def _load_pt2(self, model_file: str) -> None:
-        """Load a .pt2 (AOTInductor) model file."""
+        """Load a .pt2 (AOTInductor) model file.
+
+        ``extra/model.json`` is optional — it only enables the dpmodel
+        round-trip (used by ``eval_descriptor``, ``eval_typeebd``, etc.).
+        Pure AOTI inference (``DeepPot.eval`` / ``dp test`` / ASE
+        calculator) only needs ``extra/metadata.json``, matching the
+        contract the C++ ``DeepPotPTExpt`` reader enforces.  Backends that
+        cannot produce ``model.json``.
+        """
         import zipfile
 
         from torch._inductor import (
@@ -185,21 +257,24 @@ def _load_pt2(self, model_file: str) -> None:
         # Read metadata from the .pt2 ZIP archive
         with zipfile.ZipFile(model_file, "r") as zf:
             names = zf.namelist()
-            if "extra/model.json" not in names:
+            if "extra/metadata.json" not in names:
                 raise ValueError(
-                    f"Invalid .pt2 file '{model_file}': missing 'extra/model.json'"
+                    f"Invalid .pt2 file '{model_file}': missing 'extra/metadata.json'"
                 )
-            model_json_str = zf.read("extra/model.json").decode("utf-8")
+            md = zf.read("extra/metadata.json").decode("utf-8")
+            model_json_str = ""
+            if "extra/model.json" in names:
+                model_json_str = zf.read("extra/model.json").decode("utf-8")
             mds = ""
             if "extra/model_def_script.json" in names:
                 mds = zf.read("extra/model_def_script.json").decode("utf-8")
-            md = ""
-            if "extra/metadata.json" in names:
-                md = zf.read("extra/metadata.json").decode("utf-8")
 
-        self._init_from_model_json(model_json_str)
+        self.metadata = json.loads(md)
         self._model_def_script = json.loads(mds) if mds else {}
-        self.metadata = json.loads(md) if md else {}
+        if model_json_str:
+            self._init_from_model_json(model_json_str)
+        else:
+            self._init_from_metadata()
 
         # Load the AOTInductor model package (.pt2 ZIP archive).
         # Uses torch._inductor.aoti_load_package (private API, stable since PyTorch 2.6).
@@ -220,16 +295,29 @@ def get_type_map(self) -> list[str]:
 
     def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this DP."""
-        return self._dpmodel.get_dim_fparam()
+        if self._dpmodel is not None:
+            return self._dpmodel.get_dim_fparam()
+        return int(self.metadata["dim_fparam"])
 
     def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this DP."""
-        return self._dpmodel.get_dim_aparam()
+        if self._dpmodel is not None:
+            return self._dpmodel.get_dim_aparam()
+        return int(self.metadata["dim_aparam"])
 
     @property
     def model_type(self) -> type["DeepEvalWrapper"]:
         """The the evaluator of the model type."""
-        model_output_type = self._dpmodel.model_output_type()
+        if self._dpmodel is not None:
+            model_output_type = self._dpmodel.model_output_type()
+        else:
+            # Metadata-only mode: derive the output-type set from the
+            # fitting_output_defs names.  `model_output_type()` on a
+            # dpmodel is the same set — just the base output names, not
+            # their derived `*_redu` / `*_derv_*` twins.
+            model_output_type = [
+                d.name for d in self._model_output_def.def_outp.get_data().values()
+            ]
         if "energy" in model_output_type:
             return DeepPot
         elif "dos" in model_output_type:
@@ -250,7 +338,12 @@ def get_sel_type(self) -> list[int]:
         to the result of the model.
         If returning an empty list, all atom types are selected.
         """
-        return self._dpmodel.get_sel_type()
+        if self._dpmodel is not None:
+            return self._dpmodel.get_sel_type()
+        # Metadata-only mode: read the `sel_type` field populated by
+        # `_collect_metadata`.  Missing field → `[]` (every type
+        # selected), matching the dpmodel default for energy models.
+        return [int(t) for t in self.metadata.get("sel_type", [])]
 
     def get_numb_dos(self) -> int:
         """Get the number of DOS."""
@@ -266,9 +359,11 @@ def get_has_spin(self) -> bool:
 
     def get_use_spin(self) -> list[bool]:
         """Get the per-type spin usage of this model."""
-        if getattr(self, "_is_spin", False):
+        if not getattr(self, "_is_spin", False):
+            return []
+        if self._dpmodel is not None:
             return self._dpmodel.spin.use_spin.tolist()
-        return []
+        return [bool(v) for v in self.metadata.get("use_spin", [])]
 
     def get_ntypes_spin(self) -> int:
         """Get the number of spin atom types of this model. Only used in old implement."""
@@ -423,8 +518,11 @@ def _build_nlist_native(
         nframes = coords.shape[0]
         natoms = coords.shape[1]
         rcut = self.rcut
-        sel = self._dpmodel.get_sel()
-        mixed_types = self._dpmodel.mixed_types()
+        # ``self.sel`` / ``self.mixed_types`` are populated in both
+        # :meth:`_init_from_model_json` and :meth:`_init_from_metadata`,
+        # so this works whether or not ``model.json`` was available.
+        sel = self.sel
+        mixed_types = self.mixed_types
 
         if cells is not None:
             box_input = cells.reshape(nframes, 3, 3)
@@ -535,8 +633,8 @@ def _build_nlist_ase_single(
         nlist : np.ndarray, shape (nloc, nsel)
         mapping : np.ndarray, shape (nall,)
         """
-        sel = self._dpmodel.get_sel()
-        mixed_types = self._dpmodel.mixed_types()
+        sel = self.sel
+        mixed_types = self.mixed_types
         nsel = sum(sel)
 
         natoms = positions.shape[0]
@@ -995,13 +1093,44 @@ def get_model(self) -> torch.nn.Module:
         return self.exported_module
 
     def _is_spin_model(self) -> bool:
-        """Check if the underlying dpmodel is a SpinModel."""
+        """Check if the underlying model is a SpinModel.
+
+        Primary path: the :attr:`_is_spin` attribute set by the loaders
+        — this works for both ``model.json`` and metadata-only archives
+        (a spin ``.pt2`` carries ``is_spin=true`` in its metadata).
+
+        Legacy path: ``isinstance(_dpmodel, SpinModel)`` — retained for
+        tests that construct a non-spin archive and then swap
+        :attr:`_dpmodel` to a :class:`SpinModel` instance after load.
+        """
+        if bool(getattr(self, "_is_spin", False)):
+            return True
+        if self._dpmodel is None:
+            return False
         from deepmd.dpmodel.model.spin_model import (
             SpinModel,
         )
 
         return isinstance(self._dpmodel, SpinModel)
 
+    def _require_dpmodel(self, feature: str) -> None:
+        """Guard for features that need a deserialised dpmodel instance.
+
+        ``eval_descriptor`` / ``eval_typeebd`` / ``eval_fitting_last_layer``
+        all introspect the dpmodel's internal sub-modules, which requires
+        ``extra/model.json`` to have been present at load time.  Archives
+        shipped without ``model.json`` (metadata-only mode) can still run
+        the main ``eval`` inference path but cannot expose these hooks.
+        """
+        if self._dpmodel is None:
+            raise NotImplementedError(
+                f"{feature} requires the dpmodel instance, which is only "
+                "available when the .pt2 / .pte archive contains "
+                "'extra/model.json'. The loaded archive is metadata-only; "
+                "re-export with the full dpmodel serialisation to enable "
+                "this feature."
+            )
+
     def eval_typeebd(self) -> np.ndarray:
         """Evaluate type embedding.
 
@@ -1014,7 +1143,11 @@ def eval_typeebd(self) -> np.ndarray:
         ------
         KeyError
             If the model has no type embedding networks.
+        NotImplementedError
+            If the archive was loaded in metadata-only mode.
         """
+        self._require_dpmodel("eval_typeebd")
+
         from deepmd.dpmodel.utils.type_embed import TypeEmbedNet as TypeEmbedNetDP
 
         model = self._dpmodel
@@ -1058,6 +1191,8 @@ def eval_descriptor(
         np.ndarray
             Descriptor output, shape ``(nframes, nloc, dim_descrpt)``.
         """
+        self._require_dpmodel("eval_descriptor")
+
         coords = np.array(coords)
         atom_types = np.array(atom_types, dtype=np.int32)
         if cells is not None:
@@ -1124,6 +1259,8 @@ def eval_fitting_last_layer(
         np.ndarray
             Middle-layer output, shape ``(nframes, nloc, neuron[-1])``.
         """
+        self._require_dpmodel("eval_fitting_last_layer")
+
         coords = np.array(coords)
         atom_types = np.array(atom_types, dtype=np.int32)
         if cells is not None:
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
@@ -247,7 +247,9 @@ def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict:
                 "r_differentiable": vdef.r_differentiable,
                 "c_differentiable": vdef.c_differentiable,
                 "atomic": vdef.atomic,
-                "category": vdef.category,
+                # OutputVariableCategory is an IntEnum; force plain int for
+                # deterministic JSON serialisation across Python versions.
+                "category": int(vdef.category),
                 "r_hessian": vdef.r_hessian,
                 "magnetic": vdef.magnetic,
                 "intensive": vdef.intensive,
@@ -263,6 +265,10 @@ def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict:
         "has_default_fparam": model.has_default_fparam(),
         "default_fparam": model.get_default_fparam(),
         "fitting_output_defs": fitting_output_defs,
+        # sel_type enables `DeepEval.get_sel_type()` without a dpmodel
+        # round-trip; required for dipole/polar/wfc models in metadata-only
+        # inference (energy models return []).
+        "sel_type": [int(t) for t in model.get_sel_type()],
         "is_spin": is_spin,
     }
     if is_spin:
diff --git a/source/tests/pt_expt/infer/test_deep_eval_metadata_only.py b/source/tests/pt_expt/infer/test_deep_eval_metadata_only.py