From e4303faf8655076430321b5df70df2b0b93a00d9 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 20 Apr 2026 20:15:10 +0800
Subject: [PATCH 1/3] feat(pt_expt): add LMDB dataset support

Mirror the pt backend's LMDB plumbing (#5283) for pt_expt so users can
train against .lmdb directories on the new torch.export-based stack.
The frame-collation step is implemented once in dpmodel via
array_api_compat (so future jax/paddle backends inherit it), and pt's
existing _collate_lmdb_batch is refactored to delegate to it.
---
 deepmd/dpmodel/utils/lmdb_data.py          |  35 +++
 deepmd/pt/utils/lmdb_dataset.py            |  49 ++--
 deepmd/pt_expt/entrypoints/main.py         |  96 ++++----
 deepmd/pt_expt/utils/lmdb_dataset.py       | 128 +++++++++++
 source/tests/pt_expt/test_lmdb_training.py | 249 +++++++++++++++++++++
 5 files changed, 486 insertions(+), 71 deletions(-)
 create mode 100644 deepmd/pt_expt/utils/lmdb_dataset.py
 create mode 100644 source/tests/pt_expt/test_lmdb_training.py

diff --git a/deepmd/dpmodel/utils/lmdb_data.py b/deepmd/dpmodel/utils/lmdb_data.py
index 243d4f525d..656532fd1e 100644
--- a/deepmd/dpmodel/utils/lmdb_data.py
+++ b/deepmd/dpmodel/utils/lmdb_data.py
@@ -608,6 +608,41 @@ def system_nframes(self) -> list[int]:
         return self._system_nframes
 
 
+def collate_lmdb_frames(frames: list[dict[str, Any]]) -> dict[str, Any]:
+    """Stack a list of per-frame dicts into a single batch dict.
+
+    Backend-agnostic via ``array_api_compat``: works for numpy, torch, jax,
+    etc. The array library is inferred from the first frame's ``coord``.
+
+    Conventions match :func:`deepmd.dpmodel.utils.batch.normalize_batch`:
+    ``find_*`` flags are taken from the first frame (constant within a
+    batch); ``fid`` is collected as a list; ``type`` is dropped (callers
+    should already use ``atype``); other arrays are stacked along axis 0.
+    A ``sid`` placeholder is appended.
+    """
+    import array_api_compat
+
+    if not frames:
+        raise ValueError("collate_lmdb_frames requires at least one frame")
+
+    xp = array_api_compat.array_namespace(frames[0]["coord"])
+    dev = array_api_compat.device(frames[0]["coord"])
+    out: dict[str, Any] = {}
+    for key in frames[0]:
+        if key.startswith("find_"):
+            out[key] = frames[0][key]
+        elif key == "fid":
+            out[key] = [f[key] for f in frames]
+        elif key == "type":
+            continue
+        elif frames[0][key] is None:
+            out[key] = None
+        else:
+            out[key] = xp.stack([f[key] for f in frames])
+    out["sid"] = xp.asarray([0], dtype=xp.int64, device=dev)
+    return out
+
+
 def compute_block_targets(
     auto_prob_style: str,
     nsystems: int,
diff --git a/deepmd/pt/utils/lmdb_dataset.py b/deepmd/pt/utils/lmdb_dataset.py
index 44d67be242..ac7b1f0e3d 100644
--- a/deepmd/pt/utils/lmdb_dataset.py
+++ b/deepmd/pt/utils/lmdb_dataset.py
@@ -15,14 +15,12 @@
     Dataset,
     Sampler,
 )
-from torch.utils.data._utils.collate import (
-    collate_tensor_fn,
-)
 
 from deepmd.dpmodel.utils.lmdb_data import (
     LmdbDataReader,
     LmdbTestData,
     SameNlocBatchSampler,
+    collate_lmdb_frames,
     compute_block_targets,
     is_lmdb,
 )
@@ -42,13 +40,17 @@
 
 
 def _collate_lmdb_batch(batch: list[dict[str, Any]]) -> dict[str, Any]:
-    """Collate a list of frame dicts into a batch dict.
+    """Collate a list of frame dicts into a torch batch dict.
 
-    All frames in the batch must have the same nloc (enforced by
-    SameNlocBatchSampler when mixed_batch=False).
+    Pre-converts per-frame numpy arrays to CPU torch tensors (zero-copy when
+    dtype matches) and delegates stacking to the backend-agnostic
+    :func:`collate_lmdb_frames`. With torch tensors as input, the shared
+    collate yields a torch dict (``sid`` becomes a torch tensor automatically
+    via ``array_api_compat``).
 
-    For mixed_batch=True, this function would need padding + mask.
-    Currently raises NotImplementedError for that case.
+    All frames in the batch must have the same nloc (enforced by
+    SameNlocBatchSampler when mixed_batch=False). For mixed_batch=True,
+    raises NotImplementedError.
     """
     if len(batch) > 1:
         atypes = [d.get("atype") for d in batch if d.get("atype") is not None]
@@ -59,24 +61,19 @@ def _collate_lmdb_batch(batch: list[dict[str, Any]]) -> dict[str, Any]:
                 "Padding + mask in collate_fn needed."
             )
 
-    example = batch[0]
-    result: dict[str, Any] = {}
-    for key in example:
-        if "find_" in key:
-            result[key] = batch[0][key]
-        elif key == "fid":
-            result[key] = [d[key] for d in batch]
-        elif key == "type":
-            continue
-        elif batch[0][key] is None:
-            result[key] = None
-        else:
-            with torch.device("cpu"):
-                result[key] = collate_tensor_fn(
-                    [torch.as_tensor(d[key]) for d in batch]
-                )
-    result["sid"] = torch.tensor([0], dtype=torch.long, device="cpu")
-    return result
+    with torch.device("cpu"):
+        torch_frames: list[dict[str, Any]] = []
+        for f in batch:
+            tf: dict[str, Any] = {}
+            for key, val in f.items():
+                if key.startswith("find_") or key == "fid" or key == "type":
+                    tf[key] = val
+                elif val is None:
+                    tf[key] = None
+                else:
+                    tf[key] = torch.as_tensor(val)
+            torch_frames.append(tf)
+        return collate_lmdb_frames(torch_frames)
 
 
 class _SameNlocBatchSamplerTorch(Sampler):
diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py
index 40302ee7b3..9f6d41f1fb 100644
--- a/deepmd/pt_expt/entrypoints/main.py
+++ b/deepmd/pt_expt/entrypoints/main.py
@@ -14,9 +14,15 @@
 
 import h5py
 
+from deepmd.dpmodel.utils.lmdb_data import (
+    is_lmdb,
+)
 from deepmd.pt_expt.train import (
     training,
 )
+from deepmd.pt_expt.utils.lmdb_dataset import (
+    LmdbDataSystem,
+)
 from deepmd.utils.argcheck import (
     normalize,
 )
@@ -35,6 +41,41 @@
 log = logging.getLogger(__name__)
 
 
+def _build_data_system(
+    dataset_params: dict[str, Any],
+    type_map: list[str],
+    seed: int | None = None,
+) -> DeepmdDataSystem | LmdbDataSystem:
+    """Build a data system from dataset config, routing LMDB paths to LmdbDataSystem.
+
+    A scalar ``systems`` value pointing at an LMDB directory triggers the
+    LMDB adapter; otherwise we fall through to the legacy
+    :class:`DeepmdDataSystem` path with system expansion.
+    """
+    systems_raw = dataset_params["systems"]
+    if isinstance(systems_raw, str) and is_lmdb(systems_raw):
+        return LmdbDataSystem(
+            lmdb_path=systems_raw,
+            type_map=type_map,
+            batch_size=dataset_params["batch_size"],
+            auto_prob_style=dataset_params.get("auto_prob"),
+            seed=seed,
+        )
+    systems = process_systems(
+        systems_raw,
+        patterns=dataset_params.get("rglob_patterns", None),
+    )
+    return DeepmdDataSystem(
+        systems=systems,
+        batch_size=dataset_params["batch_size"],
+        test_size=1,
+        type_map=type_map,
+        trn_all_set=True,
+        sys_probs=dataset_params.get("sys_probs", None),
+        auto_prob_style=dataset_params.get("auto_prob", "prob_sys_size"),
+    )
+
+
 def get_trainer(
     config: dict[str, Any],
     init_model: str | None = None,
@@ -48,39 +89,23 @@ def get_trainer(
     training_params = config["training"]
     multi_task = "model_dict" in model_params
 
+    data_seed = training_params.get("seed", None)
+
     if not multi_task:
         type_map = model_params["type_map"]
 
         # ----- training data ------------------------------------------------
         training_dataset_params = training_params["training_data"]
-        training_systems = process_systems(
-            training_dataset_params["systems"],
-            patterns=training_dataset_params.get("rglob_patterns", None),
-        )
-        train_data = DeepmdDataSystem(
-            systems=training_systems,
-            batch_size=training_dataset_params["batch_size"],
-            test_size=1,
-            type_map=type_map,
-            trn_all_set=True,
-            sys_probs=training_dataset_params.get("sys_probs", None),
-            auto_prob_style=training_dataset_params.get("auto_prob", "prob_sys_size"),
+        train_data = _build_data_system(
+            training_dataset_params, type_map, seed=data_seed
         )
 
         # ----- validation data ----------------------------------------------
         validation_data = None
         validation_dataset_params = training_params.get("validation_data", None)
         if validation_dataset_params is not None:
-            val_systems = process_systems(
-                validation_dataset_params["systems"],
-                patterns=validation_dataset_params.get("rglob_patterns", None),
-            )
-            validation_data = DeepmdDataSystem(
-                systems=val_systems,
-                batch_size=validation_dataset_params["batch_size"],
-                test_size=1,
-                type_map=type_map,
-                trn_all_set=True,
+            validation_data = _build_data_system(
+                validation_dataset_params, type_map, seed=data_seed
             )
 
         # ----- stat file path -----------------------------------------------
@@ -103,34 +128,15 @@ def get_trainer(
             data_params = training_params["data_dict"][model_key]
 
             # training data
-            td_params = data_params["training_data"]
-            training_systems = process_systems(
-                td_params["systems"],
-                patterns=td_params.get("rglob_patterns", None),
-            )
-            train_data[model_key] = DeepmdDataSystem(
-                systems=training_systems,
-                batch_size=td_params["batch_size"],
-                test_size=1,
-                type_map=type_map,
-                trn_all_set=True,
-                sys_probs=td_params.get("sys_probs", None),
-                auto_prob_style=td_params.get("auto_prob", "prob_sys_size"),
+            train_data[model_key] = _build_data_system(
+                data_params["training_data"], type_map, seed=data_seed
             )
 
             # validation data
             vd_params = data_params.get("validation_data", None)
             if vd_params is not None:
-                val_systems = process_systems(
-                    vd_params["systems"],
-                    patterns=vd_params.get("rglob_patterns", None),
-                )
-                validation_data[model_key] = DeepmdDataSystem(
-                    systems=val_systems,
-                    batch_size=vd_params["batch_size"],
-                    test_size=1,
-                    type_map=type_map,
-                    trn_all_set=True,
+                validation_data[model_key] = _build_data_system(
+                    vd_params, type_map, seed=data_seed
                 )
             else:
                 validation_data[model_key] = None
diff --git a/deepmd/pt_expt/utils/lmdb_dataset.py b/deepmd/pt_expt/utils/lmdb_dataset.py
new file mode 100644
index 0000000000..4e61714cc6
--- /dev/null
+++ b/deepmd/pt_expt/utils/lmdb_dataset.py
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""LMDB data adapter for the pt_expt backend.
+
+pt_expt does not use ``torch.utils.data.DataLoader``; its trainer calls
+``data_sys.get_batch()`` directly and expects a numpy dict in the
+``DeepmdDataSystem`` shape (the shape consumed by
+``deepmd.dpmodel.utils.batch.normalize_batch``). This module provides a thin
+wrapper around the framework-agnostic :class:`LmdbDataReader` that satisfies
+that interface.
+"""
+
+import logging
+from typing import (
+    Any,
+)
+
+from deepmd.dpmodel.utils.lmdb_data import (
+    LmdbDataReader,
+    SameNlocBatchSampler,
+    collate_lmdb_frames,
+    compute_block_targets,
+)
+from deepmd.utils.data import (
+    DataRequirementItem,
+)
+
+log = logging.getLogger(__name__)
+
+__all__ = ["LmdbDataSystem"]
+
+
+class LmdbDataSystem:
+    """LMDB-backed data system for pt_expt.
+
+    Exposes the small surface that pt_expt's trainer touches:
+    ``get_batch(sys_idx=None)``, ``add_data_requirements(list)``, and
+    ``get_nsystems()``. Internally uses :class:`LmdbDataReader` for I/O and
+    :class:`SameNlocBatchSampler` to draw same-nloc batches.
+
+    Parameters
+    ----------
+    lmdb_path
+        Path to the LMDB directory.
+    type_map
+        Global type map from the model config.
+    batch_size
+        Batch size spec; ``int``, ``"auto"``, or ``"auto:N"``.
+    auto_prob_style
+        Optional ``auto_prob`` string (e.g. ``"prob_sys_size"``) for
+        per-system reweighting via :func:`compute_block_targets`.
+    seed
+        Optional seed for the shuffle in :class:`SameNlocBatchSampler`.
+    """
+
+    def __init__(
+        self,
+        lmdb_path: str,
+        type_map: list[str],
+        batch_size: int | str = "auto",
+        auto_prob_style: str | None = None,
+        seed: int | None = None,
+    ) -> None:
+        self._reader = LmdbDataReader(
+            lmdb_path, type_map, batch_size, mixed_batch=False
+        )
+
+        block_targets = None
+        if auto_prob_style is not None and self._reader.frame_system_ids is not None:
+            block_targets = compute_block_targets(
+                auto_prob_style,
+                self._reader.nsystems,
+                self._reader.system_nframes,
+            )
+
+        self._sampler = SameNlocBatchSampler(
+            self._reader,
+            shuffle=True,
+            seed=seed,
+            block_targets=block_targets,
+        )
+        self._iter = iter(self._sampler)
+
+    # ------------------------------------------------------------------
+    # pt_expt trainer surface
+    # ------------------------------------------------------------------
+
+    def get_batch(self, sys_idx: int | None = None) -> dict[str, Any]:
+        """Return one batch as a numpy dict.
+
+        ``sys_idx`` is accepted for API compatibility but ignored: per-system
+        sampling is baked into ``block_targets`` at sampler construction.
+        """
+        del sys_idx
+        try:
+            indices = next(self._iter)
+        except StopIteration:
+            self._iter = iter(self._sampler)
+            indices = next(self._iter)
+        frames = [self._reader[int(i)] for i in indices]
+        return collate_lmdb_frames(frames)
+
+    def add_data_requirements(
+        self, data_requirement: list[DataRequirementItem]
+    ) -> None:
+        self._reader.add_data_requirement(data_requirement)
+
+    def get_nsystems(self) -> int:
+        """Return 1: pt_expt's stat collection treats LMDB as a single system.
+
+        Per-system sampling within the LMDB is handled by
+        ``SameNlocBatchSampler`` + ``block_targets``.
+        """
+        return 1
+
+    # ------------------------------------------------------------------
+    # Misc forwarders
+    # ------------------------------------------------------------------
+
+    @property
+    def type_map(self) -> list[str]:
+        return self._reader._type_map
+
+    @property
+    def mixed_type(self) -> bool:
+        return True
+
+    def print_summary(self, name: str, prob: Any = None) -> None:
+        self._reader.print_summary(name, prob)
diff --git a/source/tests/pt_expt/test_lmdb_training.py b/source/tests/pt_expt/test_lmdb_training.py
new file mode 100644
index 0000000000..94673a8761
--- /dev/null
+++ b/source/tests/pt_expt/test_lmdb_training.py
@@ -0,0 +1,249 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""LMDB-format dataset support for the pt_expt backend.
+
+Covers:
+
+- :class:`LmdbDataSystem.get_batch` returns numpy arrays in the shape that
+  :func:`normalize_batch` consumes.
+- ``get_trainer()`` routes an LMDB ``systems`` path through
+  :class:`LmdbDataSystem` and runs a few training steps.
+"""
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import lmdb
+import msgpack
+import numpy as np
+
+from deepmd.dpmodel.utils.batch import (
+    normalize_batch,
+    split_batch,
+)
+from deepmd.pt_expt.entrypoints.main import (
+    get_trainer,
+)
+from deepmd.pt_expt.utils.lmdb_dataset import (
+    LmdbDataSystem,
+)
+from deepmd.utils.argcheck import (
+    normalize,
+)
+from deepmd.utils.compat import (
+    update_deepmd_input,
+)
+
+
+def _encode_array(arr: np.ndarray) -> dict:
+    return {
+        "nd": None,
+        "type": str(arr.dtype),
+        "kind": "",
+        "shape": list(arr.shape),
+        "data": arr.tobytes(),
+    }
+
+
+def _make_frame(natoms: int, seed: int) -> dict:
+    """Synthetic LMDB frame matching the on-disk schema used by LmdbDataReader."""
+    rng = np.random.RandomState(seed)
+    half = natoms // 2
+    return {
+        "atom_numbs": [half, natoms - half],
+        "atom_names": ["O", "H"],
+        "atom_types": _encode_array(
+            np.array([0] * half + [1] * (natoms - half), dtype=np.int64)
+        ),
+        "orig": _encode_array(np.zeros(3, dtype=np.float64)),
+        "cells": _encode_array((np.eye(3) * 10.0).astype(np.float64)),
+        "coords": _encode_array((rng.rand(natoms, 3) * 10.0).astype(np.float64)),
+        "energies": _encode_array(np.array(rng.randn(), dtype=np.float64)),
+        "forces": _encode_array(rng.randn(natoms, 3).astype(np.float64)),
+    }
+
+
+def _create_test_lmdb(path: str, nframes: int, natoms: int) -> None:
+    """Write a minimal LMDB containing *nframes* frames of *natoms* atoms each."""
+    env = lmdb.open(path, map_size=10 * 1024 * 1024)
+    fmt = "012d"
+    metadata = {
+        "nframes": nframes,
+        "frame_idx_fmt": fmt,
+        "system_info": {
+            "formula": f"O{natoms // 2}H{natoms - natoms // 2}",
+            "natoms": [natoms // 2, natoms - natoms // 2],
+            "nframes": nframes,
+        },
+    }
+    with env.begin(write=True) as txn:
+        txn.put(b"__metadata__", msgpack.packb(metadata, use_bin_type=True))
+        for i in range(nframes):
+            key = format(i, fmt).encode()
+            txn.put(key, msgpack.packb(_make_frame(natoms, i), use_bin_type=True))
+    env.close()
+
+
+class TestLmdbDataSystemGetBatch(unittest.TestCase):
+    """LmdbDataSystem.get_batch produces a numpy dict that normalize_batch accepts."""
+
+    def setUp(self) -> None:
+        self.tmpdir = tempfile.mkdtemp()
+        self.lmdb_path = os.path.join(self.tmpdir, "test.lmdb")
+        _create_test_lmdb(self.lmdb_path, nframes=8, natoms=6)
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.tmpdir, ignore_errors=True)
+
+    def test_get_batch_shape_and_normalize(self) -> None:
+        ds = LmdbDataSystem(
+            lmdb_path=self.lmdb_path,
+            type_map=["O", "H"],
+            batch_size=2,
+            seed=0,
+        )
+        batch = ds.get_batch()
+        # Required structural keys.
+        for key in ("coord", "atype", "force", "energy", "natoms"):
+            self.assertIn(key, batch, f"missing {key}")
+        # NumPy arrays (not torch tensors) — pt_expt converts at the trainer
+        # boundary.
+        self.assertIsInstance(batch["coord"], np.ndarray)
+        self.assertIsInstance(batch["atype"], np.ndarray)
+        self.assertEqual(batch["coord"].shape, (2, 6, 3))
+        self.assertEqual(batch["atype"].shape, (2, 6))
+        self.assertEqual(batch["natoms"].shape, (2, 4))  # nloc, nloc, n_O, n_H
+
+        # normalize_batch must accept the dict and produce input/label splits
+        # without raising.
+        norm = normalize_batch(batch)
+        inputs, labels = split_batch(norm)
+        self.assertIn("coord", inputs)
+        self.assertIn("atype", inputs)
+        self.assertIn("force", labels)
+        self.assertIn("natoms", labels)
+
+    def test_get_batch_iterates_past_end(self) -> None:
+        """get_batch reseeds the sampler at the end of an epoch."""
+        ds = LmdbDataSystem(
+            lmdb_path=self.lmdb_path,
+            type_map=["O", "H"],
+            batch_size=2,
+            seed=0,
+        )
+        # 8 frames / batch 2 = 4 batches per epoch; pull more than that.
+        for _ in range(10):
+            batch = ds.get_batch()
+            self.assertEqual(batch["coord"].shape, (2, 6, 3))
+
+    def test_add_data_requirements_passthrough(self) -> None:
+        from deepmd.utils.data import (
+            DataRequirementItem,
+        )
+
+        ds = LmdbDataSystem(
+            lmdb_path=self.lmdb_path,
+            type_map=["O", "H"],
+            batch_size=1,
+            seed=0,
+        )
+        ds.add_data_requirements(
+            [
+                DataRequirementItem(
+                    "energy", ndof=1, atomic=False, must=False, high_prec=True
+                ),
+            ]
+        )
+        batch = ds.get_batch()
+        self.assertIn("energy", batch)
+        self.assertIn("find_energy", batch)
+
+
+class TestLmdbTrainingLoop(unittest.TestCase):
+    """End-to-end: get_trainer routes an LMDB path and runs training steps."""
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.tmpdir = tempfile.mkdtemp()
+        cls.lmdb_path = os.path.join(cls.tmpdir, "train.lmdb")
+        cls.val_lmdb_path = os.path.join(cls.tmpdir, "val.lmdb")
+        _create_test_lmdb(cls.lmdb_path, nframes=8, natoms=6)
+        _create_test_lmdb(cls.val_lmdb_path, nframes=4, natoms=6)
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        shutil.rmtree(cls.tmpdir, ignore_errors=True)
+
+    def _make_lmdb_config(self, numb_steps: int = 3) -> dict:
+        return {
+            "model": {
+                "type_map": ["O", "H"],
+                "descriptor": {
+                    "type": "se_e2_a",
+                    "sel": [6, 12],
+                    "rcut_smth": 0.50,
+                    "rcut": 3.00,
+                    "neuron": [8, 16],
+                    "resnet_dt": False,
+                    "axis_neuron": 4,
+                    "type_one_side": True,
+                    "seed": 1,
+                },
+                "fitting_net": {
+                    "neuron": [16, 16],
+                    "resnet_dt": True,
+                    "seed": 1,
+                },
+                "data_stat_nbatch": 1,
+            },
+            "learning_rate": {
+                "type": "exp",
+                "decay_steps": 500,
+                "start_lr": 0.001,
+                "stop_lr": 3.51e-8,
+            },
+            "loss": {
+                "type": "ener",
+                "start_pref_e": 0.02,
+                "limit_pref_e": 1,
+                "start_pref_f": 1000,
+                "limit_pref_f": 1,
+                "start_pref_v": 0,
+                "limit_pref_v": 0,
+            },
+            "training": {
+                "training_data": {
+                    "systems": self.lmdb_path,
+                    "batch_size": 1,
+                },
+                "validation_data": {
+                    "systems": self.val_lmdb_path,
+                    "batch_size": 1,
+                    "numb_btch": 1,
+                },
+                "numb_steps": numb_steps,
+                "seed": 10,
+                "disp_file": "lcurve.out",
+                "disp_freq": numb_steps,
+                "save_freq": numb_steps,
+            },
+        }
+
+    def test_get_trainer_routes_lmdb(self) -> None:
+        config = self._make_lmdb_config(numb_steps=3)
+        config = update_deepmd_input(config, warning=False)
+        config = normalize(config)
+
+        cwd = os.getcwd()
+        os.chdir(self.tmpdir)
+        try:
+            trainer = get_trainer(config)
+            self.assertIsInstance(trainer.training_data, LmdbDataSystem)
+            trainer.run()
+        finally:
+            os.chdir(cwd)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 5bc00614b48a807e78278a966b647a7ef6773c1d Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 20 Apr 2026 22:19:59 +0800
Subject: [PATCH 2/3] fix(pt_expt): address PR #5408 reviewer feedback

- route LMDB paths through make_neighbor_stat_data in the CLI neighbor-stat
  step, so dp --pt_expt train no longer crashes before reaching get_trainer
  (reported by chatgpt-codex-connector)
- reject list-form 'systems' containing LMDB paths with a clear ValueError
  instead of a confusing DeepmdData failure
- expose LmdbDataReader.type_map as a public property and stop reaching
  into the private _type_map from the pt_expt adapter
- drop the unused logging global flagged by CodeQL
---
 deepmd/dpmodel/utils/lmdb_data.py    |  5 ++++
 deepmd/pt_expt/entrypoints/main.py   | 36 ++++++++++++++++++++++++----
 deepmd/pt_expt/utils/lmdb_dataset.py |  5 +---
 3 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/deepmd/dpmodel/utils/lmdb_data.py b/deepmd/dpmodel/utils/lmdb_data.py
index 656532fd1e..c2ce362a50 100644
--- a/deepmd/dpmodel/utils/lmdb_data.py
+++ b/deepmd/dpmodel/utils/lmdb_data.py
@@ -577,6 +577,11 @@ def mixed_type(self) -> bool:
         """LMDB datasets are always mixed_type (frames may have different compositions)."""
         return True
 
+    @property
+    def type_map(self) -> list[str]:
+        """Model-side type map used when constructing the reader."""
+        return self._type_map
+
     @property
     def nloc_groups(self) -> dict[int, list[int]]:
         """Nloc → list of frame indices."""
diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py
index 9f6d41f1fb..9068de6abe 100644
--- a/deepmd/pt_expt/entrypoints/main.py
+++ b/deepmd/pt_expt/entrypoints/main.py
@@ -41,6 +41,26 @@
 log = logging.getLogger(__name__)
 
 
+def _get_neighbor_stat_data(
+    dataset_params: dict[str, Any],
+    type_map: list[str] | None,
+) -> Any:
+    """Return a data proxy suitable for ``BaseModel.update_sel`` (neighbor stat).
+
+    Routes a scalar LMDB ``systems`` path through dpmodel's
+    ``make_neighbor_stat_data``; falls back to the legacy ``get_data`` for
+    npy/HDF5 directories.
+    """
+    systems_raw = dataset_params.get("systems")
+    if isinstance(systems_raw, str) and is_lmdb(systems_raw):
+        from deepmd.dpmodel.utils.lmdb_data import (
+            make_neighbor_stat_data,
+        )
+
+        return make_neighbor_stat_data(systems_raw, type_map)
+    return get_data(dataset_params, 0, type_map, None)
+
+
 def _build_data_system(
     dataset_params: dict[str, Any],
     type_map: list[str],
@@ -61,6 +81,14 @@ def _build_data_system(
             auto_prob_style=dataset_params.get("auto_prob"),
             seed=seed,
         )
+    if isinstance(systems_raw, list) and any(
+        isinstance(s, str) and is_lmdb(s) for s in systems_raw
+    ):
+        raise ValueError(
+            "LMDB datasets must be passed as a scalar 'systems' string "
+            "(e.g. 'systems': '/path/to/data.lmdb'); list-form systems "
+            "with LMDB paths are not supported."
+        )
     systems = process_systems(
         systems_raw,
         patterns=dataset_params.get("rglob_patterns", None),
@@ -267,8 +295,8 @@ def train(
 
         if not multi_task:
             type_map = config["model"].get("type_map")
-            train_data = get_data(
-                config["training"]["training_data"], 0, type_map, None
+            train_data = _get_neighbor_stat_data(
+                config["training"]["training_data"], type_map
             )
             config["model"], _ = BaseModel.update_sel(
                 train_data, type_map, config["model"]
@@ -276,11 +304,9 @@ def train(
         else:
             for model_key in config["model"]["model_dict"]:
                 type_map = config["model"]["model_dict"][model_key]["type_map"]
-                train_data = get_data(
+                train_data = _get_neighbor_stat_data(
                     config["training"]["data_dict"][model_key]["training_data"],
-                    0,
                     type_map,
-                    None,
                 )
                 config["model"]["model_dict"][model_key], _ = BaseModel.update_sel(
                     train_data,
diff --git a/deepmd/pt_expt/utils/lmdb_dataset.py b/deepmd/pt_expt/utils/lmdb_dataset.py
index 4e61714cc6..4b279880e1 100644
--- a/deepmd/pt_expt/utils/lmdb_dataset.py
+++ b/deepmd/pt_expt/utils/lmdb_dataset.py
@@ -9,7 +9,6 @@
 that interface.
 """
 
-import logging
 from typing import (
     Any,
 )
@@ -24,8 +23,6 @@
     DataRequirementItem,
 )
 
-log = logging.getLogger(__name__)
-
 __all__ = ["LmdbDataSystem"]
 
 
@@ -118,7 +115,7 @@ def get_nsystems(self) -> int:
 
     @property
     def type_map(self) -> list[str]:
-        return self._reader._type_map
+        return self._reader.type_map
 
     @property
     def mixed_type(self) -> bool:

From afa4054c01537ee515a41b0b14a00ebc74b050d9 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 20 Apr 2026 23:01:45 +0800
Subject: [PATCH 3/3] fix(pt_expt): share LMDB list-rejection between
 neighbor-stat and trainer

The list-form 'systems' check added in 5bc00614 only lived in
_build_data_system, but the CLI train() flow hits _get_neighbor_stat_data
first. A list containing an LMDB path therefore fell through to get_data
with an opaque error before the nice ValueError could fire. Extract a
shared _detect_lmdb_path helper so both paths fail fast with the same
message.
---
 deepmd/pt_expt/entrypoints/main.py | 41 ++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py
index 9068de6abe..4dd6c81d58 100644
--- a/deepmd/pt_expt/entrypoints/main.py
+++ b/deepmd/pt_expt/entrypoints/main.py
@@ -41,6 +41,28 @@
 log = logging.getLogger(__name__)
 
 
+def _detect_lmdb_path(systems_raw: Any) -> str | None:
+    """Return the LMDB path when ``systems_raw`` is a scalar LMDB string.
+
+    Returns ``None`` for non-LMDB inputs. Raises ``ValueError`` if
+    ``systems_raw`` is a list containing any LMDB path, so both
+    ``_get_neighbor_stat_data`` and ``_build_data_system`` fail with the
+    same clear message instead of the opaque error from
+    :func:`process_systems` / :class:`DeepmdData`.
+    """
+    if isinstance(systems_raw, str) and is_lmdb(systems_raw):
+        return systems_raw
+    if isinstance(systems_raw, list) and any(
+        isinstance(s, str) and is_lmdb(s) for s in systems_raw
+    ):
+        raise ValueError(
+            "LMDB datasets must be passed as a scalar 'systems' string "
+            "(e.g. 'systems': '/path/to/data.lmdb'); list-form systems "
+            "with LMDB paths are not supported."
+        )
+    return None
+
+
 def _get_neighbor_stat_data(
     dataset_params: dict[str, Any],
     type_map: list[str] | None,
@@ -51,13 +73,13 @@ def _get_neighbor_stat_data(
     ``make_neighbor_stat_data``; falls back to the legacy ``get_data`` for
     npy/HDF5 directories.
     """
-    systems_raw = dataset_params.get("systems")
-    if isinstance(systems_raw, str) and is_lmdb(systems_raw):
+    lmdb_path = _detect_lmdb_path(dataset_params.get("systems"))
+    if lmdb_path is not None:
         from deepmd.dpmodel.utils.lmdb_data import (
             make_neighbor_stat_data,
         )
 
-        return make_neighbor_stat_data(systems_raw, type_map)
+        return make_neighbor_stat_data(lmdb_path, type_map)
     return get_data(dataset_params, 0, type_map, None)
 
 
@@ -73,22 +95,15 @@ def _build_data_system(
     :class:`DeepmdDataSystem` path with system expansion.
     """
     systems_raw = dataset_params["systems"]
-    if isinstance(systems_raw, str) and is_lmdb(systems_raw):
+    lmdb_path = _detect_lmdb_path(systems_raw)
+    if lmdb_path is not None:
         return LmdbDataSystem(
-            lmdb_path=systems_raw,
+            lmdb_path=lmdb_path,
             type_map=type_map,
             batch_size=dataset_params["batch_size"],
             auto_prob_style=dataset_params.get("auto_prob"),
             seed=seed,
         )
-    if isinstance(systems_raw, list) and any(
-        isinstance(s, str) and is_lmdb(s) for s in systems_raw
-    ):
-        raise ValueError(
-            "LMDB datasets must be passed as a scalar 'systems' string "
-            "(e.g. 'systems': '/path/to/data.lmdb'); list-form systems "
-            "with LMDB paths are not supported."
-        )
     systems = process_systems(
         systems_raw,
         patterns=dataset_params.get("rglob_patterns", None),