From e4303faf8655076430321b5df70df2b0b93a00d9 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 20 Apr 2026 20:15:10 +0800 Subject: [PATCH 1/3] feat(pt_expt): add LMDB dataset support Mirror the pt backend's LMDB plumbing (#5283) for pt_expt so users can train against .lmdb directories on the new torch.export-based stack. The frame-collation step is implemented once in dpmodel via array_api_compat (so future jax/paddle backends inherit it), and pt's existing _collate_lmdb_batch is refactored to delegate to it. --- deepmd/dpmodel/utils/lmdb_data.py | 35 +++ deepmd/pt/utils/lmdb_dataset.py | 49 ++-- deepmd/pt_expt/entrypoints/main.py | 96 ++++---- deepmd/pt_expt/utils/lmdb_dataset.py | 128 +++++++++++ source/tests/pt_expt/test_lmdb_training.py | 249 +++++++++++++++++++++ 5 files changed, 486 insertions(+), 71 deletions(-) create mode 100644 deepmd/pt_expt/utils/lmdb_dataset.py create mode 100644 source/tests/pt_expt/test_lmdb_training.py diff --git a/deepmd/dpmodel/utils/lmdb_data.py b/deepmd/dpmodel/utils/lmdb_data.py index 243d4f525d..656532fd1e 100644 --- a/deepmd/dpmodel/utils/lmdb_data.py +++ b/deepmd/dpmodel/utils/lmdb_data.py @@ -608,6 +608,41 @@ def system_nframes(self) -> list[int]: return self._system_nframes +def collate_lmdb_frames(frames: list[dict[str, Any]]) -> dict[str, Any]: + """Stack a list of per-frame dicts into a single batch dict. + + Backend-agnostic via ``array_api_compat``: works for numpy, torch, jax, + etc. The array library is inferred from the first frame's ``coord``. + + Conventions match :func:`deepmd.dpmodel.utils.batch.normalize_batch`: + ``find_*`` flags are taken from the first frame (constant within a + batch); ``fid`` is collected as a list; ``type`` is dropped (callers + should already use ``atype``); other arrays are stacked along axis 0. + A ``sid`` placeholder is appended. + """ + import array_api_compat + + if not frames: + raise ValueError("collate_lmdb_frames requires at least one frame") + + xp = array_api_compat.array_namespace(frames[0]["coord"]) + dev = array_api_compat.device(frames[0]["coord"]) + out: dict[str, Any] = {} + for key in frames[0]: + if key.startswith("find_"): + out[key] = frames[0][key] + elif key == "fid": + out[key] = [f[key] for f in frames] + elif key == "type": + continue + elif frames[0][key] is None: + out[key] = None + else: + out[key] = xp.stack([f[key] for f in frames]) + out["sid"] = xp.asarray([0], dtype=xp.int64, device=dev) + return out + + def compute_block_targets( auto_prob_style: str, nsystems: int, diff --git a/deepmd/pt/utils/lmdb_dataset.py b/deepmd/pt/utils/lmdb_dataset.py index 44d67be242..ac7b1f0e3d 100644 --- a/deepmd/pt/utils/lmdb_dataset.py +++ b/deepmd/pt/utils/lmdb_dataset.py @@ -15,14 +15,12 @@ Dataset, Sampler, ) -from torch.utils.data._utils.collate import ( - collate_tensor_fn, -) from deepmd.dpmodel.utils.lmdb_data import ( LmdbDataReader, LmdbTestData, SameNlocBatchSampler, + collate_lmdb_frames, compute_block_targets, is_lmdb, ) @@ -42,13 +40,17 @@ def _collate_lmdb_batch(batch: list[dict[str, Any]]) -> dict[str, Any]: - """Collate a list of frame dicts into a batch dict. + """Collate a list of frame dicts into a torch batch dict. - All frames in the batch must have the same nloc (enforced by - SameNlocBatchSampler when mixed_batch=False). + Pre-converts per-frame numpy arrays to CPU torch tensors (zero-copy when + dtype matches) and delegates stacking to the backend-agnostic + :func:`collate_lmdb_frames`. With torch tensors as input, the shared + collate yields a torch dict (``sid`` becomes a torch tensor automatically + via ``array_api_compat``). - For mixed_batch=True, this function would need padding + mask. - Currently raises NotImplementedError for that case. + All frames in the batch must have the same nloc (enforced by + SameNlocBatchSampler when mixed_batch=False). For mixed_batch=True, + raises NotImplementedError. """ if len(batch) > 1: atypes = [d.get("atype") for d in batch if d.get("atype") is not None] @@ -59,24 +61,19 @@ def _collate_lmdb_batch(batch: list[dict[str, Any]]) -> dict[str, Any]: "Padding + mask in collate_fn needed." ) - example = batch[0] - result: dict[str, Any] = {} - for key in example: - if "find_" in key: - result[key] = batch[0][key] - elif key == "fid": - result[key] = [d[key] for d in batch] - elif key == "type": - continue - elif batch[0][key] is None: - result[key] = None - else: - with torch.device("cpu"): - result[key] = collate_tensor_fn( - [torch.as_tensor(d[key]) for d in batch] - ) - result["sid"] = torch.tensor([0], dtype=torch.long, device="cpu") - return result + with torch.device("cpu"): + torch_frames: list[dict[str, Any]] = [] + for f in batch: + tf: dict[str, Any] = {} + for key, val in f.items(): + if key.startswith("find_") or key == "fid" or key == "type": + tf[key] = val + elif val is None: + tf[key] = None + else: + tf[key] = torch.as_tensor(val) + torch_frames.append(tf) + return collate_lmdb_frames(torch_frames) class _SameNlocBatchSamplerTorch(Sampler): diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py index 40302ee7b3..9f6d41f1fb 100644 --- a/deepmd/pt_expt/entrypoints/main.py +++ b/deepmd/pt_expt/entrypoints/main.py @@ -14,9 +14,15 @@ import h5py +from deepmd.dpmodel.utils.lmdb_data import ( + is_lmdb, +) from deepmd.pt_expt.train import ( training, ) +from deepmd.pt_expt.utils.lmdb_dataset import ( + LmdbDataSystem, +) from deepmd.utils.argcheck import ( normalize, ) @@ -35,6 +41,41 @@ log = logging.getLogger(__name__) +def _build_data_system( + dataset_params: dict[str, Any], + type_map: list[str], + seed: int | None = None, +) -> DeepmdDataSystem | LmdbDataSystem: + """Build a data system from dataset config, routing LMDB paths to LmdbDataSystem. + + A scalar ``systems`` value pointing at an LMDB directory triggers the + LMDB adapter; otherwise we fall through to the legacy + :class:`DeepmdDataSystem` path with system expansion. + """ + systems_raw = dataset_params["systems"] + if isinstance(systems_raw, str) and is_lmdb(systems_raw): + return LmdbDataSystem( + lmdb_path=systems_raw, + type_map=type_map, + batch_size=dataset_params["batch_size"], + auto_prob_style=dataset_params.get("auto_prob"), + seed=seed, + ) + systems = process_systems( + systems_raw, + patterns=dataset_params.get("rglob_patterns", None), + ) + return DeepmdDataSystem( + systems=systems, + batch_size=dataset_params["batch_size"], + test_size=1, + type_map=type_map, + trn_all_set=True, + sys_probs=dataset_params.get("sys_probs", None), + auto_prob_style=dataset_params.get("auto_prob", "prob_sys_size"), + ) + + def get_trainer( config: dict[str, Any], init_model: str | None = None, @@ -48,39 +89,23 @@ def get_trainer( training_params = config["training"] multi_task = "model_dict" in model_params + data_seed = training_params.get("seed", None) + if not multi_task: type_map = model_params["type_map"] # ----- training data ------------------------------------------------ training_dataset_params = training_params["training_data"] - training_systems = process_systems( - training_dataset_params["systems"], - patterns=training_dataset_params.get("rglob_patterns", None), - ) - train_data = DeepmdDataSystem( - systems=training_systems, - batch_size=training_dataset_params["batch_size"], - test_size=1, - type_map=type_map, - trn_all_set=True, - sys_probs=training_dataset_params.get("sys_probs", None), - auto_prob_style=training_dataset_params.get("auto_prob", "prob_sys_size"), + train_data = _build_data_system( + training_dataset_params, type_map, seed=data_seed ) # ----- validation data ---------------------------------------------- validation_data = None validation_dataset_params = training_params.get("validation_data", None) if validation_dataset_params is not None: - val_systems = process_systems( - validation_dataset_params["systems"], - patterns=validation_dataset_params.get("rglob_patterns", None), - ) - validation_data = DeepmdDataSystem( - systems=val_systems, - batch_size=validation_dataset_params["batch_size"], - test_size=1, - type_map=type_map, - trn_all_set=True, + validation_data = _build_data_system( + validation_dataset_params, type_map, seed=data_seed ) # ----- stat file path ----------------------------------------------- @@ -103,34 +128,15 @@ def get_trainer( data_params = training_params["data_dict"][model_key] # training data - td_params = data_params["training_data"] - training_systems = process_systems( - td_params["systems"], - patterns=td_params.get("rglob_patterns", None), - ) - train_data[model_key] = DeepmdDataSystem( - systems=training_systems, - batch_size=td_params["batch_size"], - test_size=1, - type_map=type_map, - trn_all_set=True, - sys_probs=td_params.get("sys_probs", None), - auto_prob_style=td_params.get("auto_prob", "prob_sys_size"), + train_data[model_key] = _build_data_system( + data_params["training_data"], type_map, seed=data_seed ) # validation data vd_params = data_params.get("validation_data", None) if vd_params is not None: - val_systems = process_systems( - vd_params["systems"], - patterns=vd_params.get("rglob_patterns", None), - ) - validation_data[model_key] = DeepmdDataSystem( - systems=val_systems, - batch_size=vd_params["batch_size"], - test_size=1, - type_map=type_map, - trn_all_set=True, + validation_data[model_key] = _build_data_system( + vd_params, type_map, seed=data_seed ) else: validation_data[model_key] = None diff --git a/deepmd/pt_expt/utils/lmdb_dataset.py b/deepmd/pt_expt/utils/lmdb_dataset.py new file mode 100644 index 0000000000..4e61714cc6 --- /dev/null +++ b/deepmd/pt_expt/utils/lmdb_dataset.py @@ -0,0 +1,128 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""LMDB data adapter for the pt_expt backend. + +pt_expt does not use ``torch.utils.data.DataLoader``; its trainer calls +``data_sys.get_batch()`` directly and expects a numpy dict in the +``DeepmdDataSystem`` shape (the shape consumed by +``deepmd.dpmodel.utils.batch.normalize_batch``). This module provides a thin +wrapper around the framework-agnostic :class:`LmdbDataReader` that satisfies +that interface. +""" + +import logging +from typing import ( + Any, +) + +from deepmd.dpmodel.utils.lmdb_data import ( + LmdbDataReader, + SameNlocBatchSampler, + collate_lmdb_frames, + compute_block_targets, +) +from deepmd.utils.data import ( + DataRequirementItem, +) + +log = logging.getLogger(__name__) + +__all__ = ["LmdbDataSystem"] + + +class LmdbDataSystem: + """LMDB-backed data system for pt_expt. + + Exposes the small surface that pt_expt's trainer touches: + ``get_batch(sys_idx=None)``, ``add_data_requirements(list)``, and + ``get_nsystems()``. Internally uses :class:`LmdbDataReader` for I/O and + :class:`SameNlocBatchSampler` to draw same-nloc batches. + + Parameters + ---------- + lmdb_path + Path to the LMDB directory. + type_map + Global type map from the model config. + batch_size + Batch size spec; ``int``, ``"auto"``, or ``"auto:N"``. + auto_prob_style + Optional ``auto_prob`` string (e.g. ``"prob_sys_size"``) for + per-system reweighting via :func:`compute_block_targets`. + seed + Optional seed for the shuffle in :class:`SameNlocBatchSampler`. + """ + + def __init__( + self, + lmdb_path: str, + type_map: list[str], + batch_size: int | str = "auto", + auto_prob_style: str | None = None, + seed: int | None = None, + ) -> None: + self._reader = LmdbDataReader( + lmdb_path, type_map, batch_size, mixed_batch=False + ) + + block_targets = None + if auto_prob_style is not None and self._reader.frame_system_ids is not None: + block_targets = compute_block_targets( + auto_prob_style, + self._reader.nsystems, + self._reader.system_nframes, + ) + + self._sampler = SameNlocBatchSampler( + self._reader, + shuffle=True, + seed=seed, + block_targets=block_targets, + ) + self._iter = iter(self._sampler) + + # ------------------------------------------------------------------ + # pt_expt trainer surface + # ------------------------------------------------------------------ + + def get_batch(self, sys_idx: int | None = None) -> dict[str, Any]: + """Return one batch as a numpy dict. + + ``sys_idx`` is accepted for API compatibility but ignored: per-system + sampling is baked into ``block_targets`` at sampler construction. + """ + del sys_idx + try: + indices = next(self._iter) + except StopIteration: + self._iter = iter(self._sampler) + indices = next(self._iter) + frames = [self._reader[int(i)] for i in indices] + return collate_lmdb_frames(frames) + + def add_data_requirements( + self, data_requirement: list[DataRequirementItem] + ) -> None: + self._reader.add_data_requirement(data_requirement) + + def get_nsystems(self) -> int: + """Return 1: pt_expt's stat collection treats LMDB as a single system. + + Per-system sampling within the LMDB is handled by + ``SameNlocBatchSampler`` + ``block_targets``. + """ + return 1 + + # ------------------------------------------------------------------ + # Misc forwarders + # ------------------------------------------------------------------ + + @property + def type_map(self) -> list[str]: + return self._reader._type_map + + @property + def mixed_type(self) -> bool: + return True + + def print_summary(self, name: str, prob: Any = None) -> None: + self._reader.print_summary(name, prob) diff --git a/source/tests/pt_expt/test_lmdb_training.py b/source/tests/pt_expt/test_lmdb_training.py new file mode 100644 index 0000000000..94673a8761 --- /dev/null +++ b/source/tests/pt_expt/test_lmdb_training.py @@ -0,0 +1,249 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""LMDB-format dataset support for the pt_expt backend. + +Covers: + +- :class:`LmdbDataSystem.get_batch` returns numpy arrays in the shape that + :func:`normalize_batch` consumes. +- ``get_trainer()`` routes an LMDB ``systems`` path through + :class:`LmdbDataSystem` and runs a few training steps. +""" + +import os +import shutil +import tempfile +import unittest + +import lmdb +import msgpack +import numpy as np + +from deepmd.dpmodel.utils.batch import ( + normalize_batch, + split_batch, +) +from deepmd.pt_expt.entrypoints.main import ( + get_trainer, +) +from deepmd.pt_expt.utils.lmdb_dataset import ( + LmdbDataSystem, +) +from deepmd.utils.argcheck import ( + normalize, +) +from deepmd.utils.compat import ( + update_deepmd_input, +) + + +def _encode_array(arr: np.ndarray) -> dict: + return { + "nd": None, + "type": str(arr.dtype), + "kind": "", + "shape": list(arr.shape), + "data": arr.tobytes(), + } + + +def _make_frame(natoms: int, seed: int) -> dict: + """Synthetic LMDB frame matching the on-disk schema used by LmdbDataReader.""" + rng = np.random.RandomState(seed) + half = natoms // 2 + return { + "atom_numbs": [half, natoms - half], + "atom_names": ["O", "H"], + "atom_types": _encode_array( + np.array([0] * half + [1] * (natoms - half), dtype=np.int64) + ), + "orig": _encode_array(np.zeros(3, dtype=np.float64)), + "cells": _encode_array((np.eye(3) * 10.0).astype(np.float64)), + "coords": _encode_array((rng.rand(natoms, 3) * 10.0).astype(np.float64)), + "energies": _encode_array(np.array(rng.randn(), dtype=np.float64)), + "forces": _encode_array(rng.randn(natoms, 3).astype(np.float64)), + } + + +def _create_test_lmdb(path: str, nframes: int, natoms: int) -> None: + """Write a minimal LMDB containing *nframes* frames of *natoms* atoms each.""" + env = lmdb.open(path, map_size=10 * 1024 * 1024) + fmt = "012d" + metadata = { + "nframes": nframes, + "frame_idx_fmt": fmt, + "system_info": { + "formula": f"O{natoms // 2}H{natoms - natoms // 2}", + "natoms": [natoms // 2, natoms - natoms // 2], + "nframes": nframes, + }, + } + with env.begin(write=True) as txn: + txn.put(b"__metadata__", msgpack.packb(metadata, use_bin_type=True)) + for i in range(nframes): + key = format(i, fmt).encode() + txn.put(key, msgpack.packb(_make_frame(natoms, i), use_bin_type=True)) + env.close() + + +class TestLmdbDataSystemGetBatch(unittest.TestCase): + """LmdbDataSystem.get_batch produces a numpy dict that normalize_batch accepts.""" + + def setUp(self) -> None: + self.tmpdir = tempfile.mkdtemp() + self.lmdb_path = os.path.join(self.tmpdir, "test.lmdb") + _create_test_lmdb(self.lmdb_path, nframes=8, natoms=6) + + def tearDown(self) -> None: + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_get_batch_shape_and_normalize(self) -> None: + ds = LmdbDataSystem( + lmdb_path=self.lmdb_path, + type_map=["O", "H"], + batch_size=2, + seed=0, + ) + batch = ds.get_batch() + # Required structural keys. + for key in ("coord", "atype", "force", "energy", "natoms"): + self.assertIn(key, batch, f"missing {key}") + # NumPy arrays (not torch tensors) — pt_expt converts at the trainer + # boundary. + self.assertIsInstance(batch["coord"], np.ndarray) + self.assertIsInstance(batch["atype"], np.ndarray) + self.assertEqual(batch["coord"].shape, (2, 6, 3)) + self.assertEqual(batch["atype"].shape, (2, 6)) + self.assertEqual(batch["natoms"].shape, (2, 4)) # nloc, nloc, n_O, n_H + + # normalize_batch must accept the dict and produce input/label splits + # without raising. + norm = normalize_batch(batch) + inputs, labels = split_batch(norm) + self.assertIn("coord", inputs) + self.assertIn("atype", inputs) + self.assertIn("force", labels) + self.assertIn("natoms", labels) + + def test_get_batch_iterates_past_end(self) -> None: + """get_batch reseeds the sampler at the end of an epoch.""" + ds = LmdbDataSystem( + lmdb_path=self.lmdb_path, + type_map=["O", "H"], + batch_size=2, + seed=0, + ) + # 8 frames / batch 2 = 4 batches per epoch; pull more than that. + for _ in range(10): + batch = ds.get_batch() + self.assertEqual(batch["coord"].shape, (2, 6, 3)) + + def test_add_data_requirements_passthrough(self) -> None: + from deepmd.utils.data import ( + DataRequirementItem, + ) + + ds = LmdbDataSystem( + lmdb_path=self.lmdb_path, + type_map=["O", "H"], + batch_size=1, + seed=0, + ) + ds.add_data_requirements( + [ + DataRequirementItem( + "energy", ndof=1, atomic=False, must=False, high_prec=True + ), + ] + ) + batch = ds.get_batch() + self.assertIn("energy", batch) + self.assertIn("find_energy", batch) + + +class TestLmdbTrainingLoop(unittest.TestCase): + """End-to-end: get_trainer routes an LMDB path and runs training steps.""" + + @classmethod + def setUpClass(cls) -> None: + cls.tmpdir = tempfile.mkdtemp() + cls.lmdb_path = os.path.join(cls.tmpdir, "train.lmdb") + cls.val_lmdb_path = os.path.join(cls.tmpdir, "val.lmdb") + _create_test_lmdb(cls.lmdb_path, nframes=8, natoms=6) + _create_test_lmdb(cls.val_lmdb_path, nframes=4, natoms=6) + + @classmethod + def tearDownClass(cls) -> None: + shutil.rmtree(cls.tmpdir, ignore_errors=True) + + def _make_lmdb_config(self, numb_steps: int = 3) -> dict: + return { + "model": { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_e2_a", + "sel": [6, 12], + "rcut_smth": 0.50, + "rcut": 3.00, + "neuron": [8, 16], + "resnet_dt": False, + "axis_neuron": 4, + "type_one_side": True, + "seed": 1, + }, + "fitting_net": { + "neuron": [16, 16], + "resnet_dt": True, + "seed": 1, + }, + "data_stat_nbatch": 1, + }, + "learning_rate": { + "type": "exp", + "decay_steps": 500, + "start_lr": 0.001, + "stop_lr": 3.51e-8, + }, + "loss": { + "type": "ener", + "start_pref_e": 0.02, + "limit_pref_e": 1, + "start_pref_f": 1000, + "limit_pref_f": 1, + "start_pref_v": 0, + "limit_pref_v": 0, + }, + "training": { + "training_data": { + "systems": self.lmdb_path, + "batch_size": 1, + }, + "validation_data": { + "systems": self.val_lmdb_path, + "batch_size": 1, + "numb_btch": 1, + }, + "numb_steps": numb_steps, + "seed": 10, + "disp_file": "lcurve.out", + "disp_freq": numb_steps, + "save_freq": numb_steps, + }, + } + + def test_get_trainer_routes_lmdb(self) -> None: + config = self._make_lmdb_config(numb_steps=3) + config = update_deepmd_input(config, warning=False) + config = normalize(config) + + cwd = os.getcwd() + os.chdir(self.tmpdir) + try: + trainer = get_trainer(config) + self.assertIsInstance(trainer.training_data, LmdbDataSystem) + trainer.run() + finally: + os.chdir(cwd) + + +if __name__ == "__main__": + unittest.main() From 5bc00614b48a807e78278a966b647a7ef6773c1d Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 20 Apr 2026 22:19:59 +0800 Subject: [PATCH 2/3] fix(pt_expt): address PR #5408 reviewer feedback - route LMDB paths through make_neighbor_stat_data in the CLI neighbor-stat step, so dp --pt_expt train no longer crashes before reaching get_trainer (reported by chatgpt-codex-connector) - reject list-form 'systems' containing LMDB paths with a clear ValueError instead of a confusing DeepmdData failure - expose LmdbDataReader.type_map as a public property and stop reaching into the private _type_map from the pt_expt adapter - drop the unused logging global flagged by CodeQL --- deepmd/dpmodel/utils/lmdb_data.py | 5 ++++ deepmd/pt_expt/entrypoints/main.py | 36 ++++++++++++++++++++++++---- deepmd/pt_expt/utils/lmdb_dataset.py | 5 +--- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/deepmd/dpmodel/utils/lmdb_data.py b/deepmd/dpmodel/utils/lmdb_data.py index 656532fd1e..c2ce362a50 100644 --- a/deepmd/dpmodel/utils/lmdb_data.py +++ b/deepmd/dpmodel/utils/lmdb_data.py @@ -577,6 +577,11 @@ def mixed_type(self) -> bool: """LMDB datasets are always mixed_type (frames may have different compositions).""" return True + @property + def type_map(self) -> list[str]: + """Model-side type map used when constructing the reader.""" + return self._type_map + @property def nloc_groups(self) -> dict[int, list[int]]: """Nloc → list of frame indices.""" diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py index 9f6d41f1fb..9068de6abe 100644 --- a/deepmd/pt_expt/entrypoints/main.py +++ b/deepmd/pt_expt/entrypoints/main.py @@ -41,6 +41,26 @@ log = logging.getLogger(__name__) +def _get_neighbor_stat_data( + dataset_params: dict[str, Any], + type_map: list[str] | None, +) -> Any: + """Return a data proxy suitable for ``BaseModel.update_sel`` (neighbor stat). + + Routes a scalar LMDB ``systems`` path through dpmodel's + ``make_neighbor_stat_data``; falls back to the legacy ``get_data`` for + npy/HDF5 directories. + """ + systems_raw = dataset_params.get("systems") + if isinstance(systems_raw, str) and is_lmdb(systems_raw): + from deepmd.dpmodel.utils.lmdb_data import ( + make_neighbor_stat_data, + ) + + return make_neighbor_stat_data(systems_raw, type_map) + return get_data(dataset_params, 0, type_map, None) + + def _build_data_system( dataset_params: dict[str, Any], type_map: list[str], @@ -61,6 +81,14 @@ def _build_data_system( auto_prob_style=dataset_params.get("auto_prob"), seed=seed, ) + if isinstance(systems_raw, list) and any( + isinstance(s, str) and is_lmdb(s) for s in systems_raw + ): + raise ValueError( + "LMDB datasets must be passed as a scalar 'systems' string " + "(e.g. 'systems': '/path/to/data.lmdb'); list-form systems " + "with LMDB paths are not supported." + ) systems = process_systems( systems_raw, patterns=dataset_params.get("rglob_patterns", None), @@ -267,8 +295,8 @@ def train( if not multi_task: type_map = config["model"].get("type_map") - train_data = get_data( - config["training"]["training_data"], 0, type_map, None + train_data = _get_neighbor_stat_data( + config["training"]["training_data"], type_map ) config["model"], _ = BaseModel.update_sel( train_data, type_map, config["model"] @@ -276,11 +304,9 @@ def train( else: for model_key in config["model"]["model_dict"]: type_map = config["model"]["model_dict"][model_key]["type_map"] - train_data = get_data( + train_data = _get_neighbor_stat_data( config["training"]["data_dict"][model_key]["training_data"], - 0, type_map, - None, ) config["model"]["model_dict"][model_key], _ = BaseModel.update_sel( train_data, diff --git a/deepmd/pt_expt/utils/lmdb_dataset.py b/deepmd/pt_expt/utils/lmdb_dataset.py index 4e61714cc6..4b279880e1 100644 --- a/deepmd/pt_expt/utils/lmdb_dataset.py +++ b/deepmd/pt_expt/utils/lmdb_dataset.py @@ -9,7 +9,6 @@ that interface. """ -import logging from typing import ( Any, ) @@ -24,8 +23,6 @@ DataRequirementItem, ) -log = logging.getLogger(__name__) - __all__ = ["LmdbDataSystem"] @@ -118,7 +115,7 @@ def get_nsystems(self) -> int: @property def type_map(self) -> list[str]: - return self._reader._type_map + return self._reader.type_map @property def mixed_type(self) -> bool: From afa4054c01537ee515a41b0b14a00ebc74b050d9 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 20 Apr 2026 23:01:45 +0800 Subject: [PATCH 3/3] fix(pt_expt): share LMDB list-rejection between neighbor-stat and trainer The list-form 'systems' check added in 5bc00614 only lived in _build_data_system, but the CLI train() flow hits _get_neighbor_stat_data first. A list containing an LMDB path therefore fell through to get_data with an opaque error before the nice ValueError could fire. Extract a shared _detect_lmdb_path helper so both paths fail fast with the same message. --- deepmd/pt_expt/entrypoints/main.py | 41 ++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py index 9068de6abe..4dd6c81d58 100644 --- a/deepmd/pt_expt/entrypoints/main.py +++ b/deepmd/pt_expt/entrypoints/main.py @@ -41,6 +41,28 @@ log = logging.getLogger(__name__) +def _detect_lmdb_path(systems_raw: Any) -> str | None: + """Return the LMDB path when ``systems_raw`` is a scalar LMDB string. + + Returns ``None`` for non-LMDB inputs. Raises ``ValueError`` if + ``systems_raw`` is a list containing any LMDB path, so both + ``_get_neighbor_stat_data`` and ``_build_data_system`` fail with the + same clear message instead of the opaque error from + :func:`process_systems` / :class:`DeepmdData`. + """ + if isinstance(systems_raw, str) and is_lmdb(systems_raw): + return systems_raw + if isinstance(systems_raw, list) and any( + isinstance(s, str) and is_lmdb(s) for s in systems_raw + ): + raise ValueError( + "LMDB datasets must be passed as a scalar 'systems' string " + "(e.g. 'systems': '/path/to/data.lmdb'); list-form systems " + "with LMDB paths are not supported." + ) + return None + + def _get_neighbor_stat_data( dataset_params: dict[str, Any], type_map: list[str] | None, @@ -51,13 +73,13 @@ def _get_neighbor_stat_data( ``make_neighbor_stat_data``; falls back to the legacy ``get_data`` for npy/HDF5 directories. """ - systems_raw = dataset_params.get("systems") - if isinstance(systems_raw, str) and is_lmdb(systems_raw): + lmdb_path = _detect_lmdb_path(dataset_params.get("systems")) + if lmdb_path is not None: from deepmd.dpmodel.utils.lmdb_data import ( make_neighbor_stat_data, ) - return make_neighbor_stat_data(systems_raw, type_map) + return make_neighbor_stat_data(lmdb_path, type_map) return get_data(dataset_params, 0, type_map, None) @@ -73,22 +95,15 @@ def _build_data_system( :class:`DeepmdDataSystem` path with system expansion. """ systems_raw = dataset_params["systems"] - if isinstance(systems_raw, str) and is_lmdb(systems_raw): + lmdb_path = _detect_lmdb_path(systems_raw) + if lmdb_path is not None: return LmdbDataSystem( - lmdb_path=systems_raw, + lmdb_path=lmdb_path, type_map=type_map, batch_size=dataset_params["batch_size"], auto_prob_style=dataset_params.get("auto_prob"), seed=seed, ) - if isinstance(systems_raw, list) and any( - isinstance(s, str) and is_lmdb(s) for s in systems_raw - ): - raise ValueError( - "LMDB datasets must be passed as a scalar 'systems' string " - "(e.g. 'systems': '/path/to/data.lmdb'); list-form systems " - "with LMDB paths are not supported." - ) systems = process_systems( systems_raw, patterns=dataset_params.get("rglob_patterns", None),