fixup lmdb

OutisLi · OutisLi · commit 86df0ca7ecf0 · 2026-04-22T21:02:50.000+08:00
diff --git a/deepmd/dpmodel/utils/lmdb_data.py b/deepmd/dpmodel/utils/lmdb_data.py
@@ -66,7 +66,10 @@ def _open_lmdb(path: str) -> lmdb.Environment:
         env, refcount = entry
         _ENV_CACHE[resolved] = (env, refcount + 1)
         return env
-    env = lmdb.open(path, readonly=True, lock=False, readahead=False, meminit=False)
+    # ``readahead=True`` lets the kernel batch-prefetch B+tree pages; this is
+    # cheap on local SSDs and a major win on networked filesystems (vepfs /
+    # NFS / Lustre) where each uncoalesced 4 KB page fault costs a full RPC.
+    env = lmdb.open(path, readonly=True, lock=False, readahead=True, meminit=False)
     _ENV_CACHE[resolved] = (env, 1)
     return env
 
@@ -298,33 +301,34 @@ def __init__(
 
         # Scan per-frame nloc only when needed for same-nloc batching.
         # For mixed_batch=True, skip the scan entirely (future: padding handles it).
-        # We keep _frame_nlocs / _frame_system_ids indexable by the *original*
-        # LMDB frame index even after filter:N: entries for dropped frames
-        # simply never get referenced because _nloc_groups / _system_groups
-        # no longer reference them.
+        # ``orig_frame_nlocs`` / ``orig_frame_system_ids`` are indexed by the
+        # *original* LMDB frame index. After a potential ``filter:N`` drop we
+        # rebuild ``self._frame_nlocs`` / ``self._frame_system_ids`` so they
+        # are parallel arrays over the *dataset* index space (0..len(self));
+        # the dataset-to-original mapping lives in ``self._retained_keys``.
         if not mixed_batch:
             # Fast path: use pre-computed frame_nlocs from metadata if available.
             # Falls back to scanning each frame's atom_types shape (~10 us/frame).
             meta_nlocs = meta.get("frame_nlocs")
             if meta_nlocs is not None:
-                self._frame_nlocs = [int(n) for n in meta_nlocs]
+                orig_frame_nlocs = [int(n) for n in meta_nlocs]
             else:
-                self._frame_nlocs = _scan_frame_nlocs(
+                orig_frame_nlocs = _scan_frame_nlocs(
                     self._env, self.nframes, self._frame_fmt, self._natoms
                 )
         else:
-            self._frame_nlocs = []
+            orig_frame_nlocs = []
 
-        # Parse frame_system_ids for auto_prob support. _nsystems must stay at
-        # ``max(original_sid) + 1`` even after filter:N so that user-facing
+        # Parse frame_system_ids for auto_prob support. ``_nsystems`` must stay
+        # at ``max(original_sid) + 1`` even after filter:N so that user-facing
         # auto_prob block slicing (e.g. ``prob_sys_size;0:284:0.5;284:842:0.5``)
         # keeps its meaning across filter thresholds.
         meta_sys_ids = meta.get("frame_system_ids")
         if meta_sys_ids is not None:
-            self._frame_system_ids: list[int] | None = [int(s) for s in meta_sys_ids]
-            self._nsystems = max(self._frame_system_ids) + 1
+            orig_frame_system_ids: list[int] | None = [int(s) for s in meta_sys_ids]
+            self._nsystems = max(orig_frame_system_ids) + 1
         else:
-            self._frame_system_ids = None
+            orig_frame_system_ids = None
             self._nsystems = 1
 
         # Parse batch_size spec. ``auto_rule`` and ``max_rule`` are mutually
@@ -353,47 +357,66 @@ def __init__(
         # ``filter:N`` every frame is retained. ``mixed_batch=True`` has no
         # per-frame nloc info to filter against, so filter:N is a no-op there.
         if self._filter_rule is not None and not mixed_batch:
-            retained_indices = [
-                i for i, n in enumerate(self._frame_nlocs) if n <= self._filter_rule
+            retained_keys = [
+                i for i, n in enumerate(orig_frame_nlocs) if n <= self._filter_rule
             ]
-            n_dropped = self.nframes - len(retained_indices)
+            n_dropped = self.nframes - len(retained_keys)
             if n_dropped > 0:
                 log.info(
                     f"LMDB filter:{self._filter_rule} drops {n_dropped}/"
                     f"{self.nframes} frames with nloc > {self._filter_rule} "
                     f"({self.lmdb_path})."
                 )
         else:
-            retained_indices = list(range(self.nframes))
+            retained_keys = list(range(self.nframes))
+
+        # Dataset-index → original LMDB frame key. ``__getitem__`` looks up
+        # this table so that ``reader[i]`` is a valid LMDB read for every
+        # ``0 <= i < len(reader)``, no matter how many frames were filtered.
+        self._retained_keys: list[int] = retained_keys
+
+        # Re-key _frame_nlocs / _frame_system_ids into the dataset-index
+        # space so that every downstream consumer (nloc_groups, system_groups,
+        # SameNlocBatchSampler, _expand_indices_by_blocks) operates in a
+        # single, self-consistent indexing scheme.
+        if not mixed_batch:
+            self._frame_nlocs = [orig_frame_nlocs[k] for k in retained_keys]
+        else:
+            self._frame_nlocs = []
+
+        if orig_frame_system_ids is not None:
+            self._frame_system_ids: list[int] | None = [
+                orig_frame_system_ids[k] for k in retained_keys
+            ]
+        else:
+            self._frame_system_ids = None
 
-        # Group retained frames by nloc. _nloc_groups only contains nlocs
-        # that passed the filter; its values stay as *original* LMDB frame
-        # indices so __getitem__(index) keeps reading the right LMDB key.
+        # Group retained frames by nloc using dataset indices (0..len-1).
         if not mixed_batch:
             self._nloc_groups: dict[int, list[int]] = {}
-            for idx in retained_indices:
-                self._nloc_groups.setdefault(self._frame_nlocs[idx], []).append(idx)
+            for ds_idx, nloc in enumerate(self._frame_nlocs):
+                self._nloc_groups.setdefault(nloc, []).append(ds_idx)
         else:
             self._nloc_groups = {}
 
-        # Group retained frames by system id. _system_nframes is indexed by
-        # *original* sid and stays length _nsystems even if some systems are
-        # fully dropped — those entries are simply zero so auto_prob block
-        # slicing still parses predictably.
+        # Group retained frames by original system id; the sid numbering is
+        # preserved (no compression) so user-facing auto_prob slices stay
+        # meaningful across filter thresholds. Fully-dropped systems appear
+        # as zero-frame entries in ``_system_nframes``.
         if self._frame_system_ids is not None:
             self._system_groups: dict[int, list[int]] = {}
-            for idx in retained_indices:
-                sid = self._frame_system_ids[idx]
-                self._system_groups.setdefault(sid, []).append(idx)
+            for ds_idx, sid in enumerate(self._frame_system_ids):
+                self._system_groups.setdefault(sid, []).append(ds_idx)
             self._system_nframes: list[int] = [
                 len(self._system_groups.get(i, [])) for i in range(self._nsystems)
             ]
         else:
-            self._system_groups = {0: list(retained_indices)}
-            self._system_nframes = [len(retained_indices)]
+            self._system_groups = {0: list(range(len(retained_keys)))}
+            self._system_nframes = [len(retained_keys)]
 
-        # nframes now reflects retained frames; __len__ returns this.
-        self.nframes = len(retained_indices)
+        # nframes now reflects retained frames; __len__ returns this and the
+        # valid index domain for __getitem__ is [0, self.nframes).
+        self.nframes = len(retained_keys)
 
         # Default batch_size used only by the index/total_batch estimate. The
         # sampler always goes through get_batch_size_for_nloc for real batches.
@@ -474,11 +497,21 @@ def __len__(self) -> int:
         return self.nframes
 
     def __getitem__(self, index: int) -> dict[str, Any]:
-        """Read frame from LMDB, decode, remap keys, return dict of numpy arrays."""
-        key = format(index, self._frame_fmt).encode()
+        """Read frame from LMDB, decode, remap keys, return dict of numpy arrays.
+
+        ``index`` is a dataset-level index in ``[0, len(self))``. Under
+        ``filter:N`` the LMDB key space may have gaps (dropped frames), so
+        we translate through ``self._retained_keys`` before hitting LMDB.
+        """
+        if index < 0 or index >= self.nframes:
+            raise IndexError(f"dataset index {index} out of range [0, {self.nframes})")
+        original_key = self._retained_keys[index]
+        key = format(original_key, self._frame_fmt).encode()
         raw = self._txn.get(key)
         if raw is None:
-            raise IndexError(f"Frame {index} not found in LMDB")
+            raise IndexError(
+                f"Frame {original_key} not found in LMDB (dataset index {index})"
+            )
         frame = _decode_frame(raw)
         frame = _remap_keys(frame)
 
@@ -607,7 +640,7 @@ def __getitem__(self, index: int) -> dict[str, Any]:
                     np.float32(1.0) if extra_key in frame else np.float32(0.0)
                 )
 
-        frame["fid"] = index
+        frame["fid"] = original_key
 
         return frame
 
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
@@ -3677,8 +3677,8 @@ def training_data_args() -> list[
 - string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\
 - string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.\n\n\
 - string "mixed:N": the batch data will be sampled from all systems and merged into a mixed system with the batch size N. Only support the se_atten descriptor for TensorFlow backend.\n\n\
-- string "max:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no more than N.\n\n\
-- string "filter:N": the same as `"max:N"` but removes the systems with the number of atoms larger than `N` from the data set.\n\n\
+- string "max:N": automatically determines the batch size so that `batch_size * natoms` is at most `N`. `natoms` is the per-system atom count for npy data and the per-frame nloc for LMDB data. When a single system/frame already has more than `N` atoms, the batch size clamps to 1 and that batch will exceed `N`.\n\n\
+- string "filter:N": the same as `"max:N"` but additionally drops data whose atom count exceeds `N`. For npy data this removes whole systems with natoms > `N`; for LMDB data this removes individual frames with nloc > `N`.\n\n\
 If MPI is used, the value should be considered as the batch size per task.'
     doc_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\
 - "prob_uniform"  : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\
@@ -3758,8 +3758,8 @@ def validation_data_args() -> list[
 - int: all {link_sys} use the same batch size.\n\n\
 - string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\
 - string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.\n\n\
-- string "max:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no more than N.\n\n\
-- string "filter:N": the same as `"max:N"` but removes the systems with the number of atoms larger than `N` from the data set.'
+- string "max:N": automatically determines the batch size so that `batch_size * natoms` is at most `N`. `natoms` is the per-system atom count for npy data and the per-frame nloc for LMDB data. When a single system/frame already has more than `N` atoms, the batch size clamps to 1 and that batch will exceed `N`.\n\n\
+- string "filter:N": the same as `"max:N"` but additionally drops data whose atom count exceeds `N`. For npy data this removes whole systems with natoms > `N`; for LMDB data this removes individual frames with nloc > `N`.'
     doc_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\
 - "prob_uniform"  : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\
 - "prob_sys_size" : the probability of a system is proportional to the number of batches in the system\n\n\
diff --git a/source/tests/common/dpmodel/test_lmdb_data.py b/source/tests/common/dpmodel/test_lmdb_data.py
@@ -821,6 +821,46 @@ def test_filter_preserves_system_id_numbering(self):
         # after re-normalisation → no expansion needed.
         self.assertEqual(block_targets, [])
 
+    def test_filter_dataset_index_is_contiguous_and_live(self):
+        """After filter:N, every i in range(len(reader)) is a live retrievable frame.
+
+        Regression for the earlier indexing bug where ``len(reader)`` shrank
+        to the retained count but ``__getitem__`` still indexed the original
+        LMDB key space. Under filter:10 the mixed-nloc LMDB drops the two
+        12-atom frames at original keys 8 & 9; we check here that:
+
+        * every dataset index ``0..len(reader)-1`` decodes without raising
+          and never returns a filtered-out frame, and
+        * ``fid`` reports the stable original LMDB key, not the dataset
+          index (so downstream logs survive the remap), and
+        * out-of-range indices still raise IndexError.
+        """
+        reader = LmdbDataReader(
+            self._mixed_path, self._type_map, batch_size="filter:10"
+        )
+        self.assertEqual(len(reader), 8)
+        self.assertEqual(len(reader._retained_keys), 8)
+        self.assertEqual(reader._retained_keys, [0, 1, 2, 3, 4, 5, 6, 7])
+
+        seen_fids = []
+        for i in range(len(reader)):
+            frame = reader[i]
+            self.assertLessEqual(frame["atype"].shape[0], 10)
+            self.assertEqual(
+                frame["fid"],
+                reader._retained_keys[i],
+                msg=f"fid should be the original LMDB key, not dataset index {i}",
+            )
+            seen_fids.append(frame["fid"])
+        # Dropped original keys (8, 9) must never appear as fids.
+        self.assertNotIn(8, seen_fids)
+        self.assertNotIn(9, seen_fids)
+
+        with self.assertRaises(IndexError):
+            reader[len(reader)]
+        with self.assertRaises(IndexError):
+            reader[-1]
+
     def test_sampler_with_filter(self):
         """SameNlocBatchSampler only emits retained, same-nloc frames."""
         reader = LmdbDataReader(
@@ -841,9 +881,11 @@ def test_sampler_with_filter(self):
         for batch in all_batches:
             nlocs = {reader.frame_nlocs[idx] for idx in batch}
             self.assertEqual(len(nlocs), 1)
-        # The 12-atom frames (indices 8, 9) are never reached.
-        for idx in (8, 9):
-            self.assertNotIn(idx, all_indices)
+        # The 12-atom frames were at original LMDB keys 8, 9; they must
+        # never be reachable via any emitted dataset index.
+        reached_original_keys = {reader._retained_keys[idx] for idx in all_indices}
+        for original_key in (8, 9):
+            self.assertNotIn(original_key, reached_original_keys)
 
     def test_auto_prob_with_filter_still_works(self):
         """compute_block_targets + sampler survive a fully-dropped block."""