test(pt_expt): shrink change-bias water dataset to 5 frames

Han Wang · Han Wang · commit dacf606872ea · 2026-05-27T16:15:25.000+08:00
``TestChangeBias`` was the dominant memory hog in the ``Test Python``
shard ``(10, 3.13)`` of the CI matrix — by itself it peaked at ~5 GB
RSS, leaving so little headroom under the 7 GB GitHub-hosted runner
ceiling that the shard sporadically lost communication with the
GitHub Actions server (intermittent ``runner lost communication``
flake observed across many recent PRs).

Profile finding: peak RSS scales **linearly at ~50 MB per frame**
during ``dp change-bias``'s in-process ``main(cmds)`` call.  The
forward over ``compute_output_stats`` enumerates ``nbatches = min(
data.get_nbatches()) = 80`` frames of the water example, and each
frame leaks ~50 MB into torch's caching allocator (not autograd —
the wrapper is already in ``torch.no_grad()``; the leak is in
``forward_common_atomic`` somewhere and is a separate bug).

Constraint: we **must** keep ``nbatches == total dataset frames``
to preserve determinism for ``test_change_bias_pt2_pte_consistency``
which compares two .pte and .pt2 invocations with ``atol=1e-10``.
``_load_batch_set`` shuffles the loaded set, so a value of
``nbatches &lt; total_frames`` would sample a random subset and the
two calls (running in the same Python process with an advancing
``dp_random`` state) would see different frames and produce
different biases.  Full enumeration sees every frame and so the
aggregate bias is invariant under shuffle.

Solution: build a 5-frame subset of ``examples/water/data/data_0``
in ``TestChangeBias.setUpClass`` and point both the trainer config
and the change-bias ``-s`` argument at it.  ``nbatches`` then
resolves to 5 (= the new dataset size, = full enumeration), peak
RSS drops to ~1.7 GB for the whole class, and all 9 tests in the
class (including the strict atol=1e-10 consistency check) still
pass.  Class wall time also improves (~3:55 → less data-loop work
in each change-bias invocation).
diff --git a/source/tests/pt_expt/test_change_bias.py b/source/tests/pt_expt/test_change_bias.py
@@ -118,19 +118,70 @@ def _make_config(data_dir: str) -> dict:
     }
 
 
+def _make_subset_dataset(src_system: str, dst_system: str, n_frames: int) -> None:
+    """Copy ``type{,_map}.raw`` and the first ``n_frames`` of every ``.npy``
+    in ``set.000`` from ``src_system`` to ``dst_system``.
+
+    Used by ``TestChangeBias`` to shrink the water/data_0 example (80
+    frames) down to a tiny subset so that ``dp change-bias`` enumerates
+    over only ``n_frames`` frames.  Why this matters: the in-process
+    ``main(cmds)`` path runs the model forward over ``nbatches`` frames
+    via ``compute_output_stats``, and each frame leaks ~50 MB into
+    torch's caching allocator.  At ``n_frames=80`` (the default,
+    ``min(data.get_nbatches()) = 80``) peak RSS hits ~5 GB which OOMs
+    the 7 GB GitHub-hosted CI runner.  Shrinking to ``n_frames=5`` keeps
+    peak at ~800 MB while preserving **determinism**: the test
+    ``test_change_bias_pt2_pte_consistency`` asserts ``atol=1e-10``
+    between two .pte and .pt2 calls in the same process, which requires
+    every frame to be seen on each call regardless of the
+    shuffle-based ``_load_batch_set`` order.  ``nbatches == total
+    frames`` makes the forward enumerate every frame and so the
+    aggregate bias is invariant under shuffle.
+    """
+    import numpy as np
+
+    src_set = os.path.join(src_system, "set.000")
+    dst_set = os.path.join(dst_system, "set.000")
+    os.makedirs(dst_set, exist_ok=True)
+    for raw in ("type.raw", "type_map.raw"):
+        src = os.path.join(src_system, raw)
+        if os.path.isfile(src):
+            shutil.copyfile(src, os.path.join(dst_system, raw))
+    for fname in os.listdir(src_set):
+        if not fname.endswith(".npy"):
+            continue
+        arr = np.load(os.path.join(src_set, fname))
+        np.save(os.path.join(dst_set, fname), arr[:n_frames])
+
+
 class TestChangeBias(unittest.TestCase):
     """Test dp change-bias for the pt_expt backend."""
 
     @classmethod
     def setUpClass(cls) -> None:
-        data_dir = os.path.join(EXAMPLE_DIR, "data")
-        if not os.path.isdir(data_dir):
-            raise unittest.SkipTest(f"Example data not found: {data_dir}")
+        full_data_dir = os.path.join(EXAMPLE_DIR, "data")
+        if not os.path.isdir(full_data_dir):
+            raise unittest.SkipTest(f"Example data not found: {full_data_dir}")
+        cls.tmpdir = tempfile.mkdtemp()
+        cls.old_cwd = os.getcwd()
+
+        # Shrink the water example dataset (80 frames) to a 5-frame
+        # subset.  ``dp change-bias`` defaults to enumerating every
+        # frame (``nbatches = min(data.get_nbatches())``), and each
+        # frame's forward pass leaks ~50 MB into torch's allocator; at
+        # 80 frames peak RSS pushes the 7 GB CI runner into OOM.  See
+        # the docstring of ``_make_subset_dataset`` for why we keep
+        # full enumeration (determinism) but shrink the dataset.
+        data_dir = os.path.join(cls.tmpdir, "data")
+        os.makedirs(data_dir, exist_ok=True)
+        _make_subset_dataset(
+            src_system=os.path.join(full_data_dir, "data_0"),
+            dst_system=os.path.join(data_dir, "data_0"),
+            n_frames=5,
+        )
         cls.data_dir = data_dir
         cls.data_file = [os.path.join(data_dir, "data_0")]
 
-        cls.tmpdir = tempfile.mkdtemp()
-        cls.old_cwd = os.getcwd()
         os.chdir(cls.tmpdir)
 
         # Build & train 1-step model