From dacf606872eacacb62323d6568e1d0739eecf87c Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 27 May 2026 16:15:25 +0800 Subject: [PATCH 1/2] test(pt_expt): shrink change-bias water dataset to 5 frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``TestChangeBias`` was the dominant memory hog in the ``Test Python`` shard ``(10, 3.13)`` of the CI matrix — by itself it peaked at ~5 GB RSS, leaving so little headroom under the 7 GB GitHub-hosted runner ceiling that the shard sporadically lost communication with the GitHub Actions server (intermittent ``runner lost communication`` flake observed across many recent PRs). Profile finding: peak RSS scales **linearly at ~50 MB per frame** during ``dp change-bias``'s in-process ``main(cmds)`` call. The forward over ``compute_output_stats`` enumerates ``nbatches = min( data.get_nbatches()) = 80`` frames of the water example, and each frame leaks ~50 MB into torch's caching allocator (not autograd — the wrapper is already in ``torch.no_grad()``; the leak is in ``forward_common_atomic`` somewhere and is a separate bug). Constraint: we **must** keep ``nbatches == total dataset frames`` to preserve determinism for ``test_change_bias_pt2_pte_consistency`` which compares two .pte and .pt2 invocations with ``atol=1e-10``. ``_load_batch_set`` shuffles the loaded set, so a value of ``nbatches < total_frames`` would sample a random subset and the two calls (running in the same Python process with an advancing ``dp_random`` state) would see different frames and produce different biases. Full enumeration sees every frame and so the aggregate bias is invariant under shuffle. Solution: build a 5-frame subset of ``examples/water/data/data_0`` in ``TestChangeBias.setUpClass`` and point both the trainer config and the change-bias ``-s`` argument at it. ``nbatches`` then resolves to 5 (= the new dataset size, = full enumeration), peak RSS drops to ~1.7 GB for the whole class, and all 9 tests in the class (including the strict atol=1e-10 consistency check) still pass. Class wall time also improves (~3:55 → less data-loop work in each change-bias invocation). --- source/tests/pt_expt/test_change_bias.py | 61 ++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/source/tests/pt_expt/test_change_bias.py b/source/tests/pt_expt/test_change_bias.py index e3749671aa..bce2d171f5 100644 --- a/source/tests/pt_expt/test_change_bias.py +++ b/source/tests/pt_expt/test_change_bias.py @@ -118,19 +118,70 @@ def _make_config(data_dir: str) -> dict: } +def _make_subset_dataset(src_system: str, dst_system: str, n_frames: int) -> None: + """Copy ``type{,_map}.raw`` and the first ``n_frames`` of every ``.npy`` + in ``set.000`` from ``src_system`` to ``dst_system``. + + Used by ``TestChangeBias`` to shrink the water/data_0 example (80 + frames) down to a tiny subset so that ``dp change-bias`` enumerates + over only ``n_frames`` frames. Why this matters: the in-process + ``main(cmds)`` path runs the model forward over ``nbatches`` frames + via ``compute_output_stats``, and each frame leaks ~50 MB into + torch's caching allocator. At ``n_frames=80`` (the default, + ``min(data.get_nbatches()) = 80``) peak RSS hits ~5 GB which OOMs + the 7 GB GitHub-hosted CI runner. Shrinking to ``n_frames=5`` keeps + peak at ~800 MB while preserving **determinism**: the test + ``test_change_bias_pt2_pte_consistency`` asserts ``atol=1e-10`` + between two .pte and .pt2 calls in the same process, which requires + every frame to be seen on each call regardless of the + shuffle-based ``_load_batch_set`` order. ``nbatches == total + frames`` makes the forward enumerate every frame and so the + aggregate bias is invariant under shuffle. + """ + import numpy as np + + src_set = os.path.join(src_system, "set.000") + dst_set = os.path.join(dst_system, "set.000") + os.makedirs(dst_set, exist_ok=True) + for raw in ("type.raw", "type_map.raw"): + src = os.path.join(src_system, raw) + if os.path.isfile(src): + shutil.copyfile(src, os.path.join(dst_system, raw)) + for fname in os.listdir(src_set): + if not fname.endswith(".npy"): + continue + arr = np.load(os.path.join(src_set, fname)) + np.save(os.path.join(dst_set, fname), arr[:n_frames]) + + class TestChangeBias(unittest.TestCase): """Test dp change-bias for the pt_expt backend.""" @classmethod def setUpClass(cls) -> None: - data_dir = os.path.join(EXAMPLE_DIR, "data") - if not os.path.isdir(data_dir): - raise unittest.SkipTest(f"Example data not found: {data_dir}") + full_data_dir = os.path.join(EXAMPLE_DIR, "data") + if not os.path.isdir(full_data_dir): + raise unittest.SkipTest(f"Example data not found: {full_data_dir}") + cls.tmpdir = tempfile.mkdtemp() + cls.old_cwd = os.getcwd() + + # Shrink the water example dataset (80 frames) to a 5-frame + # subset. ``dp change-bias`` defaults to enumerating every + # frame (``nbatches = min(data.get_nbatches())``), and each + # frame's forward pass leaks ~50 MB into torch's allocator; at + # 80 frames peak RSS pushes the 7 GB CI runner into OOM. See + # the docstring of ``_make_subset_dataset`` for why we keep + # full enumeration (determinism) but shrink the dataset. + data_dir = os.path.join(cls.tmpdir, "data") + os.makedirs(data_dir, exist_ok=True) + _make_subset_dataset( + src_system=os.path.join(full_data_dir, "data_0"), + dst_system=os.path.join(data_dir, "data_0"), + n_frames=5, + ) cls.data_dir = data_dir cls.data_file = [os.path.join(data_dir, "data_0")] - cls.tmpdir = tempfile.mkdtemp() - cls.old_cwd = os.getcwd() os.chdir(cls.tmpdir) # Build & train 1-step model From 9706edeb7e0143180d68e3fcdc674de426e7c9ce Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 27 May 2026 18:06:25 +0800 Subject: [PATCH 2/2] test(pt_expt): drop redundant in-function numpy import numpy is already imported at module level; the in-function import in _make_subset_dataset triggers ruff PLC0415. --- source/tests/pt_expt/test_change_bias.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/source/tests/pt_expt/test_change_bias.py b/source/tests/pt_expt/test_change_bias.py index bce2d171f5..2f441f12bd 100644 --- a/source/tests/pt_expt/test_change_bias.py +++ b/source/tests/pt_expt/test_change_bias.py @@ -138,8 +138,6 @@ def _make_subset_dataset(src_system: str, dst_system: str, n_frames: int) -> Non frames`` makes the forward enumerate every frame and so the aggregate bias is invariant under shuffle. """ - import numpy as np - src_set = os.path.join(src_system, "set.000") dst_set = os.path.join(dst_system, "set.000") os.makedirs(dst_set, exist_ok=True)