test: save/restore round-trip on a --no-sample-tables dataset

pei-li-hedgehog · pei-li-hedgehog · commit 88245f2daf84 · 2026-05-19T17:29:23.000+02:00
Empirically verifies that the integer-indexed loader's checkpoint /
resume path works on a dataset prepared with --no-sample-tables.
ShardInfosITarReader and SliceState never touch the SQLite samples
tables, so the load-bearing claim of the flag is that training-time
save/restore still produces the same sample sequence. This test
exercises the round-trip:

  1. Reference: an uninterrupted iteration of 20 samples.
  2. Capture state mid-stream (after 10 samples) via save_state_rank().
  3. Continue iterating to capture the next 10 samples (post_save).
  4. Build a fresh loader, restore_state_rank(state), iterate 10 samples
     (post_restore).
  5. Assert first_half + post_save == reference (no divergence from
     the reference run) and post_restore == post_save (resumed
     iteration matches continued iteration).

Re-prepares the test fixture as CaptioningSample + --no-sample-tables
so get_train_dataset returns decodable samples.

Signed-off-by: Pei Li &lt;pei.li@kaiko.ai&gt;
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1882,6 +1882,72 @@ def test_prepare_dataset_no_sample_tables(self):
         finally:
             reader.close()
 
+    def test_prepare_dataset_no_sample_tables_save_restore(self):
+        """Resume after a mid-iteration checkpoint must reach the same samples in the
+        same order on a dataset prepared with ``--no-sample-tables``.
+
+        This is the load-bearing claim of the flag: the integer-indexed loader
+        (``ShardInfosITarReader``) and the savable state (``SliceState``) do not
+        touch the SQLite samples tables, so save/restore must work without them.
+        We re-prepare the fixture with ``--no-sample-tables`` plus a captioning
+        field-map (so ``get_train_dataset`` yields decodable samples), then compare
+        a save/restore round-trip against a reference run.
+        """
+
+        from megatron.energon import get_savable_loader, get_train_dataset
+
+        runner = CliRunner()
+        result = runner.invoke(
+            prepare_command,
+            [
+                str(self.dataset_path),
+                "--non-interactive",
+                "--force-overwrite",
+                "--split-ratio=1,0,0",
+                "--sample-type=CaptioningSample",
+                '--field-map={"image": "png", "caption": "txt"}',
+                "--no-sample-tables",
+            ],
+            catch_exceptions=False,
+        )
+        assert result.exit_code == 0, f"Prepare failed: {result.stdout}"
+
+        def loader_factory():
+            return get_savable_loader(
+                get_train_dataset(
+                    self.dataset_path,
+                    batch_size=2,
+                    worker_config=no_worker_config,
+                    shuffle_buffer_size=20,
+                    max_samples_per_sequence=10,
+                )
+            )
+
+        def keys_from(loader, n):
+            return [tuple(batch.__key__) for _, batch in zip(range(n), loader)]
+
+        # Reference: a single uninterrupted run, used as the ground truth.
+        reference = keys_from(loader_factory(), 20)
+
+        # Capture state mid-stream.
+        loader = loader_factory()
+        first_half = keys_from(loader, 10)
+        state = loader.save_state_rank()
+        post_save = keys_from(loader, 10)
+
+        # Restore into a fresh loader and continue. The resumed sequence must
+        # match what the original loader produced after `save_state_rank()`.
+        resumed = loader_factory()
+        resumed.restore_state_rank(state)
+        post_restore = keys_from(resumed, 10)
+
+        assert first_half + post_save == reference, (
+            f"Uninterrupted iteration diverges from reference: {first_half + post_save} != {reference}"
+        )
+        assert post_restore == post_save, (
+            f"Resume diverged from continued iteration: {post_restore} != {post_save}"
+        )
+
     def test_preview_captioning_dataset(self):
         runner = CliRunner()
         result = runner.invoke(