NVIDIA
diff --git a/‎examples/dataset/README.md‎
Lines changed: 13 additions & 0 deletions b/‎examples/dataset/README.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎examples/dataset/synthetic_conversations_1k.jsonl‎
Lines changed: 1000 additions & 0 deletions b/‎examples/dataset/synthetic_conversations_1k.jsonl‎
Lines changed: 1000 additions & 0 deletions
diff --git a/‎tests/examples/speculative_decoding/conftest.py‎
Lines changed: 5 additions & 4 deletions b/‎tests/examples/speculative_decoding/conftest.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎tests/examples/speculative_decoding/test_data.jsonl‎
Lines changed: 0 additions & 100 deletions b/‎tests/examples/speculative_decoding/test_data.jsonl‎
Lines changed: 0 additions & 100 deletions
diff --git a/‎tests/examples/speculative_decoding/test_dflash.py‎
Lines changed: 177 additions & 0 deletions b/‎tests/examples/speculative_decoding/test_dflash.py‎
Lines changed: 177 additions & 0 deletions
diff --git a/‎tools/launcher/common/check_regression.py‎
Lines changed: 162 additions & 0 deletions b/‎tools/launcher/common/check_regression.py‎
Lines changed: 162 additions & 0 deletions
diff --git a/‎tools/launcher/common/specdec/dflash_online_training.sh‎
Lines changed: 5 additions & 0 deletions b/‎tools/launcher/common/specdec/dflash_online_training.sh‎
Lines changed: 5 additions & 0 deletions
@@ -219,3 +219,16 @@ python -m modelopt.torch.utils.plugins.megatron_preprocess_data \
     --workers 32 \
     --reasoning_content inline
 ```
+
+## Synthetic Test Dataset
+
+`synthetic_conversations_1k.jsonl` is a 1,000-sample dataset in OpenAI messages format
+(900 single-turn + 100 two-turn conversations) covering writing, reasoning, math, coding,
+STEM, extraction, humanities, and roleplay categories.
+
+This dataset was synthesized by Claude (Anthropic) and is licensed under Apache-2.0.
+It is intended for testing and CI regression — not for production training.
+
+```json
+{"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
+```
@@ -46,9 +46,10 @@ def tiny_conversations_path(tmp_path_factory):
 
 @pytest.fixture(scope="session", autouse=True)
 def tiny_daring_anteater_path():
-    """Return path to static test data (100 samples in OpenAI messages format).
+    """Return path to synthetic test data in OpenAI messages format.
 
-    test_data.jsonl contains 100 synthetic samples (80 single-turn, 20 two-turn)
-    generated by Claude (Anthropic) for testing purposes. Licensed under Apache-2.0.
+    Uses examples/dataset/synthetic_conversations_1k.jsonl (1000 samples,
+    900 single-turn + 100 two-turn). Synthesized by Claude (Anthropic),
+    Apache-2.0 licensed.
     """
-    return Path(__file__).parent / "test_data.jsonl"
+    return Path(__file__).parents[3] / "examples" / "dataset" / "synthetic_conversations_1k.jsonl"
@@ -0,0 +1,177 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DFlash E2E regression tests.
+
+Tests the full DFlash pipeline using Qwen3-0.6B and the synthetic dataset
+(examples/dataset/synthetic_conversations_1k.jsonl). Matches the configuration
+in tools/launcher/examples/Qwen/Qwen3-0.6B/hf_online_dflash.yaml.
+
+Convergence baseline (from L40 run):
+  Step  100 (epoch 0.2): loss=6.59  acc=0.079
+  Step  500 (epoch 1.0): loss=1.78  acc=0.525
+  Step 1500 (epoch 3.0): loss=1.11  acc=0.595
+"""
+
+import json
+import os
+
+import pytest
+from _test_utils.examples.run_command import MODELOPT_ROOT, run_example_command
+
+DFLASH_YAML = str(
+    MODELOPT_ROOT / "modelopt_recipes" / "general" / "speculative_decoding" / "dflash.yaml"
+)
+
+CHAT_TEMPLATE = str(
+    MODELOPT_ROOT
+    / "tools"
+    / "launcher"
+    / "examples"
+    / "Qwen"
+    / "Qwen3-0.6B"
+    / "chat_template_train.jinja"
+)
+
+SYNTH_DATA = str(MODELOPT_ROOT / "examples" / "dataset" / "synthetic_conversations_1k.jsonl")
+
+# Match tools/launcher/examples/Qwen/Qwen3-0.6B/hf_online_dflash.yaml
+_DFLASH_OVERRIDES = [
+    f"data.data_path={SYNTH_DATA}",
+    f"data.chat_template={CHAT_TEMPLATE}",
+    "training.training_seq_len=512",
+    "training.per_device_train_batch_size=2",
+    "training.logging_steps=100",
+    "training.answer_only_loss=true",
+    "dflash.dflash_block_size=8",
+    "dflash.dflash_mask_token_id=151669",
+    "dflash.dflash_use_torch_compile=False",
+    "dflash.dflash_architecture_config.num_hidden_layers=2",
+]
+
+
+@pytest.fixture(scope="session")
+def qwen3_model_name():
+    """Qwen3-0.6B model name (downloaded from HF on first use)."""
+    return "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="session")
+def dflash_output_dir(tmp_path_factory):
+    return tmp_path_factory.mktemp("dflash_output")
+
+
+def test_dflash_training(qwen3_model_name, dflash_output_dir):
+    """Train DFlash on Qwen3-0.6B and validate loss convergence."""
+    output_dir = str(dflash_output_dir / "dflash-qwen3-0.6b")
+    overrides = [
+        f"model.model_name_or_path={qwen3_model_name}",
+        f"training.output_dir={output_dir}",
+        "training.num_train_epochs=3",
+        "training.save_steps=500",
+        *_DFLASH_OVERRIDES,
+    ]
+
+    run_example_command(
+        ["./launch_train.sh", "--config", DFLASH_YAML, *overrides],
+        "speculative_decoding",
+    )
+
+    # Verify checkpoint was saved
+    assert os.path.exists(os.path.join(output_dir, "modelopt_state.pth")) or any(
+        "checkpoint-" in d
+        for d in os.listdir(output_dir)
+        if os.path.isdir(os.path.join(output_dir, d))
+    )
+
+    # Regression: verify loss decreased
+    trainer_state = os.path.join(output_dir, "trainer_state.json")
+    assert os.path.exists(trainer_state), "trainer_state.json not found"
+    with open(trainer_state) as f:
+        state = json.load(f)
+    logs = [h for h in state.get("log_history", []) if "loss" in h]
+    assert len(logs) >= 2, f"Expected at least 2 log entries, got {len(logs)}"
+
+    first_loss = float(logs[0]["loss"])
+    final_loss = float(logs[-1]["loss"])
+    assert final_loss < first_loss, f"Loss did not decrease: {first_loss:.3f} -> {final_loss:.3f}"
+    # Sanity: final loss should be reasonable (baseline: ~1.1 on L40)
+    assert final_loss < 3.0, f"Final loss {final_loss:.3f} too high (expected < 3.0)"
+
+
+def test_dflash_resume(qwen3_model_name, dflash_output_dir):
+    """Resume DFlash training from checkpoint."""
+    output_dir = str(dflash_output_dir / "dflash-qwen3-0.6b")
+    overrides = [
+        f"model.model_name_or_path={qwen3_model_name}",
+        f"training.output_dir={output_dir}",
+        "training.num_train_epochs=4",
+        "training.save_steps=5000",
+        *_DFLASH_OVERRIDES,
+    ]
+
+    run_example_command(
+        ["./launch_train.sh", "--config", DFLASH_YAML, *overrides],
+        "speculative_decoding",
+    )
+
+
+def test_dflash_export(dflash_output_dir):
+    """Export DFlash checkpoint to deployment format."""
+    output_dir = str(dflash_output_dir / "dflash-qwen3-0.6b")
+    export_dir = str(dflash_output_dir / "dflash-export")
+
+    run_example_command(
+        [
+            "python",
+            "./scripts/export_hf_checkpoint.py",
+            "--model_path",
+            output_dir,
+            "--export_path",
+            export_dir,
+        ],
+        "speculative_decoding",
+    )
+
+    assert os.path.exists(os.path.join(export_dir, "model.safetensors"))
+    assert os.path.exists(os.path.join(export_dir, "config.json"))
+
+    with open(os.path.join(export_dir, "config.json")) as f:
+        config = json.load(f)
+    assert config["architectures"] == ["DFlashDraftModel"]
+    assert config["model_type"] == "qwen3"
+    assert "dflash_config" in config
+    assert "block_size" in config
+
+
+def test_dflash_ar_validate(dflash_output_dir):
+    """AR validation on trained DFlash checkpoint."""
+    output_dir = str(dflash_output_dir / "dflash-qwen3-0.6b")
+
+    run_example_command(
+        [
+            "python",
+            "./scripts/ar_validate.py",
+            "--model_path",
+            output_dir,
+            "--osl",
+            "10",
+            "--num_samples",
+            "3",
+            "--steps",
+            "7",
+        ],
+        "speculative_decoding",
+    )
@@ -0,0 +1,162 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Regression check for training jobs.
+
+Reads trainer_state.json from a HuggingFace Trainer checkpoint and validates
+that final metrics meet specified thresholds. Used by training scripts to
+catch regressions in CI.
+
+Environment variables (all optional — no check if unset):
+    MAX_FINAL_LOSS:  Final loss must be below this value
+    MIN_FINAL_ACC:   Final accuracy must be above this value (any key containing 'acc')
+    MAX_FINAL_PERPLEXITY: Final perplexity must be below this value
+
+Usage:
+    python check_regression.py /path/to/output_dir
+
+    Or from a shell script:
+    python common/check_regression.py ${OUTPUT_DIR}
+
+Exit codes:
+    0 — all checks pass (or no thresholds set)
+    1 — regression detected
+"""
+
+import json
+import os
+import sys
+from glob import glob
+
+
+def find_trainer_state(output_dir):
+    """Find the latest trainer_state.json in the output directory."""
+    # Check checkpoint subdirs first (sorted by step number)
+    checkpoint_states = sorted(glob(os.path.join(output_dir, "checkpoint-*", "trainer_state.json")))
+    if checkpoint_states:
+        return checkpoint_states[-1]
+    # Fall back to output_dir itself
+    direct = os.path.join(output_dir, "trainer_state.json")
+    if os.path.exists(direct):
+        return direct
+    return None
+
+
+def get_final_metrics(trainer_state_path):
+    """Extract final loss and accuracy from trainer_state.json."""
+    with open(trainer_state_path) as f:
+        state = json.load(f)
+
+    logs = [h for h in state.get("log_history", []) if "loss" in h]
+    if not logs:
+        return {}
+
+    last = logs[-1]
+    metrics = {"loss": float(last["loss"])}
+
+    # Find any accuracy key (train_acc/parallel_0_step_0, eval_accuracy, etc.)
+    for key, value in last.items():
+        if "acc" in key.lower():
+            metrics["accuracy"] = float(value)
+            break
+
+    # Perplexity if available
+    if "perplexity" in last:
+        metrics["perplexity"] = float(last["perplexity"])
+
+    return metrics
+
+
+def check_regression(metrics):
+    """Check metrics against environment variable thresholds. Returns (passed, messages)."""
+    checks = [
+        (
+            "MAX_FINAL_LOSS",
+            "loss",
+            lambda val, thresh: val <= thresh,
+            "loss {val:.3f} > threshold {thresh}",
+        ),
+        (
+            "MIN_FINAL_ACC",
+            "accuracy",
+            lambda val, thresh: val >= thresh,
+            "acc {val:.3f} < threshold {thresh}",
+        ),
+        (
+            "MAX_FINAL_PERPLEXITY",
+            "perplexity",
+            lambda val, thresh: val <= thresh,
+            "perplexity {val:.3f} > threshold {thresh}",
+        ),
+    ]
+
+    passed = True
+    messages = []
+
+    for env_var, metric_key, check_fn, fail_msg in checks:
+        thresh_str = os.environ.get(env_var)
+        if thresh_str is None:
+            continue
+        thresh = float(thresh_str)
+        val = metrics.get(metric_key)
+        if val is None:
+            messages.append(f"WARNING: {env_var} set but '{metric_key}' not found in metrics")
+            continue
+        if check_fn(val, thresh):
+            messages.append(f"PASS: {metric_key}={val:.3f} (threshold: {env_var}={thresh})")
+        else:
+            messages.append(f"REGRESSION: {fail_msg.format(val=val, thresh=thresh)}")
+            passed = False
+
+    return passed, messages
+
+
+def main():
+    """Entry point for regression check CLI."""
+    if len(sys.argv) < 2:
+        print("Usage: python check_regression.py <output_dir>")
+        sys.exit(0)
+
+    output_dir = sys.argv[1]
+
+    # Skip if no thresholds set
+    if not any(
+        os.environ.get(v) for v in ["MAX_FINAL_LOSS", "MIN_FINAL_ACC", "MAX_FINAL_PERPLEXITY"]
+    ):
+        return
+
+    trainer_state = find_trainer_state(output_dir)
+    if not trainer_state:
+        print(f"WARNING: No trainer_state.json found in {output_dir}, skipping regression check")
+        return
+
+    print(f"=== Regression Check ({trainer_state}) ===")
+    metrics = get_final_metrics(trainer_state)
+    if not metrics:
+        print("No training logs found in trainer_state.json")
+        return
+
+    print(f"Final metrics: {metrics}")
+    passed, messages = check_regression(metrics)
+    for msg in messages:
+        print(f"  {msg}")
+
+    if not passed:
+        sys.exit(1)
+    print("Regression check PASSED")
+
+
+if __name__ == "__main__":
+    main()
@@ -142,4 +142,9 @@ for arg in sys.argv[1:]:
             echo "No new checkpoints to export in ${OUTPUT_DIR}"
         fi
     fi
+
+    # Regression check (uses env vars MAX_FINAL_LOSS, MIN_FINAL_ACC, etc.)
+    if [ -n "$OUTPUT_DIR" ]; then
+        python3 common/check_regression.py "${OUTPUT_DIR}" || true
+    fi
 fi