Fix spec dec example tests (#1183)

kevalmorabia97 · h-guo18 · web-flow · commit 80d2f02a2d12 · 2026-04-06T22:23:50.000-07:00
### What does this PR do? Type of change: Test fix  - Fix `tests/examples/speculative_decoding` - previously silently skipped - Avoid pulling nemotron-post-training-dataset-v2 in tests to reduce chances of HF loading timeout in CICD - Make slow and redundant tests manual to speed up CICD ### Testing  - Tests passing ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅  - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: N/A  - Did you write any new necessary tests?: ✅  - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: N/A   ## Summary by CodeRabbit * **Chores** * Removed git‑LFS install step from CI and deleted an automated branch‑cleanup workflow * Trimmed example environment dependencies and relaxed transformers compatibility; added an optional tokenization dependency * **Tests** * Switched tests to generate datasets dynamically and improved fixture handling * Standardized PTQ test parameters (explicit calibration dataset) and refined GPU/test selection * **Bug Fixes** * Improved device-awareness and numeric handling in speculative decoding attention paths  --------- Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Co-authored-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
diff --git a/.github/workflows/_example_tests_runner.yml b/.github/workflows/_example_tests_runner.yml
@@ -47,10 +47,6 @@ jobs:
           echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV
       - name: Install dependencies
         run: |
-          # Install git-lfs for Daring-Anteater dataset
-          apt-get update && apt-get install -y git-lfs
-          git lfs install --system
-
           # use `python -m pip` instead of `pip` to avoid conflicts with system pip for nemo containers
           python -m pip install ".${{ inputs.pip_install_extras }}"
 
diff --git a/.github/workflows/delete_outdated_pr_branches.yml b/.github/workflows/delete_outdated_pr_branches.yml
diff --git a/examples/llm_eval/requirements.txt b/examples/llm_eval/requirements.txt
@@ -2,5 +2,4 @@ fire>=0.5.0
 lm_eval[api,ifeval]==0.4.8
 peft>=0.5.0
 rwkv>=0.7.3
-tiktoken
 torchvision
diff --git a/examples/llm_ptq/requirements.txt b/examples/llm_ptq/requirements.txt
@@ -2,6 +2,5 @@ compressed-tensors==0.12.0
 fire
 flash-attn>=2.6.0
 rouge_score>=0.1.2
-tiktoken
 transformers_stream_generator
 zstandard
diff --git a/examples/speculative_decoding/requirements.txt b/examples/speculative_decoding/requirements.txt
@@ -1,2 +1,2 @@
 accelerate==1.12.0
-transformers==5.0.0rc1
+transformers<5.4
diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py
@@ -75,11 +75,6 @@
 CACHED_SHARD_TTT_MASKS = {}
 
 
-def _get_empty_cache(config):
-    """Return an empty cache. Handle different versions of transformers for unit tests."""
-    return DynamicCache(config=config)
-
-
 @MedusaDMRegistry.register({PreTrainedModel: "hf.PreTrainedModel"})
 class HFMedusaModel(MedusaModel):
     """Medusa Model Class for huggingface models."""
@@ -287,9 +282,9 @@ def __init__(self, config, decoder_layer_cls, bias=False):
                 num_layers=self.config.parallel_draft_heads_num_layers,
             )
 
-    def _maybe_init_rope(self):
+    def _maybe_init_rope(self, device=None):
         if self.config.eagle_decoder_type == "llama" and not hasattr(self, "rotary_emb"):
-            self.rotary_emb = LlamaRotaryEmbedding(config=self.config)
+            self.rotary_emb = LlamaRotaryEmbedding(config=self.config, device=device)
 
     def _expand_first_attn_in_dim(self, first_layer_attn):
         """Modify qkv projection in first layer to accept 2h hidden size."""
@@ -565,12 +560,19 @@ def modify(
         elif self.eagle_decoder_type == "kimik2":
             decoder_cls = _setup_kimi_k2_decoder()
 
-        self.eagle_config = PretrainedConfig.from_dict(config.eagle_architecture_config)
+        arch_config = config.eagle_architecture_config
+
+        # Populate base-model-dependent fields before constructing PretrainedConfig,
+        # since transformers >=5.4 validates rope_scaling during __init__.
+        arch_config["hidden_size"] = self._base_llm_config.hidden_size
+        arch_config["vocab_size"] = self._base_llm_config.vocab_size
+        arch_config["max_position_embeddings"] = self._base_llm_config.max_position_embeddings
+        rope_scaling = arch_config.get("rope_scaling")
+        if rope_scaling and "rope_theta" not in rope_scaling and "rope_theta" in arch_config:
+            rope_scaling["rope_theta"] = arch_config["rope_theta"]
+
+        self.eagle_config = PretrainedConfig.from_dict(arch_config)
         self.eagle_config.eagle_decoder_type = self.eagle_decoder_type
-        # Hidden size and vocab size must match base model
-        self.eagle_config.hidden_size = self._base_llm_config.hidden_size
-        self.eagle_config.vocab_size = self._base_llm_config.vocab_size
-        self.eagle_config.max_position_embeddings = self._base_llm_config.max_position_embeddings
         self.eagle_config.draft_vocab_size = getattr(
             self.eagle_config, "draft_vocab_size", self.eagle_config.vocab_size
         )
@@ -751,7 +753,10 @@ def _compute_ttt_attention_mask(
     ) -> BlockMask | torch.Tensor:
         """Return TTT attention_mask tensor of type BlockMask or Tensor depends on eagle attn impl."""
         msk_func = get_ttt_msk_func(seq_length, ttt_step)
-        dtypemin = torch.finfo(self._base_llm_config.dtype).min
+        dtype = (
+            self._base_llm_config.dtype or self.eagle_module.layers[0].input_layernorm.weight.dtype
+        )
+        dtypemin = torch.finfo(dtype).min
         q_len = seq_length
         kv_len = seq_length * (1 + ttt_step)
         if self.eagle_config._attn_implementation == "flex_attention":
@@ -767,7 +772,7 @@ def _compute_ttt_attention_mask(
                 torch.arange(kv_len).view(1, 1, 1, kv_len),
             ).to(self.device)
             tensor_mask = torch.full_like(
-                tensor_mask, 0, dtype=self._base_llm_config.dtype, device=self.device
+                tensor_mask, 0, dtype=dtype, device=self.device
             ).masked_fill(~tensor_mask, dtypemin)
 
             return tensor_mask
@@ -910,9 +915,9 @@ def forward(
                 )
 
         if not isinstance(past_key_values, Cache):
-            past_key_values = _get_empty_cache(self._base_llm_config)
+            past_key_values = DynamicCache(config=self._base_llm_config)
         if not isinstance(eagle_cache, Cache):
-            eagle_cache = _get_empty_cache(self.eagle_module.config)
+            eagle_cache = DynamicCache(config=self.eagle_module.config)
         past_key_values.eagle_cache = eagle_cache
 
         # ====Prepare inputs for the first eagle forward pass====
@@ -937,7 +942,7 @@ def forward(
                 base_outputs,
             )
 
-        self.eagle_module._maybe_init_rope()
+        self.eagle_module._maybe_init_rope(device=eagle_input_hiddens.device)
 
         # ====Run eagle forward with extra training-time-test steps====
         for ttt_step in range(self.eagle_ttt_steps):
@@ -1070,7 +1075,7 @@ def pseudo_speculative_generate(
         else:
             eagle_input_hidden_states = base_model_hidden_states
 
-        self.eagle_module._maybe_init_rope()
+        self.eagle_module._maybe_init_rope(device=eagle_input_hidden_states.device)
         draft_tokens = []
         for step in range(steps):
             b, seq_length = eagle_ids.shape
diff --git a/pyproject.toml b/pyproject.toml
@@ -82,6 +82,7 @@ hf = [
     "peft>=0.17.0",
     "sentencepiece>=0.2.1",                                                           # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export
     "transformers>=4.56,<5.0",                                                        # Should match modelopt/torch/__init__.py and tox.ini
+    "tiktoken",
     "wonderwords",
 ]
 dev-lint = [
diff --git a/tests/_test_utils/examples/llm_ptq_utils.py b/tests/_test_utils/examples/llm_ptq_utils.py
@@ -16,12 +16,10 @@
 import importlib.metadata as metadata
 import subprocess
 from dataclasses import asdict, dataclass
-from pathlib import Path
 
 import pytest
 import torch
-
-PTQ_EXAMPLE_DIR = Path(__file__).parents[3] / "examples" / "llm_ptq"
+from _test_utils.examples.run_command import run_llm_ptq_command
 
 
 @dataclass
@@ -32,6 +30,7 @@ class PTQCommand:
     sparsity: str | None = None
     kv_cache_quant: str | None = None
     trust_remote_code: bool = False
+    calib_dataset: str = "cnn_dailymail"
     calib_batch_size: int | None = None
     auto_quantize_bits: float | None = None
     tp: int | None = None
@@ -47,37 +46,23 @@ def run(self, model_path: str):
             self.min_sm % 10,
         ):
             pytest.skip(reason=f"Requires sm{self.min_sm} or higher")
-            return
 
         if self.max_sm and torch.cuda.get_device_capability() > (
             self.max_sm // 10,
             self.max_sm % 10,
         ):
             pytest.skip(reason=f"Requires sm{self.max_sm} or lower")
-            return
 
         if self.min_gpu and torch.cuda.device_count() < self.min_gpu:
             pytest.skip(reason=f"Requires at least {self.min_gpu} GPUs")
-            return
 
         param_dict = asdict(self)
-
         param_dict.pop("min_sm", None)
+        param_dict.pop("max_sm", None)
         param_dict.pop("min_gpu", None)
 
-        trust_remote_code = param_dict.pop("trust_remote_code", False)
-
-        args = ["--model", model_path]
-        for key, value in param_dict.items():
-            if value is not None:
-                args.append(f"--{key}")
-                args.append(f"{value}")
-
-        if trust_remote_code:
-            args.append("--trust_remote_code")
-
-        self.command = ["scripts/huggingface_example.sh", "--no-verbose", *args]
-        subprocess.run(self.command, cwd=PTQ_EXAMPLE_DIR, check=True)
+        quant = param_dict.pop("quant")
+        run_llm_ptq_command(model=model_path, quant=quant, **param_dict)
 
     def param_str(self):
         param_dict = asdict(self)
diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py
@@ -71,7 +71,7 @@ class TestWhisper(WithRequirements):
         "command",
         [
             # Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size
-            PTQCommand(quant="fp8", calib_batch_size=16, min_sm=89),
+            PTQCommand(quant="fp8", calib_batch_size=16, calib_dataset="peoples_speech", min_sm=89),
         ],
         ids=PTQCommand.param_str,
     )
diff --git a/tests/examples/speculative_decoding/conftest.py b/tests/examples/speculative_decoding/conftest.py
@@ -13,30 +13,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-
 import pytest
-from _test_utils.examples.run_command import MODELOPT_ROOT, run_example_command
+import yaml
+from _test_utils.examples.run_command import run_example_command
 
 
 @pytest.fixture(scope="session", autouse=True)
 def tiny_daring_anteater_path(tmp_path_factory):
-    dataset_path = (
-        MODELOPT_ROOT / "examples/speculative_decoding/input_conversations/daring-anteater.jsonl"
+    tmp_dir = tmp_path_factory.mktemp("daring_anteater")
+    output_file = tmp_dir / "train.jsonl"
+
+    config = {
+        "outputs": [
+            {
+                "filename": str(output_file),
+                "global_limit": 100,
+                "sources": [{"name": "daring-anteater", "splits": {"all": 100}}],
+            }
+        ]
+    }
+    config_path = tmp_dir / "data_config.yaml"
+    config_path.write_text(yaml.dump(config))
+
+    run_example_command(
+        ["python", "prepare_input_conversations/make_dataset.py", "-f", str(config_path), "--full"],
+        "speculative_decoding",
     )
-    if not os.path.exists(dataset_path):
-        try:
-            run_example_command(
-                ["python", "prepare_input_conversations/add_daring_anteater.py"],
-                "speculative_decoding",
-            )
-        except Exception as e:
-            # Ignore rate-limiting errors
-            pytest.skip(f"Failed to prepare dataset: {e}")
-    output_path = tmp_path_factory.mktemp("daring_anteater") / "train.jsonl"
-    with open(dataset_path) as src, open(output_path, "w") as dst:
-        for i, line in enumerate(src):
-            if i >= 128:
-                break
-            dst.write(line)
-    return output_path
+
+    return output_file
diff --git a/tests/examples/speculative_decoding/test_eagle.py b/tests/examples/speculative_decoding/test_eagle.py
@@ -22,6 +22,7 @@
 import torch
 from _test_utils.examples.run_command import run_example_command
 from packaging.version import Version
+from transformers import AutoConfig
 
 from modelopt.torch.export.plugins.hf_spec_export import LLAMA_EAGLE_SINGLE_LAYER
 
@@ -105,11 +106,11 @@ def test_llama_eagle3(tiny_llama_path,
                       tiny_daring_anteater_path,
                       tmp_path, eagle_output_dir,
                       cp_size,
-                      mix_hidden_states):
+                      mix_hidden_states,
+                      num_gpus):
     """Test Eagle3 training with a tiny llama model, using different cp_size values."""
-    available_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
-    if cp_size == 2 and available_gpus < 2:
-        pytest.skip("cp_size=2 requires at least 2 GPUs, but only {} found.".format(available_gpus))
+    if cp_size == 2 and num_gpus < 2:
+        pytest.skip("cp_size=2 requires at least 2 GPUs, but only {} found.".format(num_gpus))
     if cp_size == 2 and not Version(torch.__version__) >= Version("2.10.0"):
         pytest.skip("cp_size=2 requires torch 2.10.0")
     # Create an ultra-tiny EAGLE config for testing to reduce memory usage
@@ -210,8 +211,14 @@ def test_convert_to_vllm_ckpt(tiny_llama_path, eagle_output_dir):
     [
         (None, False),                       # tiny_llama (from fixture), no FakeBase
         ("moonshotai/Kimi-K2.5", True),      # remote HF repo, FakeBaseModel
-        ("moonshotai/Kimi-K2-Thinking", True),     # remote HF repo, no FakeBaseModel
-        ("MiniMaxAI/MiniMax-M2.5", True),
+        pytest.param(
+            "moonshotai/Kimi-K2-Thinking", True,   # remote HF repo, no FakeBaseModel
+            marks=pytest.mark.manual(reason="skip redundand test, too slow"),
+        ),
+        pytest.param(
+            "MiniMaxAI/MiniMax-M2.5", True,
+            marks=pytest.mark.manual(reason="skip redundand test, too slow"),
+        ),
     ],
     ids=["tinyllama", "kimi-k2.5","kimi-k2-thinking","minimax-m2.5"],
 )
@@ -220,16 +227,12 @@ def test_offline_eagle3_training(
     model_source, use_fake_base,
 ):
     """Test Eagle3 training with pre-computed hidden states (offline mode / FakeBaseModel)."""
-    import transformers
-
     model_path = tiny_llama_path if model_source is None else model_source
     model_id = "tinyllama" if model_source is None else model_source.split("/")[-1]
     output_subdir = eagle_output_dir / f"eagle-{model_id}-offline"
 
-    cfg = transformers.AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-
-    if model_source=="moonshotai/Kimi-K2.5":
-        #vlm, get text config
+    cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    if hasattr(cfg, "text_config"):  # vlm: get text_config
         cfg = cfg.text_config
 
     offline_data_dir = generate_offline_pt_data(
@@ -277,10 +280,8 @@ def test_offline_resume_training_kimi(tiny_daring_anteater_path, tmp_path, eagle
     Depends on test_offline_eagle3_training["kimi-k2.5"] having run first.
     Exercises AutoModelForCausalLM.from_pretrained with model_type='fake_base_model'.
     """
-    import transformers
-
     checkpoint_dir = eagle_output_dir / "eagle-Kimi-K2.5-offline"
-    config = transformers.AutoConfig.from_pretrained(checkpoint_dir, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(checkpoint_dir, trust_remote_code=True)
 
     offline_data_dir = generate_offline_pt_data(
         tmp_path / "offline_data_resume",
diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py
@@ -80,6 +80,7 @@ def test_unified_hf_export_and_check_safetensors(
         pyt_ckpt_path=tiny_model_dir,
         qformat=qformat,
         export_path=output_dir,
+        dataset="cnn_dailymail",
     )
 
     # Run the command

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`accelerate==1.12.0`
`2`		`-transformers==5.0.0rc1`
	`2`	`+transformers<5.4`
Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ hf = [`
`82`	`82`	`"peft>=0.17.0",`
`83`	`83`	`"sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export`
`84`	`84`	`"transformers>=4.56,<5.0", # Should match modelopt/torch/__init__.py and tox.ini`
	`85`	`+ "tiktoken",`
`85`	`86`	`"wonderwords",`
`86`	`87`	`]`
`87`	`88`	`dev-lint = [`
Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ class TestWhisper(WithRequirements):`
`71`	`71`	`"command",`
`72`	`72`	`[`
`73`	`73`	`# Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size`
`74`		`- PTQCommand(quant="fp8", calib_batch_size=16, min_sm=89),`
	`74`	`+ PTQCommand(quant="fp8", calib_batch_size=16, calib_dataset="peoples_speech", min_sm=89),`
`75`	`75`	`],`
`76`	`76`	`ids=PTQCommand.param_str,`
`77`	`77`	`)`