fix

h-guo18 · h-guo18 · commit 027ee36e19fd · 2026-02-13T04:34:49.000Z
Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;
diff --git a/examples/speculative_decoding/eagle_utils.py b/examples/speculative_decoding/eagle_utils.py
@@ -167,21 +167,6 @@ def make_eagle_supervised_data_module(
     }
 
 
-def load_vlm_or_llm_with_kwargs(model_name_or_path: str, **kwargs):
-    """Load a VLM or LLM with kwargs. Returns the model and model config."""
-    model_config = transformers.AutoConfig.from_pretrained(
-        model_name_or_path, trust_remote_code=True
-    )
-    if "vl" in model_config.model_type.lower():
-        model_cls = transformers.AutoModelForVision2Seq
-    else:
-        model_cls = transformers.AutoModelForCausalLM
-
-    return model_config, model_cls.from_pretrained(
-        model_name_or_path, trust_remote_code=True, **kwargs
-    )
-
-
 class EagleTrainerWithAccLog(Trainer):
     """Wrapper around Trainer that logs training accuracy."""
 
diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py
@@ -40,7 +40,6 @@
 from eagle_utils import (
     EagleTrainerWithAccLog,
     EagleTrainingPlot,
-    load_vlm_or_llm_with_kwargs,
     make_eagle_supervised_data_module,
     patch_ring_attention_for_ttt,
 )
@@ -49,6 +48,7 @@
 
 import modelopt.torch.opt as mto
 import modelopt.torch.speculative as mtsp
+from modelopt.torch.speculative.utils import load_vlm_or_llm_with_kwargs
 from modelopt.torch.utils import print_rank_0
 
 torch.manual_seed(0)
diff --git a/examples/speculative_decoding/scripts/export_hf_checkpoint.py b/examples/speculative_decoding/scripts/export_hf_checkpoint.py
@@ -21,8 +21,7 @@
 
 import modelopt.torch.opt as mto
 from modelopt.torch.export import export_hf_checkpoint
-
-from ..eagle_utils import load_vlm_or_llm_with_kwargs
+from modelopt.torch.speculative.utils import load_vlm_or_llm_with_kwargs
 
 
 def parse_args():
diff --git a/modelopt/torch/speculative/utils.py b/modelopt/torch/speculative/utils.py
@@ -25,6 +25,7 @@
 
 import torch
 import torch.distributed
+import transformers
 from huggingface_hub import snapshot_download
 from torch import nn
 from torch.nn.attention import SDPBackend, sdpa_kernel
@@ -471,3 +472,16 @@ def enable_cp_ttt_patch():
             yield
         finally:
             modelopt.torch.speculative.plugins.transformers.ENABLE_CP_TTT_PATCH = False
+
+
+def load_vlm_or_llm_with_kwargs(model_name_or_path: str, **kwargs):
+    """Load a VLM or LLM with kwargs. Returns the model and model config."""
+    model_config = transformers.AutoConfig.from_pretrained(
+        model_name_or_path, trust_remote_code=True
+    )
+    if "vl" in model_config.model_type.lower():
+        model_cls = transformers.AutoModelForVision2Seq
+    else:
+        model_cls = transformers.AutoModelForCausalLM
+
+    return model_config, model_cls.from_pretrained(model_name_or_path, **kwargs)
diff --git a/tests/examples/speculative_decoding/test_eagle.py b/tests/examples/speculative_decoding/test_eagle.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import json
+import os
 
 import pytest
 import safetensors.torch
@@ -54,6 +55,10 @@ def test_calibrate_draft_vocab(tiny_llama_path, tiny_daring_anteater_path, draft
         "speculative_decoding",
     )
 
+    model_name = os.path.basename(os.path.normpath(tiny_llama_path))
+    d2t = torch.load(os.path.join(draft_vocab_cache_dir, model_name, "d2t.pt"))
+    assert d2t.shape[0] == 100, f"Expected draft vocab size 100, got {d2t.shape[0]}"
+
 
 # fmt: off
 @pytest.mark.parametrize("cp_size", [1, 2])
@@ -102,8 +107,8 @@ def test_ar_validate(eagle_output_dir):
         [
             "python", "./scripts/ar_validate.py",
             "--model_path", eagle_output_dir / "eagle-tinyllama-cp1",
-            "--osl", "20",
-            "--num_samples", "10",
+            "--osl", "10",
+            "--num_samples", "5",
             "--steps", "3"
         ],
         "speculative_decoding",