Skip to content

Commit 80d2f02

Browse files
Fix spec dec example tests (#1183)
### What does this PR do? Type of change: Test fix <!-- Use one of the following: Bug fix, new feature, new example, new tests, documentation. --> - Fix `tests/examples/speculative_decoding` - previously silently skipped - Avoid pulling nemotron-post-training-dataset-v2 in tests to reduce chances of HF loading timeout in CICD - Make slow and redundant tests manual to speed up CICD ### Testing <!-- Mention how have you tested your change if applicable. --> - Tests passing ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅ <!--- If ❌, explain why. --> - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: N/A <!--- Mandatory --> - Did you write any new necessary tests?: ✅ <!--- Mandatory for new features or examples. --> - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: N/A <!--- Only for new features, API changes, critical bug fixes or backward incompatible changes. --> <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Chores** * Removed git‑LFS install step from CI and deleted an automated branch‑cleanup workflow * Trimmed example environment dependencies and relaxed transformers compatibility; added an optional tokenization dependency * **Tests** * Switched tests to generate datasets dynamically and improved fixture handling * Standardized PTQ test parameters (explicit calibration dataset) and refined GPU/test selection * **Bug Fixes** * Improved device-awareness and numeric handling in speculative decoding attention paths <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Co-authored-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
1 parent 0d6fdd8 commit 80d2f02

12 files changed

Lines changed: 70 additions & 129 deletions

File tree

.github/workflows/_example_tests_runner.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,6 @@ jobs:
4747
echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV
4848
- name: Install dependencies
4949
run: |
50-
# Install git-lfs for Daring-Anteater dataset
51-
apt-get update && apt-get install -y git-lfs
52-
git lfs install --system
53-
5450
# use `python -m pip` instead of `pip` to avoid conflicts with system pip for nemo containers
5551
python -m pip install ".${{ inputs.pip_install_extras }}"
5652

.github/workflows/delete_outdated_pr_branches.yml

Lines changed: 0 additions & 47 deletions
This file was deleted.

examples/llm_eval/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,4 @@ fire>=0.5.0
22
lm_eval[api,ifeval]==0.4.8
33
peft>=0.5.0
44
rwkv>=0.7.3
5-
tiktoken
65
torchvision

examples/llm_ptq/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,5 @@ compressed-tensors==0.12.0
22
fire
33
flash-attn>=2.6.0
44
rouge_score>=0.1.2
5-
tiktoken
65
transformers_stream_generator
76
zstandard
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
accelerate==1.12.0
2-
transformers==5.0.0rc1
2+
transformers<5.4

modelopt/torch/speculative/plugins/transformers.py

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,6 @@
7575
CACHED_SHARD_TTT_MASKS = {}
7676

7777

78-
def _get_empty_cache(config):
79-
"""Return an empty cache. Handle different versions of transformers for unit tests."""
80-
return DynamicCache(config=config)
81-
82-
8378
@MedusaDMRegistry.register({PreTrainedModel: "hf.PreTrainedModel"})
8479
class HFMedusaModel(MedusaModel):
8580
"""Medusa Model Class for huggingface models."""
@@ -287,9 +282,9 @@ def __init__(self, config, decoder_layer_cls, bias=False):
287282
num_layers=self.config.parallel_draft_heads_num_layers,
288283
)
289284

290-
def _maybe_init_rope(self):
285+
def _maybe_init_rope(self, device=None):
291286
if self.config.eagle_decoder_type == "llama" and not hasattr(self, "rotary_emb"):
292-
self.rotary_emb = LlamaRotaryEmbedding(config=self.config)
287+
self.rotary_emb = LlamaRotaryEmbedding(config=self.config, device=device)
293288

294289
def _expand_first_attn_in_dim(self, first_layer_attn):
295290
"""Modify qkv projection in first layer to accept 2h hidden size."""
@@ -565,12 +560,19 @@ def modify(
565560
elif self.eagle_decoder_type == "kimik2":
566561
decoder_cls = _setup_kimi_k2_decoder()
567562

568-
self.eagle_config = PretrainedConfig.from_dict(config.eagle_architecture_config)
563+
arch_config = config.eagle_architecture_config
564+
565+
# Populate base-model-dependent fields before constructing PretrainedConfig,
566+
# since transformers >=5.4 validates rope_scaling during __init__.
567+
arch_config["hidden_size"] = self._base_llm_config.hidden_size
568+
arch_config["vocab_size"] = self._base_llm_config.vocab_size
569+
arch_config["max_position_embeddings"] = self._base_llm_config.max_position_embeddings
570+
rope_scaling = arch_config.get("rope_scaling")
571+
if rope_scaling and "rope_theta" not in rope_scaling and "rope_theta" in arch_config:
572+
rope_scaling["rope_theta"] = arch_config["rope_theta"]
573+
574+
self.eagle_config = PretrainedConfig.from_dict(arch_config)
569575
self.eagle_config.eagle_decoder_type = self.eagle_decoder_type
570-
# Hidden size and vocab size must match base model
571-
self.eagle_config.hidden_size = self._base_llm_config.hidden_size
572-
self.eagle_config.vocab_size = self._base_llm_config.vocab_size
573-
self.eagle_config.max_position_embeddings = self._base_llm_config.max_position_embeddings
574576
self.eagle_config.draft_vocab_size = getattr(
575577
self.eagle_config, "draft_vocab_size", self.eagle_config.vocab_size
576578
)
@@ -751,7 +753,10 @@ def _compute_ttt_attention_mask(
751753
) -> BlockMask | torch.Tensor:
752754
"""Return TTT attention_mask tensor of type BlockMask or Tensor depends on eagle attn impl."""
753755
msk_func = get_ttt_msk_func(seq_length, ttt_step)
754-
dtypemin = torch.finfo(self._base_llm_config.dtype).min
756+
dtype = (
757+
self._base_llm_config.dtype or self.eagle_module.layers[0].input_layernorm.weight.dtype
758+
)
759+
dtypemin = torch.finfo(dtype).min
755760
q_len = seq_length
756761
kv_len = seq_length * (1 + ttt_step)
757762
if self.eagle_config._attn_implementation == "flex_attention":
@@ -767,7 +772,7 @@ def _compute_ttt_attention_mask(
767772
torch.arange(kv_len).view(1, 1, 1, kv_len),
768773
).to(self.device)
769774
tensor_mask = torch.full_like(
770-
tensor_mask, 0, dtype=self._base_llm_config.dtype, device=self.device
775+
tensor_mask, 0, dtype=dtype, device=self.device
771776
).masked_fill(~tensor_mask, dtypemin)
772777

773778
return tensor_mask
@@ -910,9 +915,9 @@ def forward(
910915
)
911916

912917
if not isinstance(past_key_values, Cache):
913-
past_key_values = _get_empty_cache(self._base_llm_config)
918+
past_key_values = DynamicCache(config=self._base_llm_config)
914919
if not isinstance(eagle_cache, Cache):
915-
eagle_cache = _get_empty_cache(self.eagle_module.config)
920+
eagle_cache = DynamicCache(config=self.eagle_module.config)
916921
past_key_values.eagle_cache = eagle_cache
917922

918923
# ====Prepare inputs for the first eagle forward pass====
@@ -937,7 +942,7 @@ def forward(
937942
base_outputs,
938943
)
939944

940-
self.eagle_module._maybe_init_rope()
945+
self.eagle_module._maybe_init_rope(device=eagle_input_hiddens.device)
941946

942947
# ====Run eagle forward with extra training-time-test steps====
943948
for ttt_step in range(self.eagle_ttt_steps):
@@ -1070,7 +1075,7 @@ def pseudo_speculative_generate(
10701075
else:
10711076
eagle_input_hidden_states = base_model_hidden_states
10721077

1073-
self.eagle_module._maybe_init_rope()
1078+
self.eagle_module._maybe_init_rope(device=eagle_input_hidden_states.device)
10741079
draft_tokens = []
10751080
for step in range(steps):
10761081
b, seq_length = eagle_ids.shape

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ hf = [
8282
"peft>=0.17.0",
8383
"sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export
8484
"transformers>=4.56,<5.0", # Should match modelopt/torch/__init__.py and tox.ini
85+
"tiktoken",
8586
"wonderwords",
8687
]
8788
dev-lint = [

tests/_test_utils/examples/llm_ptq_utils.py

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,10 @@
1616
import importlib.metadata as metadata
1717
import subprocess
1818
from dataclasses import asdict, dataclass
19-
from pathlib import Path
2019

2120
import pytest
2221
import torch
23-
24-
PTQ_EXAMPLE_DIR = Path(__file__).parents[3] / "examples" / "llm_ptq"
22+
from _test_utils.examples.run_command import run_llm_ptq_command
2523

2624

2725
@dataclass
@@ -32,6 +30,7 @@ class PTQCommand:
3230
sparsity: str | None = None
3331
kv_cache_quant: str | None = None
3432
trust_remote_code: bool = False
33+
calib_dataset: str = "cnn_dailymail"
3534
calib_batch_size: int | None = None
3635
auto_quantize_bits: float | None = None
3736
tp: int | None = None
@@ -47,37 +46,23 @@ def run(self, model_path: str):
4746
self.min_sm % 10,
4847
):
4948
pytest.skip(reason=f"Requires sm{self.min_sm} or higher")
50-
return
5149

5250
if self.max_sm and torch.cuda.get_device_capability() > (
5351
self.max_sm // 10,
5452
self.max_sm % 10,
5553
):
5654
pytest.skip(reason=f"Requires sm{self.max_sm} or lower")
57-
return
5855

5956
if self.min_gpu and torch.cuda.device_count() < self.min_gpu:
6057
pytest.skip(reason=f"Requires at least {self.min_gpu} GPUs")
61-
return
6258

6359
param_dict = asdict(self)
64-
6560
param_dict.pop("min_sm", None)
61+
param_dict.pop("max_sm", None)
6662
param_dict.pop("min_gpu", None)
6763

68-
trust_remote_code = param_dict.pop("trust_remote_code", False)
69-
70-
args = ["--model", model_path]
71-
for key, value in param_dict.items():
72-
if value is not None:
73-
args.append(f"--{key}")
74-
args.append(f"{value}")
75-
76-
if trust_remote_code:
77-
args.append("--trust_remote_code")
78-
79-
self.command = ["scripts/huggingface_example.sh", "--no-verbose", *args]
80-
subprocess.run(self.command, cwd=PTQ_EXAMPLE_DIR, check=True)
64+
quant = param_dict.pop("quant")
65+
run_llm_ptq_command(model=model_path, quant=quant, **param_dict)
8166

8267
def param_str(self):
8368
param_dict = asdict(self)

tests/examples/llm_ptq/test_llm_ptq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class TestWhisper(WithRequirements):
7171
"command",
7272
[
7373
# Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size
74-
PTQCommand(quant="fp8", calib_batch_size=16, min_sm=89),
74+
PTQCommand(quant="fp8", calib_batch_size=16, calib_dataset="peoples_speech", min_sm=89),
7575
],
7676
ids=PTQCommand.param_str,
7777
)

tests/examples/speculative_decoding/conftest.py

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,30 +13,31 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
import os
17-
1816
import pytest
19-
from _test_utils.examples.run_command import MODELOPT_ROOT, run_example_command
17+
import yaml
18+
from _test_utils.examples.run_command import run_example_command
2019

2120

2221
@pytest.fixture(scope="session", autouse=True)
2322
def tiny_daring_anteater_path(tmp_path_factory):
24-
dataset_path = (
25-
MODELOPT_ROOT / "examples/speculative_decoding/input_conversations/daring-anteater.jsonl"
23+
tmp_dir = tmp_path_factory.mktemp("daring_anteater")
24+
output_file = tmp_dir / "train.jsonl"
25+
26+
config = {
27+
"outputs": [
28+
{
29+
"filename": str(output_file),
30+
"global_limit": 100,
31+
"sources": [{"name": "daring-anteater", "splits": {"all": 100}}],
32+
}
33+
]
34+
}
35+
config_path = tmp_dir / "data_config.yaml"
36+
config_path.write_text(yaml.dump(config))
37+
38+
run_example_command(
39+
["python", "prepare_input_conversations/make_dataset.py", "-f", str(config_path), "--full"],
40+
"speculative_decoding",
2641
)
27-
if not os.path.exists(dataset_path):
28-
try:
29-
run_example_command(
30-
["python", "prepare_input_conversations/add_daring_anteater.py"],
31-
"speculative_decoding",
32-
)
33-
except Exception as e:
34-
# Ignore rate-limiting errors
35-
pytest.skip(f"Failed to prepare dataset: {e}")
36-
output_path = tmp_path_factory.mktemp("daring_anteater") / "train.jsonl"
37-
with open(dataset_path) as src, open(output_path, "w") as dst:
38-
for i, line in enumerate(src):
39-
if i >= 128:
40-
break
41-
dst.write(line)
42-
return output_path
42+
43+
return output_file

0 commit comments

Comments
 (0)