Skip to content

Commit dd8314b

Browse files
yeyu-nvidiaclaude
andauthored
Fix EAGLE3 Nemotron-3-Nano-30B deploy test (NVBugs 6130106) (#1568)
## Summary - **vLLM fix**: Add EAGLE3 speculative decoding code path in `_deploy_vllm_impl` — loads base model with `speculative_config` instead of treating the unquantized EAGLE3 draft model as a quantized model (which fails looking for nonexistent `quantization_config`) - **SGLang fix**: Add Nemotron-specific kwargs (`mamba_scheduler_strategy="extra_buffer"`, `SGLANG_ENABLE_SPEC_V2=1`) for hybrid Mamba+attention architecture; remove SGLang from Nemotron EAGLE3 test backends since upstream SGLang does not support speculative decoding with NemotronH - Verified vLLM fix on OCI-HSG cluster (job 3020228) — EAGLE3 speculative decoding generates correct output with TP=4 ## Test plan - [x] vLLM EAGLE3 deploy verified on cluster (4×GPU, Nemotron-3-Nano-30B-A3B) - [x] SGLang confirmed as upstream limitation (NemotronH `extra_buffer` not supported for speculative decoding) - [x] Pre-commit checks pass 🤖 Generated with [Claude Code](https://claude.com/claude-code) <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Tests** * Updated deployment configurations for improved Eagle3 speculative decoding support across vLLM and SGLang. * Enhanced quantization method selection logic for model deployments. * Added environment configuration for Nemotron model optimizations. * Refined backend compatibility settings in test deployment configurations. <!-- review_stack_entry_start --> [![Review Change Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/NVIDIA/Model-Optimizer/pull/1568?utm_source=github_walkthrough&utm_medium=github&utm_campaign=change_stack) <!-- review_stack_entry_end --> <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: Ye Yu <yeyu@nvidia.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent eb5ed2d commit dd8314b

2 files changed

Lines changed: 43 additions & 22 deletions

File tree

tests/_test_utils/deploy_utils.py

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -309,15 +309,27 @@ def _deploy_vllm_impl(self):
309309
"""Run vLLM deploy (used by subprocess in run())."""
310310
from vllm import LLM, SamplingParams
311311

312-
quantization_method = "modelopt"
313-
if "fp4" in self.model_id.lower():
314-
quantization_method = "modelopt_fp4"
315-
llm = LLM(
316-
model=self.model_id,
317-
quantization=quantization_method,
318-
tensor_parallel_size=self.tensor_parallel_size,
319-
trust_remote_code=True,
320-
)
312+
if "eagle" in self.model_id.lower():
313+
llm = LLM(
314+
model=self.base_model,
315+
speculative_config={
316+
"method": "eagle3",
317+
"model": self.model_id,
318+
"num_speculative_tokens": 3,
319+
},
320+
tensor_parallel_size=self.tensor_parallel_size,
321+
trust_remote_code=True,
322+
)
323+
else:
324+
quantization_method = "modelopt"
325+
if "fp4" in self.model_id.lower():
326+
quantization_method = "modelopt_fp4"
327+
llm = LLM(
328+
model=self.model_id,
329+
quantization=quantization_method,
330+
tensor_parallel_size=self.tensor_parallel_size,
331+
trust_remote_code=True,
332+
)
321333
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
322334
outputs = llm.generate(COMMON_PROMPTS, sampling_params)
323335

@@ -347,18 +359,25 @@ def _deploy_sglang_impl(self):
347359
if "fp4" in self.model_id.lower():
348360
quantization_method = "modelopt_fp4"
349361
if "eagle" in self.model_id.lower():
350-
llm = sgl.Engine(
351-
model_path=self.base_model,
352-
speculative_algorithm="EAGLE3",
353-
speculative_num_steps=3,
354-
speculative_eagle_topk=1,
355-
speculative_num_draft_tokens=4,
356-
speculative_draft_model_path=self.model_id,
357-
tp_size=self.tensor_parallel_size,
358-
trust_remote_code=True,
359-
mem_fraction_static=0.7,
360-
context_length=1024,
361-
)
362+
eagle_kwargs = {
363+
"model_path": self.base_model,
364+
"speculative_algorithm": "EAGLE3",
365+
"speculative_num_steps": 3,
366+
"speculative_eagle_topk": 1,
367+
"speculative_num_draft_tokens": 4,
368+
"speculative_draft_model_path": self.model_id,
369+
"tp_size": self.tensor_parallel_size,
370+
"trust_remote_code": True,
371+
"mem_fraction_static": 0.7,
372+
"context_length": 1024,
373+
}
374+
# Nemotron hybrid (Mamba+attention) requires extra_buffer scheduler
375+
# strategy and SGLANG_ENABLE_SPEC_V2 for radix cache compatibility
376+
# with speculative decoding
377+
if "nemotron" in self.base_model.lower():
378+
eagle_kwargs["mamba_scheduler_strategy"] = "extra_buffer"
379+
os.environ["SGLANG_ENABLE_SPEC_V2"] = "1"
380+
llm = sgl.Engine(**eagle_kwargs)
362381
elif self.model_id in (
363382
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
364383
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4",

tests/examples/llm_ptq/test_deploy.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,9 @@ def test_medusa(command):
640640
*ModelDeployerList(
641641
base_model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
642642
model_id="nvidia/EAGLE3-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
643-
backend=("trtllm", "vllm", "sglang"),
643+
# SGLang excluded: Nemotron hybrid (Mamba+attention) doesn't support
644+
# speculative decoding in SGLang (NVBugs 6130106)
645+
backend=("trtllm", "vllm"),
644646
eagle3_one_model=False,
645647
tensor_parallel_size=8,
646648
mini_sm=89,

0 commit comments

Comments
 (0)