Fix EAGLE3 Nemotron-3-Nano-30B deploy test (NVBugs 6130106) (#1568)

yeyu-nvidia · claude · web-flow · commit dd8314b02cff · 2026-05-29T18:45:54.000Z
## Summary - **vLLM fix**: Add EAGLE3 speculative decoding code path in `_deploy_vllm_impl` — loads base model with `speculative_config` instead of treating the unquantized EAGLE3 draft model as a quantized model (which fails looking for nonexistent `quantization_config`) - **SGLang fix**: Add Nemotron-specific kwargs (`mamba_scheduler_strategy="extra_buffer"`, `SGLANG_ENABLE_SPEC_V2=1`) for hybrid Mamba+attention architecture; remove SGLang from Nemotron EAGLE3 test backends since upstream SGLang does not support speculative decoding with NemotronH - Verified vLLM fix on OCI-HSG cluster (job 3020228) — EAGLE3 speculative decoding generates correct output with TP=4 ## Test plan - [x] vLLM EAGLE3 deploy verified on cluster (4×GPU, Nemotron-3-Nano-30B-A3B) - [x] SGLang confirmed as upstream limitation (NemotronH `extra_buffer` not supported for speculative decoding) - [x] Pre-commit checks pass 🤖 Generated with [Claude Code](https://claude.com/claude-code)  ## Summary by CodeRabbit * **Tests** * Updated deployment configurations for improved Eagle3 speculative decoding support across vLLM and SGLang. * Enhanced quantization method selection logic for model deployments. * Added environment configuration for Nemotron model optimizations. * Refined backend compatibility settings in test deployment configurations.  [![Review Change Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/NVIDIA/Model-Optimizer/pull/1568?utm_source=github_walkthrough&utm_medium=github&utm_campaign=change_stack)   Signed-off-by: Ye Yu <yeyu@nvidia.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/tests/_test_utils/deploy_utils.py b/tests/_test_utils/deploy_utils.py
@@ -309,15 +309,27 @@ def _deploy_vllm_impl(self):
         """Run vLLM deploy (used by subprocess in run())."""
         from vllm import LLM, SamplingParams
 
-        quantization_method = "modelopt"
-        if "fp4" in self.model_id.lower():
-            quantization_method = "modelopt_fp4"
-        llm = LLM(
-            model=self.model_id,
-            quantization=quantization_method,
-            tensor_parallel_size=self.tensor_parallel_size,
-            trust_remote_code=True,
-        )
+        if "eagle" in self.model_id.lower():
+            llm = LLM(
+                model=self.base_model,
+                speculative_config={
+                    "method": "eagle3",
+                    "model": self.model_id,
+                    "num_speculative_tokens": 3,
+                },
+                tensor_parallel_size=self.tensor_parallel_size,
+                trust_remote_code=True,
+            )
+        else:
+            quantization_method = "modelopt"
+            if "fp4" in self.model_id.lower():
+                quantization_method = "modelopt_fp4"
+            llm = LLM(
+                model=self.model_id,
+                quantization=quantization_method,
+                tensor_parallel_size=self.tensor_parallel_size,
+                trust_remote_code=True,
+            )
         sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
         outputs = llm.generate(COMMON_PROMPTS, sampling_params)
 
@@ -347,18 +359,25 @@ def _deploy_sglang_impl(self):
         if "fp4" in self.model_id.lower():
             quantization_method = "modelopt_fp4"
         if "eagle" in self.model_id.lower():
-            llm = sgl.Engine(
-                model_path=self.base_model,
-                speculative_algorithm="EAGLE3",
-                speculative_num_steps=3,
-                speculative_eagle_topk=1,
-                speculative_num_draft_tokens=4,
-                speculative_draft_model_path=self.model_id,
-                tp_size=self.tensor_parallel_size,
-                trust_remote_code=True,
-                mem_fraction_static=0.7,
-                context_length=1024,
-            )
+            eagle_kwargs = {
+                "model_path": self.base_model,
+                "speculative_algorithm": "EAGLE3",
+                "speculative_num_steps": 3,
+                "speculative_eagle_topk": 1,
+                "speculative_num_draft_tokens": 4,
+                "speculative_draft_model_path": self.model_id,
+                "tp_size": self.tensor_parallel_size,
+                "trust_remote_code": True,
+                "mem_fraction_static": 0.7,
+                "context_length": 1024,
+            }
+            # Nemotron hybrid (Mamba+attention) requires extra_buffer scheduler
+            # strategy and SGLANG_ENABLE_SPEC_V2 for radix cache compatibility
+            # with speculative decoding
+            if "nemotron" in self.base_model.lower():
+                eagle_kwargs["mamba_scheduler_strategy"] = "extra_buffer"
+                os.environ["SGLANG_ENABLE_SPEC_V2"] = "1"
+            llm = sgl.Engine(**eagle_kwargs)
         elif self.model_id in (
             "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
             "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4",
diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py
@@ -640,7 +640,9 @@ def test_medusa(command):
         *ModelDeployerList(
             base_model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
             model_id="nvidia/EAGLE3-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
-            backend=("trtllm", "vllm", "sglang"),
+            # SGLang excluded: Nemotron hybrid (Mamba+attention) doesn't support
+            # speculative decoding in SGLang (NVBugs 6130106)
+            backend=("trtllm", "vllm"),
             eagle3_one_model=False,
             tensor_parallel_size=8,
             mini_sm=89,