[None][test] add DeepSeek V4 Flash AutoDeploy smoke

bmarimuthu-nv · bmarimuthu-nv · commit 8f71a83ff045 · 2026-06-09T21:19:59.000-07:00
Signed-off-by: Balamurugan Marimuthu &lt;246387390+bmarimuthu-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py
@@ -49,6 +49,7 @@
 )
 _E8M0_EXPONENT_BIAS = 127
 _MXFP4_BLOCK_SIZE = 32
+_TORCH_MXFP4_ROUTED_MOE_TOKEN_CHUNK = 16
 
 # Prepared (swizzled) triton_kernels tensors; typed as ``object`` so the module
 # imports without ``triton_kernels``.
@@ -319,18 +320,23 @@ def _run_torch_mxfp4_from_routing_slots(
 
     x_for_bmm = x.unsqueeze(-1)
     for route_idx in range(local_expert_idx.shape[1]):
-        expert_idx = local_expert_idx[:, route_idx]
-        gate_up = torch.bmm(gate_up_weight.index_select(0, expert_idx), x_for_bmm).squeeze(-1)
-        gate_up = gate_up + gate_up_bias.index_select(0, expert_idx).to(torch.float32)
-        inter = _apply_swiglu(gate_up, alpha, limit, gate_up_order, swiglu_mode)
-        expert_output = torch.bmm(
-            down_weight.index_select(0, expert_idx), inter.unsqueeze(-1)
-        ).squeeze(-1)
-        expert_output = expert_output + down_bias.index_select(0, expert_idx).to(torch.float32)
-        route_scale = routing_weights[:, route_idx, None] * valid_route[:, route_idx, None].to(
-            torch.float32
-        )
-        output = output + expert_output * route_scale
+        for start in range(0, x.shape[0], _TORCH_MXFP4_ROUTED_MOE_TOKEN_CHUNK):
+            end = min(start + _TORCH_MXFP4_ROUTED_MOE_TOKEN_CHUNK, x.shape[0])
+            token_slice = slice(start, end)
+            expert_idx = local_expert_idx[token_slice, route_idx]
+            gate_up = torch.bmm(
+                gate_up_weight.index_select(0, expert_idx), x_for_bmm[token_slice]
+            ).squeeze(-1)
+            gate_up = gate_up + gate_up_bias.index_select(0, expert_idx).to(torch.float32)
+            inter = _apply_swiglu(gate_up, alpha, limit, gate_up_order, swiglu_mode)
+            expert_output = torch.bmm(
+                down_weight.index_select(0, expert_idx), inter.unsqueeze(-1)
+            ).squeeze(-1)
+            expert_output = expert_output + down_bias.index_select(0, expert_idx).to(torch.float32)
+            route_scale = routing_weights[token_slice, route_idx, None] * valid_route[
+                token_slice, route_idx, None
+            ].to(torch.float32)
+            output[token_slice] = output[token_slice] + expert_output * route_scale
 
     return output.reshape(*leading_shape, hidden_size).to(output_dtype)
 
diff --git a/tests/integration/defs/accuracy/configs/deepseek_v4_flash_4gpu_smoke.yaml b/tests/integration/defs/accuracy/configs/deepseek_v4_flash_4gpu_smoke.yaml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+transforms:
+  apply_sharding_hints:
+    dist_mapping:
+      tp: 4
+      moe_ep: 4
+      moe_tp: 1
+      moe_cluster: 1
diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -24,6 +24,7 @@
 from test_common.llm_data import hf_id_to_local_model_dir
 
 from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
+from tensorrt_llm.evaluate import GSM8K as GSM8KEvaluator
 from tensorrt_llm.llmapi import Eagle3DecodingConfig
 from tensorrt_llm.quantization import QuantAlgo
 from tensorrt_llm.sampling_params import SamplingParams
@@ -35,6 +36,7 @@
                    'model_registry' / 'configs')
 _AD_MODEL_REGISTRY_DIR = Path(
     get_llm_root()) / 'examples' / 'auto_deploy' / 'model_registry'
+_ACCURACY_CONFIGS_DIR = Path(__file__).resolve().parent / "configs"
 
 
 def _load_ad_config(config_name):
@@ -1567,6 +1569,59 @@ def test_autodeploy_from_registry(self, model_name, config_overrides, tasks,
                         raise type(e)(f"[{task_cls.__name__}] {e}") from None
 
 
+class TestDeepSeekV4Flash(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "deepseek-ai/DeepSeek-V4-Flash"
+    WORLD_SIZE = 4
+    YAML_EXTRA = [
+        str(_AD_CONFIGS_DIR / "dashboard_default.yaml"),
+        str(_AD_CONFIGS_DIR / "world_size_4.yaml"),
+        str(_AD_CONFIGS_DIR / "deepseek_v4_flash.yaml"),
+        str(_ACCURACY_CONFIGS_DIR / "deepseek_v4_flash_4gpu_smoke.yaml"),
+    ]
+    GSM8K_NUM_SAMPLES = 15
+    GSM8K_NUM_FEWSHOT = 0
+    GSM8K_MAX_INPUT_LEN = 1024
+    GSM8K_MAX_OUTPUT_LEN = 128
+    GSM8K_MIN_ACCURACY = 40.0
+
+    def get_default_sampling_params(self):
+        return SamplingParams(end_id=None,
+                              pad_id=None,
+                              max_tokens=self.GSM8K_MAX_OUTPUT_LEN,
+                              n=1,
+                              use_beam_search=False)
+
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.skip_less_device_memory(80000)
+    def test_gsm8k_smoke(self):
+        if get_device_count() < self.WORLD_SIZE:
+            pytest.skip(
+                f"DeepSeek V4 Flash smoke requires {self.WORLD_SIZE} GPUs")
+
+        with AutoDeployLLM(model=self.MODEL_NAME,
+                           tokenizer=self.MODEL_NAME,
+                           world_size=self.WORLD_SIZE,
+                           yaml_extra=self.YAML_EXTRA,
+                           max_seq_len=self.GSM8K_MAX_INPUT_LEN +
+                           self.GSM8K_MAX_OUTPUT_LEN,
+                           trust_remote_code=True) as llm:
+            task = GSM8KEvaluator(dataset_path=GSM8K.DATASET_DIR,
+                                  num_samples=self.GSM8K_NUM_SAMPLES,
+                                  random_seed=0)
+            for task_obj in task.task_dict.values():
+                task_obj.set_config("num_fewshot", self.GSM8K_NUM_FEWSHOT)
+            score = task.evaluate(
+                llm,
+                sampling_params=self.get_default_sampling_params(),
+                scores_filter="exact_match,flexible-extract",
+            )
+
+        assert score >= self.GSM8K_MIN_ACCURACY, (
+            f"DeepSeek V4 Flash GSM8K smoke accuracy {score:.2f} is below "
+            f"{self.GSM8K_MIN_ACCURACY:.2f} on {self.GSM8K_NUM_SAMPLES} samples"
+        )
+
+
 # =============================================================================
 # IR Sharding Path Tests
 # =============================================================================
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -378,6 +378,7 @@ l0_dgx_h100:
   - accuracy/test_llm_api_autodeploy.py::TestQwen3_5_397B_MoE::test_bf16_small[4]
   - accuracy/test_llm_api_autodeploy.py::TestGemma4MoE::test_bf16
   - accuracy/test_llm_api_autodeploy.py::TestMiniMaxM2::test_finegrained_fp8
+  - accuracy/test_llm_api_autodeploy.py::TestDeepSeekV4Flash::test_gsm8k_smoke
 # ------------- AutoDeploy Backend Stages L1 / Nightly only ---------------
 - condition:
     ranges:
diff --git a/tests/unittest/auto_deploy/singlegpu/custom_ops/moe/test_torch_mxfp4_moe.py b/tests/unittest/auto_deploy/singlegpu/custom_ops/moe/test_torch_mxfp4_moe.py
@@ -592,6 +592,61 @@ def test_torch_mxfp4_moe_from_routing_matches_deepseek_layout_reference() -> Non
     torch.testing.assert_close(actual, expected, rtol=1e-5, atol=1e-5)
 
 
+def test_torch_mxfp4_moe_from_routing_matches_reference_across_token_chunks() -> None:
+    num_experts = 3
+    hidden_size = 32
+    intermediate_size = 32
+    alpha = 1.0
+    limit = 0.75
+    num_tokens = 20
+    x = torch.linspace(-0.3, 0.35, steps=num_tokens * hidden_size, dtype=torch.float32).reshape(
+        num_tokens, hidden_size
+    )
+    token_ids = torch.arange(num_tokens, dtype=torch.int64)
+    selected_experts = torch.stack(
+        (token_ids % num_experts, (token_ids + 1) % num_experts),
+        dim=1,
+    )
+    routing_weights = torch.stack(
+        (
+            torch.linspace(0.15, 0.45, steps=num_tokens),
+            torch.linspace(0.4, 0.1, steps=num_tokens),
+        ),
+        dim=1,
+    )
+    packed, w1_weight, w2_weight, w3_weight = _deepseek_packed_params_from_layout(num_experts)
+    gate_up_bias = torch.zeros((num_experts, 2 * intermediate_size), dtype=torch.float32)
+    down_bias = torch.zeros((num_experts, hidden_size), dtype=torch.float32)
+
+    actual = torch.ops.auto_deploy.torch_mxfp4_moe_from_routing(
+        x,
+        selected_experts,
+        routing_weights,
+        packed.gate_up_blocks,
+        gate_up_bias,
+        packed.gate_up_scales,
+        alpha,
+        limit,
+        packed.down_blocks,
+        down_bias,
+        packed.down_scales,
+        "up_gate",
+        "deepseek",
+    )
+    expected = _dense_deepseek_routing_reference(
+        x,
+        selected_experts,
+        routing_weights,
+        w1_weight,
+        w2_weight,
+        w3_weight,
+        alpha=alpha,
+        limit=limit,
+    )
+
+    torch.testing.assert_close(actual, expected, rtol=1e-5, atol=1e-5)
+
+
 def test_torch_mxfp4_moe_from_routing_ep_partitions_deepseek_layout_experts() -> None:
     num_experts = 5
     ep_size = 3