Revert "[None][test] Add support for nemotron_3_ultra_550b_nvfp4 model in performance tests and configurations" (#15310)

tburt-nv · web-flow · commit b03b78f300ad · 2026-06-12T12:48:45.000-04:00
diff --git a/tests/integration/defs/perf/_model_paths.py b/tests/integration/defs/perf/_model_paths.py
@@ -123,9 +123,16 @@
     "glm_5_nvfp4": "GLM-5-NVFP4",
 }
 
-# Models loaded directly by HuggingFace repo id (downloaded at runtime, not synced locally).
+# Model PATH of HuggingFace
 HF_MODEL_PATH = {
-    "nemotron_3_ultra_550b_nvfp4": "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4",
+    "llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B",
+    "llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8",
+    "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
+    "llama_v3.1_nemotron_nano_8b_fp8_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1-FP8",
+    "llama_v3.3_nemotron_super_49b_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+    "llama_v3.3_nemotron_super_49b_fp8_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8",
+    "llama_v3.1_nemotron_ultra_253b_fp8_hf": "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
+    "phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct",
 }
 
 LORA_MODEL_PATH = {
diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
@@ -586,28 +586,6 @@ def get_model_yaml_config(model_label: str,
                 },
             }
         },
-        # Nemotron-3-Ultra-550B-NVFP4 throughput variant, aligned with curated yaml (served from HF).
-        {
-            'patterns': ['nemotron_3_ultra_550b_nvfp4-serve-pytorch-'],
-            'config': {
-                'enable_attention_dp': True,
-                'stream_interval': 10,
-                'num_postprocess_workers': 4,
-                'moe_config': {
-                    'backend': 'CUTEDSL',
-                },
-                'cuda_graph_config': {
-                    'enable_padding': True,
-                    'max_batch_size': 256,
-                },
-                'kv_cache_config': {
-                    'enable_block_reuse': False,
-                    'mamba_ssm_cache_dtype': 'float16',
-                    'mamba_ssm_stochastic_rounding': True,
-                    'mamba_ssm_philox_rounds': 5,
-                },
-            }
-        },
     ]
 
     # Apply pattern-based configurations on top of base config
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
@@ -49,7 +49,6 @@
 NEMOTRON_SUPER_MODELS = {
     "nemotron_3_super_120b_nvfp4",
     "nemotron_3_super_120b_nvfp4_mtp",
-    "nemotron_3_ultra_550b_nvfp4",
     "nemotron_3_nano_omni_nvfp4",
     "nemotron_3_nano_omni_nvfp4_image",
 }
@@ -62,7 +61,6 @@
     "kimi_k2_nvfp4",
     "nemotron_3_super_120b_nvfp4",
     "nemotron_3_super_120b_nvfp4_mtp",
-    "nemotron_3_ultra_550b_nvfp4",
     "glm_5_fp8",
     "nemotron_3_nano_omni_nvfp4",
     "nemotron_3_nano_omni_nvfp4_image",
@@ -108,12 +106,13 @@
 
 
 def get_model_dir(model_name: str):
-    # HF models use the repo id verbatim (downloaded at runtime, no LLM_MODELS_ROOT prefix).
-    if model_name in HF_MODEL_PATH.keys():
-        return HF_MODEL_PATH[model_name]
+    model_dir = ""
     if model_name in MODEL_PATH_DICT.keys():
-        return os.path.join(llm_models_root(), MODEL_PATH_DICT[model_name])
-    return ""
+        model_dir = os.path.join(llm_models_root(), MODEL_PATH_DICT[model_name])
+    elif model_name in HF_MODEL_PATH.keys():
+        model_dir = os.path.join(llm_models_root(),
+                                 MODEL_PATH_DICT[model_name.split('_hf')[0]])
+    return model_dir
 
 
 def get_dataset_path():
@@ -1039,13 +1038,14 @@ def get_trtllm_bench_build_command(self, engine_dir) -> list:
         model_dir = self.get_trtllm_bench_model()
         if model_dir == "":
             pytest.skip("Model Name is not supported by trtllm-bench")
-        # Legacy "<name>_hf" label; weights load from --model_path.
         model_name = self._config.model_name
         if not model_name.endswith("_hf"):
             model_name = model_name + "_hf"
+        hf_model_name = HF_MODEL_PATH.get(model_name, "")
         build_cmd = [
-            self._build_script, "--log_level=info", f"--workspace={engine_dir}",
-            f"--model={model_name}", f"--model_path={model_dir}", "build",
+            self._build_script, f"--log_level=info",
+            f"--workspace={engine_dir}", f"--model={hf_model_name}",
+            f"--model_path={model_dir}", "build",
             f"--tp_size={self._config.tp_size}",
             f"--pp_size={self._config.pp_size}"
         ]
@@ -1170,11 +1170,11 @@ def get_trtllm_bench_command(self, engine_dir):
         model_name = self._config.model_name
         dataset_path = os.path.join(engine_dir, "synthetic_data.json")
         report_path = os.path.join(engine_dir, "report.json")
-        # Legacy "<name>_hf" label; weights load from --model_path.
         if not model_name.endswith("_hf"):
             model_name = model_name + "_hf"
+        hf_model_name = HF_MODEL_PATH.get(model_name, "")
         tp_pp_str = f"tp_{self._config.tp_size}_pp_{self._config.pp_size}"
-        engine_dir = os.path.join(engine_dir, tp_pp_str)
+        engine_dir = os.path.join(engine_dir, hf_model_name, tp_pp_str)
         benchmark_cmd = [
             self._benchmark_script,
             f"--model={model_name}",
diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml
@@ -11,7 +11,7 @@ llm_perf_core:
 # 6: B200, GB200, B300, GB300 test cases
 # 7: B200, B300 test cases
 # 8: H100, H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases
-# 9: H20, H200, B200, B300, RTX6000-Server test cases
+# 9: H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases
 # 10: RTX-6000D, RTX-6000 Server test cases
 # ===============================================================================
 
@@ -52,8 +52,24 @@ llm_perf_core:
   tests:
   #nemotron_nano_12b_v2
   - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-con:1] #min_latency
+  - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] #max_throughput
+  - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:128,128]  #qwen3.5_9b (dense BF16 19G, 1-GPU)
   #qwen3.5_27b (dense BF16 52G, 2-GPU)
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
   - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
+  #llama_v3.3_nemotron_super_49b (nemotron-nas BF16 94G, 2-GPU)
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,200-gpus:8]
@@ -83,23 +99,6 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:64]
   - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:128,128]
   - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:256]
-  - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] #max_throughput
-  - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:128,128]
-  #qwen3.5_27b (dense BF16 52G, 2-GPU)
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
-  #llama_v3.3_nemotron_super_49b (nemotron-nas BF16 94G, 2-GPU)
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
 
 
 # 4: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
@@ -143,6 +142,7 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
   - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
     #qwen3.5_122b_a10b (MoE BF16 234G, 4-GPU)
+  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:500,2000-ep:4-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:2000,500-ep:4-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,1000-ep:4-tp:4-gpus:4]
@@ -214,15 +214,9 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:4-tp:4-gpus:4] #max_throughput
   #nemotron_3_super_120b_nvfp4 (Hybrid MoE+SSM+Attn FP4 76G, 4-GPU ep=4 tp=4, throughput config)
-  #these test config come from docs/source/deployment-guide/deployment-guide-for-nemotron-3-on-trtllm.md
   - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:5-con:1-ep:4-tp:4-gpus:4] #min_latency
   - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:160-con:32-ep:4-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:640-con:128-ep:4-tp:4-gpus:4] #max_throughput
-  #nemotron_3_ultra_550b_nvfp4 (Hybrid MoE FP4 ~275G, 4-GPU ep=4 tp=4, throughput config, HF download)
-  #these test config come from docs/source/deployment-guide/deployment-guide-for-nemotron-3-on-trtllm.md
-  - perf/test_perf.py::test_perf[nemotron_3_ultra_550b_nvfp4-serve-pytorch-float4-maxbs:256-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:5-con:1-ep:4-tp:4-gpus:4] #min_latency
-  - perf/test_perf.py::test_perf[nemotron_3_ultra_550b_nvfp4-serve-pytorch-float4-maxbs:256-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:160-con:32-ep:4-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[nemotron_3_ultra_550b_nvfp4-serve-pytorch-float4-maxbs:256-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:640-con:128-ep:4-tp:4-gpus:4] #max_throughput
 
 
 # 7: B200, B300 test cases
@@ -290,6 +284,15 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,2000-ep:8-gpus:8]
   - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-gpus:8] #min_latency
   - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-gpus:8] #max_throughput
+# 9: H20, H200, B200, B300 test cases
+  #llama_v3.1_nemotron_ultra_253b (nemotron-nas BF16 474G, 8-GPU)
+  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:128,128-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:8-gpus:8] #min_latency
+  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:8-gpus:8] #max_throughput
   #llama_v3.1_nemotron_ultra_253b_fp8 (nemotron-nas FP8 241G, 8-GPU)
   - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:8-gpus:8]
@@ -306,10 +309,8 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:1000,2000-ep:8-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4]
-
 
-# 9: H20, H200, B200, B300, RTX6000-Server test cases
+# 9: H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases
 - condition:
     ranges:
       system_gpu_count:
@@ -335,14 +336,6 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:1000,2000-ep:8-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput
-   #llama_v3.1_nemotron_ultra_253b (nemotron-nas BF16 474G, 8-GPU)
-  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:128,128-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:8-gpus:8] #min_latency
-  - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:8-gpus:8] #max_throughput
 
 # 10: RTX-6000D, RTX-6000 Server test cases
 - condition:
diff --git a/tests/integration/test_lists/test-db/l0_perf.yml b/tests/integration/test_lists/test-db/l0_perf.yml
@@ -1,5 +1,20 @@
 version: 0.0.1
 l0_perf:
+  - condition:
+      ranges:
+        system_gpu_count:
+          gte: 1
+          lte: 1
+      wildcards:
+        gpu:
+          - '*h100*'
+        linux_distribution_name: ubuntu*
+      terms:
+        stage: pre_merge
+        backend: tensorrt
+    tests:
+      - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:128,128-reqs:8192]
+
   - condition:
       ranges:
         system_gpu_count: