Skip to content

Commit b03b78f

Browse files
authored
Revert "[None][test] Add support for nemotron_3_ultra_550b_nvfp4 model in performance tests and configurations" (#15310)
1 parent db7161b commit b03b78f

5 files changed

Lines changed: 64 additions & 71 deletions

File tree

tests/integration/defs/perf/_model_paths.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,16 @@
123123
"glm_5_nvfp4": "GLM-5-NVFP4",
124124
}
125125

126-
# Models loaded directly by HuggingFace repo id (downloaded at runtime, not synced locally).
126+
# Model PATH of HuggingFace
127127
HF_MODEL_PATH = {
128-
"nemotron_3_ultra_550b_nvfp4": "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4",
128+
"llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B",
129+
"llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8",
130+
"llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
131+
"llama_v3.1_nemotron_nano_8b_fp8_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1-FP8",
132+
"llama_v3.3_nemotron_super_49b_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
133+
"llama_v3.3_nemotron_super_49b_fp8_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8",
134+
"llama_v3.1_nemotron_ultra_253b_fp8_hf": "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
135+
"phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct",
129136
}
130137

131138
LORA_MODEL_PATH = {

tests/integration/defs/perf/pytorch_model_config.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -586,28 +586,6 @@ def get_model_yaml_config(model_label: str,
586586
},
587587
}
588588
},
589-
# Nemotron-3-Ultra-550B-NVFP4 throughput variant, aligned with curated yaml (served from HF).
590-
{
591-
'patterns': ['nemotron_3_ultra_550b_nvfp4-serve-pytorch-'],
592-
'config': {
593-
'enable_attention_dp': True,
594-
'stream_interval': 10,
595-
'num_postprocess_workers': 4,
596-
'moe_config': {
597-
'backend': 'CUTEDSL',
598-
},
599-
'cuda_graph_config': {
600-
'enable_padding': True,
601-
'max_batch_size': 256,
602-
},
603-
'kv_cache_config': {
604-
'enable_block_reuse': False,
605-
'mamba_ssm_cache_dtype': 'float16',
606-
'mamba_ssm_stochastic_rounding': True,
607-
'mamba_ssm_philox_rounds': 5,
608-
},
609-
}
610-
},
611589
]
612590

613591
# Apply pattern-based configurations on top of base config

tests/integration/defs/perf/test_perf.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@
4949
NEMOTRON_SUPER_MODELS = {
5050
"nemotron_3_super_120b_nvfp4",
5151
"nemotron_3_super_120b_nvfp4_mtp",
52-
"nemotron_3_ultra_550b_nvfp4",
5352
"nemotron_3_nano_omni_nvfp4",
5453
"nemotron_3_nano_omni_nvfp4_image",
5554
}
@@ -62,7 +61,6 @@
6261
"kimi_k2_nvfp4",
6362
"nemotron_3_super_120b_nvfp4",
6463
"nemotron_3_super_120b_nvfp4_mtp",
65-
"nemotron_3_ultra_550b_nvfp4",
6664
"glm_5_fp8",
6765
"nemotron_3_nano_omni_nvfp4",
6866
"nemotron_3_nano_omni_nvfp4_image",
@@ -108,12 +106,13 @@
108106

109107

110108
def get_model_dir(model_name: str):
111-
# HF models use the repo id verbatim (downloaded at runtime, no LLM_MODELS_ROOT prefix).
112-
if model_name in HF_MODEL_PATH.keys():
113-
return HF_MODEL_PATH[model_name]
109+
model_dir = ""
114110
if model_name in MODEL_PATH_DICT.keys():
115-
return os.path.join(llm_models_root(), MODEL_PATH_DICT[model_name])
116-
return ""
111+
model_dir = os.path.join(llm_models_root(), MODEL_PATH_DICT[model_name])
112+
elif model_name in HF_MODEL_PATH.keys():
113+
model_dir = os.path.join(llm_models_root(),
114+
MODEL_PATH_DICT[model_name.split('_hf')[0]])
115+
return model_dir
117116

118117

119118
def get_dataset_path():
@@ -1039,13 +1038,14 @@ def get_trtllm_bench_build_command(self, engine_dir) -> list:
10391038
model_dir = self.get_trtllm_bench_model()
10401039
if model_dir == "":
10411040
pytest.skip("Model Name is not supported by trtllm-bench")
1042-
# Legacy "<name>_hf" label; weights load from --model_path.
10431041
model_name = self._config.model_name
10441042
if not model_name.endswith("_hf"):
10451043
model_name = model_name + "_hf"
1044+
hf_model_name = HF_MODEL_PATH.get(model_name, "")
10461045
build_cmd = [
1047-
self._build_script, "--log_level=info", f"--workspace={engine_dir}",
1048-
f"--model={model_name}", f"--model_path={model_dir}", "build",
1046+
self._build_script, f"--log_level=info",
1047+
f"--workspace={engine_dir}", f"--model={hf_model_name}",
1048+
f"--model_path={model_dir}", "build",
10491049
f"--tp_size={self._config.tp_size}",
10501050
f"--pp_size={self._config.pp_size}"
10511051
]
@@ -1170,11 +1170,11 @@ def get_trtllm_bench_command(self, engine_dir):
11701170
model_name = self._config.model_name
11711171
dataset_path = os.path.join(engine_dir, "synthetic_data.json")
11721172
report_path = os.path.join(engine_dir, "report.json")
1173-
# Legacy "<name>_hf" label; weights load from --model_path.
11741173
if not model_name.endswith("_hf"):
11751174
model_name = model_name + "_hf"
1175+
hf_model_name = HF_MODEL_PATH.get(model_name, "")
11761176
tp_pp_str = f"tp_{self._config.tp_size}_pp_{self._config.pp_size}"
1177-
engine_dir = os.path.join(engine_dir, tp_pp_str)
1177+
engine_dir = os.path.join(engine_dir, hf_model_name, tp_pp_str)
11781178
benchmark_cmd = [
11791179
self._benchmark_script,
11801180
f"--model={model_name}",

tests/integration/test_lists/qa/llm_perf_core.yml

Lines changed: 28 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ llm_perf_core:
1111
# 6: B200, GB200, B300, GB300 test cases
1212
# 7: B200, B300 test cases
1313
# 8: H100, H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases
14-
# 9: H20, H200, B200, B300, RTX6000-Server test cases
14+
# 9: H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases
1515
# 10: RTX-6000D, RTX-6000 Server test cases
1616
# ===============================================================================
1717

@@ -52,8 +52,24 @@ llm_perf_core:
5252
tests:
5353
#nemotron_nano_12b_v2
5454
- perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-con:1] #min_latency
55+
- perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] #max_throughput
56+
- perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:128,128] #qwen3.5_9b (dense BF16 19G, 1-GPU)
5557
#qwen3.5_27b (dense BF16 52G, 2-GPU)
58+
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
59+
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
60+
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
61+
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
62+
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
5663
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
64+
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
65+
#llama_v3.3_nemotron_super_49b (nemotron-nas BF16 94G, 2-GPU)
66+
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
67+
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
68+
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
69+
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
70+
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
71+
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
72+
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
5773
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:4]
5874
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
5975
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,200-gpus:8]
@@ -83,23 +99,6 @@ llm_perf_core:
8399
- perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:64]
84100
- perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:128,128]
85101
- perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:256]
86-
- perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] #max_throughput
87-
- perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:128,128]
88-
#qwen3.5_27b (dense BF16 52G, 2-GPU)
89-
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
90-
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
91-
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
92-
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
93-
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
94-
- perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
95-
#llama_v3.3_nemotron_super_49b (nemotron-nas BF16 94G, 2-GPU)
96-
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
97-
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
98-
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
99-
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
100-
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
101-
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
102-
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
103102

104103

105104
# 4: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
@@ -143,6 +142,7 @@ llm_perf_core:
143142
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
144143
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
145144
#qwen3.5_122b_a10b (MoE BF16 234G, 4-GPU)
145+
- perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4]
146146
- perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:500,2000-ep:4-tp:4-gpus:4]
147147
- perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:2000,500-ep:4-tp:4-gpus:4]
148148
- perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,1000-ep:4-tp:4-gpus:4]
@@ -214,15 +214,9 @@ llm_perf_core:
214214
- perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency
215215
- perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:4-tp:4-gpus:4] #max_throughput
216216
#nemotron_3_super_120b_nvfp4 (Hybrid MoE+SSM+Attn FP4 76G, 4-GPU ep=4 tp=4, throughput config)
217-
#these test config come from docs/source/deployment-guide/deployment-guide-for-nemotron-3-on-trtllm.md
218217
- perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:5-con:1-ep:4-tp:4-gpus:4] #min_latency
219218
- perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:160-con:32-ep:4-tp:4-gpus:4]
220219
- perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:640-con:128-ep:4-tp:4-gpus:4] #max_throughput
221-
#nemotron_3_ultra_550b_nvfp4 (Hybrid MoE FP4 ~275G, 4-GPU ep=4 tp=4, throughput config, HF download)
222-
#these test config come from docs/source/deployment-guide/deployment-guide-for-nemotron-3-on-trtllm.md
223-
- perf/test_perf.py::test_perf[nemotron_3_ultra_550b_nvfp4-serve-pytorch-float4-maxbs:256-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:5-con:1-ep:4-tp:4-gpus:4] #min_latency
224-
- perf/test_perf.py::test_perf[nemotron_3_ultra_550b_nvfp4-serve-pytorch-float4-maxbs:256-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:160-con:32-ep:4-tp:4-gpus:4]
225-
- perf/test_perf.py::test_perf[nemotron_3_ultra_550b_nvfp4-serve-pytorch-float4-maxbs:256-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:640-con:128-ep:4-tp:4-gpus:4] #max_throughput
226220

227221

228222
# 7: B200, B300 test cases
@@ -290,6 +284,15 @@ llm_perf_core:
290284
- perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,2000-ep:8-gpus:8]
291285
- perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-gpus:8] #min_latency
292286
- perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-gpus:8] #max_throughput
287+
# 9: H20, H200, B200, B300 test cases
288+
#llama_v3.1_nemotron_ultra_253b (nemotron-nas BF16 474G, 8-GPU)
289+
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:128,128-tp:8-gpus:8]
290+
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:8-gpus:8]
291+
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:8-gpus:8]
292+
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:8-gpus:8]
293+
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:8-gpus:8]
294+
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:8-gpus:8] #min_latency
295+
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:8-gpus:8] #max_throughput
293296
#llama_v3.1_nemotron_ultra_253b_fp8 (nemotron-nas FP8 241G, 8-GPU)
294297
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
295298
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:8-gpus:8]
@@ -306,10 +309,8 @@ llm_perf_core:
306309
- perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:1000,2000-ep:8-tp:8-gpus:8]
307310
- perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency
308311
- perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput
309-
- perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4]
310-
311312

312-
# 9: H20, H200, B200, B300, RTX6000-Server test cases
313+
# 9: H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases
313314
- condition:
314315
ranges:
315316
system_gpu_count:
@@ -335,14 +336,6 @@ llm_perf_core:
335336
- perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:1000,2000-ep:8-tp:8-gpus:8]
336337
- perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency
337338
- perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput
338-
#llama_v3.1_nemotron_ultra_253b (nemotron-nas BF16 474G, 8-GPU)
339-
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:128,128-tp:8-gpus:8]
340-
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:8-gpus:8]
341-
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:8-gpus:8]
342-
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:8-gpus:8]
343-
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:8-gpus:8]
344-
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:8-gpus:8] #min_latency
345-
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:8-gpus:8] #max_throughput
346339

347340
# 10: RTX-6000D, RTX-6000 Server test cases
348341
- condition:

tests/integration/test_lists/test-db/l0_perf.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,20 @@
11
version: 0.0.1
22
l0_perf:
3+
- condition:
4+
ranges:
5+
system_gpu_count:
6+
gte: 1
7+
lte: 1
8+
wildcards:
9+
gpu:
10+
- '*h100*'
11+
linux_distribution_name: ubuntu*
12+
terms:
13+
stage: pre_merge
14+
backend: tensorrt
15+
tests:
16+
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-float16-input_output_len:128,128-reqs:8192]
17+
318
- condition:
419
ranges:
520
system_gpu_count:

0 commit comments

Comments
 (0)