{
"schema_version": "1.0",
"suite_id": "suite_C",
"implementation_id": "nvidia_sglang_c43a8309",
"chip": {
"name": "NVIDIA A100-SXM4-40GB",
"vendor": "NVIDIA",
"count": 1,
"memory_gb": 40.0,
"interconnect_intra_node": null,
"interconnect_inter_node": null
},
"software": {
"framework": "SGLang",
"framework_version": "0.5.6",
"driver_version": "565.57.01",
"runtime_version": "CUDA 12.8",
"os": "Ubuntu 22.04.4 LTS",
"python_version": "3.10.20"
},
"model": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
"model_revision": "0e9e39f249a16976918f6564b8830bc894c89659",
"model_name": null,
"model_note": null,
"model_source": "local",
"architecture": "dense",
"parameter_count_b": 8.0,
"precision": "BF16",
"effective_dtype": "bfloat16",
"quantization_method": null,
"model_format": "HuggingFace original",
"_note": "suite model_id. Each precision level uses its own quantized checkpoint."
},
"task": {
"scenarios_run": [
"accuracy",
"offline",
"online",
"sustained"
],
"precision_levels_run": [
"BF16",
"FP8",
"W8A8",
"W8A16",
"W4A16"
],
"precision_levels_skipped": [
"FP16"
],
"parallelism": {
"tensor_parallel_size": 1,
"pipeline_parallel_size": 1,
"expert_parallel_size": 1,
"data_parallel_size": 1
},
"num_runs": 3,
"extra_config": null
},
"metrics": {
"quantization": {
"results_by_precision": [
{
"precision": "BF16",
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
"best_throughput_tokens_per_sec": 3160.74,
"accuracy_score": 0.57,
"accuracy_baseline_delta": 0.01,
"accuracy_valid": true,
"quality_efficiency": 1801.6,
"speedup_vs_bf16": 1.0,
"results_by_concurrency": [
{
"client_concurrency": 1,
"throughput_tokens_per_sec": 3149.6,
"throughput_tokens_per_sec_per_chip": 3149.6,
"elapsed_seconds_median": 11.4,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 4,
"throughput_tokens_per_sec": 3160.74,
"throughput_tokens_per_sec_per_chip": 3160.74,
"elapsed_seconds_median": 11.3,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 16,
"throughput_tokens_per_sec": 3148.17,
"throughput_tokens_per_sec_per_chip": 3148.17,
"elapsed_seconds_median": 11.3,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 64,
"throughput_tokens_per_sec": 3156.58,
"throughput_tokens_per_sec_per_chip": 3156.58,
"elapsed_seconds_median": 11.3,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
}
],
"result_dir": "bf16",
"effective_dtype": "bfloat16",
"quantization_method": null
},
{
"precision": "W8A16",
"model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16",
"best_throughput_tokens_per_sec": 3396.91,
"accuracy_score": 0.58,
"accuracy_baseline_delta": -0.01,
"accuracy_valid": true,
"quality_efficiency": 1970.2,
"speedup_vs_bf16": 1.075,
"results_by_concurrency": [
{
"client_concurrency": 1,
"throughput_tokens_per_sec": 3396.91,
"throughput_tokens_per_sec_per_chip": 3396.91,
"elapsed_seconds_median": 10.6,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 4,
"throughput_tokens_per_sec": 3316.93,
"throughput_tokens_per_sec_per_chip": 3316.93,
"elapsed_seconds_median": 10.8,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 16,
"throughput_tokens_per_sec": 3387.33,
"throughput_tokens_per_sec_per_chip": 3387.33,
"elapsed_seconds_median": 10.6,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 64,
"throughput_tokens_per_sec": 3395.75,
"throughput_tokens_per_sec_per_chip": 3395.75,
"elapsed_seconds_median": 10.6,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
}
],
"result_dir": "w8a16",
"effective_dtype": "auto",
"quantization_method": "compressed-tensors"
},
{
"precision": "W4A16",
"model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",
"best_throughput_tokens_per_sec": 1817.91,
"accuracy_score": 0.56,
"accuracy_baseline_delta": -0.01,
"accuracy_valid": true,
"quality_efficiency": 1018.0,
"speedup_vs_bf16": 0.575,
"results_by_concurrency": [
{
"client_concurrency": 1,
"throughput_tokens_per_sec": 1808.4,
"throughput_tokens_per_sec_per_chip": 1808.4,
"elapsed_seconds_median": 19.0,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 4,
"throughput_tokens_per_sec": 1810.14,
"throughput_tokens_per_sec_per_chip": 1810.14,
"elapsed_seconds_median": 19.0,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 16,
"throughput_tokens_per_sec": 1810.03,
"throughput_tokens_per_sec_per_chip": 1810.03,
"elapsed_seconds_median": 19.0,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 64,
"throughput_tokens_per_sec": 1817.91,
"throughput_tokens_per_sec_per_chip": 1817.91,
"elapsed_seconds_median": 19.0,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
}
],
"result_dir": "w4a16",
"effective_dtype": "auto",
"quantization_method": "gptq"
}
]
},
"derived": {},
"quantization_online": {
"results_by_precision": [
{
"precision": "BF16",
"max_valid_qps": 50,
"results_by_qps": [
{
"target_qps": 5,
"achieved_qps": 5.0,
"ttft_ms_p50": 44.84,
"ttft_ms_p90": 63.85,
"ttft_ms_p99": 1627.45,
"tpot_ms_p50": 15.7,
"tpot_ms_p90": 17.63,
"tpot_ms_p99": 19.03,
"elapsed_seconds_median": 66.2,
"sla_met": false
},
{
"target_qps": 10,
"achieved_qps": 10.0,
"ttft_ms_p50": 47.0,
"ttft_ms_p90": 56.65,
"ttft_ms_p99": 65.26,
"tpot_ms_p50": 21.61,
"tpot_ms_p90": 23.39,
"tpot_ms_p99": 26.07,
"elapsed_seconds_median": 32.9,
"sla_met": true
},
{
"target_qps": 25,
"achieved_qps": 25.0,
"ttft_ms_p50": 52.95,
"ttft_ms_p90": 66.65,
"ttft_ms_p99": 78.1,
"tpot_ms_p50": 35.4,
"tpot_ms_p90": 40.44,
"tpot_ms_p99": 49.08,
"elapsed_seconds_median": 17.1,
"sla_met": true
},
{
"target_qps": 50,
"achieved_qps": 50.0,
"ttft_ms_p50": 51.84,
"ttft_ms_p90": 67.78,
"ttft_ms_p99": 87.9,
"tpot_ms_p50": 41.22,
"tpot_ms_p90": 49.79,
"tpot_ms_p99": 75.08,
"elapsed_seconds_median": 12.9,
"sla_met": true
}
]
},
{
"precision": "W8A16",
"max_valid_qps": 50,
"results_by_qps": [
{
"target_qps": 5,
"achieved_qps": 5.0,
"ttft_ms_p50": 36.98,
"ttft_ms_p90": 65.92,
"ttft_ms_p99": 1707.78,
"tpot_ms_p50": 9.64,
"tpot_ms_p90": 10.99,
"tpot_ms_p99": 14.19,
"elapsed_seconds_median": 64.8,
"sla_met": false
},
{
"target_qps": 10,
"achieved_qps": 10.0,
"ttft_ms_p50": 42.23,
"ttft_ms_p90": 54.06,
"ttft_ms_p99": 61.13,
"tpot_ms_p50": 15.34,
"tpot_ms_p90": 19.73,
"tpot_ms_p99": 21.49,
"elapsed_seconds_median": 31.8,
"sla_met": true
},
{
"target_qps": 25,
"achieved_qps": 25.0,
"ttft_ms_p50": 54.16,
"ttft_ms_p90": 70.81,
"ttft_ms_p99": 86.6,
"tpot_ms_p50": 38.87,
"tpot_ms_p90": 45.19,
"tpot_ms_p99": 56.31,
"elapsed_seconds_median": 17.6,
"sla_met": true
},
{
"target_qps": 50,
"achieved_qps": 50.0,
"ttft_ms_p50": 55.28,
"ttft_ms_p90": 74.41,
"ttft_ms_p99": 101.43,
"tpot_ms_p50": 47.2,
"tpot_ms_p90": 55.92,
"tpot_ms_p99": 82.54,
"elapsed_seconds_median": 14.1,
"sla_met": true
}
]
},
{
"precision": "W4A16",
"max_valid_qps": 50,
"results_by_qps": [
{
"target_qps": 5,
"achieved_qps": 5.0,
"ttft_ms_p50": 57.96,
"ttft_ms_p90": 100.93,
"ttft_ms_p99": 1674.78,
"tpot_ms_p50": 23.16,
"tpot_ms_p90": 36.67,
"tpot_ms_p99": 42.7,
"elapsed_seconds_median": 66.6,
"sla_met": false
},
{
"target_qps": 10,
"achieved_qps": 10.0,
"ttft_ms_p50": 65.68,
"ttft_ms_p90": 85.76,
"ttft_ms_p99": 92.43,
"tpot_ms_p50": 42.17,
"tpot_ms_p90": 43.43,
"tpot_ms_p99": 46.09,
"elapsed_seconds_median": 35.7,
"sla_met": true
},
{
"target_qps": 25,
"achieved_qps": 25.0,
"ttft_ms_p50": 64.12,
"ttft_ms_p90": 88.09,
"ttft_ms_p99": 113.73,
"tpot_ms_p50": 53.25,
"tpot_ms_p90": 59.64,
"tpot_ms_p99": 73.73,
"elapsed_seconds_median": 20.9,
"sla_met": true
},
{
"target_qps": 50,
"achieved_qps": 50.0,
"ttft_ms_p50": 57.15,
"ttft_ms_p90": 81.87,
"ttft_ms_p99": 103.31,
"tpot_ms_p50": 55.67,
"tpot_ms_p90": 67.41,
"tpot_ms_p99": 86.73,
"elapsed_seconds_median": 16.4,
"sla_met": true
}
]
}
]
},
"quantization_sustained": {
"results_by_precision": [
{
"precision": "BF16",
"sustained_throughput_tokens_per_sec": 558.6,
"throttle_ratio": 0.889,
"throttle_onset_minute": 1.0,
"ttft_p99_drift_ms": -2930.0,
"sustained_concurrency": 8,
"duration_minutes": 15,
"samples": [
{
"minute": 1.0,
"is_warmup": false,
"throughput_tokens_per_sec": 510.3,
"tokens_out": 30617,
"tokens_in": 0,
"requests_completed": 168,
"ttft_ms_p50": 47.0,
"ttft_ms_p99": 2980.6
},
{
"minute": 2.0,
"is_warmup": false,
"throughput_tokens_per_sec": 566.2,
"tokens_out": 33989,
"tokens_in": 0,
"requests_completed": 185,
"ttft_ms_p50": 43.1,
"ttft_ms_p99": 59.5
},
{
"minute": 3.0,
"is_warmup": false,
"throughput_tokens_per_sec": 555.8,
"tokens_out": 33345,
"tokens_in": 0,
"requests_completed": 183,
"ttft_ms_p50": 43.1,
"ttft_ms_p99": 50.9
},
{
"minute": 4.0,
"is_warmup": false,
"throughput_tokens_per_sec": 574.1,
"tokens_out": 34447,
"tokens_in": 0,
"requests_completed": 183,
"ttft_ms_p50": 42.7,
"ttft_ms_p99": 59.1
},
{
"minute": 5.0,
"is_warmup": false,
"throughput_tokens_per_sec": 564.4,
"tokens_out": 33852,
"tokens_in": 0,
"requests_completed": 182,
"ttft_ms_p50": 43.0,
"ttft_ms_p99": 45.9
},
{
"minute": 6.0,
"is_warmup": false,
"throughput_tokens_per_sec": 552.2,
"tokens_out": 33145,
"tokens_in": 0,
"requests_completed": 180,
"ttft_ms_p50": 43.3,
"ttft_ms_p99": 59.3
},
{
"minute": 7.0,
"is_warmup": false,
"throughput_tokens_per_sec": 562.1,
"tokens_out": 33715,
"tokens_in": 0,
"requests_completed": 184,
"ttft_ms_p50": 43.1,
"ttft_ms_p99": 59.1
},
{
"minute": 8.0,
"is_warmup": false,
"throughput_tokens_per_sec": 562.6,
"tokens_out": 33751,
"tokens_in": 0,
"requests_completed": 183,
"ttft_ms_p50": 43.0,
"ttft_ms_p99": 58.6
},
{
"minute": 9.0,
"is_warmup": false,
"throughput_tokens_per_sec": 565.5,
"tokens_out": 33923,
"tokens_in": 0,
"requests_completed": 183,
"ttft_ms_p50": 43.4,
"ttft_ms_p99": 46.6
},
{
"minute": 10.0,
"is_warmup": false,
"throughput_tokens_per_sec": 559.5,
"tokens_out": 33594,
"tokens_in": 0,
"requests_completed": 180,
"ttft_ms_p50": 43.5,
"ttft_ms_p99": 59.5
},
{
"minute": 11.0,
"is_warmup": false,
"throughput_tokens_per_sec": 555.9,
"tokens_out": 33329,
"tokens_in": 0,
"requests_completed": 181,
"ttft_ms_p50": 43.2,
"ttft_ms_p99": 58.6
},
{
"minute": 12.0,
"is_warmup": false,
"throughput_tokens_per_sec": 561.2,
"tokens_out": 33679,
"tokens_in": 0,
"requests_completed": 183,
"ttft_ms_p50": 43.2,
"ttft_ms_p99": 59.9
},
{
"minute": 13.0,
"is_warmup": false,
"throughput_tokens_per_sec": 568.0,
"tokens_out": 34091,
"tokens_in": 0,
"requests_completed": 186,
"ttft_ms_p50": 43.4,
"ttft_ms_p99": 57.6
},
{
"minute": 14.0,
"is_warmup": false,
"throughput_tokens_per_sec": 562.5,
"tokens_out": 33735,
"tokens_in": 0,
"requests_completed": 183,
"ttft_ms_p50": 43.4,
"ttft_ms_p99": 50.6
}
]
},
{
"precision": "W8A16",
"sustained_throughput_tokens_per_sec": 841.8,
"throttle_ratio": 0.902,
"throttle_onset_minute": null,
"ttft_p99_drift_ms": -3044.7,
"sustained_concurrency": 8,
"duration_minutes": 15,
"samples": [
{
"minute": 1.0,
"is_warmup": false,
"throughput_tokens_per_sec": 770.0,
"tokens_out": 46214,
"tokens_in": 0,
"requests_completed": 254,
"ttft_ms_p50": 35.2,
"ttft_ms_p99": 3097.4
},
{
"minute": 2.0,
"is_warmup": false,
"throughput_tokens_per_sec": 851.3,
"tokens_out": 51089,
"tokens_in": 0,
"requests_completed": 281,
"ttft_ms_p50": 34.7,
"ttft_ms_p99": 45.2
},
{
"minute": 3.0,
"is_warmup": false,
"throughput_tokens_per_sec": 851.8,
"tokens_out": 51090,
"tokens_in": 0,
"requests_completed": 275,
"ttft_ms_p50": 34.8,
"ttft_ms_p99": 52.2
},
{
"minute": 4.0,
"is_warmup": false,
"throughput_tokens_per_sec": 839.2,
"tokens_out": 50347,
"tokens_in": 0,
"requests_completed": 277,
"ttft_ms_p50": 34.8,
"ttft_ms_p99": 49.7
},
{
"minute": 5.0,
"is_warmup": false,
"throughput_tokens_per_sec": 849.4,
"tokens_out": 50977,
"tokens_in": 0,
"requests_completed": 278,
"ttft_ms_p50": 35.0,
"ttft_ms_p99": 53.0
},
{
"minute": 6.0,
"is_warmup": false,
"throughput_tokens_per_sec": 850.5,
"tokens_out": 51013,
"tokens_in": 0,
"requests_completed": 279,
"ttft_ms_p50": 34.8,
"ttft_ms_p99": 47.5
},
{
"minute": 7.0,
"is_warmup": false,
"throughput_tokens_per_sec": 850.2,
"tokens_out": 51029,
"tokens_in": 0,
"requests_completed": 275,
"ttft_ms_p50": 35.2,
"ttft_ms_p99": 52.9
},
{
"minute": 8.0,
"is_warmup": false,
"throughput_tokens_per_sec": 833.1,
"tokens_out": 49975,
"tokens_in": 0,
"requests_completed": 273,
"ttft_ms_p50": 35.1,
"ttft_ms_p99": 52.4
},
{
"minute": 9.0,
"is_warmup": false,
"throughput_tokens_per_sec": 853.8,
"tokens_out": 51245,
"tokens_in": 0,
"requests_completed": 281,
"ttft_ms_p50": 34.9,
"ttft_ms_p99": 47.7
},
{
"minute": 10.0,
"is_warmup": false,
"throughput_tokens_per_sec": 852.4,
"tokens_out": 51154,
"tokens_in": 0,
"requests_completed": 280,
"ttft_ms_p50": 35.0,
"ttft_ms_p99": 39.7
},
{
"minute": 11.0,
"is_warmup": false,
"throughput_tokens_per_sec": 849.4,
"tokens_out": 50955,
"tokens_in": 0,
"requests_completed": 277,
"ttft_ms_p50": 35.0,
"ttft_ms_p99": 51.7
},
{
"minute": 12.0,
"is_warmup": false,
"throughput_tokens_per_sec": 840.2,
"tokens_out": 50400,
"tokens_in": 0,
"requests_completed": 275,
"ttft_ms_p50": 34.9,
"ttft_ms_p99": 52.7
},
{
"minute": 13.0,
"is_warmup": false,
"throughput_tokens_per_sec": 850.7,
"tokens_out": 51044,
"tokens_in": 0,
"requests_completed": 281,
"ttft_ms_p50": 35.0,
"ttft_ms_p99": 47.5
},
{
"minute": 14.0,
"is_warmup": false,
"throughput_tokens_per_sec": 843.5,
"tokens_out": 50629,
"tokens_in": 0,
"requests_completed": 277,
"ttft_ms_p50": 35.2,
"ttft_ms_p99": 52.7
}
]
},
{
"precision": "W4A16",
"sustained_throughput_tokens_per_sec": 760.9,
"throttle_ratio": 0.887,
"throttle_onset_minute": 1.0,
"ttft_p99_drift_ms": -2750.7,
"sustained_concurrency": 8,
"duration_minutes": 15,
"samples": [
{
"minute": 1.0,
"is_warmup": false,
"throughput_tokens_per_sec": 687.5,
"tokens_out": 41259,
"tokens_in": 0,
"requests_completed": 236,
"ttft_ms_p50": 36.8,
"ttft_ms_p99": 2802.1
},
{
"minute": 2.0,
"is_warmup": false,
"throughput_tokens_per_sec": 770.2,
"tokens_out": 46209,
"tokens_in": 0,
"requests_completed": 256,
"ttft_ms_p50": 35.4,
"ttft_ms_p99": 51.3
},
{
"minute": 3.0,
"is_warmup": false,
"throughput_tokens_per_sec": 764.0,
"tokens_out": 45832,
"tokens_in": 0,
"requests_completed": 258,
"ttft_ms_p50": 35.3,
"ttft_ms_p99": 51.2
},
{
"minute": 4.0,
"is_warmup": false,
"throughput_tokens_per_sec": 768.7,
"tokens_out": 46151,
"tokens_in": 0,
"requests_completed": 257,
"ttft_ms_p50": 35.5,
"ttft_ms_p99": 51.9
},
{
"minute": 5.0,
"is_warmup": false,
"throughput_tokens_per_sec": 766.7,
"tokens_out": 45997,
"tokens_in": 0,
"requests_completed": 258,
"ttft_ms_p50": 35.3,
"ttft_ms_p99": 51.5
},
{
"minute": 6.0,
"is_warmup": false,
"throughput_tokens_per_sec": 768.2,
"tokens_out": 46086,
"tokens_in": 0,
"requests_completed": 257,
"ttft_ms_p50": 35.4,
"ttft_ms_p99": 47.7
},
{
"minute": 7.0,
"is_warmup": false,
"throughput_tokens_per_sec": 764.3,
"tokens_out": 45881,
"tokens_in": 0,
"requests_completed": 258,
"ttft_ms_p50": 35.4,
"ttft_ms_p99": 51.4
},
{
"minute": 8.0,
"is_warmup": false,
"throughput_tokens_per_sec": 768.5,
"tokens_out": 46105,
"tokens_in": 0,
"requests_completed": 260,
"ttft_ms_p50": 35.3,
"ttft_ms_p99": 51.9
},
{
"minute": 9.0,
"is_warmup": false,
"throughput_tokens_per_sec": 762.5,
"tokens_out": 45749,
"tokens_in": 0,
"requests_completed": 253,
"ttft_ms_p50": 35.2,
"ttft_ms_p99": 52.1
},
{
"minute": 10.0,
"is_warmup": false,
"throughput_tokens_per_sec": 773.0,
"tokens_out": 46367,
"tokens_in": 0,
"requests_completed": 260,
"ttft_ms_p50": 35.5,
"ttft_ms_p99": 51.4
},
{
"minute": 11.0,
"is_warmup": false,
"throughput_tokens_per_sec": 761.1,
"tokens_out": 45663,
"tokens_in": 0,
"requests_completed": 254,
"ttft_ms_p50": 35.4,
"ttft_ms_p99": 49.6
},
{
"minute": 12.0,
"is_warmup": false,
"throughput_tokens_per_sec": 760.9,
"tokens_out": 45671,
"tokens_in": 0,
"requests_completed": 256,
"ttft_ms_p50": 35.4,
"ttft_ms_p99": 45.8
},
{
"minute": 13.0,
"is_warmup": false,
"throughput_tokens_per_sec": 775.3,
"tokens_out": 46524,
"tokens_in": 0,
"requests_completed": 259,
"ttft_ms_p50": 35.4,
"ttft_ms_p99": 51.6
},
{
"minute": 14.0,
"is_warmup": false,
"throughput_tokens_per_sec": 762.3,
"tokens_out": 45727,
"tokens_in": 0,
"requests_completed": 257,
"ttft_ms_p50": 35.3,
"ttft_ms_p99": 51.4
}
]
}
]
}
},
"accuracy": null,
"meta": {
"submitted_by": "Gong-K",
"submission_type": "individual",
"date": "2026-04-30",
"time": "08:29:45",
"run_id": "651fefa6",
"run_name": "nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6",
"flagged": null,
"reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py",
"env_info_file": "../../env_info.json",
"log_file": "run.log",
"samples_file": "samples.jsonl",
"notes": null,
"benchmark_start_time": "2026-04-30T08:26:37.946702+00:00",
"benchmark_end_time": "2026-04-30T08:29:45.379126+00:00",
"benchmark_elapsed_minutes": 76.2,
"model_load_seconds": 65.9,
"benchmark_elapsed_minutes_note": "Sum of per-precision benchmark_elapsed_minutes (excludes sleep gaps and orchestrator overhead).",
"scenario_dirs": {
"bf16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/offline",
"bf16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/online",
"bf16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/sustained",
"fp8/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/offline",
"fp8/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/online",
"fp8/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/sustained",
"w8a8/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/offline",
"w8a8/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/online",
"w8a8/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/sustained",
"w8a16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/offline",
"w8a16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/online",
"w8a16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/sustained",
"w4a16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/offline",
"w4a16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/online",
"w4a16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/sustained"
},
"precision_dirs": {
"BF16": "bf16",
"FP8": "fp8",
"W8A8": "w8a8",
"W8A16": "w8a16",
"W4A16": "w4a16"
},
"precision_model_map": {
"BF16": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
"model_revision": "0e9e39f249a16976918f6564b8830bc894c89659",
"dtype_override": "bfloat16"
},
"FP8": {
"model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
"model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866",
"engine_kwargs": {
"quantization": "compressed-tensors"
},
"_note": "Static per-tensor FP8 (weights + activations). Requires Ampere+ (A100, A800, H20). Skipped automatically on FP16-only hardware."
},
"W8A8": {
"model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708",
"engine_kwargs": {
"quantization": "compressed-tensors"
},
"_note": "INT8 weights + INT8 activations via compressed-tensors. Exercises native int8 tensor cores."
},
"W8A16": {
"model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16",
"model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9",
"engine_kwargs": {
"quantization": "compressed-tensors"
},
"_note": "INT8 weights, FP16 activations. Weight-only quantization \u2014 reduces memory bandwidth, not compute dtype."
},
"W4A16": {
"model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",
"model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e",
"engine_kwargs": {
"quantization": "gptq"
},
"_note": "INT4 weights, FP16 activations via GPTQ Marlin kernels. Weight-only quantization \u2014 larger memory saving than W8A16."
}
}
}
}
{ "schema_version": "1.0", "suite_id": "suite_C", "implementation_id": "nvidia_sglang_c43a8309", "chip": { "name": "NVIDIA A100-SXM4-40GB", "vendor": "NVIDIA", "count": 1, "memory_gb": 40.0, "interconnect_intra_node": null, "interconnect_inter_node": null }, "software": { "framework": "SGLang", "framework_version": "0.5.6", "driver_version": "565.57.01", "runtime_version": "CUDA 12.8", "os": "Ubuntu 22.04.4 LTS", "python_version": "3.10.20" }, "model": { "model_id": "meta-llama/Llama-3.1-8B-Instruct", "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", "model_name": null, "model_note": null, "model_source": "local", "architecture": "dense", "parameter_count_b": 8.0, "precision": "BF16", "effective_dtype": "bfloat16", "quantization_method": null, "model_format": "HuggingFace original", "_note": "suite model_id. Each precision level uses its own quantized checkpoint." }, "task": { "scenarios_run": [ "accuracy", "offline", "online", "sustained" ], "precision_levels_run": [ "BF16", "FP8", "W8A8", "W8A16", "W4A16" ], "precision_levels_skipped": [ "FP16" ], "parallelism": { "tensor_parallel_size": 1, "pipeline_parallel_size": 1, "expert_parallel_size": 1, "data_parallel_size": 1 }, "num_runs": 3, "extra_config": null }, "metrics": { "quantization": { "results_by_precision": [ { "precision": "BF16", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "best_throughput_tokens_per_sec": 3160.74, "accuracy_score": 0.57, "accuracy_baseline_delta": 0.01, "accuracy_valid": true, "quality_efficiency": 1801.6, "speedup_vs_bf16": 1.0, "results_by_concurrency": [ { "client_concurrency": 1, "throughput_tokens_per_sec": 3149.6, "throughput_tokens_per_sec_per_chip": 3149.6, "elapsed_seconds_median": 11.4, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 4, "throughput_tokens_per_sec": 3160.74, "throughput_tokens_per_sec_per_chip": 3160.74, "elapsed_seconds_median": 11.3, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 16, "throughput_tokens_per_sec": 3148.17, "throughput_tokens_per_sec_per_chip": 3148.17, "elapsed_seconds_median": 11.3, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 64, "throughput_tokens_per_sec": 3156.58, "throughput_tokens_per_sec_per_chip": 3156.58, "elapsed_seconds_median": 11.3, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." } ], "result_dir": "bf16", "effective_dtype": "bfloat16", "quantization_method": null }, { "precision": "W8A16", "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", "best_throughput_tokens_per_sec": 3396.91, "accuracy_score": 0.58, "accuracy_baseline_delta": -0.01, "accuracy_valid": true, "quality_efficiency": 1970.2, "speedup_vs_bf16": 1.075, "results_by_concurrency": [ { "client_concurrency": 1, "throughput_tokens_per_sec": 3396.91, "throughput_tokens_per_sec_per_chip": 3396.91, "elapsed_seconds_median": 10.6, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 4, "throughput_tokens_per_sec": 3316.93, "throughput_tokens_per_sec_per_chip": 3316.93, "elapsed_seconds_median": 10.8, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 16, "throughput_tokens_per_sec": 3387.33, "throughput_tokens_per_sec_per_chip": 3387.33, "elapsed_seconds_median": 10.6, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 64, "throughput_tokens_per_sec": 3395.75, "throughput_tokens_per_sec_per_chip": 3395.75, "elapsed_seconds_median": 10.6, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." } ], "result_dir": "w8a16", "effective_dtype": "auto", "quantization_method": "compressed-tensors" }, { "precision": "W4A16", "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", "best_throughput_tokens_per_sec": 1817.91, "accuracy_score": 0.56, "accuracy_baseline_delta": -0.01, "accuracy_valid": true, "quality_efficiency": 1018.0, "speedup_vs_bf16": 0.575, "results_by_concurrency": [ { "client_concurrency": 1, "throughput_tokens_per_sec": 1808.4, "throughput_tokens_per_sec_per_chip": 1808.4, "elapsed_seconds_median": 19.0, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 4, "throughput_tokens_per_sec": 1810.14, "throughput_tokens_per_sec_per_chip": 1810.14, "elapsed_seconds_median": 19.0, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 16, "throughput_tokens_per_sec": 1810.03, "throughput_tokens_per_sec_per_chip": 1810.03, "elapsed_seconds_median": 19.0, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." }, { "client_concurrency": 64, "throughput_tokens_per_sec": 1817.91, "throughput_tokens_per_sec_per_chip": 1817.91, "elapsed_seconds_median": 19.0, "peak_memory_gb": null, "power_watts_avg": null, "power_watts_peak": null, "oom": false, "_throughput_note": "output_only", "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." } ], "result_dir": "w4a16", "effective_dtype": "auto", "quantization_method": "gptq" } ] }, "derived": {}, "quantization_online": { "results_by_precision": [ { "precision": "BF16", "max_valid_qps": 50, "results_by_qps": [ { "target_qps": 5, "achieved_qps": 5.0, "ttft_ms_p50": 44.84, "ttft_ms_p90": 63.85, "ttft_ms_p99": 1627.45, "tpot_ms_p50": 15.7, "tpot_ms_p90": 17.63, "tpot_ms_p99": 19.03, "elapsed_seconds_median": 66.2, "sla_met": false }, { "target_qps": 10, "achieved_qps": 10.0, "ttft_ms_p50": 47.0, "ttft_ms_p90": 56.65, "ttft_ms_p99": 65.26, "tpot_ms_p50": 21.61, "tpot_ms_p90": 23.39, "tpot_ms_p99": 26.07, "elapsed_seconds_median": 32.9, "sla_met": true }, { "target_qps": 25, "achieved_qps": 25.0, "ttft_ms_p50": 52.95, "ttft_ms_p90": 66.65, "ttft_ms_p99": 78.1, "tpot_ms_p50": 35.4, "tpot_ms_p90": 40.44, "tpot_ms_p99": 49.08, "elapsed_seconds_median": 17.1, "sla_met": true }, { "target_qps": 50, "achieved_qps": 50.0, "ttft_ms_p50": 51.84, "ttft_ms_p90": 67.78, "ttft_ms_p99": 87.9, "tpot_ms_p50": 41.22, "tpot_ms_p90": 49.79, "tpot_ms_p99": 75.08, "elapsed_seconds_median": 12.9, "sla_met": true } ] }, { "precision": "W8A16", "max_valid_qps": 50, "results_by_qps": [ { "target_qps": 5, "achieved_qps": 5.0, "ttft_ms_p50": 36.98, "ttft_ms_p90": 65.92, "ttft_ms_p99": 1707.78, "tpot_ms_p50": 9.64, "tpot_ms_p90": 10.99, "tpot_ms_p99": 14.19, "elapsed_seconds_median": 64.8, "sla_met": false }, { "target_qps": 10, "achieved_qps": 10.0, "ttft_ms_p50": 42.23, "ttft_ms_p90": 54.06, "ttft_ms_p99": 61.13, "tpot_ms_p50": 15.34, "tpot_ms_p90": 19.73, "tpot_ms_p99": 21.49, "elapsed_seconds_median": 31.8, "sla_met": true }, { "target_qps": 25, "achieved_qps": 25.0, "ttft_ms_p50": 54.16, "ttft_ms_p90": 70.81, "ttft_ms_p99": 86.6, "tpot_ms_p50": 38.87, "tpot_ms_p90": 45.19, "tpot_ms_p99": 56.31, "elapsed_seconds_median": 17.6, "sla_met": true }, { "target_qps": 50, "achieved_qps": 50.0, "ttft_ms_p50": 55.28, "ttft_ms_p90": 74.41, "ttft_ms_p99": 101.43, "tpot_ms_p50": 47.2, "tpot_ms_p90": 55.92, "tpot_ms_p99": 82.54, "elapsed_seconds_median": 14.1, "sla_met": true } ] }, { "precision": "W4A16", "max_valid_qps": 50, "results_by_qps": [ { "target_qps": 5, "achieved_qps": 5.0, "ttft_ms_p50": 57.96, "ttft_ms_p90": 100.93, "ttft_ms_p99": 1674.78, "tpot_ms_p50": 23.16, "tpot_ms_p90": 36.67, "tpot_ms_p99": 42.7, "elapsed_seconds_median": 66.6, "sla_met": false }, { "target_qps": 10, "achieved_qps": 10.0, "ttft_ms_p50": 65.68, "ttft_ms_p90": 85.76, "ttft_ms_p99": 92.43, "tpot_ms_p50": 42.17, "tpot_ms_p90": 43.43, "tpot_ms_p99": 46.09, "elapsed_seconds_median": 35.7, "sla_met": true }, { "target_qps": 25, "achieved_qps": 25.0, "ttft_ms_p50": 64.12, "ttft_ms_p90": 88.09, "ttft_ms_p99": 113.73, "tpot_ms_p50": 53.25, "tpot_ms_p90": 59.64, "tpot_ms_p99": 73.73, "elapsed_seconds_median": 20.9, "sla_met": true }, { "target_qps": 50, "achieved_qps": 50.0, "ttft_ms_p50": 57.15, "ttft_ms_p90": 81.87, "ttft_ms_p99": 103.31, "tpot_ms_p50": 55.67, "tpot_ms_p90": 67.41, "tpot_ms_p99": 86.73, "elapsed_seconds_median": 16.4, "sla_met": true } ] } ] }, "quantization_sustained": { "results_by_precision": [ { "precision": "BF16", "sustained_throughput_tokens_per_sec": 558.6, "throttle_ratio": 0.889, "throttle_onset_minute": 1.0, "ttft_p99_drift_ms": -2930.0, "sustained_concurrency": 8, "duration_minutes": 15, "samples": [ { "minute": 1.0, "is_warmup": false, "throughput_tokens_per_sec": 510.3, "tokens_out": 30617, "tokens_in": 0, "requests_completed": 168, "ttft_ms_p50": 47.0, "ttft_ms_p99": 2980.6 }, { "minute": 2.0, "is_warmup": false, "throughput_tokens_per_sec": 566.2, "tokens_out": 33989, "tokens_in": 0, "requests_completed": 185, "ttft_ms_p50": 43.1, "ttft_ms_p99": 59.5 }, { "minute": 3.0, "is_warmup": false, "throughput_tokens_per_sec": 555.8, "tokens_out": 33345, "tokens_in": 0, "requests_completed": 183, "ttft_ms_p50": 43.1, "ttft_ms_p99": 50.9 }, { "minute": 4.0, "is_warmup": false, "throughput_tokens_per_sec": 574.1, "tokens_out": 34447, "tokens_in": 0, "requests_completed": 183, "ttft_ms_p50": 42.7, "ttft_ms_p99": 59.1 }, { "minute": 5.0, "is_warmup": false, "throughput_tokens_per_sec": 564.4, "tokens_out": 33852, "tokens_in": 0, "requests_completed": 182, "ttft_ms_p50": 43.0, "ttft_ms_p99": 45.9 }, { "minute": 6.0, "is_warmup": false, "throughput_tokens_per_sec": 552.2, "tokens_out": 33145, "tokens_in": 0, "requests_completed": 180, "ttft_ms_p50": 43.3, "ttft_ms_p99": 59.3 }, { "minute": 7.0, "is_warmup": false, "throughput_tokens_per_sec": 562.1, "tokens_out": 33715, "tokens_in": 0, "requests_completed": 184, "ttft_ms_p50": 43.1, "ttft_ms_p99": 59.1 }, { "minute": 8.0, "is_warmup": false, "throughput_tokens_per_sec": 562.6, "tokens_out": 33751, "tokens_in": 0, "requests_completed": 183, "ttft_ms_p50": 43.0, "ttft_ms_p99": 58.6 }, { "minute": 9.0, "is_warmup": false, "throughput_tokens_per_sec": 565.5, "tokens_out": 33923, "tokens_in": 0, "requests_completed": 183, "ttft_ms_p50": 43.4, "ttft_ms_p99": 46.6 }, { "minute": 10.0, "is_warmup": false, "throughput_tokens_per_sec": 559.5, "tokens_out": 33594, "tokens_in": 0, "requests_completed": 180, "ttft_ms_p50": 43.5, "ttft_ms_p99": 59.5 }, { "minute": 11.0, "is_warmup": false, "throughput_tokens_per_sec": 555.9, "tokens_out": 33329, "tokens_in": 0, "requests_completed": 181, "ttft_ms_p50": 43.2, "ttft_ms_p99": 58.6 }, { "minute": 12.0, "is_warmup": false, "throughput_tokens_per_sec": 561.2, "tokens_out": 33679, "tokens_in": 0, "requests_completed": 183, "ttft_ms_p50": 43.2, "ttft_ms_p99": 59.9 }, { "minute": 13.0, "is_warmup": false, "throughput_tokens_per_sec": 568.0, "tokens_out": 34091, "tokens_in": 0, "requests_completed": 186, "ttft_ms_p50": 43.4, "ttft_ms_p99": 57.6 }, { "minute": 14.0, "is_warmup": false, "throughput_tokens_per_sec": 562.5, "tokens_out": 33735, "tokens_in": 0, "requests_completed": 183, "ttft_ms_p50": 43.4, "ttft_ms_p99": 50.6 } ] }, { "precision": "W8A16", "sustained_throughput_tokens_per_sec": 841.8, "throttle_ratio": 0.902, "throttle_onset_minute": null, "ttft_p99_drift_ms": -3044.7, "sustained_concurrency": 8, "duration_minutes": 15, "samples": [ { "minute": 1.0, "is_warmup": false, "throughput_tokens_per_sec": 770.0, "tokens_out": 46214, "tokens_in": 0, "requests_completed": 254, "ttft_ms_p50": 35.2, "ttft_ms_p99": 3097.4 }, { "minute": 2.0, "is_warmup": false, "throughput_tokens_per_sec": 851.3, "tokens_out": 51089, "tokens_in": 0, "requests_completed": 281, "ttft_ms_p50": 34.7, "ttft_ms_p99": 45.2 }, { "minute": 3.0, "is_warmup": false, "throughput_tokens_per_sec": 851.8, "tokens_out": 51090, "tokens_in": 0, "requests_completed": 275, "ttft_ms_p50": 34.8, "ttft_ms_p99": 52.2 }, { "minute": 4.0, "is_warmup": false, "throughput_tokens_per_sec": 839.2, "tokens_out": 50347, "tokens_in": 0, "requests_completed": 277, "ttft_ms_p50": 34.8, "ttft_ms_p99": 49.7 }, { "minute": 5.0, "is_warmup": false, "throughput_tokens_per_sec": 849.4, "tokens_out": 50977, "tokens_in": 0, "requests_completed": 278, "ttft_ms_p50": 35.0, "ttft_ms_p99": 53.0 }, { "minute": 6.0, "is_warmup": false, "throughput_tokens_per_sec": 850.5, "tokens_out": 51013, "tokens_in": 0, "requests_completed": 279, "ttft_ms_p50": 34.8, "ttft_ms_p99": 47.5 }, { "minute": 7.0, "is_warmup": false, "throughput_tokens_per_sec": 850.2, "tokens_out": 51029, "tokens_in": 0, "requests_completed": 275, "ttft_ms_p50": 35.2, "ttft_ms_p99": 52.9 }, { "minute": 8.0, "is_warmup": false, "throughput_tokens_per_sec": 833.1, "tokens_out": 49975, "tokens_in": 0, "requests_completed": 273, "ttft_ms_p50": 35.1, "ttft_ms_p99": 52.4 }, { "minute": 9.0, "is_warmup": false, "throughput_tokens_per_sec": 853.8, "tokens_out": 51245, "tokens_in": 0, "requests_completed": 281, "ttft_ms_p50": 34.9, "ttft_ms_p99": 47.7 }, { "minute": 10.0, "is_warmup": false, "throughput_tokens_per_sec": 852.4, "tokens_out": 51154, "tokens_in": 0, "requests_completed": 280, "ttft_ms_p50": 35.0, "ttft_ms_p99": 39.7 }, { "minute": 11.0, "is_warmup": false, "throughput_tokens_per_sec": 849.4, "tokens_out": 50955, "tokens_in": 0, "requests_completed": 277, "ttft_ms_p50": 35.0, "ttft_ms_p99": 51.7 }, { "minute": 12.0, "is_warmup": false, "throughput_tokens_per_sec": 840.2, "tokens_out": 50400, "tokens_in": 0, "requests_completed": 275, "ttft_ms_p50": 34.9, "ttft_ms_p99": 52.7 }, { "minute": 13.0, "is_warmup": false, "throughput_tokens_per_sec": 850.7, "tokens_out": 51044, "tokens_in": 0, "requests_completed": 281, "ttft_ms_p50": 35.0, "ttft_ms_p99": 47.5 }, { "minute": 14.0, "is_warmup": false, "throughput_tokens_per_sec": 843.5, "tokens_out": 50629, "tokens_in": 0, "requests_completed": 277, "ttft_ms_p50": 35.2, "ttft_ms_p99": 52.7 } ] }, { "precision": "W4A16", "sustained_throughput_tokens_per_sec": 760.9, "throttle_ratio": 0.887, "throttle_onset_minute": 1.0, "ttft_p99_drift_ms": -2750.7, "sustained_concurrency": 8, "duration_minutes": 15, "samples": [ { "minute": 1.0, "is_warmup": false, "throughput_tokens_per_sec": 687.5, "tokens_out": 41259, "tokens_in": 0, "requests_completed": 236, "ttft_ms_p50": 36.8, "ttft_ms_p99": 2802.1 }, { "minute": 2.0, "is_warmup": false, "throughput_tokens_per_sec": 770.2, "tokens_out": 46209, "tokens_in": 0, "requests_completed": 256, "ttft_ms_p50": 35.4, "ttft_ms_p99": 51.3 }, { "minute": 3.0, "is_warmup": false, "throughput_tokens_per_sec": 764.0, "tokens_out": 45832, "tokens_in": 0, "requests_completed": 258, "ttft_ms_p50": 35.3, "ttft_ms_p99": 51.2 }, { "minute": 4.0, "is_warmup": false, "throughput_tokens_per_sec": 768.7, "tokens_out": 46151, "tokens_in": 0, "requests_completed": 257, "ttft_ms_p50": 35.5, "ttft_ms_p99": 51.9 }, { "minute": 5.0, "is_warmup": false, "throughput_tokens_per_sec": 766.7, "tokens_out": 45997, "tokens_in": 0, "requests_completed": 258, "ttft_ms_p50": 35.3, "ttft_ms_p99": 51.5 }, { "minute": 6.0, "is_warmup": false, "throughput_tokens_per_sec": 768.2, "tokens_out": 46086, "tokens_in": 0, "requests_completed": 257, "ttft_ms_p50": 35.4, "ttft_ms_p99": 47.7 }, { "minute": 7.0, "is_warmup": false, "throughput_tokens_per_sec": 764.3, "tokens_out": 45881, "tokens_in": 0, "requests_completed": 258, "ttft_ms_p50": 35.4, "ttft_ms_p99": 51.4 }, { "minute": 8.0, "is_warmup": false, "throughput_tokens_per_sec": 768.5, "tokens_out": 46105, "tokens_in": 0, "requests_completed": 260, "ttft_ms_p50": 35.3, "ttft_ms_p99": 51.9 }, { "minute": 9.0, "is_warmup": false, "throughput_tokens_per_sec": 762.5, "tokens_out": 45749, "tokens_in": 0, "requests_completed": 253, "ttft_ms_p50": 35.2, "ttft_ms_p99": 52.1 }, { "minute": 10.0, "is_warmup": false, "throughput_tokens_per_sec": 773.0, "tokens_out": 46367, "tokens_in": 0, "requests_completed": 260, "ttft_ms_p50": 35.5, "ttft_ms_p99": 51.4 }, { "minute": 11.0, "is_warmup": false, "throughput_tokens_per_sec": 761.1, "tokens_out": 45663, "tokens_in": 0, "requests_completed": 254, "ttft_ms_p50": 35.4, "ttft_ms_p99": 49.6 }, { "minute": 12.0, "is_warmup": false, "throughput_tokens_per_sec": 760.9, "tokens_out": 45671, "tokens_in": 0, "requests_completed": 256, "ttft_ms_p50": 35.4, "ttft_ms_p99": 45.8 }, { "minute": 13.0, "is_warmup": false, "throughput_tokens_per_sec": 775.3, "tokens_out": 46524, "tokens_in": 0, "requests_completed": 259, "ttft_ms_p50": 35.4, "ttft_ms_p99": 51.6 }, { "minute": 14.0, "is_warmup": false, "throughput_tokens_per_sec": 762.3, "tokens_out": 45727, "tokens_in": 0, "requests_completed": 257, "ttft_ms_p50": 35.3, "ttft_ms_p99": 51.4 } ] } ] } }, "accuracy": null, "meta": { "submitted_by": "Gong-K", "submission_type": "individual", "date": "2026-04-30", "time": "08:29:45", "run_id": "651fefa6", "run_name": "nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6", "flagged": null, "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", "env_info_file": "../../env_info.json", "log_file": "run.log", "samples_file": "samples.jsonl", "notes": null, "benchmark_start_time": "2026-04-30T08:26:37.946702+00:00", "benchmark_end_time": "2026-04-30T08:29:45.379126+00:00", "benchmark_elapsed_minutes": 76.2, "model_load_seconds": 65.9, "benchmark_elapsed_minutes_note": "Sum of per-precision benchmark_elapsed_minutes (excludes sleep gaps and orchestrator overhead).", "scenario_dirs": { "bf16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/offline", "bf16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/online", "bf16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/sustained", "fp8/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/offline", "fp8/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/online", "fp8/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/sustained", "w8a8/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/offline", "w8a8/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/online", "w8a8/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/sustained", "w8a16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/offline", "w8a16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/online", "w8a16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/sustained", "w4a16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/offline", "w4a16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/online", "w4a16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/sustained" }, "precision_dirs": { "BF16": "bf16", "FP8": "fp8", "W8A8": "w8a8", "W8A16": "w8a16", "W4A16": "w4a16" }, "precision_model_map": { "BF16": { "model_id": "meta-llama/Llama-3.1-8B-Instruct", "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", "dtype_override": "bfloat16" }, "FP8": { "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866", "engine_kwargs": { "quantization": "compressed-tensors" }, "_note": "Static per-tensor FP8 (weights + activations). Requires Ampere+ (A100, A800, H20). Skipped automatically on FP16-only hardware." }, "W8A8": { "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708", "engine_kwargs": { "quantization": "compressed-tensors" }, "_note": "INT8 weights + INT8 activations via compressed-tensors. Exercises native int8 tensor cores." }, "W8A16": { "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16", "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9", "engine_kwargs": { "quantization": "compressed-tensors" }, "_note": "INT8 weights, FP16 activations. Weight-only quantization \u2014 reduces memory bandwidth, not compute dtype." }, "W4A16": { "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e", "engine_kwargs": { "quantization": "gptq" }, "_note": "INT4 weights, FP16 activations via GPTQ Marlin kernels. Weight-only quantization \u2014 larger memory saving than W8A16." } } } }