Skip to content

[Submission] nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6 #39

@Gong-K

Description

@Gong-K
{
  "schema_version": "1.0",
  "suite_id": "suite_C",
  "implementation_id": "nvidia_sglang_c43a8309",
  "chip": {
    "name": "NVIDIA A100-SXM4-40GB",
    "vendor": "NVIDIA",
    "count": 1,
    "memory_gb": 40.0,
    "interconnect_intra_node": null,
    "interconnect_inter_node": null
  },
  "software": {
    "framework": "SGLang",
    "framework_version": "0.5.6",
    "driver_version": "565.57.01",
    "runtime_version": "CUDA 12.8",
    "os": "Ubuntu 22.04.4 LTS",
    "python_version": "3.10.20"
  },
  "model": {
    "model_id": "meta-llama/Llama-3.1-8B-Instruct",
    "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659",
    "model_name": null,
    "model_note": null,
    "model_source": "local",
    "architecture": "dense",
    "parameter_count_b": 8.0,
    "precision": "BF16",
    "effective_dtype": "bfloat16",
    "quantization_method": null,
    "model_format": "HuggingFace original",
    "_note": "suite model_id. Each precision level uses its own quantized checkpoint."
  },
  "task": {
    "scenarios_run": [
      "accuracy",
      "offline",
      "online",
      "sustained"
    ],
    "precision_levels_run": [
      "BF16",
      "FP8",
      "W8A8",
      "W8A16",
      "W4A16"
    ],
    "precision_levels_skipped": [
      "FP16"
    ],
    "parallelism": {
      "tensor_parallel_size": 1,
      "pipeline_parallel_size": 1,
      "expert_parallel_size": 1,
      "data_parallel_size": 1
    },
    "num_runs": 3,
    "extra_config": null
  },
  "metrics": {
    "quantization": {
      "results_by_precision": [
        {
          "precision": "BF16",
          "model_id": "meta-llama/Llama-3.1-8B-Instruct",
          "best_throughput_tokens_per_sec": 3160.74,
          "accuracy_score": 0.57,
          "accuracy_baseline_delta": 0.01,
          "accuracy_valid": true,
          "quality_efficiency": 1801.6,
          "speedup_vs_bf16": 1.0,
          "results_by_concurrency": [
            {
              "client_concurrency": 1,
              "throughput_tokens_per_sec": 3149.6,
              "throughput_tokens_per_sec_per_chip": 3149.6,
              "elapsed_seconds_median": 11.4,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            },
            {
              "client_concurrency": 4,
              "throughput_tokens_per_sec": 3160.74,
              "throughput_tokens_per_sec_per_chip": 3160.74,
              "elapsed_seconds_median": 11.3,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            },
            {
              "client_concurrency": 16,
              "throughput_tokens_per_sec": 3148.17,
              "throughput_tokens_per_sec_per_chip": 3148.17,
              "elapsed_seconds_median": 11.3,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            },
            {
              "client_concurrency": 64,
              "throughput_tokens_per_sec": 3156.58,
              "throughput_tokens_per_sec_per_chip": 3156.58,
              "elapsed_seconds_median": 11.3,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            }
          ],
          "result_dir": "bf16",
          "effective_dtype": "bfloat16",
          "quantization_method": null
        },
        {
          "precision": "W8A16",
          "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16",
          "best_throughput_tokens_per_sec": 3396.91,
          "accuracy_score": 0.58,
          "accuracy_baseline_delta": -0.01,
          "accuracy_valid": true,
          "quality_efficiency": 1970.2,
          "speedup_vs_bf16": 1.075,
          "results_by_concurrency": [
            {
              "client_concurrency": 1,
              "throughput_tokens_per_sec": 3396.91,
              "throughput_tokens_per_sec_per_chip": 3396.91,
              "elapsed_seconds_median": 10.6,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            },
            {
              "client_concurrency": 4,
              "throughput_tokens_per_sec": 3316.93,
              "throughput_tokens_per_sec_per_chip": 3316.93,
              "elapsed_seconds_median": 10.8,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            },
            {
              "client_concurrency": 16,
              "throughput_tokens_per_sec": 3387.33,
              "throughput_tokens_per_sec_per_chip": 3387.33,
              "elapsed_seconds_median": 10.6,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            },
            {
              "client_concurrency": 64,
              "throughput_tokens_per_sec": 3395.75,
              "throughput_tokens_per_sec_per_chip": 3395.75,
              "elapsed_seconds_median": 10.6,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            }
          ],
          "result_dir": "w8a16",
          "effective_dtype": "auto",
          "quantization_method": "compressed-tensors"
        },
        {
          "precision": "W4A16",
          "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",
          "best_throughput_tokens_per_sec": 1817.91,
          "accuracy_score": 0.56,
          "accuracy_baseline_delta": -0.01,
          "accuracy_valid": true,
          "quality_efficiency": 1018.0,
          "speedup_vs_bf16": 0.575,
          "results_by_concurrency": [
            {
              "client_concurrency": 1,
              "throughput_tokens_per_sec": 1808.4,
              "throughput_tokens_per_sec_per_chip": 1808.4,
              "elapsed_seconds_median": 19.0,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            },
            {
              "client_concurrency": 4,
              "throughput_tokens_per_sec": 1810.14,
              "throughput_tokens_per_sec_per_chip": 1810.14,
              "elapsed_seconds_median": 19.0,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            },
            {
              "client_concurrency": 16,
              "throughput_tokens_per_sec": 1810.03,
              "throughput_tokens_per_sec_per_chip": 1810.03,
              "elapsed_seconds_median": 19.0,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            },
            {
              "client_concurrency": 64,
              "throughput_tokens_per_sec": 1817.91,
              "throughput_tokens_per_sec_per_chip": 1817.91,
              "elapsed_seconds_median": 19.0,
              "peak_memory_gb": null,
              "power_watts_avg": null,
              "power_watts_peak": null,
              "oom": false,
              "_throughput_note": "output_only",
              "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
            }
          ],
          "result_dir": "w4a16",
          "effective_dtype": "auto",
          "quantization_method": "gptq"
        }
      ]
    },
    "derived": {},
    "quantization_online": {
      "results_by_precision": [
        {
          "precision": "BF16",
          "max_valid_qps": 50,
          "results_by_qps": [
            {
              "target_qps": 5,
              "achieved_qps": 5.0,
              "ttft_ms_p50": 44.84,
              "ttft_ms_p90": 63.85,
              "ttft_ms_p99": 1627.45,
              "tpot_ms_p50": 15.7,
              "tpot_ms_p90": 17.63,
              "tpot_ms_p99": 19.03,
              "elapsed_seconds_median": 66.2,
              "sla_met": false
            },
            {
              "target_qps": 10,
              "achieved_qps": 10.0,
              "ttft_ms_p50": 47.0,
              "ttft_ms_p90": 56.65,
              "ttft_ms_p99": 65.26,
              "tpot_ms_p50": 21.61,
              "tpot_ms_p90": 23.39,
              "tpot_ms_p99": 26.07,
              "elapsed_seconds_median": 32.9,
              "sla_met": true
            },
            {
              "target_qps": 25,
              "achieved_qps": 25.0,
              "ttft_ms_p50": 52.95,
              "ttft_ms_p90": 66.65,
              "ttft_ms_p99": 78.1,
              "tpot_ms_p50": 35.4,
              "tpot_ms_p90": 40.44,
              "tpot_ms_p99": 49.08,
              "elapsed_seconds_median": 17.1,
              "sla_met": true
            },
            {
              "target_qps": 50,
              "achieved_qps": 50.0,
              "ttft_ms_p50": 51.84,
              "ttft_ms_p90": 67.78,
              "ttft_ms_p99": 87.9,
              "tpot_ms_p50": 41.22,
              "tpot_ms_p90": 49.79,
              "tpot_ms_p99": 75.08,
              "elapsed_seconds_median": 12.9,
              "sla_met": true
            }
          ]
        },
        {
          "precision": "W8A16",
          "max_valid_qps": 50,
          "results_by_qps": [
            {
              "target_qps": 5,
              "achieved_qps": 5.0,
              "ttft_ms_p50": 36.98,
              "ttft_ms_p90": 65.92,
              "ttft_ms_p99": 1707.78,
              "tpot_ms_p50": 9.64,
              "tpot_ms_p90": 10.99,
              "tpot_ms_p99": 14.19,
              "elapsed_seconds_median": 64.8,
              "sla_met": false
            },
            {
              "target_qps": 10,
              "achieved_qps": 10.0,
              "ttft_ms_p50": 42.23,
              "ttft_ms_p90": 54.06,
              "ttft_ms_p99": 61.13,
              "tpot_ms_p50": 15.34,
              "tpot_ms_p90": 19.73,
              "tpot_ms_p99": 21.49,
              "elapsed_seconds_median": 31.8,
              "sla_met": true
            },
            {
              "target_qps": 25,
              "achieved_qps": 25.0,
              "ttft_ms_p50": 54.16,
              "ttft_ms_p90": 70.81,
              "ttft_ms_p99": 86.6,
              "tpot_ms_p50": 38.87,
              "tpot_ms_p90": 45.19,
              "tpot_ms_p99": 56.31,
              "elapsed_seconds_median": 17.6,
              "sla_met": true
            },
            {
              "target_qps": 50,
              "achieved_qps": 50.0,
              "ttft_ms_p50": 55.28,
              "ttft_ms_p90": 74.41,
              "ttft_ms_p99": 101.43,
              "tpot_ms_p50": 47.2,
              "tpot_ms_p90": 55.92,
              "tpot_ms_p99": 82.54,
              "elapsed_seconds_median": 14.1,
              "sla_met": true
            }
          ]
        },
        {
          "precision": "W4A16",
          "max_valid_qps": 50,
          "results_by_qps": [
            {
              "target_qps": 5,
              "achieved_qps": 5.0,
              "ttft_ms_p50": 57.96,
              "ttft_ms_p90": 100.93,
              "ttft_ms_p99": 1674.78,
              "tpot_ms_p50": 23.16,
              "tpot_ms_p90": 36.67,
              "tpot_ms_p99": 42.7,
              "elapsed_seconds_median": 66.6,
              "sla_met": false
            },
            {
              "target_qps": 10,
              "achieved_qps": 10.0,
              "ttft_ms_p50": 65.68,
              "ttft_ms_p90": 85.76,
              "ttft_ms_p99": 92.43,
              "tpot_ms_p50": 42.17,
              "tpot_ms_p90": 43.43,
              "tpot_ms_p99": 46.09,
              "elapsed_seconds_median": 35.7,
              "sla_met": true
            },
            {
              "target_qps": 25,
              "achieved_qps": 25.0,
              "ttft_ms_p50": 64.12,
              "ttft_ms_p90": 88.09,
              "ttft_ms_p99": 113.73,
              "tpot_ms_p50": 53.25,
              "tpot_ms_p90": 59.64,
              "tpot_ms_p99": 73.73,
              "elapsed_seconds_median": 20.9,
              "sla_met": true
            },
            {
              "target_qps": 50,
              "achieved_qps": 50.0,
              "ttft_ms_p50": 57.15,
              "ttft_ms_p90": 81.87,
              "ttft_ms_p99": 103.31,
              "tpot_ms_p50": 55.67,
              "tpot_ms_p90": 67.41,
              "tpot_ms_p99": 86.73,
              "elapsed_seconds_median": 16.4,
              "sla_met": true
            }
          ]
        }
      ]
    },
    "quantization_sustained": {
      "results_by_precision": [
        {
          "precision": "BF16",
          "sustained_throughput_tokens_per_sec": 558.6,
          "throttle_ratio": 0.889,
          "throttle_onset_minute": 1.0,
          "ttft_p99_drift_ms": -2930.0,
          "sustained_concurrency": 8,
          "duration_minutes": 15,
          "samples": [
            {
              "minute": 1.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 510.3,
              "tokens_out": 30617,
              "tokens_in": 0,
              "requests_completed": 168,
              "ttft_ms_p50": 47.0,
              "ttft_ms_p99": 2980.6
            },
            {
              "minute": 2.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 566.2,
              "tokens_out": 33989,
              "tokens_in": 0,
              "requests_completed": 185,
              "ttft_ms_p50": 43.1,
              "ttft_ms_p99": 59.5
            },
            {
              "minute": 3.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 555.8,
              "tokens_out": 33345,
              "tokens_in": 0,
              "requests_completed": 183,
              "ttft_ms_p50": 43.1,
              "ttft_ms_p99": 50.9
            },
            {
              "minute": 4.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 574.1,
              "tokens_out": 34447,
              "tokens_in": 0,
              "requests_completed": 183,
              "ttft_ms_p50": 42.7,
              "ttft_ms_p99": 59.1
            },
            {
              "minute": 5.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 564.4,
              "tokens_out": 33852,
              "tokens_in": 0,
              "requests_completed": 182,
              "ttft_ms_p50": 43.0,
              "ttft_ms_p99": 45.9
            },
            {
              "minute": 6.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 552.2,
              "tokens_out": 33145,
              "tokens_in": 0,
              "requests_completed": 180,
              "ttft_ms_p50": 43.3,
              "ttft_ms_p99": 59.3
            },
            {
              "minute": 7.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 562.1,
              "tokens_out": 33715,
              "tokens_in": 0,
              "requests_completed": 184,
              "ttft_ms_p50": 43.1,
              "ttft_ms_p99": 59.1
            },
            {
              "minute": 8.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 562.6,
              "tokens_out": 33751,
              "tokens_in": 0,
              "requests_completed": 183,
              "ttft_ms_p50": 43.0,
              "ttft_ms_p99": 58.6
            },
            {
              "minute": 9.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 565.5,
              "tokens_out": 33923,
              "tokens_in": 0,
              "requests_completed": 183,
              "ttft_ms_p50": 43.4,
              "ttft_ms_p99": 46.6
            },
            {
              "minute": 10.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 559.5,
              "tokens_out": 33594,
              "tokens_in": 0,
              "requests_completed": 180,
              "ttft_ms_p50": 43.5,
              "ttft_ms_p99": 59.5
            },
            {
              "minute": 11.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 555.9,
              "tokens_out": 33329,
              "tokens_in": 0,
              "requests_completed": 181,
              "ttft_ms_p50": 43.2,
              "ttft_ms_p99": 58.6
            },
            {
              "minute": 12.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 561.2,
              "tokens_out": 33679,
              "tokens_in": 0,
              "requests_completed": 183,
              "ttft_ms_p50": 43.2,
              "ttft_ms_p99": 59.9
            },
            {
              "minute": 13.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 568.0,
              "tokens_out": 34091,
              "tokens_in": 0,
              "requests_completed": 186,
              "ttft_ms_p50": 43.4,
              "ttft_ms_p99": 57.6
            },
            {
              "minute": 14.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 562.5,
              "tokens_out": 33735,
              "tokens_in": 0,
              "requests_completed": 183,
              "ttft_ms_p50": 43.4,
              "ttft_ms_p99": 50.6
            }
          ]
        },
        {
          "precision": "W8A16",
          "sustained_throughput_tokens_per_sec": 841.8,
          "throttle_ratio": 0.902,
          "throttle_onset_minute": null,
          "ttft_p99_drift_ms": -3044.7,
          "sustained_concurrency": 8,
          "duration_minutes": 15,
          "samples": [
            {
              "minute": 1.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 770.0,
              "tokens_out": 46214,
              "tokens_in": 0,
              "requests_completed": 254,
              "ttft_ms_p50": 35.2,
              "ttft_ms_p99": 3097.4
            },
            {
              "minute": 2.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 851.3,
              "tokens_out": 51089,
              "tokens_in": 0,
              "requests_completed": 281,
              "ttft_ms_p50": 34.7,
              "ttft_ms_p99": 45.2
            },
            {
              "minute": 3.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 851.8,
              "tokens_out": 51090,
              "tokens_in": 0,
              "requests_completed": 275,
              "ttft_ms_p50": 34.8,
              "ttft_ms_p99": 52.2
            },
            {
              "minute": 4.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 839.2,
              "tokens_out": 50347,
              "tokens_in": 0,
              "requests_completed": 277,
              "ttft_ms_p50": 34.8,
              "ttft_ms_p99": 49.7
            },
            {
              "minute": 5.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 849.4,
              "tokens_out": 50977,
              "tokens_in": 0,
              "requests_completed": 278,
              "ttft_ms_p50": 35.0,
              "ttft_ms_p99": 53.0
            },
            {
              "minute": 6.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 850.5,
              "tokens_out": 51013,
              "tokens_in": 0,
              "requests_completed": 279,
              "ttft_ms_p50": 34.8,
              "ttft_ms_p99": 47.5
            },
            {
              "minute": 7.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 850.2,
              "tokens_out": 51029,
              "tokens_in": 0,
              "requests_completed": 275,
              "ttft_ms_p50": 35.2,
              "ttft_ms_p99": 52.9
            },
            {
              "minute": 8.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 833.1,
              "tokens_out": 49975,
              "tokens_in": 0,
              "requests_completed": 273,
              "ttft_ms_p50": 35.1,
              "ttft_ms_p99": 52.4
            },
            {
              "minute": 9.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 853.8,
              "tokens_out": 51245,
              "tokens_in": 0,
              "requests_completed": 281,
              "ttft_ms_p50": 34.9,
              "ttft_ms_p99": 47.7
            },
            {
              "minute": 10.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 852.4,
              "tokens_out": 51154,
              "tokens_in": 0,
              "requests_completed": 280,
              "ttft_ms_p50": 35.0,
              "ttft_ms_p99": 39.7
            },
            {
              "minute": 11.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 849.4,
              "tokens_out": 50955,
              "tokens_in": 0,
              "requests_completed": 277,
              "ttft_ms_p50": 35.0,
              "ttft_ms_p99": 51.7
            },
            {
              "minute": 12.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 840.2,
              "tokens_out": 50400,
              "tokens_in": 0,
              "requests_completed": 275,
              "ttft_ms_p50": 34.9,
              "ttft_ms_p99": 52.7
            },
            {
              "minute": 13.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 850.7,
              "tokens_out": 51044,
              "tokens_in": 0,
              "requests_completed": 281,
              "ttft_ms_p50": 35.0,
              "ttft_ms_p99": 47.5
            },
            {
              "minute": 14.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 843.5,
              "tokens_out": 50629,
              "tokens_in": 0,
              "requests_completed": 277,
              "ttft_ms_p50": 35.2,
              "ttft_ms_p99": 52.7
            }
          ]
        },
        {
          "precision": "W4A16",
          "sustained_throughput_tokens_per_sec": 760.9,
          "throttle_ratio": 0.887,
          "throttle_onset_minute": 1.0,
          "ttft_p99_drift_ms": -2750.7,
          "sustained_concurrency": 8,
          "duration_minutes": 15,
          "samples": [
            {
              "minute": 1.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 687.5,
              "tokens_out": 41259,
              "tokens_in": 0,
              "requests_completed": 236,
              "ttft_ms_p50": 36.8,
              "ttft_ms_p99": 2802.1
            },
            {
              "minute": 2.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 770.2,
              "tokens_out": 46209,
              "tokens_in": 0,
              "requests_completed": 256,
              "ttft_ms_p50": 35.4,
              "ttft_ms_p99": 51.3
            },
            {
              "minute": 3.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 764.0,
              "tokens_out": 45832,
              "tokens_in": 0,
              "requests_completed": 258,
              "ttft_ms_p50": 35.3,
              "ttft_ms_p99": 51.2
            },
            {
              "minute": 4.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 768.7,
              "tokens_out": 46151,
              "tokens_in": 0,
              "requests_completed": 257,
              "ttft_ms_p50": 35.5,
              "ttft_ms_p99": 51.9
            },
            {
              "minute": 5.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 766.7,
              "tokens_out": 45997,
              "tokens_in": 0,
              "requests_completed": 258,
              "ttft_ms_p50": 35.3,
              "ttft_ms_p99": 51.5
            },
            {
              "minute": 6.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 768.2,
              "tokens_out": 46086,
              "tokens_in": 0,
              "requests_completed": 257,
              "ttft_ms_p50": 35.4,
              "ttft_ms_p99": 47.7
            },
            {
              "minute": 7.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 764.3,
              "tokens_out": 45881,
              "tokens_in": 0,
              "requests_completed": 258,
              "ttft_ms_p50": 35.4,
              "ttft_ms_p99": 51.4
            },
            {
              "minute": 8.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 768.5,
              "tokens_out": 46105,
              "tokens_in": 0,
              "requests_completed": 260,
              "ttft_ms_p50": 35.3,
              "ttft_ms_p99": 51.9
            },
            {
              "minute": 9.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 762.5,
              "tokens_out": 45749,
              "tokens_in": 0,
              "requests_completed": 253,
              "ttft_ms_p50": 35.2,
              "ttft_ms_p99": 52.1
            },
            {
              "minute": 10.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 773.0,
              "tokens_out": 46367,
              "tokens_in": 0,
              "requests_completed": 260,
              "ttft_ms_p50": 35.5,
              "ttft_ms_p99": 51.4
            },
            {
              "minute": 11.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 761.1,
              "tokens_out": 45663,
              "tokens_in": 0,
              "requests_completed": 254,
              "ttft_ms_p50": 35.4,
              "ttft_ms_p99": 49.6
            },
            {
              "minute": 12.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 760.9,
              "tokens_out": 45671,
              "tokens_in": 0,
              "requests_completed": 256,
              "ttft_ms_p50": 35.4,
              "ttft_ms_p99": 45.8
            },
            {
              "minute": 13.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 775.3,
              "tokens_out": 46524,
              "tokens_in": 0,
              "requests_completed": 259,
              "ttft_ms_p50": 35.4,
              "ttft_ms_p99": 51.6
            },
            {
              "minute": 14.0,
              "is_warmup": false,
              "throughput_tokens_per_sec": 762.3,
              "tokens_out": 45727,
              "tokens_in": 0,
              "requests_completed": 257,
              "ttft_ms_p50": 35.3,
              "ttft_ms_p99": 51.4
            }
          ]
        }
      ]
    }
  },
  "accuracy": null,
  "meta": {
    "submitted_by": "Gong-K",
    "submission_type": "individual",
    "date": "2026-04-30",
    "time": "08:29:45",
    "run_id": "651fefa6",
    "run_name": "nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6",
    "flagged": null,
    "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py",
    "env_info_file": "../../env_info.json",
    "log_file": "run.log",
    "samples_file": "samples.jsonl",
    "notes": null,
    "benchmark_start_time": "2026-04-30T08:26:37.946702+00:00",
    "benchmark_end_time": "2026-04-30T08:29:45.379126+00:00",
    "benchmark_elapsed_minutes": 76.2,
    "model_load_seconds": 65.9,
    "benchmark_elapsed_minutes_note": "Sum of per-precision benchmark_elapsed_minutes (excludes sleep gaps and orchestrator overhead).",
    "scenario_dirs": {
      "bf16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/offline",
      "bf16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/online",
      "bf16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/bf16/sustained",
      "fp8/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/offline",
      "fp8/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/online",
      "fp8/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/fp8/sustained",
      "w8a8/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/offline",
      "w8a8/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/online",
      "w8a8/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a8/sustained",
      "w8a16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/offline",
      "w8a16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/online",
      "w8a16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w8a16/sustained",
      "w4a16/offline": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/offline",
      "w4a16/online": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/online",
      "w4a16/sustained": "results/community/nvidia_a100_sxm4_40gbx1_suite_C_nvidia_sglang_c43a8309_651fefa6/w4a16/sustained"
    },
    "precision_dirs": {
      "BF16": "bf16",
      "FP8": "fp8",
      "W8A8": "w8a8",
      "W8A16": "w8a16",
      "W4A16": "w4a16"
    },
    "precision_model_map": {
      "BF16": {
        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
        "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659",
        "dtype_override": "bfloat16"
      },
      "FP8": {
        "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
        "model_revision": "12fd6884d2585dd4d020373e7f39f74507b31866",
        "engine_kwargs": {
          "quantization": "compressed-tensors"
        },
        "_note": "Static per-tensor FP8 (weights + activations). Requires Ampere+ (A100, A800, H20). Skipped automatically on FP16-only hardware."
      },
      "W8A8": {
        "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "model_revision": "e2bfb7d92784ad7d1b606c2f9644d3cefb2ec708",
        "engine_kwargs": {
          "quantization": "compressed-tensors"
        },
        "_note": "INT8 weights + INT8 activations via compressed-tensors. Exercises native int8 tensor cores."
      },
      "W8A16": {
        "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a16",
        "model_revision": "38e03ba250017bf8ed3eeecd3a744e21f6b994a9",
        "engine_kwargs": {
          "quantization": "compressed-tensors"
        },
        "_note": "INT8 weights, FP16 activations. Weight-only quantization \u2014 reduces memory bandwidth, not compute dtype."
      },
      "W4A16": {
        "model_id": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",
        "model_revision": "70371b1b0ea0d4eacfe1ee9056ee805629921c6e",
        "engine_kwargs": {
          "quantization": "gptq"
        },
        "_note": "INT4 weights, FP16 activations via GPTQ Marlin kernels. Weight-only quantization \u2014 larger memory saving than W8A16."
      }
    }
  }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    community-submissionResult submitted via OpenClaw AccelMark Skill

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions