diff --git a/llmdbenchmark/analysis/benchmark_report/README.md b/llmdbenchmark/analysis/benchmark_report/README.md index f8ae6ad98..26b59b079 100644 --- a/llmdbenchmark/analysis/benchmark_report/README.md +++ b/llmdbenchmark/analysis/benchmark_report/README.md @@ -2,7 +2,7 @@ A benchmarking report is a standard data format describing the cluster configuration, workload, and results of a benchmark run. The report acts as a common API for different benchmarking experiments. Each supported harness in llm-d-benchmark creates a benchmark report upon completion of a run, in addition to saving results in its native format. -There are two versions of the benchmark report, `0.1` and `0.2`. Both reports are generated by the `llm-d-benchmark` harness pod, but new applications consuming benchmark data should use version `0.2` reports. +There are three versions of the benchmark report, `0.1`, `0.2`, and `0.2.1`. All reports are generated by the `llm-d-benchmark` harness pod, but new applications consuming benchmark data should use version `0.2.1` reports. Version `0.2.1` is an additive superset of `0.2`: every valid `0.2` report is also a valid `0.2.1` report. ## v0.2 Format Description @@ -56,6 +56,22 @@ Session-level statistics for multi-turn workloads. Populated from `*_session_lif A `session_performance` report is generated alongside the standard `request_performance` report for each stage that has a corresponding session lifecycle file. The `scenario.load.standardized.stage` field identifies which stage the report covers, and `scenario.load.standardized.multi_turn.enabled` is set to `true`. +## v0.2.1 Format Description + +Version `0.2.1` is an additive minor revision of `0.2` that adds optional multi-modal payload statistics for image, video, and audio workloads. Every field introduced is optional, so any `0.2` report validates unchanged under `0.2.1` (enforced by `tests/test_benchmark_report_v0_2_1_compat.py`). + +See [`br_v0_2_1_example.yaml`](br_v0_2_1_example.yaml) for a dummy example report, and [`br_v0_2_1_json_schema.json`](br_v0_2_1_json_schema.json) for its [JSON Schema](https://json-schema.org/draft/2020-12). All other fields and sections are identical to `0.2`. + +The additions, all derived from what the client can determine from the payloads it sent, are: + +- **`results.request_performance.aggregate.requests.request_size`** (`Statistics`): total encoded request size in bytes, capturing the large payloads typical of multi-modal requests. +- **`results.request_performance.aggregate.requests.multimodal`**: a per-modality block (`image`, `video`, `audio`), each a distribution set over the media instances sent. `image`/`video` carry `count`, `bytes`, `pixels`, and `aspect_ratio`; `video` adds `frames`; `audio` carries `count`, `bytes`, and `seconds`. +- **`results.request_performance.aggregate.throughput.{image,video,audio}_rate`** (`Statistics`): per-modality delivery rates (`images/s`, `videos/s`, `audios/s`). + +New unit categories back these fields: `pixels` (quantity), `ratio` (for aspect ratio, distinct from a 0..1 portion), `bytes` (memory), and a media-throughput category (`images/s`, `videos/s`, `audios/s`) kept separate from the request-rate category so the existing `request_rate` guardrail is unaffected. + +Server-side multi-modal metrics (vision token counts, encoding time, multimodal cache hit rates) are out of scope for this revision. + ## v0.1 Format Description A benchmark report describes the inference service configuration, workload, and aggregate results. Individual traces from single inference executions are not captured, rather statistics from multiple traces of identical scenarios are combined to create a report. @@ -199,7 +215,7 @@ python3 -m llmdbenchmark.analysis.benchmark_report.cli \ #### Parameters Reference * `-w, --workload-generator`: Specifies the harness generator. Must be one of: `'guidellm'`, `'inferencemax'`, `'inference-perf'`, `'vllm-benchmark'`, `'nop'`. -* `-b, --br-version`: Target benchmark report version (defaults to `0.1`; use `0.2` for the standard version). +* `-b, --br-version`: Target benchmark report version (defaults to `0.1`; use `0.2` for the standard version, or `0.2.1` to additionally capture the multimodal payload statistics that `inference-perf` emits). * `-f, --force`: Overwrites the output file if it already exists. * `results_file` *(Positional)*: Path to the raw native results file to convert. (e.g. For `inference-perf`, this must contain `"stage_"` in its filename, e.g., `stage_0_lifecycle_metrics.json`). * `output_file` *(Positional, Optional)*: Destination for the converted report. If omitted, the YAML output is printed directly to `stdout`. diff --git a/llmdbenchmark/analysis/benchmark_report/__init__.py b/llmdbenchmark/analysis/benchmark_report/__init__.py index bf6f43d22..6b67df440 100644 --- a/llmdbenchmark/analysis/benchmark_report/__init__.py +++ b/llmdbenchmark/analysis/benchmark_report/__init__.py @@ -14,11 +14,13 @@ ) from .schema_v0_1 import BenchmarkReportV01 from .schema_v0_2 import BenchmarkReportV02 +from .schema_v0_2_1 import BenchmarkReportV021 __all__ = [ "BenchmarkReport", "BenchmarkReportV01", "BenchmarkReportV02", + "BenchmarkReportV021", "get_nested", "import_benchmark_report", "import_yaml", diff --git a/llmdbenchmark/analysis/benchmark_report/base.py b/llmdbenchmark/analysis/benchmark_report/base.py index 0a0b5bd96..dc00831ff 100644 --- a/llmdbenchmark/analysis/benchmark_report/base.py +++ b/llmdbenchmark/analysis/benchmark_report/base.py @@ -92,13 +92,17 @@ class Units(StrEnum): # Quantity COUNT = auto() + PIXELS = auto() # Portion PERCENT = auto() FRACTION = auto() + # Ratio (unbounded; unlike a portion, may exceed 1, e.g. aspect ratio) + RATIO = auto() # Time MS = auto() S = auto() # Memory + BYTES = "bytes" MB = "MB" GB = "GB" TB = "TB" @@ -121,15 +125,28 @@ class Units(StrEnum): TOKEN_PER_S = "tokens/s" # Request throughput QUERY_PER_S = "queries/s" + # Media throughput (per-modality payload rates) + IMAGE_PER_S = "images/s" + VIDEO_PER_S = "videos/s" + AUDIO_PER_S = "audios/s" # Power WATTS = "Watts" # Lists of compatible units for a particular application -UNITS_QUANTITY = [Units.COUNT] +UNITS_QUANTITY = [Units.COUNT, Units.PIXELS] UNITS_PORTION = [Units.PERCENT, Units.FRACTION] +UNITS_RATIO = [Units.RATIO] UNITS_TIME = [Units.MS, Units.S] -UNITS_MEMORY = [Units.MB, Units.GB, Units.TB, Units.MIB, Units.GIB, Units.TIB] +UNITS_MEMORY = [ + Units.BYTES, + Units.MB, + Units.GB, + Units.TB, + Units.MIB, + Units.GIB, + Units.TIB, +] UNITS_BANDWIDTH = [ Units.MBIT_PER_S, Units.GBIT_PER_S, @@ -141,6 +158,7 @@ class Units(StrEnum): UNITS_GEN_LATENCY = [Units.MS_PER_TOKEN, Units.S_PER_TOKEN] UNITS_GEN_THROUGHPUT = [Units.TOKEN_PER_S] UNITS_REQUEST_THROUGHPUT = [Units.QUERY_PER_S] +UNITS_MEDIA_THROUGHPUT = [Units.IMAGE_PER_S, Units.VIDEO_PER_S, Units.AUDIO_PER_S] UNITS_POWER = [Units.WATTS] ############################################################################### diff --git a/llmdbenchmark/analysis/benchmark_report/br_v0_2_1_example.yaml b/llmdbenchmark/analysis/benchmark_report/br_v0_2_1_example.yaml new file mode 100644 index 000000000..a3935258b --- /dev/null +++ b/llmdbenchmark/analysis/benchmark_report/br_v0_2_1_example.yaml @@ -0,0 +1,979 @@ +results: + component_health: + - component_label: vllm-svc-0 + failed_replicas: 1 + replica_health: + - healthy: true + logs: https://logs.example.com/vllm-svc-0-pod-1 + replica_id: vllm-svc-0-pod-1 + restarts: 1 + - healthy: false + logs: /path/to/logs/vllm.log + replica_id: vllm-svc-0-pod-2 + restarts: 2 + - healthy: true + logs: s3://path/to/logs/vllm.log + replica_id: vllm-svc-0-pod-3 + restarts: 0 + total_restarts: 3 + observability: + components: + - aggregate: + gpu_cache_usage: + max: 84.0 + mean: 38.2 + min: 10.1 + p50: 37.5 + p90: 58.7 + p99: 78.3 + units: percent + gpu_memory_usage: + max: 73.5 + mean: 62.4 + min: 58.1 + p50: 62.0 + p90: 66.8 + p99: 71.2 + units: GiB + gpu_utilization: + max: 96.8 + mean: 76.0 + min: 45.2 + p50: 78.9 + p90: 89.3 + p99: 94.1 + units: percent + kv_cache_usage: + max: 92.1 + mean: 42.5 + min: 12.3 + p50: 41.8 + p75: 52.1 + p90: 63.4 + p95: 71.2 + p99: 85.6 + units: percent + running_requests: + max: 62.0 + mean: 24.6 + min: 1.0 + p50: 23.0 + p90: 42.0 + p99: 56.0 + units: count + waiting_requests: + max: 25.0 + mean: 3.2 + min: 0.0 + p50: 2.0 + p90: 8.0 + p99: 18.0 + units: count + component_label: vllm-svc-0 + time_series: + gpu_utilization: + series: + - mean: 73.2 + p90: 85.4 + ts: '2025-11-05T18:05:00Z' + - mean: 78.9 + p90: 91.2 + ts: '2025-11-05T18:10:00Z' + units: percent + kv_cache_usage: + series: + - mean: 35.2 + p90: 48.1 + p99: 62.5 + ts: '2025-11-05T18:05:00Z' + - mean: 52.8 + p90: 71.3 + p99: 85.6 + ts: '2025-11-05T18:10:00Z' + units: percent + - aggregate: + cpu_memory_usage: + max: 5.8 + mean: 2.8 + min: 1.2 + p50: 2.6 + p90: 4.1 + p99: 5.3 + units: GiB + cpu_utilization: + max: 42.0 + mean: 18.4 + min: 5.2 + p50: 17.1 + p90: 28.9 + p99: 35.6 + units: percent + running_requests: + max: 40.0 + mean: 12.3 + min: 0.0 + p50: 11.0 + p90: 22.0 + p99: 34.0 + units: count + waiting_requests: + max: 12.0 + mean: 1.5 + min: 0.0 + p50: 1.0 + p90: 4.0 + p99: 9.0 + units: count + component_label: epp-0 + drop_rate: + max: 0.18 + mean: 0.02 + min: 0.0 + p50: 0.0 + p90: 0.05 + p99: 0.12 + units: percent + epp_dispatch_latency: + statistics: + graph_path: metrics/graphs/epp_dispatch_latency.png + mean: 0.0012 + p50: 0.001 + p99: 0.0035 + stddev: 0.0008 + units: seconds + epp_endpoint_scores: + components: + - component_id: qwen3-0p6b-decode-pod-1 + statistics: + graph_path: metrics/graphs/epp_endpoint_scores.png + mean: 0.82 + p50: 0.85 + p99: 0.95 + stddev: 0.08 + units: score + epp_pool_avg_kv_cache_utilization: + components: + - component_id: epp-0 + pod: qwen3-0p6b-epp-pod-1 + role: epp + statistics: + graph_path: metrics/graphs/epp_pool_avg_kv_cache_utilization.png + mean: 40.3 + p50: 39.8 + p99: 82.1 + stddev: 14.8 + units: percent + epp_pool_avg_queue_size: + components: + - component_id: epp-0 + pod: qwen3-0p6b-epp-pod-1 + role: epp + statistics: + graph_path: metrics/graphs/epp_pool_avg_queue_size.png + mean: 4.7 + p50: 3.0 + p99: 22.0 + stddev: 5.8 + units: count + epp_pool_avg_running_requests: + components: + - component_id: epp-0 + pod: qwen3-0p6b-epp-pod-1 + role: epp + statistics: + graph_path: metrics/graphs/epp_pool_avg_running_requests.png + mean: 36.9 + p50: 35.0 + p99: 68.0 + stddev: 16.2 + units: count + epp_pool_ready_pods: + components: + - component_id: epp-0 + pod: qwen3-0p6b-epp-pod-1 + role: epp + statistics: + graph_path: metrics/graphs/epp_pool_ready_pods.png + mean: 2.4 + p50: 2.0 + p99: 3.0 + stddev: 0.5 + units: count + epp_request_distribution: + components: + - component_id: qwen3-0p6b-decode-pod-1 + statistics: + count: 245 + graph_path: metrics/graphs/epp_request_distribution.png + units: count + - component_id: qwen3-0p6b-decode-pod-2 + statistics: + count: 255 + graph_path: metrics/graphs/epp_request_distribution.png + units: count + pod_startup_times: + aggregate: + max: 95.0 + mean: 84.0 + min: 72.0 + p50: 85.0 + p90: 93.0 + p99: 94.8 + units: s + collected_at: '2025-11-05T18:17:00Z' + graph_path: metrics/graphs/pod_startup_times.png + pods: + - creation_timestamp: '2025-11-05T17:55:00Z' + model: Qwen/Qwen3-0.6B + name: qwen3-0p6b-decode-pod-1 + node: gpu-node-01 + ready_timestamp: '2025-11-05T17:56:12Z' + role: decode + startup_seconds: 72.0 + - creation_timestamp: '2025-11-05T17:55:00Z' + model: Qwen/Qwen3-0.6B + name: qwen3-0p6b-decode-pod-2 + node: gpu-node-02 + ready_timestamp: '2025-11-05T17:56:25Z' + role: decode + startup_seconds: 85.0 + - creation_timestamp: '2025-11-05T18:08:00Z' + model: Qwen/Qwen3-0.6B + name: qwen3-0p6b-decode-pod-3 + node: gpu-node-03 + ready_timestamp: '2025-11-05T18:09:35Z' + role: decode + startup_seconds: 95.0 + replica_status: + aggregate_ready_replicas: + max: 3 + mean: 2.4 + min: 2 + p50: 2 + p90: 3 + p99: 3 + units: count + controllers: + - available_replicas: 3 + desired_replicas: 3 + kind: Deployment + model: Qwen/Qwen3-0.6B + name: qwen3-0p6b-decode + ready_replicas: 3 + role: decode + updated_replicas: 3 + - available_replicas: 0 + desired_replicas: 0 + kind: Deployment + model: Qwen/Qwen3-0.6B + name: qwen3-0p6b-prefill + ready_replicas: 0 + role: prefill + updated_replicas: 0 + graph_path: metrics/graphs/replica_status.png + namespace: benchmark-ns + time_series: + - controllers: + - available_replicas: 2 + desired_replicas: 2 + kind: Deployment + model: Qwen/Qwen3-0.6B + name: qwen3-0p6b-decode + ready_replicas: 2 + role: decode + updated_replicas: 2 + namespace: benchmark-ns + timestamp: '2025-11-05T18:00:40Z' + - controllers: + - available_replicas: 2 + desired_replicas: 3 + kind: Deployment + model: Qwen/Qwen3-0.6B + name: qwen3-0p6b-decode + ready_replicas: 2 + role: decode + updated_replicas: 2 + namespace: benchmark-ns + timestamp: '2025-11-05T18:08:10Z' + - controllers: + - available_replicas: 3 + desired_replicas: 3 + kind: Deployment + model: Qwen/Qwen3-0.6B + name: qwen3-0p6b-decode + ready_replicas: 3 + role: decode + updated_replicas: 3 + namespace: benchmark-ns + timestamp: '2025-11-05T18:09:40Z' + timestamp: '2025-11-05T18:17:00Z' + vllm_external_prefix_cache_hit_rate: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_external_prefix_cache_hit_rate.png + mean: 42.9 + p50: 44.2 + p99: 68.5 + stddev: 14.3 + units: percent + vllm_external_prefix_cache_hits_total: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_external_prefix_cache_hits_total.png + mean: 8200.0 + p50: 8100.0 + p99: 9500.0 + stddev: 620.0 + units: tokens + vllm_external_prefix_cache_queries_total: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_external_prefix_cache_queries_total.png + mean: 19100.0 + p50: 19000.0 + p99: 21500.0 + stddev: 1200.0 + units: tokens + vllm_kv_cache_usage_perc: + aggregated: + mean: 40.3 + p50: 39.8 + p99: 82.1 + stddev: 14.8 + units: percent + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_kv_cache_usage_perc.png + mean: 42.5 + p50: 41.8 + p99: 85.6 + stddev: 15.2 + units: percent + vllm_nixl_bytes_transferred_count: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_nixl_bytes_transferred_count.png + mean: 450.0 + p50: 440.0 + p99: 520.0 + stddev: 35.0 + units: count + vllm_nixl_bytes_transferred_sum: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_nixl_bytes_transferred_sum.png + mean: 1250000000.0 + p50: 1220000000.0 + p99: 1480000000.0 + stddev: 85000000.0 + units: bytes + vllm_nixl_xfer_time_seconds_count: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_nixl_xfer_time_seconds_count.png + mean: 450.0 + p50: 440.0 + p99: 520.0 + stddev: 35.0 + units: count + vllm_nixl_xfer_time_seconds_sum: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_nixl_xfer_time_seconds_sum.png + mean: 0.0045 + p50: 0.0042 + p99: 0.0098 + stddev: 0.0015 + units: seconds + vllm_num_preemptions_total: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_num_preemptions_total.png + mean: 0.5 + p50: 0.0 + p99: 3.0 + stddev: 0.8 + units: count + vllm_num_requests_running: + aggregated: + mean: 23.8 + p50: 22.0 + p99: 54.0 + stddev: 11.5 + units: count + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_num_requests_running.png + mean: 24.6 + p50: 23.0 + p99: 56.0 + stddev: 12.1 + units: count + vllm_num_requests_waiting: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_num_requests_waiting.png + mean: 3.2 + p50: 2.0 + p99: 18.0 + stddev: 4.5 + units: count + vllm_prefix_cache_hit_rate: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_prefix_cache_hit_rate.png + mean: 65.3 + p50: 68.1 + p99: 92.4 + stddev: 18.7 + units: percent + vllm_prefix_cache_hits_total: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_prefix_cache_hits_total.png + mean: 12500.0 + p50: 12400.0 + p99: 14200.0 + stddev: 850.0 + units: tokens + vllm_prefix_cache_queries_total: + components: + - component_id: decode-engine + pod: qwen3-0p6b-decode-pod-1 + role: decode + statistics: + graph_path: metrics/graphs/vllm_prefix_cache_queries_total.png + mean: 19100.0 + p50: 19000.0 + p99: 21500.0 + stddev: 1200.0 + units: tokens + profiling: + anything_goes: 5 + request_performance: + aggregate: + latency: + inter_token_latency: + mean: 0.0368578234128654 + units: s/token + normalized_time_per_output_token: + mean: 0.0368578234128654 + units: s/token + request_latency: + mean: 0.0368578234128654 + units: s + time_per_output_token: + mean: 0.0368578234128654 + units: s/token + time_to_first_token: + max: 0.0574655020609498 + mean: 0.0368578234128654 + min: 0.027276142966002226 + p0p1: 0.027322400792501866 + p1: 0.02785794825293124 + p10: 0.031235878029838203 + p25: 0.033357357955537736 + p5: 0.030159439309500158 + p50: 0.03622930939309299 + p75: 0.03993474820163101 + p90: 0.04262634576298297 + p95: 0.045246614259667695 + p99: 0.05029489721637218 + p99p9: 0.057014757215045425 + units: s + requests: + failures: 0 + input_length: + max: 2328.0 + mean: 2262.448 + min: 2223.0 + p0p1: 2223.998 + p1: 2227.99 + p10: 2240.0 + p25: 2252.0 + p5: 2234.95 + p50: 2262.0 + p75: 2270.0 + p90: 2286.0 + p95: 2294.0 + p99: 2322.0 + p99p9: 2326.503 + units: count + multimodal: + audio: + count: + mean: 1.0 + units: count + seconds: + mean: 10.0 + units: s + image: + aspect_ratio: + mean: 1.78 + units: ratio + bytes: + mean: 40000.0 + units: bytes + count: + max: 4.0 + mean: 2.0 + units: count + pixels: + mean: 2073600.0 + units: pixels + video: + bytes: + mean: 800000.0 + units: bytes + count: + mean: 1.0 + units: count + frames: + mean: 16.0 + units: count + output_length: + max: 2328.0 + mean: 2262.448 + min: 2223.0 + p0p1: 2223.998 + p1: 2227.99 + p10: 2240.0 + p25: 2252.0 + p5: 2234.95 + p50: 2262.0 + p75: 2270.0 + p90: 2286.0 + p95: 2294.0 + p99: 2322.0 + p99p9: 2326.503 + units: count + request_size: + max: 120000.0 + mean: 51234.0 + p50: 48000.0 + units: bytes + total: 500 + throughput: + audio_rate: + mean: 2.0 + units: audios/s + image_rate: + mean: 24.0 + units: images/s + input_token_rate: + max: 2328.0 + mean: 2262.448 + min: 2223.0 + p0p1: 2223.998 + p1: 2227.99 + p10: 2240.0 + p25: 2252.0 + p5: 2234.95 + p50: 2262.0 + p75: 2270.0 + p90: 2286.0 + p95: 2294.0 + p99: 2322.0 + p99p9: 2326.503 + units: tokens/s + output_token_rate: + max: 2328.0 + mean: 2262.448 + min: 2223.0 + p0p1: 2223.998 + p1: 2227.99 + p10: 2240.0 + p25: 2252.0 + p5: 2234.95 + p50: 2262.0 + p75: 2270.0 + p90: 2286.0 + p95: 2294.0 + p99: 2322.0 + p99p9: 2326.503 + units: tokens/s + request_rate: + max: 2328.0 + mean: 2262.448 + min: 2223.0 + p0p1: 2223.998 + p1: 2227.99 + p10: 2240.0 + p25: 2252.0 + p5: 2234.95 + p50: 2262.0 + p75: 2270.0 + p90: 2286.0 + p95: 2294.0 + p99: 2322.0 + p99p9: 2326.503 + units: queries/s + total_token_rate: + max: 2328.0 + mean: 2262.448 + min: 2223.0 + p0p1: 2223.998 + p1: 2227.99 + p10: 2240.0 + p25: 2252.0 + p5: 2234.95 + p50: 2262.0 + p75: 2270.0 + p90: 2286.0 + p95: 2294.0 + p99: 2322.0 + p99p9: 2326.503 + units: tokens/s + video_rate: + mean: 3.0 + units: videos/s + time_series: + latency: + time_to_first_token: + series: + - mean: 0.0368578234128654 + p90: 0.04262634576298297 + p95: 0.045246614259667695 + ts: '2025-11-05T18:01:00Z' + - mean: 0.0368578234128654 + p90: 0.04262634576298297 + p95: 0.045246614259667695 + ts: '2025-11-05T18:02:00Z' + units: s + throughput: + input_token_rate: + series: + - p90: 0.04262634576298297 + p95: 0.045246614259667695 + ts: '2025-11-05T18:01:00Z' + value: 204.0394 + - p90: 0.04262634576298297 + p95: 0.045246614259667695 + ts: '2025-11-05T18:02:00Z' + value: 204.0394 + units: tokens/s + output_token_rate: + series: + - p90: 0.04262634576298297 + p95: 0.045246614259667695 + ts: '2025-11-05T18:01:00Z' + value: 204.0394 + - p90: 0.04262634576298297 + p95: 0.045246614259667695 + ts: '2025-11-05T18:02:00Z' + value: 204.0394 + units: tokens/s + request_rate: + series: + - p90: 0.04262634576298297 + p95: 0.045246614259667695 + ts: '2025-11-05T18:01:00Z' + value: 204.0394 + - p90: 0.04262634576298297 + p95: 0.045246614259667695 + ts: '2025-11-05T18:02:00Z' + value: 204.0394 + units: queries/s + total_token_rate: + series: + - p90: 0.04262634576298297 + p95: 0.045246614259667695 + ts: '2025-11-05T18:01:00Z' + value: 204.0394 + - p90: 0.04262634576298297 + p95: 0.045246614259667695 + ts: '2025-11-05T18:02:00Z' + value: 204.0394 + units: tokens/s + units: tokens/s + session_performance: + sessions: + events_cancelled_per_session: + max: 3.0 + mean: 0.18 + min: 0.0 + p50: 0.0 + p99: 2.0 + units: count + events_per_session: + max: 20.0 + mean: 11.96 + min: 1.0 + p50: 12.0 + p99: 20.0 + units: count + failed: 2 + input_tokens_per_session: + max: 46900.0 + mean: 25612.4 + min: 2148.0 + p50: 25400.0 + p99: 43020.0 + units: count + output_tokens_per_session: + max: 22000.0 + mean: 11800.2 + min: 980.0 + p50: 11700.0 + p99: 20100.0 + units: count + session_duration: + max: 80.1 + mean: 48.3 + min: 12.1 + p50: 47.9 + p90: 61.2 + p99: 74.5 + units: s + session_rate: + mean: 2.24 + units: queries/s + succeeded: 110 + total: 112 + total_events: 1340 + total_events_cancelled: 20 + total_events_completed: 1320 +run: + cid: 09825952f60004aa59bb5b2a2eefa6d1 + description: Baseline latency test for Qwen3-0.6B on inference-scheduling stack + eid: d9c5a5ddce6c2f885a9f8e6a6a6db0fb + keywords: + - baseline + - latency + - qwen3 + pid: 052d5f654ea08edf5caa2c0293fa7fb3 + time: + duration: PT49.97206788184121S + end: '2025-11-05T18:17:03Z' + start: '2025-11-05T18:00:42Z' + uid: 38b1f169ca178b756f7483523b17de61 + user: namasluk +scenario: + load: + metadata: + cfg_id: a4e18f265cc33786a42b8a3f7ac2edcb + description: Optional description of workload + schema_version: 0.0.1 + native: + config: + api: + headers: null + streaming: true + type: completion + data: + input_distribution: null + output_distribution: null + path: null + shared_prefix: + num_groups: 32 + num_prompts_per_group: 8 + output_len: 1000 + question_len: 100 + system_prompt_len: 2048 + type: shared_prefix + load: + interval: 1.0 + num_workers: 112 + stages: + - duration: 50 + rate: 2.0 + - duration: 50 + rate: 5.0 + - duration: 50 + rate: 8.0 + - duration: 50 + rate: 10.0 + - duration: 50 + rate: 12.0 + - duration: 50 + rate: 15.0 + - duration: 50 + rate: 20.0 + sweep: null + type: constant + worker_max_concurrency: 100 + worker_max_tcp_connections: 2500 + metrics: null + report: + prometheus: + per_stage: false + summary: true + request_lifecycle: + per_request: true + per_stage: true + summary: true + server: + api_key: null + base_url: http://infra-nam-release-inference-gateway.namasluk.svc.cluster.local:80/qwen-qwen3-0-6b + ignore_eos: true + model_name: Qwen/Qwen3-0.6B + type: vllm + storage: + google_cloud_storage: null + local_storage: + path: /requests/inference-perf_1759339259-cache_tracking-run_100_1000_llm-d-0p6b-base + report_file_prefix: null + simple_storage_service: null + tokenizer: + pretrained_model_name_or_path: Qwen/Qwen3-0.6B + token: null + trust_remote_code: null + standardized: + concurrency: .inf + input_seq_len: + distribution: fixed + value: 2148 + multi_turn: + enabled: true + max_turns: + distribution: fixed + value: 20 + output_seq_len: + distribution: gaussian + std_dev: 100.0 + value: 1000 + parallelism: 1 + prefix: + num_groups: 32 + num_prefixes: 8 + num_users_per_group: 10 + prefix_len: + distribution: fixed + value: 1000 + rate_qps: 10.0 + source: sampled + stage: 3 + tool: inference-perf + tool_version: 0.3.0 + stack: + - metadata: + cfg_id: cc73fc6b51a1d3b8128f312d70476d7c + description: Optional description of this component + label: vllm-svc-0 + schema_version: 0.0.1 + native: + args: + --block-size: 1024 + --disable-uvicorn-access-log: null + --kv-transfer-config: + kv_connector: NixlConnector + kv_role: kv_both + --max-model-len: 1024 + --no-enable-log-requests: null + --tensor-parallel-size: 8 + envars: + NCCL_IB_HCA: mlx5_1 + UCX_NET_DEVICES: mlx5_1:1 + UCX_SOCKADDR_TLS_PRIORITY: tcp + UCX_TLS: rc,sm,cuda_ipc,cuda_copy,tcp + VLLM_ALLOW_LONG_MAX_MODEL_LEN: '1' + VLLM_LOGGING_LEVEL: DEBUG + VLLM_NIXL_SIDE_CHANNEL_HOST: 10.39.39.3 + VLLM_NIXL_SIDE_CHANNEL_PORT: '5557' + standardized: + accelerator: + count: 8 + model: NVIDIA-H100-80GB-HBM3 + parallelism: + dp: 1 + dp_local: 1 + ep: 1 + pp: 1 + tp: 8 + workers: 1 + kind: inference_engine + model: + name: Qwen/Qwen3-0.6B + replicas: 2 + role: decode + tool: llm-d + tool_version: ghcr.io/llm-d/llm-d-cuda:0.3.1 + - metadata: + cfg_id: 47000e70c655e88198e0dc4e57d41d5f + description: This is a router, but no standardized component for this exists + label: epp-0 + schema_version: 0.0.1 + native: + config: + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: single-profile-handler + - type: decode-filter + - parameters: + blockSize: 16 + indexerConfig: + kvBlockIndexConfig: + enableMetrics: true + metricsLoggingInterval: 60000000000 + tokenProcessorConfig: + blockSize: 64 + hashSeed: '42' + lruCapacityPerServer: 31250 + maxPrefixBlocksToMatch: 256 + mode: cache_tracking + type: prefix-cache-scorer + - type: kv-cache-scorer + - type: queue-scorer + - type: max-score-picker + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: prefix-cache-scorer + weight: 2.0 + - pluginRef: kv-cache-scorer + weight: 1.0 + - pluginRef: queue-scorer + weight: 1.0 + - pluginRef: max-score-picker + standardized: + another_thing: + - a: 5 + - a: 1 + - b: 3 + kind: generic + kind_draft: router + some_config_param: 93 + tool: llm-d-inference-scheduler + tool_version: ghcr.io/llm-d/llm-d-inference-scheduler:0.3.2 +version: 0.2.1 diff --git a/llmdbenchmark/analysis/benchmark_report/br_v0_2_1_json_schema.json b/llmdbenchmark/analysis/benchmark_report/br_v0_2_1_json_schema.json new file mode 100644 index 000000000..128c386f1 --- /dev/null +++ b/llmdbenchmark/analysis/benchmark_report/br_v0_2_1_json_schema.json @@ -0,0 +1,3380 @@ +{ + "$defs": { + "AggregateLatency": { + "additionalProperties": false, + "description": "Aggregate response latency performance metrics.", + "properties": { + "time_to_first_token": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Time to generate the first token (TTFT)." + }, + "normalized_time_per_output_token": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Typical time to generate an output token, including first (NTPOT)." + }, + "time_per_output_token": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Time to generate an output token, excluding first (TPOT, may differ from ITL depending on tool)." + }, + "inter_token_latency": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Latency between generated tokens, excluding first (ITL, may differ from TPOT depending on tool)." + }, + "request_latency": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "End-to-end request latency." + } + }, + "title": "AggregateLatency", + "type": "object" + }, + "AggregateRequestPerformance": { + "additionalProperties": false, + "description": "Aggregate performance metrics (v0.2.1 aggregates).", + "properties": { + "requests": { + "anyOf": [ + { + "$ref": "#/$defs/AggregateRequests" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Aggregate request details." + }, + "latency": { + "anyOf": [ + { + "$ref": "#/$defs/AggregateLatency" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Aggregate response latency performance metrics." + }, + "throughput": { + "anyOf": [ + { + "$ref": "#/$defs/AggregateThroughput" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Aggregate response throughput performance metrics." + } + }, + "title": "AggregateRequestPerformance", + "type": "object" + }, + "AggregateRequests": { + "additionalProperties": false, + "description": "v0.2 request statistics, plus multi-modal payload details.", + "properties": { + "total": { + "description": "Total number of requests sent.", + "minimum": 0, + "title": "Total", + "type": "integer" + }, + "failures": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of requests which responded with an error.", + "title": "Failures" + }, + "incomplete": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of requests which were not completed.", + "title": "Incomplete" + }, + "input_length": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Input sequence length." + }, + "output_length": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Output sequence length." + }, + "request_size": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Total encoded request size, including all media payloads." + }, + "multimodal": { + "anyOf": [ + { + "$ref": "#/$defs/MultiModalRequests" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Per-modality payload statistics." + } + }, + "required": [ + "total" + ], + "title": "AggregateRequests", + "type": "object" + }, + "AggregateThroughput": { + "additionalProperties": false, + "description": "v0.2 throughput metrics, plus per-modality payload rates.", + "properties": { + "input_token_rate": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Input token rate." + }, + "output_token_rate": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Output token rate." + }, + "total_token_rate": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Total token rate (input + output)." + }, + "request_rate": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Request (query) processing rate." + }, + "image_rate": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Image delivery rate." + }, + "video_rate": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Video delivery rate." + }, + "audio_rate": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Audio delivery rate." + } + }, + "title": "AggregateThroughput", + "type": "object" + }, + "AudioPayloadStats": { + "additionalProperties": false, + "description": "Audio payload statistics.", + "properties": { + "count": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of media instances of this modality per request." + }, + "bytes": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Encoded size per media instance." + }, + "seconds": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Duration per audio instance." + } + }, + "title": "AudioPayloadStats", + "type": "object" + }, + "Component": { + "additionalProperties": false, + "description": "Component details.", + "properties": { + "metadata": { + "$ref": "#/$defs/ComponentMetadata" + }, + "standardized": { + "description": "Component configuration details in standardized format.", + "discriminator": { + "mapping": { + "generic": "#/$defs/Generic", + "inference_engine": "#/$defs/InferenceEngine" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/$defs/Generic" + }, + { + "$ref": "#/$defs/InferenceEngine" + } + ], + "title": "Standardized" + }, + "native": { + "$ref": "#/$defs/ComponentNative" + } + }, + "required": [ + "metadata", + "standardized", + "native" + ], + "title": "Component", + "type": "object" + }, + "ComponentHealth": { + "additionalProperties": false, + "description": "Health and reliability metrics for a component during the benchmark.", + "properties": { + "component_label": { + "description": "References the component's label from scenario.stack[].metadata.label", + "title": "Component Label", + "type": "string" + }, + "total_restarts": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Total restarts across all replicas during benchmark.", + "title": "Total Restarts" + }, + "failed_replicas": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of replicas that hand one or more failures during benchmark.", + "title": "Failed Replicas" + }, + "replica_health": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/ReplicaHealth" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Per-replica health details.", + "title": "Replica Health" + } + }, + "required": [ + "component_label" + ], + "title": "ComponentHealth", + "type": "object" + }, + "ComponentMetadata": { + "additionalProperties": false, + "description": "Component metadata.", + "properties": { + "schema_version": { + "default": "0.0.1", + "description": "Schema version for the component.", + "title": "Schema Version", + "type": "string" + }, + "label": { + "description": "Unique name for this particular component.", + "title": "Label", + "type": "string" + }, + "cfg_id": { + "description": "Configuration ID, a hash of this component's configuration.", + "title": "Cfg Id", + "type": "string" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Description of this component.", + "title": "Description" + } + }, + "required": [ + "label", + "cfg_id" + ], + "title": "ComponentMetadata", + "type": "object" + }, + "ComponentNative": { + "additionalProperties": false, + "description": "Component configuration in native format.", + "properties": { + "args": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Command line arguments.", + "title": "Args" + }, + "envars": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Environment variables.", + "title": "Envars" + }, + "config": { + "anyOf": [ + {}, + { + "type": "null" + } + ], + "default": null, + "description": "Configuration file details.", + "title": "Config" + } + }, + "title": "ComponentNative", + "type": "object" + }, + "ComponentObservability": { + "additionalProperties": false, + "description": "Observability metrics for a specific component.", + "properties": { + "component_label": { + "description": "References the component's label from scenario.stack[].metadata.label", + "title": "Component Label", + "type": "string" + }, + "replica_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Specific replica/pod identifier (optional, for per-replica metrics).", + "title": "Replica Id" + }, + "aggregate": { + "anyOf": [ + { + "$ref": "#/$defs/ResourceMetrics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Aggregate resource metrics." + }, + "time_series": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesResourceMetrics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Time series resource metrics." + }, + "raw_data_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Path to raw metrics data files.", + "title": "Raw Data Path" + }, + "graph_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Path to visualization/graph of metrics.", + "title": "Graph Path" + } + }, + "required": [ + "component_label" + ], + "title": "ComponentObservability", + "type": "object" + }, + "ControllerReplicaStatus": { + "additionalProperties": false, + "description": "Replica status for a single controller (Deployment or StatefulSet).", + "properties": { + "kind": { + "description": "Controller kind (e.g., Deployment, StatefulSet).", + "title": "Kind", + "type": "string" + }, + "name": { + "description": "Controller name.", + "title": "Name", + "type": "string" + }, + "model": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Model identifier.", + "title": "Model" + }, + "role": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Role (e.g., prefill, decode).", + "title": "Role" + }, + "desired_replicas": { + "description": "Number of desired replicas.", + "minimum": 0, + "title": "Desired Replicas", + "type": "integer" + }, + "available_replicas": { + "description": "Number of available replicas.", + "minimum": 0, + "title": "Available Replicas", + "type": "integer" + }, + "ready_replicas": { + "description": "Number of ready replicas.", + "minimum": 0, + "title": "Ready Replicas", + "type": "integer" + }, + "updated_replicas": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of updated replicas.", + "title": "Updated Replicas" + } + }, + "required": [ + "kind", + "name", + "desired_replicas", + "available_replicas", + "ready_replicas" + ], + "title": "ControllerReplicaStatus", + "type": "object" + }, + "Distribution": { + "description": "Distribution type.\n\nAttributes\n FIXED: str\n Length is a fixed value.\n GAUSSIAN: str\n Gaussian distribution, with a mean and standard deviation.\n UNIFORM: str\n Uniform distribution between a minimum and maximum value.\n OTHER: str\n An otherwise undefined distribution.", + "enum": [ + "fixed", + "gaussian", + "uniform", + "other" + ], + "title": "Distribution", + "type": "string" + }, + "Generic": { + "additionalProperties": true, + "description": "Component configuration for a generic component.\n\nThis class allows for extra attributes to be added without validation.\nUse this for development of new component classes, or when a class for your\ncomponent does not exist but you don't want to write your own class.", + "properties": { + "kind": { + "const": "generic", + "description": "The type of component.", + "title": "Kind", + "type": "string" + }, + "tool": { + "description": "Particular tool used for this component.", + "title": "Tool", + "type": "string" + }, + "tool_version": { + "description": "Version of tool.", + "title": "Tool Version", + "type": "string" + } + }, + "required": [ + "kind", + "tool", + "tool_version" + ], + "title": "Generic", + "type": "object" + }, + "HostType": { + "description": "Enumeration of supported workload generators\n\nAttributes\n REPLICA: str\n Standard instance of an inference service\n PREFILL: str\n Prefill instance of an inference service\n DECODE: str\n Decode instance of an inference service", + "enum": [ + "replica", + "prefill", + "decode" + ], + "title": "HostType", + "type": "string" + }, + "ImagePayloadStats": { + "additionalProperties": false, + "description": "Image payload statistics.", + "properties": { + "count": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of media instances of this modality per request." + }, + "bytes": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Encoded size per media instance." + }, + "pixels": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Pixel count per media instance (height x width, summed over frames)." + }, + "aspect_ratio": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Aspect ratio (width / height) per media instance." + } + }, + "title": "ImagePayloadStats", + "type": "object" + }, + "InferenceEngine": { + "additionalProperties": false, + "description": "Component configuration for an inference engine.", + "properties": { + "kind": { + "const": "inference_engine", + "description": "The type of component.", + "title": "Kind", + "type": "string" + }, + "tool": { + "description": "Particular tool used for this component.", + "title": "Tool", + "type": "string" + }, + "tool_version": { + "description": "Version of tool.", + "title": "Tool Version", + "type": "string" + }, + "role": { + "$ref": "#/$defs/HostType", + "description": "Type of model serving host." + }, + "replicas": { + "description": "Number of replicas.", + "minimum": 1, + "title": "Replicas", + "type": "integer" + }, + "model": { + "$ref": "#/$defs/InferenceEngineModel" + }, + "accelerator": { + "$ref": "#/$defs/InferenceEngineAccelerator" + } + }, + "required": [ + "kind", + "tool", + "tool_version", + "role", + "replicas", + "model", + "accelerator" + ], + "title": "InferenceEngine", + "type": "object" + }, + "InferenceEngineAccelerator": { + "additionalProperties": false, + "description": "Accelerator hardware details.", + "properties": { + "model": { + "description": "Hardware model name.", + "title": "Model", + "type": "string" + }, + "count": { + "description": "Total utilized accelerator count.", + "minimum": 0, + "title": "Count", + "type": "integer" + }, + "parallelism": { + "$ref": "#/$defs/InferenceEngineParallelism", + "description": "Parallelism utilized." + } + }, + "required": [ + "model", + "count", + "parallelism" + ], + "title": "InferenceEngineAccelerator", + "type": "object" + }, + "InferenceEngineModel": { + "additionalProperties": false, + "description": "Hosted model details.", + "properties": { + "name": { + "description": "Model name.", + "title": "Name", + "type": "string" + } + }, + "required": [ + "name" + ], + "title": "InferenceEngineModel", + "type": "object" + }, + "InferenceEngineParallelism": { + "additionalProperties": false, + "description": "Parallelism details.", + "properties": { + "tp": { + "default": 1, + "description": "Tensor parallelism.", + "minimum": 0, + "title": "Tp", + "type": "integer" + }, + "dp": { + "default": 1, + "description": "Data parallelism.", + "minimum": 0, + "title": "Dp", + "type": "integer" + }, + "dp_local": { + "default": 1, + "description": "Local data parallelism for this engine instance.", + "minimum": 0, + "title": "Dp Local", + "type": "integer" + }, + "workers": { + "default": 1, + "description": "Number of workers.", + "minimum": 0, + "title": "Workers", + "type": "integer" + }, + "ep": { + "default": 1, + "description": "Expert parallelism.", + "minimum": 1, + "title": "Ep", + "type": "integer" + }, + "pp": { + "default": 1, + "description": "Pipeline parallelism.", + "minimum": 1, + "title": "Pp", + "type": "integer" + } + }, + "title": "InferenceEngineParallelism", + "type": "object" + }, + "Load": { + "additionalProperties": false, + "description": "Experimental workload details.", + "properties": { + "metadata": { + "$ref": "#/$defs/LoadMetadata" + }, + "standardized": { + "$ref": "#/$defs/LoadStandardized" + }, + "native": { + "$ref": "#/$defs/LoadNative" + } + }, + "required": [ + "metadata", + "standardized", + "native" + ], + "title": "Load", + "type": "object" + }, + "LoadMetadata": { + "additionalProperties": false, + "description": "Workload metadata.", + "properties": { + "schema_version": { + "default": "0.0.1", + "description": "Version of workload description schema.", + "title": "Schema Version", + "type": "string" + }, + "cfg_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Configuration ID, a hash of the workload configuration.", + "title": "Cfg Id" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Descriptin of workload.", + "title": "Description" + } + }, + "title": "LoadMetadata", + "type": "object" + }, + "LoadNative": { + "additionalProperties": false, + "description": "Workload generator configuration in native format.", + "properties": { + "args": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Command line arguments.", + "title": "Args" + }, + "envars": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Environment variables.", + "title": "Envars" + }, + "config": { + "anyOf": [ + {}, + { + "type": "null" + } + ], + "default": null, + "description": "Configuration file details.", + "title": "Config" + } + }, + "title": "LoadNative", + "type": "object" + }, + "LoadPrefix": { + "additionalProperties": false, + "description": "Input sequence prefix details.", + "properties": { + "prefix_len": { + "$ref": "#/$defs/SequenceLength", + "description": "Length of common prefix." + }, + "num_groups": { + "description": "Number of groups of \"users\" that share common prefixes.", + "minimum": 1, + "title": "Num Groups", + "type": "integer" + }, + "num_users_per_group": { + "description": "Number of users per group.", + "minimum": 1, + "title": "Num Users Per Group", + "type": "integer" + }, + "num_prefixes": { + "description": "Number of common prefixes within a group.", + "minimum": 1, + "title": "Num Prefixes", + "type": "integer" + } + }, + "required": [ + "prefix_len", + "num_groups", + "num_users_per_group", + "num_prefixes" + ], + "title": "LoadPrefix", + "type": "object" + }, + "LoadSource": { + "description": "How input tokens are generated.\n\nAttributes\n RANDOM: str\n Tokens are randomly generated from vocabulary.\n SAMPLED: str\n Tokens are sampled from some data.\n UNKNOWN: str\n The source of tokens used is unknown.", + "enum": [ + "random", + "sampled", + "unknown" + ], + "title": "LoadSource", + "type": "string" + }, + "LoadStandardized": { + "additionalProperties": false, + "description": "Workload generator configuration details in standardized format.", + "properties": { + "tool": { + "description": "Particular tool used for this component.", + "title": "Tool", + "type": "string" + }, + "tool_version": { + "description": "Version of tool.", + "title": "Tool Version", + "type": "string" + }, + "parallelism": { + "default": 1, + "description": "Number of parallel workload generators.", + "minimum": 1, + "title": "Parallelism", + "type": "integer" + }, + "source": { + "$ref": "#/$defs/LoadSource", + "description": "How input tokens are generated." + }, + "stage": { + "default": 0, + "description": "Workload stage number (if multi-stage).", + "minimum": 0, + "title": "Stage", + "type": "integer" + }, + "input_seq_len": { + "$ref": "#/$defs/SequenceLength", + "description": "Input sequence length." + }, + "output_seq_len": { + "anyOf": [ + { + "$ref": "#/$defs/SequenceLength" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Output sequence length (if enforced)." + }, + "prefix": { + "anyOf": [ + { + "$ref": "#/$defs/LoadPrefix" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Input sequence prefix details." + }, + "multi_turn": { + "anyOf": [ + { + "$ref": "#/$defs/MultiTurn" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Multi-turn request configuration." + }, + "rate_qps": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Request rate, in queries per second.", + "title": "Rate Qps" + }, + "concurrency": { + "anyOf": [ + { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "number" + } + ], + "ge": 1 + }, + { + "type": "null" + } + ], + "default": null, + "description": "Request concurrency.", + "title": "Concurrency" + } + }, + "required": [ + "tool", + "tool_version", + "source", + "input_seq_len" + ], + "title": "LoadStandardized", + "type": "object" + }, + "MultiModalRequests": { + "additionalProperties": false, + "description": "Per-modality request payload statistics for multi-modal workloads.", + "properties": { + "image": { + "anyOf": [ + { + "$ref": "#/$defs/ImagePayloadStats" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Image payload statistics." + }, + "video": { + "anyOf": [ + { + "$ref": "#/$defs/VideoPayloadStats" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Video payload statistics." + }, + "audio": { + "anyOf": [ + { + "$ref": "#/$defs/AudioPayloadStats" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Audio payload statistics." + } + }, + "title": "MultiModalRequests", + "type": "object" + }, + "MultiTurn": { + "additionalProperties": false, + "description": "Multi-turn request configuration.", + "properties": { + "enabled": { + "default": true, + "description": "Multi-turn requests are enabled.", + "title": "Enabled", + "type": "boolean" + }, + "max_turns": { + "anyOf": [ + { + "$ref": "#/$defs/SequenceLength" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Maximum number of requests per session." + } + }, + "title": "MultiTurn", + "type": "object" + }, + "Observability": { + "additionalProperties": true, + "description": "Observability metrics.", + "properties": { + "components": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/ComponentObservability" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Per-component observability metrics.", + "title": "Components" + }, + "drop_rate": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Request drop rate." + }, + "pod_startup_times": { + "anyOf": [ + { + "$ref": "#/$defs/PodStartupTimes" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Pod startup times collected during or before the benchmark." + }, + "replica_status": { + "anyOf": [ + { + "$ref": "#/$defs/ReplicaStatus" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Replica status across controllers at a point in time." + } + }, + "title": "Observability", + "type": "object" + }, + "PodStartupInfo": { + "additionalProperties": false, + "description": "Startup timing information for a single pod.", + "properties": { + "name": { + "description": "Pod name.", + "title": "Name", + "type": "string" + }, + "model": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Model identifier.", + "title": "Model" + }, + "role": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Pod role (e.g., prefill, decode, aggregate).", + "title": "Role" + }, + "node": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Node the pod was scheduled on.", + "title": "Node" + }, + "creation_timestamp": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Timestamp when the pod was created.", + "title": "Creation Timestamp" + }, + "ready_timestamp": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Timestamp when the pod became ready.", + "title": "Ready Timestamp" + }, + "startup_seconds": { + "anyOf": [ + { + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Time in seconds from creation to ready.", + "title": "Startup Seconds" + } + }, + "required": [ + "name" + ], + "title": "PodStartupInfo", + "type": "object" + }, + "PodStartupTimes": { + "additionalProperties": false, + "description": "Pod startup times collected during or before the benchmark.", + "properties": { + "collected_at": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Timestamp when startup times were collected.", + "title": "Collected At" + }, + "pods": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/PodStartupInfo" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Per-pod startup information.", + "title": "Pods" + }, + "aggregate": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Aggregate statistics (mean, p50, p99, etc.) across all pod startup times." + }, + "graph_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Path to pod startup times visualization.", + "title": "Graph Path" + } + }, + "title": "PodStartupTimes", + "type": "object" + }, + "ReplicaHealth": { + "additionalProperties": false, + "description": "Health information for a specific replica.", + "properties": { + "replica_id": { + "description": "Unique identifier for this replica (e.g., pod name).", + "title": "Replica Id", + "type": "string" + }, + "restarts": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of times this replica restarted during the benchmark.", + "title": "Restarts" + }, + "healthy": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Healthy status at completion of benchmark.", + "title": "Healthy" + }, + "logs": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Reference to logs for this specific replica.", + "title": "Logs" + } + }, + "required": [ + "replica_id" + ], + "title": "ReplicaHealth", + "type": "object" + }, + "ReplicaStatus": { + "additionalProperties": false, + "description": "Replica status across controllers, with optional time series and aggregate.", + "properties": { + "namespace": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Kubernetes namespace.", + "title": "Namespace" + }, + "timestamp": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Timestamp of the latest snapshot.", + "title": "Timestamp" + }, + "controllers": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/ControllerReplicaStatus" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Per-controller replica status (latest snapshot).", + "title": "Controllers" + }, + "time_series": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/ReplicaStatusSnapshot" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Time series of replica status snapshots collected during the benchmark.", + "title": "Time Series" + }, + "aggregate_ready_replicas": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Aggregate statistics (min, max, mean, etc.) of total ready replicas over time." + }, + "graph_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Path to replica status visualization.", + "title": "Graph Path" + } + }, + "title": "ReplicaStatus", + "type": "object" + }, + "ReplicaStatusSnapshot": { + "additionalProperties": false, + "description": "A single point-in-time replica status snapshot.", + "properties": { + "timestamp": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Timestamp when this snapshot was taken.", + "title": "Timestamp" + }, + "namespace": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Kubernetes namespace.", + "title": "Namespace" + }, + "controllers": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/ControllerReplicaStatus" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Per-controller replica status at this point in time.", + "title": "Controllers" + } + }, + "title": "ReplicaStatusSnapshot", + "type": "object" + }, + "RequestPerformance": { + "additionalProperties": false, + "description": "Request-level performance metrics (v0.2.1 aggregates).", + "properties": { + "aggregate": { + "anyOf": [ + { + "$ref": "#/$defs/AggregateRequestPerformance" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Aggregate performance metrics." + }, + "time_series": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesRequestPerformance" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Time series metrics." + } + }, + "title": "RequestPerformance", + "type": "object" + }, + "ResourceMetrics": { + "additionalProperties": false, + "description": "Resource utilization metrics for a component.", + "properties": { + "kv_cache_usage": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "KV cache usage percentage." + }, + "cache_hit_rate": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Prefix cache hit rate percentage." + }, + "gpu_cache_usage": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "GPU cache usage percentage." + }, + "cpu_cache_usage": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "CPU cache usage percentage." + }, + "gpu_memory_usage": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "GPU memory usage." + }, + "cpu_memory_usage": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "CPU/RAM memory usage." + }, + "storage_usage": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Storage usage." + }, + "gpu_utilization": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "GPU compute utilization percentage." + }, + "cpu_utilization": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "CPU utilization percentage." + }, + "power_consumption": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Power consumption." + }, + "running_requests": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of currently running requests." + }, + "waiting_requests": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of requests waiting in queue." + }, + "swapped_requests": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of swapped out requests." + }, + "preemptions": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of request preemptions due to memory pressure." + } + }, + "title": "ResourceMetrics", + "type": "object" + }, + "Results": { + "additionalProperties": false, + "description": "Benchmark results (v0.2.1 request performance).", + "properties": { + "request_performance": { + "anyOf": [ + { + "$ref": "#/$defs/RequestPerformance" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Request-level performance metrics." + }, + "session_performance": { + "anyOf": [ + { + "$ref": "#/$defs/SessionPerformance" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Session-level performance metrics." + }, + "observability": { + "anyOf": [ + { + "$ref": "#/$defs/Observability" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Observability metrics." + }, + "profiling": { + "anyOf": [ + {}, + { + "type": "null" + } + ], + "default": null, + "description": "Profiling results.", + "title": "Profiling" + }, + "component_health": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/ComponentHealth" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Component health and reliability metrics during benchmark.", + "title": "Component Health" + } + }, + "title": "Results", + "type": "object" + }, + "Run": { + "additionalProperties": false, + "description": "Benchmark run details.", + "properties": { + "uid": { + "description": "Unique ID for this specific benchmark report.", + "title": "Uid", + "type": "string" + }, + "eid": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Experiment ID, common across benchmark reports from a particular experiment.", + "title": "Eid" + }, + "cid": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Cluster ID, unique to a particular cluster.", + "title": "Cid" + }, + "pid": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Pod ID, unique to a workload generating and/or data collecting pod.", + "title": "Pid" + }, + "time": { + "anyOf": [ + { + "$ref": "#/$defs/RunTime" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Time details of experiment." + }, + "user": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Username that executed experiment.", + "title": "User" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "User-provided description of the experiment.", + "title": "Description" + }, + "keywords": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "User-provided keywords/tags for the experiment.", + "title": "Keywords" + } + }, + "required": [ + "uid" + ], + "title": "Run", + "type": "object" + }, + "RunTime": { + "additionalProperties": false, + "description": "Time details of experiment.", + "properties": { + "start": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "ISO-8601 timestamp for experiment start.", + "title": "Start" + }, + "end": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "ISO-8601 timestamp for experiment end.", + "title": "End" + }, + "duration": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "ISO-8601 duration for experiment.", + "title": "Duration" + } + }, + "title": "RunTime", + "type": "object" + }, + "Scenario": { + "additionalProperties": false, + "description": "Benchmark run details.", + "properties": { + "stack": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Component" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of components used to build the stack.", + "title": "Stack" + }, + "load": { + "anyOf": [ + { + "$ref": "#/$defs/Load" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Experimental workload details." + } + }, + "title": "Scenario", + "type": "object" + }, + "SequenceLength": { + "additionalProperties": false, + "description": "Sequence length.", + "properties": { + "distribution": { + "$ref": "#/$defs/Distribution", + "description": "Sequence length distribution type." + }, + "value": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "number" + } + ], + "description": "Primary value.", + "ge": 0, + "title": "Value" + }, + "std_dev": { + "anyOf": [ + { + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Standard deviation (if Gaussian).", + "title": "Std Dev" + }, + "min": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Minimum value.", + "title": "Min" + }, + "max": { + "anyOf": [ + { + "minimum": 1, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Maximum value.", + "title": "Max" + } + }, + "required": [ + "distribution", + "value" + ], + "title": "SequenceLength", + "type": "object" + }, + "SessionPerformance": { + "additionalProperties": false, + "description": "Session-level performance metrics.", + "properties": { + "sessions": { + "anyOf": [ + { + "$ref": "#/$defs/SessionRequests" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Session counts and per-session distributions." + } + }, + "title": "SessionPerformance", + "type": "object" + }, + "SessionRequests": { + "additionalProperties": false, + "description": "Session-level request statistics.", + "properties": { + "total": { + "description": "Total number of sessions.", + "minimum": 0, + "title": "Total", + "type": "integer" + }, + "succeeded": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of sessions that completed successfully.", + "title": "Succeeded" + }, + "failed": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of sessions that failed.", + "title": "Failed" + }, + "total_events": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Total number of events (requests) across all sessions.", + "title": "Total Events" + }, + "total_events_completed": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Total number of events that completed successfully.", + "title": "Total Events Completed" + }, + "total_events_cancelled": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Total number of events that were cancelled.", + "title": "Total Events Cancelled" + }, + "session_rate": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Rate of session completions per second." + }, + "session_duration": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Distribution of session durations in seconds." + }, + "events_per_session": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Distribution of event (request) counts per session." + }, + "events_cancelled_per_session": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Distribution of cancelled event counts per session." + }, + "input_tokens_per_session": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Distribution of total input tokens consumed per session." + }, + "output_tokens_per_session": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Distribution of total output tokens produced per session." + } + }, + "required": [ + "total" + ], + "title": "SessionRequests", + "type": "object" + }, + "Statistics": { + "description": "Statistical information about a property.", + "properties": { + "units": { + "$ref": "#/$defs/Units" + }, + "mean": { + "title": "Mean", + "type": "number" + }, + "mode": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Mode" + }, + "stddev": { + "anyOf": [ + { + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Stddev" + }, + "min": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Min" + }, + "p0p1": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P0P1" + }, + "p1": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P1" + }, + "p5": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P5" + }, + "p10": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P10" + }, + "p25": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P25" + }, + "p50": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P50" + }, + "p75": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P75" + }, + "p90": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P90" + }, + "p95": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P95" + }, + "p99": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P99" + }, + "p99p9": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P99P9" + }, + "max": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max" + } + }, + "required": [ + "units", + "mean" + ], + "title": "Statistics", + "type": "object" + }, + "TimeSeriesData": { + "additionalProperties": false, + "description": "Time series data.", + "properties": { + "units": { + "$ref": "#/$defs/Units", + "description": "Units for time series." + }, + "series": { + "description": "Time series data points.", + "items": { + "$ref": "#/$defs/TimeSeriesPoint" + }, + "title": "Series", + "type": "array" + } + }, + "required": [ + "units", + "series" + ], + "title": "TimeSeriesData", + "type": "object" + }, + "TimeSeriesLatency": { + "additionalProperties": false, + "description": "Time series latency metrics.", + "properties": { + "time_to_first_token": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Time to generate the first token (TTFT)." + }, + "normalized_time_per_output_token": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Typical time to generate an output token, including first (NTPOT)." + }, + "time_per_output_token": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Time to generate an output token, excluding first (TPOT, may differ from ITL depending on tool)." + }, + "inter_token_latency": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Latency between generated tokens, excluding first (ITL, may differ from TPOT depending on tool)." + }, + "request_latency": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "End-to-end request latency." + } + }, + "title": "TimeSeriesLatency", + "type": "object" + }, + "TimeSeriesPoint": { + "additionalProperties": false, + "description": "Time series data point.", + "properties": { + "ts": { + "description": "ISO-8601 timestamp.", + "format": "date-time", + "title": "Ts", + "type": "string" + }, + "value": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Value for datapoint.", + "title": "Value" + }, + "mean": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Mean" + }, + "mode": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Mode" + }, + "stddev": { + "anyOf": [ + { + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Stddev" + }, + "min": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Min" + }, + "p0p1": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P0P1" + }, + "p1": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P1" + }, + "p5": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P5" + }, + "p10": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P10" + }, + "p25": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P25" + }, + "p50": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P50" + }, + "p75": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P75" + }, + "p90": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P90" + }, + "p95": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P95" + }, + "p99": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P99" + }, + "p99p9": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "P99P9" + }, + "max": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max" + } + }, + "required": [ + "ts" + ], + "title": "TimeSeriesPoint", + "type": "object" + }, + "TimeSeriesRequestPerformance": { + "additionalProperties": false, + "description": "Time series performance metrics.", + "properties": { + "latency": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesLatency" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Time series latency metrics." + }, + "throughput": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesThroughput" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Time series throughput metrics." + } + }, + "title": "TimeSeriesRequestPerformance", + "type": "object" + }, + "TimeSeriesResourceMetrics": { + "additionalProperties": false, + "description": "Time series resource utilization metrics.", + "properties": { + "kv_cache_usage": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "KV cache usage percentage over time." + }, + "gpu_cache_usage": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "GPU cache usage percentage over time." + }, + "cpu_cache_usage": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "CPU cache usage percentage over time." + }, + "gpu_memory_usage": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "GPU memory usage over time." + }, + "cpu_memory_usage": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "CPU/RAM memory usage over time." + }, + "storage_usage": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Storage usage over time." + }, + "gpu_utilization": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "GPU compute utilization percentage over time." + }, + "cpu_utilization": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "CPU utilization percentage over time." + }, + "power_consumption": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Power consumption over time." + } + }, + "title": "TimeSeriesResourceMetrics", + "type": "object" + }, + "TimeSeriesThroughput": { + "additionalProperties": false, + "description": "Time series throughput metrics.", + "properties": { + "units": { + "$ref": "#/$defs/Units", + "default": "tokens/s" + }, + "input_token_rate": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Input token rate." + }, + "output_token_rate": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Output token rate." + }, + "total_token_rate": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Total token rate (input + output)." + }, + "request_rate": { + "anyOf": [ + { + "$ref": "#/$defs/TimeSeriesData" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Request (query) processing rate." + } + }, + "title": "TimeSeriesThroughput", + "type": "object" + }, + "Units": { + "description": "Enumeration of units\n\nAttributes\n COUNT: str\n Count\n MS: str\n Milliseconds\n S: str\n Seconds\n MB: str\n Megabytes\n GB: str\n Gigabytes\n TB: str\n Terabytes\n MIB: str\n Mebibytes\n GIB: str\n Gibibytes\n TIB: str\n Tebibytes\n MBIT_PER_S: str\n Megabbits per second\n GBIT_PER_S: str\n Gigabits per second\n TBIT_PER_S: str\n Terabits per second\n MB_PER_S: str\n Megabytes per second\n GB_PER_S: str\n Gigabytes per second\n TB_PER_S: str\n Terabytes per second\n GIB_PER_S: str\n GiB per second\n MS_PER_TOKEN: str\n Milliseconds per token\n S_PER_TOKEN: str\n Seconds per token\n TOKEN_PER_S: str\n Tokens per second\n WATTS: str\n Watts", + "enum": [ + "count", + "pixels", + "percent", + "fraction", + "ratio", + "ms", + "s", + "bytes", + "MB", + "GB", + "TB", + "MiB", + "GiB", + "TiB", + "Mbit/s", + "Gbit/s", + "Tbit/s", + "GiB/s", + "MB/s", + "GB/s", + "TB/s", + "ms/token", + "s/token", + "tokens/s", + "queries/s", + "images/s", + "videos/s", + "audios/s", + "Watts" + ], + "title": "Units", + "type": "string" + }, + "VideoPayloadStats": { + "additionalProperties": false, + "description": "Video payload statistics.", + "properties": { + "count": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of media instances of this modality per request." + }, + "bytes": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Encoded size per media instance." + }, + "pixels": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Pixel count per media instance (height x width, summed over frames)." + }, + "aspect_ratio": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Aspect ratio (width / height) per media instance." + }, + "frames": { + "anyOf": [ + { + "$ref": "#/$defs/Statistics" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of frames per video instance." + } + }, + "title": "VideoPayloadStats", + "type": "object" + } + }, + "additionalProperties": false, + "description": "Benchmark report v0.2.1.", + "properties": { + "version": { + "default": "0.2.1", + "description": "Version of the schema.", + "title": "Version", + "type": "string" + }, + "run": { + "$ref": "#/$defs/Run" + }, + "scenario": { + "anyOf": [ + { + "$ref": "#/$defs/Scenario" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Stack configuration and workload details of experiment." + }, + "results": { + "$ref": "#/$defs/Results", + "description": "Experiment results." + } + }, + "required": [ + "run", + "results" + ], + "title": "Benchmark Report v0.2.1", + "type": "object" +} \ No newline at end of file diff --git a/llmdbenchmark/analysis/benchmark_report/cli.py b/llmdbenchmark/analysis/benchmark_report/cli.py index afcc83960..c5e2f847c 100755 --- a/llmdbenchmark/analysis/benchmark_report/cli.py +++ b/llmdbenchmark/analysis/benchmark_report/cli.py @@ -98,6 +98,15 @@ def main() -> None: import_guidellm, import_guidellm_all, ) + elif args.br_version == "0.2.1": + from .native_to_br0_2_1 import ( + import_inference_max, + import_vllm_benchmark, + import_inference_perf, + import_inference_perf_session, + import_guidellm, + import_guidellm_all, + ) else: sys.stderr.write(f"Invalid benchmark report version: {args.br_version}\n") sys.exit(1) diff --git a/llmdbenchmark/analysis/benchmark_report/core.py b/llmdbenchmark/analysis/benchmark_report/core.py index cbd21de27..52d446204 100755 --- a/llmdbenchmark/analysis/benchmark_report/core.py +++ b/llmdbenchmark/analysis/benchmark_report/core.py @@ -13,6 +13,7 @@ from .base import BenchmarkReport from .schema_v0_1 import BenchmarkReportV01 from .schema_v0_2 import BenchmarkReportV02 +from .schema_v0_2_1 import BenchmarkReportV021 def check_file(file_path: str) -> None: @@ -145,6 +146,8 @@ def load_benchmark_report(data: dict[str, Any]) -> BenchmarkReport: return BenchmarkReportV01(**data) if version == "0.2": return BenchmarkReportV02(**data) + if version == "0.2.1": + return BenchmarkReportV021(**data) raise ValueError(f"Unsupported schema version: {version}") @@ -187,4 +190,6 @@ def make_json_schema(version: str = "0.2") -> str: return json.dumps(BenchmarkReportV01.model_json_schema(), indent=2) if version == "0.2": return json.dumps(BenchmarkReportV02.model_json_schema(), indent=2) + if version == "0.2.1": + return json.dumps(BenchmarkReportV021.model_json_schema(), indent=2) raise ValueError(f"Unsupported schema version: {version}") diff --git a/llmdbenchmark/analysis/benchmark_report/native_to_br0_2_1.py b/llmdbenchmark/analysis/benchmark_report/native_to_br0_2_1.py new file mode 100644 index 000000000..4104965bc --- /dev/null +++ b/llmdbenchmark/analysis/benchmark_report/native_to_br0_2_1.py @@ -0,0 +1,172 @@ +""" +Convert application native output formats into a Benchmark Report v0.2.1. + +v0.2.1 is an additive superset of v0.2: it adds per-request payload-size and +per-modality multimodal statistics that inference-perf emits (PR #450 and +follow-ups). Everything else is identical to v0.2, so this module reuses the +v0.2 converters wholesale and only overrides :func:`import_inference_perf` to +fold in the multimodal block. + +The only producer of multimodal fields today is inference-perf; the other +harness importers are re-exported unchanged. A v0.2 report they emit is, by +construction, a valid v0.2.1 report. +""" + +from .base import Units +from .core import import_yaml, load_benchmark_report, update_dict +from .schema_v0_2_1 import BenchmarkReportV021 + +# Re-export the v0.2 converters that v0.2.1 does not change, so the CLI can +# import the full set of importers from a single module per report version. +from .native_to_br0_2 import ( # noqa: F401 + import_inference_max, + import_vllm_benchmark, + import_inference_perf_session, + import_guidellm, + import_guidellm_all, +) +from .native_to_br0_2 import import_inference_perf as _import_inference_perf_v0_2 + + +# Native (inference-perf) field name -> (schema field name, units) for each +# modality. The native report nests these under successes.{image,video,audio}; +# see inference_perf/payloads/{image,video,audio}/metrics.py and +# tests/required/reportgen/test_lifecycle_report_shape.py upstream. +_MODALITY_FIELDS = { + "image": [ + ("count", "count", Units.COUNT), + ("pixels", "pixels", Units.PIXELS), + ("bytes", "bytes", Units.BYTES), + ("aspect_ratio", "aspect_ratio", Units.RATIO), + ], + "video": [ + ("count", "count", Units.COUNT), + ("frames", "frames", Units.COUNT), + ("pixels", "pixels", Units.PIXELS), + ("bytes", "bytes", Units.BYTES), + ("aspect_ratio", "aspect_ratio", Units.RATIO), + ], + "audio": [ + ("count", "count", Units.COUNT), + ("seconds", "seconds", Units.S), + ("bytes", "bytes", Units.BYTES), + ], +} + +# Native throughput scalar -> (schema field name, units). +_MEDIA_RATE_FIELDS = [ + ("images_per_sec", "image_rate", Units.IMAGE_PER_S), + ("videos_per_sec", "video_rate", Units.VIDEO_PER_S), + ("audios_per_sec", "audio_rate", Units.AUDIO_PER_S), +] + + +def _stats(raw: dict | None, units: Units) -> dict | None: + """Map an inference-perf summary dict to a schema Statistics dict. + + inference-perf reports percentiles as ``median``/``p0.1``/``p99.9``; the + schema names them ``p50``/``p0p1``/``p99p9``. Returns None when the source + summary is absent so the (Optional) schema field is simply omitted. + """ + if not isinstance(raw, dict): + return None + return { + "units": units, + "mean": raw.get("mean"), + "min": raw.get("min"), + "p0p1": raw.get("p0.1"), + "p1": raw.get("p1"), + "p5": raw.get("p5"), + "p10": raw.get("p10"), + "p25": raw.get("p25"), + "p50": raw.get("median"), + "p75": raw.get("p75"), + "p90": raw.get("p90"), + "p95": raw.get("p95"), + "p99": raw.get("p99"), + "p99p9": raw.get("p99.9"), + "max": raw.get("max"), + } + + +def _rate(value: float | None, units: Units) -> dict | None: + """Wrap a scalar per-second rate as a schema Statistics dict, or None.""" + if value is None: + return None + return {"units": units, "mean": value} + + +def _build_multimodal(successes: dict) -> dict: + """Build the multimodal block from the successes section of the results. + + Only modalities actually present in the run are included, and within each + only the sub-fields the harness reported. + """ + multimodal = {} + for modality, fields in _MODALITY_FIELDS.items(): + native = successes.get(modality) + if not isinstance(native, dict): + continue + stats = { + schema_name: _stats(native.get(native_name), units) + for native_name, schema_name, units in fields + if isinstance(native.get(native_name), dict) + } + if stats: + multimodal[modality] = stats + return multimodal + + +def import_inference_perf(results_file: str) -> BenchmarkReportV021: + """Import data from an Inference Perf run as a BenchmarkReportV021. + + Delegates the v0.2 portion of the report to the v0.2 converter, then folds + in the additive v0.2.1 fields (request_size, the per-modality multimodal + block, and per-modality delivery rates) read from the same results file. + + Args: + results_file (str): Results file to import. + + Returns: + BenchmarkReportV021: Imported data. + """ + # Reuse all the v0.2 logic (scenario, latency, token throughput, ...). + br_dict = _import_inference_perf_v0_2(results_file).dump() + br_dict["version"] = "0.2.1" + + results = import_yaml(results_file) + successes = results.get("successes") + + # Multimodal stats live under successes; when every request failed the v0.2 + # converter omits the successes-derived aggregate entirely, and so do we. + if isinstance(successes, dict): + requests_add = {} + + request_size = _stats(successes.get("request_size_bytes"), Units.BYTES) + if request_size: + requests_add["request_size"] = request_size + + multimodal = _build_multimodal(successes) + if multimodal: + requests_add["multimodal"] = multimodal + + throughput = successes.get("throughput", {}) + rates = { + schema_name: _rate(throughput.get(native_name), units) + for native_name, schema_name, units in _MEDIA_RATE_FIELDS + if throughput.get(native_name) is not None + } + + aggregate = {} + if requests_add: + aggregate["requests"] = requests_add + if rates: + aggregate["throughput"] = rates + + if aggregate: + update_dict( + br_dict, + {"results": {"request_performance": {"aggregate": aggregate}}}, + ) + + return load_benchmark_report(br_dict) diff --git a/llmdbenchmark/analysis/benchmark_report/schema_v0_2_1.py b/llmdbenchmark/analysis/benchmark_report/schema_v0_2_1.py new file mode 100644 index 000000000..4141c71e6 --- /dev/null +++ b/llmdbenchmark/analysis/benchmark_report/schema_v0_2_1.py @@ -0,0 +1,279 @@ +""" +Benchmark report v0.2.1 + +Additive minor revision of v0.2 that adds optional multi-modal payload +statistics (image / video / audio) to the request aggregates. + +Every field introduced here is Optional, so any document valid under v0.2 is +also valid under v0.2.1. v0.2 is imported and extended in place rather than +copied, so the unchanged majority of the schema keeps a single definition and +this file contains only the multi-modal delta plus the containment shims needed +to thread the extended aggregates up to a new report root. + +Scope note: this revision covers the results side only (the per-modality stats +the client can derive from the payloads it sent, mirroring the fields emitted by +inference-perf's lifecycle report). A standardized load-side `multimodal` +descriptor on LoadStandardized is deliberately left out of this revision; see +the PR description. +""" + +from pydantic import BaseModel, model_validator + +from .base import ( + UNITS_MEDIA_THROUGHPUT, + UNITS_MEMORY, + UNITS_QUANTITY, + UNITS_RATIO, + UNITS_TIME, +) +from .schema_v0_2 import ( + MODEL_CONFIG, + VERSION as VERSION_V02, + AggregateRequestPerformance as AggregateRequestPerformanceV02, + AggregateRequests as AggregateRequestsV02, + AggregateThroughput as AggregateThroughputV02, + BenchmarkReportV02, + RequestPerformance as RequestPerformanceV02, + Results as ResultsV02, + Run, + Scenario, + Statistics, +) + +# BenchmarkReport schema version +VERSION = "0.2.1" + +# v0.2.1 is a strict additive superset of v0.2; this guards against a future +# v0.2 bump silently drifting out from under the version we extend. +assert VERSION_V02 == "0.2", ( + f"schema_v0_2_1 expects to extend v0.2, found {VERSION_V02}" +) + + +############################################################################### +# Per-modality payload statistics +# +# Single-inheritance hierarchy so that fields shared across modalities are +# declared exactly once: +# +# MediaPayloadStats count, bytes (all modalities) +# └─ VisualPayloadStats + pixels, aspect_ratio (image, video) +# ├─ ImagePayloadStats +# └─ VideoPayloadStats + frames +# └─ AudioPayloadStats + seconds +# +# Adding a modality is a new leaf class plus one field on MultiModalRequests. +############################################################################### + + +class MediaPayloadStats(BaseModel): + """Payload statistics shared by every media modality. + + All fields are distributions over the individual media instances the client + sent, derived purely from the request payload. + """ + + model_config = MODEL_CONFIG.copy() + + count: Statistics | None = None + """Number of media instances of this modality per request.""" + bytes: Statistics | None = None + """Encoded size per media instance.""" + + @model_validator(mode="after") + def check_media_units(self): + if self.count and self.count.units not in UNITS_QUANTITY: + raise ValueError( + f'Invalid units "{self.count.units}", must be one of:' + f" {' '.join(UNITS_QUANTITY)}" + ) + if self.bytes and self.bytes.units not in UNITS_MEMORY: + raise ValueError( + f'Invalid units "{self.bytes.units}", must be one of:' + f" {' '.join(UNITS_MEMORY)}" + ) + return self + + +class VisualPayloadStats(MediaPayloadStats): + """Payload statistics common to pixel-based modalities (image and video).""" + + model_config = MODEL_CONFIG.copy() + + pixels: Statistics | None = None + """Pixel count per media instance (height x width, summed over frames).""" + aspect_ratio: Statistics | None = None + """Aspect ratio (width / height) per media instance.""" + + @model_validator(mode="after") + def check_visual_units(self): + if self.pixels and self.pixels.units not in UNITS_QUANTITY: + raise ValueError( + f'Invalid units "{self.pixels.units}", must be one of:' + f" {' '.join(UNITS_QUANTITY)}" + ) + if self.aspect_ratio and self.aspect_ratio.units not in UNITS_RATIO: + raise ValueError( + f'Invalid units "{self.aspect_ratio.units}", must be one of:' + f" {' '.join(UNITS_RATIO)}" + ) + return self + + +class ImagePayloadStats(VisualPayloadStats): + """Image payload statistics.""" + + model_config = MODEL_CONFIG.copy() + + +class VideoPayloadStats(VisualPayloadStats): + """Video payload statistics.""" + + model_config = MODEL_CONFIG.copy() + + frames: Statistics | None = None + """Number of frames per video instance.""" + + @model_validator(mode="after") + def check_video_units(self): + if self.frames and self.frames.units not in UNITS_QUANTITY: + raise ValueError( + f'Invalid units "{self.frames.units}", must be one of:' + f" {' '.join(UNITS_QUANTITY)}" + ) + return self + + +class AudioPayloadStats(MediaPayloadStats): + """Audio payload statistics.""" + + model_config = MODEL_CONFIG.copy() + + seconds: Statistics | None = None + """Duration per audio instance.""" + + @model_validator(mode="after") + def check_audio_units(self): + if self.seconds and self.seconds.units not in UNITS_TIME: + raise ValueError( + f'Invalid units "{self.seconds.units}", must be one of:' + f" {' '.join(UNITS_TIME)}" + ) + return self + + +class MultiModalRequests(BaseModel): + """Per-modality request payload statistics for multi-modal workloads.""" + + model_config = MODEL_CONFIG.copy() + + image: ImagePayloadStats | None = None + """Image payload statistics.""" + video: VideoPayloadStats | None = None + """Video payload statistics.""" + audio: AudioPayloadStats | None = None + """Audio payload statistics.""" + + +############################################################################### +# Extended request aggregates +############################################################################### + + +class AggregateRequests(AggregateRequestsV02): + """v0.2 request statistics, plus multi-modal payload details.""" + + model_config = MODEL_CONFIG.copy() + + request_size: Statistics | None = None + """Total encoded request size, including all media payloads.""" + multimodal: MultiModalRequests | None = None + """Per-modality payload statistics.""" + + @model_validator(mode="after") + def check_request_size_units(self): + if self.request_size and self.request_size.units not in UNITS_MEMORY: + raise ValueError( + f'Invalid units "{self.request_size.units}", must be one of:' + f" {' '.join(UNITS_MEMORY)}" + ) + return self + + +class AggregateThroughput(AggregateThroughputV02): + """v0.2 throughput metrics, plus per-modality payload rates.""" + + model_config = MODEL_CONFIG.copy() + + image_rate: Statistics | None = None + """Image delivery rate.""" + video_rate: Statistics | None = None + """Video delivery rate.""" + audio_rate: Statistics | None = None + """Audio delivery rate.""" + + @model_validator(mode="after") + def check_media_rate_units(self): + for name, stat in ( + ("image_rate", self.image_rate), + ("video_rate", self.video_rate), + ("audio_rate", self.audio_rate), + ): + if stat and stat.units not in UNITS_MEDIA_THROUGHPUT: + raise ValueError( + f'Invalid units "{stat.units}" for {name}, must be one of:' + f" {' '.join(UNITS_MEDIA_THROUGHPUT)}" + ) + return self + + +############################################################################### +# Containment shims: re-thread the extended aggregates up to a new report root. +# Each class redeclares only the field whose type changed; all other fields are +# inherited from the v0.2 definition. +############################################################################### + + +class AggregateRequestPerformance(AggregateRequestPerformanceV02): + """Aggregate performance metrics (v0.2.1 aggregates).""" + + model_config = MODEL_CONFIG.copy() + + requests: AggregateRequests | None = None + """Aggregate request details.""" + throughput: AggregateThroughput | None = None + """Aggregate response throughput performance metrics.""" + + +class RequestPerformance(RequestPerformanceV02): + """Request-level performance metrics (v0.2.1 aggregates).""" + + model_config = MODEL_CONFIG.copy() + + aggregate: AggregateRequestPerformance | None = None + """Aggregate performance metrics.""" + + +class Results(ResultsV02): + """Benchmark results (v0.2.1 request performance).""" + + model_config = MODEL_CONFIG.copy() + + request_performance: RequestPerformance | None = None + """Request-level performance metrics.""" + + +class BenchmarkReportV021(BenchmarkReportV02): + """Benchmark report v0.2.1.""" + + model_config = MODEL_CONFIG.copy() + model_config["title"] = "Benchmark Report v0.2.1" + + version: str = VERSION + """Version of the schema.""" + run: Run + """Benchmark run details.""" + scenario: Scenario | None = None + """Stack configuration and workload details of experiment.""" + results: Results + """Experiment results.""" diff --git a/tests/fixtures/inference_perf_lifecycle.yaml b/tests/fixtures/inference_perf_lifecycle.yaml new file mode 100644 index 000000000..906727a1e --- /dev/null +++ b/tests/fixtures/inference_perf_lifecycle.yaml @@ -0,0 +1,351 @@ +# Genuine inference-perf lifecycle report (not hand-authored). +# +# Produced by inference-perf's own reportgen.summarize_requests over the +# representative multimodal request mix from its upstream shape test +# (tests/required/reportgen/test_lifecycle_report_shape.py), captured against +# inference-perf main after PR #450/#477. Serves as the golden input for the +# v0.2.1 inference-perf -> benchmark-report converter test. +benchmark_time_seconds: 2.48 +failures: + count: 0 + prompt_len: null + request_latency: null +load_summary: + achieved_rate: 1.5 + count: 3 + requested_rate: 1.0 + schedule_delay: + max: 0.0010000000000000009 + mean: 0.00033333333333337035 + median: 0.001 + min: -0.0009999999999998899 + p0.1: -0.00099599999999989 + p1: -0.000959999999999892 + p10: -0.0005999999999999118 + p25: 5.5077470362263625e-17 + p5: -0.0007999999999999008 + p75: 0.0010000000000000005 + p90: 0.0010000000000000007 + p95: 0.0010000000000000009 + p99: 0.0010000000000000009 + p99.9: 0.0010000000000000009 + send_duration: 2.0 +successes: + audio: + bytes: + max: 90000.0 + mean: 75000.0 + median: 75000.0 + min: 60000.0 + p0.1: 60030.0 + p1: 60300.0 + p10: 63000.0 + p25: 67500.0 + p5: 61500.0 + p75: 82500.0 + p90: 87000.0 + p95: 88500.0 + p99: 89700.0 + p99.9: 89970.0 + count: + max: 1.0 + mean: 0.6666666666666666 + median: 1.0 + min: 0.0 + p0.1: 0.002 + p1: 0.02 + p10: 0.2 + p25: 0.5 + p5: 0.1 + p75: 1.0 + p90: 1.0 + p95: 1.0 + p99: 1.0 + p99.9: 1.0 + seconds: + max: 25.0 + mean: 20.0 + median: 20.0 + min: 15.0 + p0.1: 15.01 + p1: 15.1 + p10: 16.0 + p25: 17.5 + p5: 15.5 + p75: 22.5 + p90: 24.0 + p95: 24.5 + p99: 24.9 + p99.9: 24.990000000000002 + count: 3 + image: + aspect_ratio: + max: 1.7777777777777777 + mean: 1.5740740740740742 + median: 1.7777777777777777 + min: 1.0 + p0.1: 1.0016666666666667 + p1: 1.0166666666666666 + p10: 1.1666666666666665 + p25: 1.4444444444444444 + p5: 1.0833333333333333 + p75: 1.7777777777777777 + p90: 1.7777777777777777 + p95: 1.7777777777777777 + p99: 1.7777777777777777 + p99.9: 1.7777777777777777 + bytes: + max: 110000.0 + mean: 52000.0 + median: 45000.0 + min: 22000.0 + p0.1: 22040.0 + p1: 22400.0 + p10: 26000.0 + p25: 32500.0 + p5: 24000.0 + p75: 57500.0 + p90: 85000.0 + p95: 97500.0 + p99: 107500.00000000001 + p99.9: 109750.00000000004 + count: + max: 3.0 + mean: 2.0 + median: 2.0 + min: 1.0 + p0.1: 1.002 + p1: 1.02 + p10: 1.2 + p25: 1.5 + p5: 1.1 + p75: 2.5 + p90: 2.8 + p95: 2.9 + p99: 2.98 + p99.9: 2.998 + pixels: + max: 8294400.0 + mean: 2481962.6666666665 + median: 1561088.0 + min: 480000.0 + p0.1: 482208.0 + p1: 502080.0 + p10: 700800.0 + p25: 953344.0 + p5: 590400.0 + p75: 2073600.0 + p90: 5184000.0 + p95: 6739200.0 + p99: 7983360.000000001 + p99.9: 8263296.0000000065 + latency: + inter_token_latency: + max: 0.2 + mean: 0.1522222222222222 + median: 0.16999999999999993 + min: 0.08000000000000007 + p0.1: 0.08008000000000007 + p1: 0.08080000000000007 + p10: 0.08800000000000008 + p25: 0.09999999999999998 + p5: 0.08400000000000007 + p75: 0.19999999999999973 + p90: 0.19999999999999996 + p95: 0.19999999999999998 + p99: 0.2 + p99.9: 0.2 + normalized_time_per_output_token: + max: 0.013333333333333332 + mean: 0.009774305555555555 + median: 0.008333333333333333 + min: 0.00765625 + p0.1: 0.007657604166666667 + p1: 0.007669791666666667 + p10: 0.007791666666666666 + p25: 0.007994791666666667 + p5: 0.0077239583333333335 + p75: 0.010833333333333334 + p90: 0.012333333333333333 + p95: 0.012833333333333332 + p99: 0.013233333333333333 + p99.9: 0.013323333333333333 + request_latency: + max: 0.5 + mean: 0.49 + median: 0.49 + min: 0.48 + p0.1: 0.48002 + p1: 0.48019999999999996 + p10: 0.482 + p25: 0.485 + p5: 0.481 + p75: 0.495 + p90: 0.498 + p95: 0.499 + p99: 0.4998 + p99.9: 0.49998 + time_per_output_token: + max: 0.01257142857142857 + mean: 0.009279705855977042 + median: 0.007966101694915254 + min: 0.007301587301587301 + p0.1: 0.007302916330373957 + p1: 0.00731487758945386 + p10: 0.007434490180252892 + p25: 0.007633844498251278 + p5: 0.0073680387409200965 + p75: 0.010268765133171912 + p90: 0.011650363196125908 + p95: 0.012110895883777239 + p99: 0.012479322033898304 + p99.9: 0.012562217917675545 + time_to_first_token: + max: 0.040000000000000036 + mean: 0.033333333333333354 + median: 0.030000000000000027 + min: 0.03 + p0.1: 0.03 + p1: 0.03 + p10: 0.030000000000000006 + p25: 0.030000000000000013 + p5: 0.030000000000000002 + p75: 0.03500000000000003 + p90: 0.038000000000000034 + p95: 0.039000000000000035 + p99: 0.03980000000000004 + p99.9: 0.039980000000000036 + output_len: + max: 64.0 + mean: 53.333333333333336 + median: 60.0 + min: 36.0 + p0.1: 36.048 + p1: 36.48 + p10: 40.8 + p25: 48.0 + p5: 38.4 + p75: 62.0 + p90: 63.2 + p95: 63.6 + p99: 63.92 + p99.9: 63.992000000000004 + prompt_len: + max: 205.0 + mean: 200.33333333333334 + median: 200.0 + min: 196.0 + p0.1: 196.008 + p1: 196.08 + p10: 196.8 + p25: 198.0 + p5: 196.4 + p75: 202.5 + p90: 204.0 + p95: 204.5 + p99: 204.9 + p99.9: 204.99 + prompt_tokens: + cached: 0.0 + total: 601.0 + uncached: 601.0 + request_size_bytes: + max: 1800000.0 + mean: 676666.6666666666 + median: 180000.0 + min: 50000.0 + p0.1: 50260.0 + p1: 52600.0 + p10: 76000.0 + p25: 115000.0 + p5: 63000.0 + p75: 990000.0 + p90: 1476000.0 + p95: 1637999.9999999998 + p99: 1767600.0 + p99.9: 1796760.0000000005 + throughput: + audios_per_sec: 0.8064516129032259 + images_per_sec: 2.4193548387096775 + input_tokens_per_sec: 242.33870967741936 + output_tokens_per_sec: 64.51612903225806 + requests_per_sec: 1.2096774193548387 + total_tokens_per_sec: 306.85483870967744 + videos_per_sec: 1.2096774193548387 + token_count_mismatches: 0 + video: + aspect_ratio: + max: 1.7777777777777777 + mean: 1.6296296296296295 + median: 1.7777777777777777 + min: 1.3333333333333333 + p0.1: 1.3342222222222222 + p1: 1.3422222222222222 + p10: 1.422222222222222 + p25: 1.5555555555555554 + p5: 1.3777777777777778 + p75: 1.7777777777777777 + p90: 1.7777777777777777 + p95: 1.7777777777777777 + p99: 1.7777777777777777 + p99.9: 1.7777777777777777 + bytes: + max: 3000000.0 + mean: 1600000.0 + median: 1300000.0 + min: 500000.0 + p0.1: 501600.0 + p1: 516000.0 + p10: 660000.0 + p25: 900000.0 + p5: 580000.0 + p75: 2150000.0 + p90: 2660000.0 + p95: 2830000.0 + p99: 2966000.0 + p99.9: 2996600.0000000005 + count: + max: 2.0 + mean: 1.0 + median: 1.0 + min: 0.0 + p0.1: 0.002 + p1: 0.02 + p10: 0.2 + p25: 0.5 + p5: 0.1 + p75: 1.5 + p90: 1.8 + p95: 1.9 + p99: 1.98 + p99.9: 1.9980000000000002 + frames: + max: 64.0 + mean: 37.333333333333336 + median: 32.0 + min: 16.0 + p0.1: 16.032 + p1: 16.32 + p10: 19.2 + p25: 24.0 + p5: 17.6 + p75: 48.0 + p90: 57.6 + p95: 60.8 + p99: 63.36 + p99.9: 63.93600000000001 + pixels: + max: 8294400.0 + mean: 3686400.0 + median: 2073600.0 + min: 691200.0 + p0.1: 693964.8 + p1: 718848.0 + p10: 967680.0 + p25: 1382400.0 + p5: 829440.0 + p75: 5184000.0 + p90: 7050240.0 + p95: 7672319.999999999 + p99: 8169984.0 + p99.9: 8281958.400000001 diff --git a/tests/test_benchmark_report_v0_2_1_compat.py b/tests/test_benchmark_report_v0_2_1_compat.py new file mode 100644 index 000000000..56b9a9e27 --- /dev/null +++ b/tests/test_benchmark_report_v0_2_1_compat.py @@ -0,0 +1,95 @@ +"""Backward-compatibility guarantee for benchmark report schema v0.2.1. + +v0.2.1 is a strict additive superset of v0.2: every document that is valid +under v0.2 MUST also be valid under v0.2.1. This is a hard requirement, so it +is enforced here rather than relying on manual inspection. + +The proof has three independent angles: + 1. Concrete: the committed v0.2 example validates unchanged under v0.2.1. + 2. Semantic: v0.2 data dumps identically whether parsed as v0.2 or v0.2.1 + (v0.2.1 neither drops, renames, nor injects fields for v0.2 input). + 3. Structural: every field v0.2.1 adds is Optional, so no previously valid + document can become invalid for want of a newly required field. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +import yaml + +from llmdbenchmark.analysis.benchmark_report.schema_v0_2 import ( + AggregateRequests as AggregateRequestsV02, + AggregateThroughput as AggregateThroughputV02, + BenchmarkReportV02, +) +from llmdbenchmark.analysis.benchmark_report.schema_v0_2_1 import ( + AggregateRequests as AggregateRequestsV021, + AggregateThroughput as AggregateThroughputV021, + BenchmarkReportV021, +) + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +BR_DIR = PROJECT_ROOT / "llmdbenchmark" / "analysis" / "benchmark_report" +V02_EXAMPLE = BR_DIR / "br_v0_2_example.yaml" + +# Smallest document that satisfies the v0.2 required fields. +MINIMAL_V02 = {"version": "0.2", "run": {"uid": "u"}, "results": {}} + + +def _load(path: Path) -> dict: + with open(path) as f: + return yaml.safe_load(f) + + +# --- 1. Concrete: real v0.2 documents validate under v0.2.1 ------------------- + + +@pytest.mark.parametrize( + "data", [_load(V02_EXAMPLE), MINIMAL_V02], ids=["example", "minimal"] +) +def test_v0_2_document_validates_under_both_versions(data): + # Sanity: it is genuinely a valid v0.2 document... + BenchmarkReportV02(**data) + # ...and therefore must also be a valid v0.2.1 document. + BenchmarkReportV021(**data) + + +# --- 2. Semantic: v0.2.1 is a no-op for v0.2 data ----------------------------- + + +def test_v0_2_data_dumps_identically_under_v0_2_1(): + data = _load(V02_EXAMPLE) + assert BenchmarkReportV021(**data).dump() == BenchmarkReportV02(**data).dump() + + +# --- 3. Structural: every field v0.2.1 adds is Optional ------------------------ + + +@pytest.mark.parametrize( + "v021_model, v02_model", + [ + (AggregateRequestsV021, AggregateRequestsV02), + (AggregateThroughputV021, AggregateThroughputV02), + (BenchmarkReportV021, BenchmarkReportV02), + ], +) +def test_added_fields_are_optional(v021_model, v02_model): + added = set(v021_model.model_fields) - set(v02_model.model_fields) + required = [n for n in added if v021_model.model_fields[n].is_required()] + assert not required, f"{v021_model.__name__} adds required field(s): {required}" + + +def test_no_v0_2_field_becomes_required_in_v0_2_1(): + # Any field shared with v0.2 must not have gained a required constraint. + for v021_model, v02_model in [ + (AggregateRequestsV021, AggregateRequestsV02), + (AggregateThroughputV021, AggregateThroughputV02), + (BenchmarkReportV021, BenchmarkReportV02), + ]: + for name, field in v02_model.model_fields.items(): + if not field.is_required(): + assert not v021_model.model_fields[name].is_required(), ( + f"{v021_model.__name__}.{name} became required in v0.2.1" + ) diff --git a/tests/test_benchmark_report_v0_2_1_multimodal.py b/tests/test_benchmark_report_v0_2_1_multimodal.py new file mode 100644 index 000000000..cc5cb0636 --- /dev/null +++ b/tests/test_benchmark_report_v0_2_1_multimodal.py @@ -0,0 +1,114 @@ +"""Feature and unit-guardrail tests for benchmark report schema v0.2.1. + +Complements the back-compat test: this exercises what v0.2.1 *adds*. A fully +populated multi-modal report validates and round-trips, every per-modality unit +is accepted with its correct category, and mismatched units are rejected, +including proof that the inherited v0.2 `request_rate` guardrail is not loosened +by the new media-throughput category. +""" + +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from llmdbenchmark.analysis.benchmark_report.schema_v0_2_1 import ( + AggregateRequests, + AggregateThroughput, + AudioPayloadStats, + BenchmarkReportV021, + ImagePayloadStats, + VideoPayloadStats, +) + +# Fields that are required independent of the one under test. +REQUIRED_FIELDS = {AggregateRequests: {"total": 0}} + + +def _stat(units: str, mean: float = 1.0) -> dict: + return {"units": units, "mean": mean} + + +def _build(model, field: str, units: str): + kwargs = dict(REQUIRED_FIELDS.get(model, {})) + kwargs[field] = _stat(units) + return model(**kwargs) + + +def test_full_multimodal_report_validates_and_roundtrips(): + report = BenchmarkReportV021( + version="0.2.1", + run={"uid": "u"}, + results={ + "request_performance": { + "aggregate": { + "requests": { + "total": 100, + "request_size": _stat("bytes", 51234.0), + "multimodal": { + "image": { + "count": _stat("count", 2.0), + "pixels": _stat("pixels", 2073600.0), + "aspect_ratio": _stat("ratio", 1.78), + "bytes": _stat("bytes", 40000.0), + }, + "video": { + "frames": _stat("count", 16.0), + }, + "audio": {"seconds": _stat("s", 10.0)}, + }, + }, + "throughput": { + "request_rate": _stat("queries/s", 12.0), + "image_rate": _stat("images/s", 24.0), + "video_rate": _stat("videos/s", 3.0), + "audio_rate": _stat("audios/s", 2.0), + }, + } + } + }, + ) + mm = report.results.request_performance.aggregate.requests.multimodal + assert mm.image.pixels.mean == 2073600.0 + assert mm.video.frames.mean == 16.0 + assert mm.audio.seconds.mean == 10.0 + # Survives a dump -> reload cycle. + assert BenchmarkReportV021(**report.dump()).version == "0.2.1" + + +@pytest.mark.parametrize( + "model, field, units", + [ + (AggregateThroughput, "image_rate", "images/s"), + (AggregateThroughput, "video_rate", "videos/s"), + (AggregateThroughput, "audio_rate", "audios/s"), + (ImagePayloadStats, "count", "count"), + (ImagePayloadStats, "pixels", "pixels"), + (ImagePayloadStats, "aspect_ratio", "ratio"), + (ImagePayloadStats, "bytes", "bytes"), + (VideoPayloadStats, "frames", "count"), + (AudioPayloadStats, "seconds", "s"), + (AggregateRequests, "request_size", "bytes"), + ], +) +def test_correct_units_accepted(model, field, units): + _build(model, field, units) + + +@pytest.mark.parametrize( + "model, field, units", + [ + # A media rate is not a request rate. + (AggregateThroughput, "image_rate", "queries/s"), + # The inherited v0.2 request_rate guardrail must stay intact. + (AggregateThroughput, "request_rate", "images/s"), + # An aspect ratio is a ratio, not a portion. + (ImagePayloadStats, "aspect_ratio", "fraction"), + (ImagePayloadStats, "pixels", "s"), + (AudioPayloadStats, "seconds", "bytes"), + (AggregateRequests, "request_size", "queries/s"), + ], +) +def test_mismatched_units_rejected(model, field, units): + with pytest.raises(ValidationError): + _build(model, field, units) diff --git a/tests/test_native_to_br0_2_1.py b/tests/test_native_to_br0_2_1.py new file mode 100644 index 000000000..2bb772faf --- /dev/null +++ b/tests/test_native_to_br0_2_1.py @@ -0,0 +1,116 @@ +"""Converter test: inference-perf native report -> benchmark report v0.2.1. + +The input fixture (tests/fixtures/inference_perf_lifecycle.yaml) is genuine +inference-perf output, captured from inference-perf's own summarize_requests +(see the fixture header). This test pins that the converter maps every +multimodal field to the right v0.2.1 location and units, performs the +native->schema renames (median->p50, p0.1->p0p1, p99.9->p99p9, *_per_sec-> +*_rate, request_size_bytes->request_size), and preserves the v0.2 content it +inherits from the reused v0.2 converter. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +import yaml + +from llmdbenchmark.analysis.benchmark_report.base import Units +from llmdbenchmark.analysis.benchmark_report.native_to_br0_2_1 import ( + import_inference_perf, +) +from llmdbenchmark.analysis.benchmark_report.schema_v0_2_1 import BenchmarkReportV021 + +FIXTURE = Path(__file__).parent / "fixtures" / "inference_perf_lifecycle.yaml" + + +@pytest.fixture(scope="module") +def native() -> dict: + with open(FIXTURE) as f: + return yaml.safe_load(f) + + +@pytest.fixture(scope="module") +def report() -> BenchmarkReportV021: + return import_inference_perf(str(FIXTURE)) + + +def _assert_maps(stat, raw: dict, units: Units): + """A converted Statistics must carry the right units and the native values, + with inference-perf's median/p0.1/p99.9 renamed to p50/p0p1/p99p9.""" + assert stat.units == units + assert stat.mean == raw["mean"] + assert stat.min == raw["min"] + assert stat.max == raw["max"] + assert stat.p50 == raw["median"] + assert stat.p0p1 == raw["p0.1"] + assert stat.p99p9 == raw["p99.9"] + + +def test_returns_v0_2_1_report(report): + assert isinstance(report, BenchmarkReportV021) + assert report.version == "0.2.1" + + +def test_request_size_mapped(report, native): + rs = report.results.request_performance.aggregate.requests.request_size + _assert_maps(rs, native["successes"]["request_size_bytes"], Units.BYTES) + + +def test_image_stats_mapped(report, native): + img = report.results.request_performance.aggregate.requests.multimodal.image + raw = native["successes"]["image"] + _assert_maps(img.count, raw["count"], Units.COUNT) + _assert_maps(img.pixels, raw["pixels"], Units.PIXELS) + _assert_maps(img.bytes, raw["bytes"], Units.BYTES) + _assert_maps(img.aspect_ratio, raw["aspect_ratio"], Units.RATIO) + + +def test_video_stats_mapped(report, native): + vid = report.results.request_performance.aggregate.requests.multimodal.video + raw = native["successes"]["video"] + _assert_maps(vid.count, raw["count"], Units.COUNT) + _assert_maps(vid.frames, raw["frames"], Units.COUNT) + _assert_maps(vid.pixels, raw["pixels"], Units.PIXELS) + _assert_maps(vid.bytes, raw["bytes"], Units.BYTES) + _assert_maps(vid.aspect_ratio, raw["aspect_ratio"], Units.RATIO) + # inference-perf does not emit a video duration, so the schema must not + # invent one. + assert not hasattr(vid, "seconds") + + +def test_audio_stats_mapped(report, native): + aud = report.results.request_performance.aggregate.requests.multimodal.audio + raw = native["successes"]["audio"] + _assert_maps(aud.count, raw["count"], Units.COUNT) + _assert_maps(aud.seconds, raw["seconds"], Units.S) + _assert_maps(aud.bytes, raw["bytes"], Units.BYTES) + + +def test_media_rates_mapped(report, native): + tp = report.results.request_performance.aggregate.throughput + raw = native["successes"]["throughput"] + assert tp.image_rate.units == Units.IMAGE_PER_S + assert tp.image_rate.mean == raw["images_per_sec"] + assert tp.video_rate.units == Units.VIDEO_PER_S + assert tp.video_rate.mean == raw["videos_per_sec"] + assert tp.audio_rate.units == Units.AUDIO_PER_S + assert tp.audio_rate.mean == raw["audios_per_sec"] + + +def test_inherited_v0_2_content_preserved(report, native): + """Reusing the v0.2 converter must not drop the v0.2 fields.""" + agg = report.results.request_performance.aggregate + # Request counts and the v0.2 request_rate / token rates survive. + assert agg.requests.total == native["successes"]["count"] + assert agg.throughput.request_rate.mean == native["successes"]["throughput"]["requests_per_sec"] + assert agg.throughput.output_token_rate.mean == ( + native["successes"]["throughput"]["output_tokens_per_sec"] + ) + # A v0.2 latency block is still populated. + assert agg.latency.request_latency.mean is not None + + +def test_roundtrips(report): + assert BenchmarkReportV021(**report.dump()).version == "0.2.1"