diff --git a/examples/02_ServerBenchmarking/offline_llama3_8b_cnn.yaml b/examples/02_ServerBenchmarking/offline_llama3_8b_cnn.yaml index fab814e0..bc5b92f1 100644 --- a/examples/02_ServerBenchmarking/offline_llama3_8b_cnn.yaml +++ b/examples/02_ServerBenchmarking/offline_llama3_8b_cnn.yaml @@ -30,13 +30,6 @@ settings: client: num_workers: 4 -metrics: - collect: - - "throughput" - - "latency" - - "ttft" - - "tpot" - endpoint_config: endpoints: - "http://localhost:8000" diff --git a/examples/02_ServerBenchmarking/online_llama2_70b_cnn.yaml b/examples/02_ServerBenchmarking/online_llama2_70b_cnn.yaml index bda7f884..d1603544 100644 --- a/examples/02_ServerBenchmarking/online_llama2_70b_cnn.yaml +++ b/examples/02_ServerBenchmarking/online_llama2_70b_cnn.yaml @@ -31,13 +31,6 @@ settings: client: num_workers: 4 -metrics: - collect: - - "throughput" - - "latency" - - "ttft" - - "tpot" - endpoint_config: endpoints: - "http://localhost:8000" diff --git a/examples/04_GPTOSS120B_Example/gptoss_120b_example.yaml b/examples/04_GPTOSS120B_Example/gptoss_120b_example.yaml index 8e0a7820..fb5edd33 100644 --- a/examples/04_GPTOSS120B_Example/gptoss_120b_example.yaml +++ b/examples/04_GPTOSS120B_Example/gptoss_120b_example.yaml @@ -30,13 +30,6 @@ settings: num_workers: 4 record_worker_events: false -metrics: - collect: - - "throughput" - - "latency" - - "ttft" - - "tpot" - endpoint_config: endpoints: - "http://localhost:3000" diff --git a/examples/04_GPTOSS120B_Example/sglang_gptoss_120b_example.yaml b/examples/04_GPTOSS120B_Example/sglang_gptoss_120b_example.yaml index 5a6ad852..761d5c3b 100644 --- a/examples/04_GPTOSS120B_Example/sglang_gptoss_120b_example.yaml +++ b/examples/04_GPTOSS120B_Example/sglang_gptoss_120b_example.yaml @@ -52,13 +52,6 @@ settings: num_workers: 8 record_worker_events: false -metrics: - collect: - - "throughput" - - "latency" - - "ttft" - - "tpot" - endpoint_config: endpoints: - "http://localhost:30000" diff --git a/examples/04_GPTOSS120B_Example/vllm_gptoss_120b_example.yaml b/examples/04_GPTOSS120B_Example/vllm_gptoss_120b_example.yaml index fe6e3e52..7cf215f9 100644 --- a/examples/04_GPTOSS120B_Example/vllm_gptoss_120b_example.yaml +++ b/examples/04_GPTOSS120B_Example/vllm_gptoss_120b_example.yaml @@ -55,13 +55,6 @@ settings: num_workers: 8 record_worker_events: false -metrics: - collect: - - "throughput" - - "latency" - - "ttft" - - "tpot" - endpoint_config: endpoints: - "http://localhost:8000" diff --git a/examples/05_Llama3.1-8B_Example/offline_llama3_8b_cnn.yaml b/examples/05_Llama3.1-8B_Example/offline_llama3_8b_cnn.yaml index 2f85210d..57e105c7 100644 --- a/examples/05_Llama3.1-8B_Example/offline_llama3_8b_cnn.yaml +++ b/examples/05_Llama3.1-8B_Example/offline_llama3_8b_cnn.yaml @@ -40,13 +40,6 @@ settings: client: num_workers: 4 # Number of client workers -metrics: - collect: - - "throughput" - - "latency" - - "ttft" - - "tpot" - endpoint_config: endpoints: - "http://localhost:8000" diff --git a/examples/05_Llama3.1-8B_Example/online_llama3_8b_cnn.yaml b/examples/05_Llama3.1-8B_Example/online_llama3_8b_cnn.yaml index 0190d620..66861f2f 100644 --- a/examples/05_Llama3.1-8B_Example/online_llama3_8b_cnn.yaml +++ b/examples/05_Llama3.1-8B_Example/online_llama3_8b_cnn.yaml @@ -41,13 +41,6 @@ settings: client: num_workers: 4 # Number of client workers -metrics: - collect: - - "throughput" - - "latency" - - "ttft" - - "tpot" - endpoint_config: endpoints: - "http://localhost:8000" diff --git a/examples/06_Llama2-70B_Example/online_llama2_70b_orca.yaml b/examples/06_Llama2-70B_Example/online_llama2_70b_orca.yaml index f54d6dc2..5a7f6ce5 100644 --- a/examples/06_Llama2-70B_Example/online_llama2_70b_orca.yaml +++ b/examples/06_Llama2-70B_Example/online_llama2_70b_orca.yaml @@ -34,13 +34,6 @@ settings: client: num_workers: 4 -metrics: - collect: - - "throughput" - - "latency" - - "ttft" - - "tpot" - endpoint_config: endpoints: - "http://localhost:8000" diff --git a/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml b/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml index 394209be..95445781 100644 --- a/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml +++ b/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml @@ -42,11 +42,6 @@ settings: # Increase timeout for slow worker startup (spawn, imports). Default 40s may be too short. worker_initialization_timeout: 120 -metrics: - collect: - - "throughput" - - "latency" - endpoint_config: endpoints: - "http://localhost:8000" diff --git a/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml b/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml index 44ddb2b9..db23f163 100644 --- a/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml +++ b/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml @@ -36,11 +36,6 @@ settings: # Increase timeout for slow worker startup (spawn, imports). Default 40s may be too short. worker_initialization_timeout: 120 -metrics: - collect: - - "latency" - - "ttft" - endpoint_config: endpoints: - "http://localhost:8000" diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index a8fb87ac..e7bfe19e 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -417,51 +417,6 @@ class OnlineSettings(Settings): pass -def _default_metrics() -> list[str]: - """ - TODO: PoC only, subject to change! - Default metrics to collect.""" - return ["throughput", "latency", "ttft", "tpot"] - - -class Metrics(BaseModel): - """Metrics collection configuration. - - Note: Currently uses string-based metric names for YAML simplicity. - Use get_metric_types() to convert to actual Metric type classes. - """ - - model_config = ConfigDict(extra="forbid", frozen=True) - - collect: list[str] = Field(default_factory=_default_metrics) - - def get_metric_types(self) -> list[type[metrics.Metric]]: - """Convert string metric names to Metric type classes. - - Returns: - List of Metric type classes corresponding to collect list - - Raises: - ValueError: If metric name is not recognized - """ - metric_map = { - "throughput": metrics.Throughput, - "latency": metrics.QueryLatency, - "ttft": metrics.TTFT, - "tpot": metrics.TPOT, - } - - result = [] - for name in self.collect: - if name not in metric_map: - raise ValueError( - f"Unknown metric name: {name}. Available: {list(metric_map.keys())}" - ) - result.append(metric_map[name]) - - return result - - class EndpointConfig(BaseModel): """Endpoint connection configuration. @@ -516,9 +471,6 @@ class BenchmarkConfig(WithUpdatesMixin, BaseModel): default_factory=list, description="Dataset configs" ) settings: Settings = Field(default_factory=Settings) - metrics: Annotated[Metrics, cyclopts.Parameter(show=False)] = Field( - default_factory=Metrics - ) endpoint_config: EndpointConfig report_dir: Annotated[ Path | None, diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 1e18b3bf..48f1b34f 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -69,12 +69,6 @@ settings: max_idle_time: 4.0 # Discard connections idle longer than this (seconds) min_required_connections: -1 # Min connections to initialize (-1=auto, 0=disabled) worker_gc_mode: relaxed # Worker GC strategy | options: disabled, relaxed, system -metrics: - collect: - - throughput - - latency - - ttft - - tpot endpoint_config: endpoints: # Endpoint URL(s) - '' diff --git a/src/inference_endpoint/config/templates/eval_template.yaml b/src/inference_endpoint/config/templates/eval_template.yaml index 947c3447..3213efd1 100644 --- a/src/inference_endpoint/config/templates/eval_template.yaml +++ b/src/inference_endpoint/config/templates/eval_template.yaml @@ -26,10 +26,6 @@ settings: client: num_workers: 4 -metrics: - collect: - - "accuracy" - endpoint_config: endpoints: - "http://localhost:8000" diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index 29a661ed..7c5f43c6 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -69,12 +69,6 @@ settings: max_idle_time: 4.0 # Discard connections idle longer than this (seconds) min_required_connections: -1 # Min connections to initialize (-1=auto, 0=disabled) worker_gc_mode: relaxed # Worker GC strategy | options: disabled, relaxed, system -metrics: - collect: - - throughput - - latency - - ttft - - tpot endpoint_config: endpoints: # Endpoint URL(s) - '' diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index ad1a2423..6e274f8e 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -69,12 +69,6 @@ settings: max_idle_time: 4.0 # Discard connections idle longer than this (seconds) min_required_connections: -1 # Min connections to initialize (-1=auto, 0=disabled) worker_gc_mode: relaxed # Worker GC strategy | options: disabled, relaxed, system -metrics: - collect: - - throughput - - latency - - ttft - - tpot endpoint_config: endpoints: # Endpoint URL(s) - '' diff --git a/src/inference_endpoint/config/templates/submission_template.yaml b/src/inference_endpoint/config/templates/submission_template.yaml index df760a5f..793c047d 100644 --- a/src/inference_endpoint/config/templates/submission_template.yaml +++ b/src/inference_endpoint/config/templates/submission_template.yaml @@ -58,14 +58,6 @@ settings: client: num_workers: 4 -metrics: - collect: - - "throughput" - - "latency" - - "ttft" - - "tpot" - - "accuracy" - endpoint_config: endpoints: - "http://localhost:8000" diff --git a/tests/unit/config/test_schema.py b/tests/unit/config/test_schema.py index f6f33afa..1792e52c 100644 --- a/tests/unit/config/test_schema.py +++ b/tests/unit/config/test_schema.py @@ -22,7 +22,6 @@ Dataset, DatasetType, EvalMethod, - Metrics, ModelParams, OSLDistribution, OSLDistributionType, @@ -109,20 +108,6 @@ def test_auto_derive_name(self): assert ds.name == "my_data" -class TestMetrics: - @pytest.mark.unit - def test_get_metric_types(self): - m = Metrics(collect=["throughput", "latency", "ttft", "tpot"]) - types = m.get_metric_types() - assert len(types) == 4 - - @pytest.mark.unit - def test_unknown_metric_raises(self): - m = Metrics(collect=["nonexistent"]) - with pytest.raises(ValueError, match="Unknown metric"): - m.get_metric_types() - - class TestBenchmarkConfig: @pytest.mark.unit def test_minimal_offline(self): diff --git a/tests/unit/config/test_yaml_loader.py b/tests/unit/config/test_yaml_loader.py index 37190931..716055dd 100644 --- a/tests/unit/config/test_yaml_loader.py +++ b/tests/unit/config/test_yaml_loader.py @@ -57,10 +57,6 @@ def test_load_valid_yaml(self, tmp_path): recv_buffer_size: 16777216 send_buffer_size: 8388608 -metrics: - collect: - - "throughput" - endpoint_config: endpoints: - "http://localhost:8000"