mlcommons
diff --git a/‎examples/02_ServerBenchmarking/offline_llama3_8b_cnn.yaml‎
Lines changed: 0 additions & 1 deletion b/‎examples/02_ServerBenchmarking/offline_llama3_8b_cnn.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/02_ServerBenchmarking/online_llama2_70b_cnn.yaml‎
Lines changed: 0 additions & 1 deletion b/‎examples/02_ServerBenchmarking/online_llama2_70b_cnn.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/inference_endpoint/cli.py‎
Lines changed: 1 addition & 1 deletion b/‎src/inference_endpoint/cli.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/inference_endpoint/commands/benchmark.py‎
Lines changed: 37 additions & 24 deletions b/‎src/inference_endpoint/commands/benchmark.py‎
Lines changed: 37 additions & 24 deletions
diff --git a/‎src/inference_endpoint/config/schema.py‎
Lines changed: 7 additions & 26 deletions b/‎src/inference_endpoint/config/schema.py‎
Lines changed: 7 additions & 26 deletions
diff --git a/‎src/inference_endpoint/config/templates/concurrency_template.yaml‎
Lines changed: 0 additions & 1 deletion b/‎src/inference_endpoint/config/templates/concurrency_template.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/inference_endpoint/config/templates/eval_template.yaml‎
Lines changed: 0 additions & 1 deletion b/‎src/inference_endpoint/config/templates/eval_template.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/inference_endpoint/config/templates/offline_template.yaml‎
Lines changed: 0 additions & 1 deletion b/‎src/inference_endpoint/config/templates/offline_template.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/inference_endpoint/config/templates/online_template.yaml‎
Lines changed: 0 additions & 1 deletion b/‎src/inference_endpoint/config/templates/online_template.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/inference_endpoint/config/templates/submission_template.yaml‎
Lines changed: 0 additions & 1 deletion b/‎src/inference_endpoint/config/templates/submission_template.yaml‎
Lines changed: 0 additions & 1 deletion
@@ -29,7 +29,6 @@ settings:
 
   client:
     workers: 4
-    max_concurrency: -1 # -1 = unlimited
 
 metrics:
   collect:
 
@@ -30,7 +30,6 @@ settings:
 
   client:
     workers: 4
-    max_concurrency: -1 # -1 = unlimited
 
 metrics:
   collect:
 
@@ -213,7 +213,7 @@ def _add_shared_benchmark_args(parser):
     parser.add_argument("--min-output-tokens", type=int, help="Min output tokens")
     parser.add_argument("--max-output-tokens", type=int, help="Max output tokens")
     parser.add_argument(
-        "--report-path", type=Path, help="Path to save detailed benchmark report"
+        "--report-dir", type=Path, help="Path to save detailed benchmark report"
     )
 
 
 
@@ -235,7 +235,7 @@ async def run_benchmark_command(args: argparse.Namespace) -> None:
     collect_responses = test_mode in [TestMode.ACC, TestMode.BOTH]
 
     # Run benchmark
-    _run_benchmark(args, effective_config, collect_responses, test_mode, benchmark_mode)
+    _run_benchmark(effective_config, collect_responses, test_mode, benchmark_mode)
 
 
 def _build_config_from_cli(
@@ -264,7 +264,10 @@ def _build_config_from_cli(
                 load_pattern_type = LoadPatternType.CONCURRENCY
             case "online":
                 load_pattern_type = LoadPatternType.POISSON
-
+    report_dir = getattr(args, "report_dir", None)
+    timeout = getattr(args, "timeout", None)
+    verbose = getattr(args, "verbose", False)
+    output = getattr(args, "output", None)
     # Build BenchmarkConfig from CLI params
     return BenchmarkConfig(
         name=f"cli_{benchmark_mode}",
@@ -315,6 +318,10 @@ def _build_config_from_cli(
         endpoint_config=EndpointConfig(endpoint=args.endpoint, api_key=args.api_key),
         metrics=Metrics(),
         baseline=None,  # CLI mode doesn't use baseline
+        report_dir=report_dir,
+        output=output,
+        timeout=timeout,
+        verbose=verbose,
     )
 
 
@@ -391,7 +398,6 @@ def _get_dataset_format(config: BenchmarkConfig, dataset_path: Path) -> str:
 
 
 def _run_benchmark(
-    args: argparse.Namespace,
     config: BenchmarkConfig,
     collect_responses: bool,
     test_mode: TestMode,
@@ -440,12 +446,19 @@ def _run_benchmark(
     # Load tokenizer if model name is provided
     # Priority: CLI args (offline/online modes) > config submission_ref (from-config mode)
     tokenizer = None
-    model_name = getattr(args, "model", None)
+    model_name = config.model_params.name
     if not model_name and config.submission_ref:
         model_name = config.submission_ref.model
     if not model_name and config.model_params.name:
         model_name = config.model_params.name
 
+    if config.report_dir:
+        report_dir = Path(config.report_dir)
+        report_dir.mkdir(parents=True, exist_ok=True)
+        config.to_yaml_file(report_dir / "config.yaml")
+
+    max_tokens = config.model_params.max_new_tokens
+
     if model_name:
         try:
             logger.info(f"Loading tokenizer for model: {model_name}")
@@ -460,18 +473,14 @@ def _run_benchmark(
         # Throw exception if no model name is provided
         raise InputValidationError("No model name provided")
 
-    # Get report path if specified
-    report_path = getattr(args, "report_path", None)
-    if report_path:
-        logger.info(f"Report will be saved to: {report_path}")
-
     # Get dataset - from CLI or from config
     # TODO: Dataset Logic is not yet fully implemented
-    dataset_path = _get_dataset_path(args, config)
+    # dataset_path = _get_dataset_path(args, config)
+    dataset_path = config.datasets[0].path
 
     # Load dataset using factory
     dataset_format = _get_dataset_format(config, dataset_path)
-    logger.info(f"Loading: {dataset_path.name} (format: {dataset_format})")
+    logger.info(f"Loading: {dataset_path} (format: {dataset_format})")
 
     # Determine if streaming should be enabled based on config
     streaming_mode = config.model_params.streaming
@@ -500,10 +509,17 @@ def _run_benchmark(
             dataset_path,
             format=dataset_format,
             key_maps=key_maps,
-            metadata={"model": model_name, "stream": enable_streaming},
+            metadata={
+                "model": model_name,
+                "stream": enable_streaming,
+                "max_completion_tokens": max_tokens,
+            },
         )
         dataloader.load()
         logger.info(f"Loaded {dataloader.num_samples()} samples")
+    except FileNotFoundError as e:
+        logger.error(f"Dataset file not found: {dataset_path}")
+        raise InputValidationError(f"Dataset file not found: {dataset_path}") from e
     except NotImplementedError as e:
         logger.error(f"Dataset format not supported: {dataset_format}")
         raise SetupError(str(e)) from e
@@ -550,20 +566,17 @@ def _run_benchmark(
     # Create endpoint client
     endpoint = config.endpoint_config.endpoint
     num_workers = config.settings.client.workers
-    max_concurrency = config.settings.client.max_concurrency
 
     logger.info(f"Connecting: {endpoint}")
-    logger.info(
-        f"Client config: workers={num_workers}, max_concurrency={max_concurrency if max_concurrency > 0 else 'unlimited'}"
-    )
+    logger.info(f"Client config: workers={num_workers}")
 
     tmp_dir = tempfile.mkdtemp(prefix="inference_endpoint_")
 
     try:
         http_config = HTTPClientConfig(
             endpoint_url=urljoin(endpoint, "/v1/chat/completions"),
             num_workers=num_workers,
-            max_concurrency=max_concurrency,
+            max_concurrency=-1,  # unlimited
         )
         aiohttp_config = AioHttpConfig()
         zmq_config = ZMQConfig(
@@ -595,9 +608,9 @@ def _run_benchmark(
             scheduler,
             name="cli_benchmark",
             stop_sample_issuer_on_test_end=False,
-            report_path=report_path,
+            report_dir=config.report_dir,
             tokenizer_override=tokenizer,
-            max_shutdown_timeout_s=args.timeout if args.timeout else None,
+            max_shutdown_timeout_s=config.timeout if config.timeout else None,
         )
 
         # Wait for test end with ability to interrupt
@@ -629,14 +642,14 @@ def signal_handler(signum, frame):
 
         if response_collector.errors:
             logger.warning(f"Errors: {len(response_collector.errors)}")
-            if args.verbose:
+            if config.verbose:
                 for error in response_collector.errors[:3]:
                     logger.warning(f"  {error}")
                 if len(response_collector.errors) > 3:
                     logger.warning(f"  ... +{len(response_collector.errors) - 3} more")
 
         # Save results if requested
-        if hasattr(args, "output") and args.output:
+        if config.output:
             try:
                 results = {
                     "config": {
@@ -660,9 +673,9 @@ def signal_handler(signum, frame):
                 if response_collector.errors:
                     results["errors"] = response_collector.errors
 
-                with open(args.output, "w") as f:
+                with open(config.output, "w") as f:
                     json.dump(results, f, indent=2)
-                logger.info(f"Saved: {args.output}")
+                logger.info(f"Saved: {config.output}")
             except Exception as e:
                 logger.error(f"Save failed: {e}")
 
@@ -685,5 +698,5 @@ def signal_handler(signum, frame):
             http_client.shutdown()
             shutil.rmtree(tmp_dir, ignore_errors=True)
         except Exception as e:
-            if args.verbose:
+            if config.verbose:
                 logger.warning(f"Cleanup error: {e}")
@@ -230,14 +230,12 @@ class LoadPattern(BaseModel):
 class ClientSettings(BaseModel):
     """HTTP client configuration.
 
-    Only workers and max_concurrency are required to configure the client.
+    Only workers are required to configure the client.
     Timeout is handled by the HTTP client internally.
 
-    Note: max_concurrency = -1 means unlimited (no semaphore limit).
     """
 
     workers: int = 4
-    max_concurrency: int = -1  # -1 = unlimited (default for CLI and YAML)
 
 
 class Settings(BaseModel):
@@ -321,6 +319,10 @@ class BenchmarkConfig(BaseModel):
     settings: Settings = Field(default_factory=Settings)
     metrics: Metrics = Field(default_factory=Metrics)
     endpoint_config: EndpointConfig = Field(default_factory=EndpointConfig)
+    output: Path | None = None
+    report_dir: Path | None = None
+    timeout: int | None = None
+    verbose: bool = False
 
     @classmethod
     def from_yaml_file(cls, path: Path) -> BenchmarkConfig:
@@ -470,27 +472,6 @@ def validate_client_settings(self) -> None:
                 f"workers must be >= 1, got {self.settings.client.workers}"
             )
 
-        # max_concurrency: -1 means unlimited, otherwise must be >= 1
-        if (
-            self.settings.client.max_concurrency < -1
-            or self.settings.client.max_concurrency == 0
-        ):
-            raise ValueError(
-                f"max_concurrency must be -1 (unlimited) or >= 1, got {self.settings.client.max_concurrency}"
-            )
-
-        # Ensure max_concurrency can handle target_concurrency if set
-        target_concurrency = self.settings.load_pattern.target_concurrency
-        max_concurrency = self.settings.client.max_concurrency
-
-        if (
-            target_concurrency is not None and max_concurrency > 0
-        ):  # Skip check if unlimited (-1)
-            if max_concurrency < target_concurrency:
-                raise ValueError(
-                    f"max_concurrency ({max_concurrency}) must be >= target_concurrency ({target_concurrency})"
-                )
-
     def validate_runtime_settings(self) -> None:
         """Validate runtime settings are reasonable.
 
@@ -579,7 +560,7 @@ def create_default_config(cls, test_type: TestType) -> BenchmarkConfig:
                         scheduler_random_seed=42,
                         dataloader_random_seed=42,
                     ),
-                    client=ClientSettings(workers=4, max_concurrency=-1),
+                    client=ClientSettings(workers=4),
                 ),
                 model_params=ModelParams(temperature=0.7, max_new_tokens=1024),
                 metrics=Metrics(),
@@ -601,7 +582,7 @@ def create_default_config(cls, test_type: TestType) -> BenchmarkConfig:
                         scheduler_random_seed=42,
                         dataloader_random_seed=42,
                     ),
-                    client=ClientSettings(workers=4, max_concurrency=-1),
+                    client=ClientSettings(workers=4),
                 ),
                 model_params=ModelParams(temperature=0.7, max_new_tokens=1024),
                 metrics=Metrics(),
 
@@ -31,7 +31,6 @@ settings:
 
   client:
     workers: 4
-    max_concurrency: -1 # -1 = unlimited # Should exceed/match target_concurrency
 
 metrics:
   collect:
 
@@ -25,7 +25,6 @@ settings:
 
   client:
     workers: 4
-    max_concurrency: -1 # -1 = unlimited
 
 metrics:
   collect:
 
@@ -30,7 +30,6 @@ settings:
 
   client:
     workers: 4
-    max_concurrency: -1 # -1 = unlimited
 
 metrics:
   collect:
 
@@ -30,7 +30,6 @@ settings:
 
   client:
     workers: 4
-    max_concurrency: -1 # -1 = unlimited
 
 metrics:
   collect:
 
@@ -56,7 +56,6 @@ settings:
 
   client:
     workers: 4
-    max_concurrency: -1 # -1 = unlimited
 
 metrics:
   collect:
Original file line number	Diff line number	Diff line change
`@@ -213,7 +213,7 @@ def _add_shared_benchmark_args(parser):`
`213`	`213`	`parser.add_argument("--min-output-tokens", type=int, help="Min output tokens")`
`214`	`214`	`parser.add_argument("--max-output-tokens", type=int, help="Max output tokens")`
`215`	`215`	`parser.add_argument(`
`216`		`- "--report-path", type=Path, help="Path to save detailed benchmark report"`
	`216`	`+ "--report-dir", type=Path, help="Path to save detailed benchmark report"`
`217`	`217`	`)`
`218`	`218`
`219`	`219`