mlcommons
diff --git a/‎README.md‎
Lines changed: 2 additions & 1 deletion b/‎README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/CLI_QUICK_REFERENCE.md‎
Lines changed: 12 additions & 7 deletions b/‎docs/CLI_QUICK_REFERENCE.md‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎docs/LOCAL_TESTING.md‎
Lines changed: 13 additions & 1 deletion b/‎docs/LOCAL_TESTING.md‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎requirements/test.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements/test.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/inference_endpoint/cli.py‎
Lines changed: 19 additions & 13 deletions b/‎src/inference_endpoint/cli.py‎
Lines changed: 19 additions & 13 deletions
diff --git a/‎src/inference_endpoint/commands/benchmark.py‎
Lines changed: 15 additions & 10 deletions b/‎src/inference_endpoint/commands/benchmark.py‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎src/inference_endpoint/config/rulesets/mlcommons/rules.py‎
Lines changed: 1 addition & 0 deletions b/‎src/inference_endpoint/config/rulesets/mlcommons/rules.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/inference_endpoint/config/runtime_settings.py‎
Lines changed: 5 additions & 1 deletion b/‎src/inference_endpoint/config/runtime_settings.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/inference_endpoint/config/schema.py‎
Lines changed: 10 additions & 9 deletions b/‎src/inference_endpoint/config/schema.py‎
Lines changed: 10 additions & 9 deletions
@@ -46,11 +46,12 @@ inference-endpoint benchmark offline \
   --model Qwen/Qwen3-8B \
   --dataset tests/datasets/dummy_1k.pkl
 
-# Run online benchmark (sustained QPS - requires --target-qps)
+# Run online benchmark (sustained QPS - requires --target-qps, --load-pattern)
 inference-endpoint benchmark online \
   --endpoint http://your-endpoint:8000 \
   --model Qwen/Qwen3-8B \
   --dataset tests/datasets/dummy_1k.pkl \
+  --load-pattern poisson \
   --target-qps 100
 
 # With explicit sample count
 
@@ -11,11 +11,12 @@ inference-endpoint benchmark offline \
   --model Qwen/Qwen3-8B \
   --dataset tests/datasets/dummy_1k.pkl
 
-# Online (sustained QPS - CLI mode - requires --target-qps)
+# Online (sustained QPS - CLI mode - requires --target-qps, --load-pattern)
 inference-endpoint benchmark online \
   --endpoint URL \
   --model Qwen/Qwen3-8B \
   --dataset tests/datasets/dummy_1k.pkl \
+  --load-pattern poisson \
   --target-qps 100
 
 # With detailed report generation
@@ -75,7 +76,7 @@ inference-endpoint info
 ## Benchmark Options (CLI Mode Only)
 
 - `--api-key KEY` - API authentication
-- `--target-qps N` - Target queries per second (required for online mode with poisson pattern)
+- `--target-qps N` - Target queries per second (required when --load-pattern=poisson)
 - `--duration SEC` - Test duration in seconds (default: 0 - run until dataset exhausted)
 - `--num-samples N` - Number of samples to issue (overrides dataset size and duration calculation)
 - `--streaming MODE` - Streaming control: `auto` (default), `on`, or `off`. Streaming will enable token streaming in response.
@@ -86,8 +87,8 @@ inference-endpoint info
 
 ## Online-Specific Options
 
-- `--load-pattern TYPE` - Load pattern: `poisson` (default), `concurrency`
-- `--concurrency N` - Max concurrent requests (required when using concurrency load pattern, default: -1 unlimited for other patterns)
+- `--load-pattern TYPE` - Load pattern (required): `poisson`, `concurrency`
+- `--concurrency N` - Max concurrent requests (required when --load-pattern=concurrency)
 
 ## Dataset Formats
 
@@ -134,11 +135,11 @@ inference-endpoint info
 - Sustains target QPS
 - Use with `benchmark online --target-qps N`
 
-**concurrency** - Online mode (fixed concurrency) - NOT YET IMPLEMENTED
+**concurrency** - Online mode (fixed concurrency)
 
 - Maintains N concurrent requests
 - QPS emerges from concurrency/latency
-- Will be available in future release
+- Use with `benchmark online --load-pattern concurrency --concurrency N`
 
 ## Examples
 
@@ -159,6 +160,7 @@ inference-endpoint benchmark online \
   --endpoint https://api.production.com \
   --model Qwen/Qwen3-8B \
   --dataset prod_queries.pkl \
+  --load-pattern poisson \
   --target-qps 100 \
   --num-samples 10000 \
   --workers 16 \
@@ -171,6 +173,7 @@ inference-endpoint benchmark online \
   --endpoint https://api.production.com \
   --model Qwen/Qwen3-8B \
   --dataset prod_queries.pkl \
+  --load-pattern poisson \
   --target-qps 100 \
   --duration 300 \
   --workers 16 \
@@ -274,7 +277,9 @@ endpoint_config:
 
 **Mode Requirements:**
 
-- Online mode requires `--target-qps` (poisson) or `--concurrency` (concurrency pattern)
+- Online mode requires `--load-pattern` (poisson or concurrency)
+  - `--load-pattern poisson` requires `--target-qps`
+  - `--load-pattern concurrency` requires `--concurrency`
 - Use `--mode both` for combined perf + accuracy runs
 - Streaming: auto (default) enables streaming responses for online, disables for offline
 
 
@@ -113,6 +113,7 @@ inference-endpoint -v benchmark online \
   --endpoint http://localhost:8765 \
   --model Qwen/Qwen3-8B \
   --dataset tests/datasets/dummy_1k.pkl \
+  --load-pattern poisson \
   --target-qps 100 \
   --report-path online_benchmark_report
 ```
@@ -286,6 +287,7 @@ inference-endpoint benchmark online \
   --endpoint http://localhost:8765 \
   --model Qwen/Qwen3-8B \
   --dataset tests/datasets/dummy_1k.pkl \
+  --load-pattern poisson \
   --target-qps 500 \
   --report-path online_report
 
@@ -302,14 +304,24 @@ inference-endpoint benchmark offline \
   --model Qwen/Qwen3-8B \
   --dataset tests/datasets/dummy_1k.pkl \
   --streaming on
+
+# Concurrency mode (fixed concurrent requests)
+inference-endpoint benchmark online \
+  --endpoint http://localhost:8765 \
+  --model Qwen/Qwen3-8B \
+  --dataset tests/datasets/dummy_1k.pkl \
+  --load-pattern concurrency \
+  --concurrency 32
 ```
 
 ## Tips
 
 **Key Requirements:**
 
 - Model name is **required** for all benchmark and probe commands
-- Online mode requires `--target-qps` (poisson) or `--concurrency` (concurrency pattern)
+- Online mode requires `--load-pattern` to specify the scheduler type (poisson or concurrency)
+  - `--load-pattern poisson` requires `--target-qps`
+  - `--load-pattern concurrency` requires `--concurrency`
 - Set `HF_TOKEN` environment variable for non-public models (public models like Qwen/Qwen3-8B don't need it)
 
 **Sample Count Control:**
 
@@ -14,6 +14,7 @@ pytest-benchmark>=4.0.0
 coverage>=7.0.0
 line-profiler==5.0.0
 Pympler==1.1
+scipy==1.16.3
 
 # HTTP server and client for mock server fixture
 aiohttp>=3.8.0
 
@@ -97,7 +97,7 @@ def create_parser() -> argparse.ArgumentParser:
     online_parser = benchmark_subparsers.add_parser(
         "online",
         help="Online benchmark (sustained QPS)",
-        description="Online mode: Issues queries at target QPS using Poisson distribution.",
+        description="Online mode: Issues queries using specified load pattern (--load-pattern required).",
     )
     _add_shared_benchmark_args(online_parser)
     _add_online_specific_args(online_parser)
@@ -189,17 +189,7 @@ def _add_shared_benchmark_args(parser):
         "--dataset", "-d", type=Path, required=True, help="Dataset file"
     )
     parser.add_argument("--api-key", type=str, help="API key")
-    parser.add_argument(
-        "--target-qps",
-        type=float,
-        help="Target queries per second (required for online mode with poisson pattern)",
-    )
     parser.add_argument("--workers", type=int, help="HTTP workers (default: 4)")
-    parser.add_argument(
-        "--concurrency",
-        type=int,
-        help="Max concurrent requests (required when using concurrency load pattern, default: -1 unlimited for other patterns)",
-    )
     parser.add_argument(
         "--duration",
         type=int,
@@ -230,8 +220,13 @@ def _add_shared_benchmark_args(parser):
 def _add_online_specific_args(parser):
     """Add online-specific arguments.
 
+    These arguments are only available for online mode and will be rejected
+    by argparse if used with offline mode.
+
     Currently adds:
-    - load-pattern: Scheduler type (poisson, etc.)
+    - load-pattern: Scheduler type (poisson, concurrency, etc.) - REQUIRED
+    - target-qps: Target QPS for poisson pattern
+    - concurrency: Max concurrent requests for concurrency pattern
 
     Load pattern choices are dynamically derived from registered Scheduler
     implementations to maintain a single source of truth.
@@ -244,7 +239,18 @@ def _add_online_specific_args(parser):
     parser.add_argument(
         "--load-pattern",
         choices=available_patterns,
-        help=f"Load pattern (default: poisson, available: {', '.join(available_patterns)})",
+        required=True,
+        help=f"Load pattern (required, available: {', '.join(available_patterns)})",
+    )
+    parser.add_argument(
+        "--target-qps",
+        type=float,
+        help="Target queries per second (required when --load-pattern=poisson)",
+    )
+    parser.add_argument(
+        "--concurrency",
+        type=int,
+        help="Max concurrent requests (required when --load-pattern=concurrency)",
     )
 
 
 
@@ -58,7 +58,7 @@
     ZMQConfig,
 )
 from inference_endpoint.endpoint_client.http_client import HTTPEndpointClient
-from inference_endpoint.endpoint_client.loadgen import HttpClientSampleIssuer
+from inference_endpoint.endpoint_client.http_sample_issuer import HttpClientSampleIssuer
 from inference_endpoint.exceptions import (
     ExecutionError,
     InputValidationError,
@@ -248,15 +248,16 @@ def _build_config_from_cli(
         InputValidationError: If required params missing
     """
     # Determine load pattern (CLI override or mode default)
-    load_pattern_arg = getattr(args, "load_pattern", None)
-    if load_pattern_arg:
+    if load_pattern_arg := getattr(args, "load_pattern", None):
         load_pattern_type = LoadPatternType(load_pattern_arg)
     else:
-        load_pattern_type = (
-            LoadPatternType.MAX_THROUGHPUT
-            if benchmark_mode == "offline"
-            else LoadPatternType.POISSON
-        )
+        match benchmark_mode:
+            case "offline":
+                load_pattern_type = LoadPatternType.MAX_THROUGHPUT
+            case "online" if getattr(args, "concurrency", None):
+                load_pattern_type = LoadPatternType.CONCURRENCY
+            case "online":
+                load_pattern_type = LoadPatternType.POISSON
 
     # Build BenchmarkConfig from CLI params
     return BenchmarkConfig(
@@ -274,7 +275,8 @@ def _build_config_from_cli(
         settings=Settings(
             load_pattern=LoadPattern(
                 type=load_pattern_type,
-                target_qps=args.target_qps if args.target_qps else None,
+                target_qps=getattr(args, "target_qps", None),
+                target_concurrency=getattr(args, "concurrency", None),
             ),
             runtime=RuntimeConfig(
                 min_duration_ms=args.duration * 1000
@@ -289,7 +291,7 @@ def _build_config_from_cli(
             ),
             client=ClientSettings(
                 workers=args.workers if args.workers else 4,
-                max_concurrency=args.concurrency if args.concurrency else -1,
+                max_concurrency=-1,  # client uses unlimited concurrency by default
             ),
         ),
         model_params=ModelParams(
@@ -540,6 +542,9 @@ def _run_benchmark(
     max_concurrency = config.settings.client.max_concurrency
 
     logger.info(f"Connecting: {endpoint}")
+    logger.info(
+        f"Client config: workers={num_workers}, max_concurrency={max_concurrency if max_concurrency > 0 else 'unlimited'}"
+    )
 
     tmp_dir = tempfile.mkdtemp(prefix="inference_endpoint_")
 
 
@@ -207,6 +207,7 @@ def apply_user_config(
             min_sample_count=min_sample_count,
             rng_sched=random.Random(self.scheduler_rng_seed),
             rng_sample_index=random.Random(self.sample_index_rng_seed),
+            load_pattern=None,  # not part user config
             optimization_priority=opt_prio,
             model=model,
             rules=ruleset,
 
@@ -37,7 +37,7 @@
 
 if TYPE_CHECKING:
     from .ruleset_base import BenchmarkSuiteRuleset
-    from .schema import BenchmarkConfig
+    from .schema import BenchmarkConfig, LoadPattern
 
 
 @dataclass(frozen=True, slots=True)
@@ -81,6 +81,9 @@ class RuntimeSettings:
     rng_sample_index: random.Random
     """Random number generator for sample indexing"""
 
+    load_pattern: LoadPattern | None
+    """Load pattern configuration"""
+
     @classmethod
     def from_config(
         cls,
@@ -155,6 +158,7 @@ def _from_config_default(
             "min_sample_count": 1,
             "rng_sched": random.Random(runtime_cfg.scheduler_random_seed),
             "rng_sample_index": random.Random(runtime_cfg.dataloader_random_seed),
+            "load_pattern": load_pattern_cfg,
         }
 
         # Apply overrides
 
@@ -35,7 +35,7 @@ class LoadPatternType(str, Enum):
 
     MAX_THROUGHPUT = "max_throughput"  # Offline: all queries at t=0
     POISSON = "poisson"  # Online: fixed QPS with Poisson distribution
-    CONCURRENCY = "concurrency"  # Online: fixed concurrent requests (TODO)
+    CONCURRENCY = "concurrency"  # Online: fixed concurrent requests
     BURST = "burst"  # Burst pattern (TODO)
     STEP = "step"  # Step pattern (TODO)
 
@@ -217,14 +217,14 @@ class LoadPattern(BaseModel):
     Different patterns use target_qps differently:
     - max_throughput: target_qps used for calculating total queries (offline, optional with default)
     - poisson: target_qps sets scheduler rate (online, required - validated)
-    - concurrency: target_qps not used, concurrency limit dominates (TODO)
+    - concurrency: issue at fixed target_concurrency (online, required - validated)
     """
 
     type: LoadPatternType = LoadPatternType.MAX_THROUGHPUT
     target_qps: float | None = (
         None  # Target QPS - required for poisson pattern, optional otherwise
     )
-    target_concurrency: int | None = None  # For concurrency mode (TODO)
+    target_concurrency: int | None = None  # For concurrency mode, ignored otherwise
 
 
 class ClientSettings(BaseModel):
@@ -314,7 +314,8 @@ class BenchmarkConfig(BaseModel):
     version: str = "1.0"
     type: TestType
     submission_ref: SubmissionReference | None = None  # For SUBMISSION type configs
-    benchmark_mode: TestType | None = None  # For SUBMISSION: specify offline or online
+    # For SUBMISSION: specify offline or online
+    benchmark_mode: TestType | None = None
     model_params: ModelParams = Field(default_factory=ModelParams)
     datasets: list[Dataset]
     settings: Settings = Field(default_factory=Settings)
@@ -433,7 +434,7 @@ def validate_load_pattern(self, benchmark_mode: TestType) -> None:
         """
         load_pattern_type = self.settings.load_pattern.type
         target_qps = self.settings.load_pattern.target_qps
-        max_concurrency = self.settings.client.max_concurrency
+        target_concurrency = self.settings.load_pattern.target_concurrency
 
         if benchmark_mode == TestType.OFFLINE:
             if load_pattern_type != LoadPatternType.MAX_THROUGHPUT:
@@ -451,11 +452,11 @@ def validate_load_pattern(self, benchmark_mode: TestType) -> None:
                         "Specify target queries per second (e.g., target_qps: 100 in YAML or --target-qps 100 in CLI)"
                     )
             elif load_pattern_type == LoadPatternType.CONCURRENCY:
-                # Concurrency pattern requires max_concurrency > 0
-                if not max_concurrency or max_concurrency <= 0:
+                # Concurrency pattern requires target_concurrency > 0
+                if not target_concurrency or target_concurrency <= 0:
                     raise ValueError(
-                        "Concurrency load pattern requires max_concurrency > 0. "
-                        "Specify number of concurrent requests (e.g., max_concurrency: 10 in YAML or --concurrency 10 in CLI)"
+                        "Concurrency load pattern requires target_concurrency > 0. "
+                        "Specify number of concurrent requests (e.g., target_concurrency: 10 under load_pattern in YAML or --concurrency 10 in CLI)"
                     )
 
     def validate_client_settings(self) -> None: