mlcommons
diff --git a/‎configs/gptoss_test.yaml‎
Lines changed: 0 additions & 66 deletions b/‎configs/gptoss_test.yaml‎
Lines changed: 0 additions & 66 deletions
diff --git a/‎docs/metrics/report_design.md‎
Lines changed: 13 additions & 0 deletions b/‎docs/metrics/report_design.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py‎
Lines changed: 17 additions & 12 deletions b/‎src/inference_endpoint/async_utils/services/metrics_aggregator/metrics_table.py‎
Lines changed: 17 additions & 12 deletions
diff --git a/‎src/inference_endpoint/commands/benchmark/execute.py‎
Lines changed: 26 additions & 24 deletions b/‎src/inference_endpoint/commands/benchmark/execute.py‎
Lines changed: 26 additions & 24 deletions
diff --git a/‎src/inference_endpoint/config/templates/concurrency_template.yaml‎
Lines changed: 1 addition & 1 deletion b/‎src/inference_endpoint/config/templates/concurrency_template.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/inference_endpoint/config/templates/concurrency_template_full.yaml‎
Lines changed: 1 addition & 2 deletions b/‎src/inference_endpoint/config/templates/concurrency_template_full.yaml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/inference_endpoint/config/templates/offline_template.yaml‎
Lines changed: 1 addition & 1 deletion b/‎src/inference_endpoint/config/templates/offline_template.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/inference_endpoint/config/templates/offline_template_full.yaml‎
Lines changed: 1 addition & 2 deletions b/‎src/inference_endpoint/config/templates/offline_template_full.yaml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/inference_endpoint/config/templates/online_template.yaml‎
Lines changed: 1 addition & 1 deletion b/‎src/inference_endpoint/config/templates/online_template.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/inference_endpoint/config/templates/online_template_full.yaml‎
Lines changed: 1 addition & 2 deletions b/‎src/inference_endpoint/config/templates/online_template_full.yaml‎
Lines changed: 1 addition & 2 deletions
@@ -38,6 +38,19 @@ this complexity is needed when the input is a `list[float]` from the KVStore.
 The entire rollup is a single function: `compute_summary(values) → dict`.
 It calls numpy for percentiles and histograms. No classes, no state.
 
+**Reports are reproducible from the event log.**
+
+The KVStore is lossy aggregation — it stores per-metric series, not per-sample
+provenance. The authoritative record of what happened during a run is the event
+log written by the `EventLoggerService`. Every number in a `Report` can be
+recomputed by replaying the event log through the same aggregator logic: if a
+production report shows a TTFT spike, the event log is the ground truth a user
+can mine to attribute the spike to specific samples or time windows.
+
+New metrics must preserve this property: the aggregator may only derive values
+from event fields, never from out-of-band state. If a metric cannot be rebuilt
+from the event log alone, it does not belong in the KVStore.
+
 ## Components
 
 ### `compute_summary(values, percentiles, n_histogram_buckets) → dict`
 
@@ -150,18 +150,20 @@ def fire(
 
 
 class TimeDeltaTrigger(EmitTrigger):
-    """Sync trigger: emits ev_rec.timestamp_ns - pre_change[required_field].
+    """Sync trigger: emits ev_rec.timestamp_ns - pre_change[delta_start_fieldname].
 
-    Subclass only needs to set metric_name and the required field name.
-    Skips silently if the required field is None (event hasn't occurred yet).
+    The emitted metric is a time delta: the firing event marks the end of the
+    delta, and ``delta_start_fieldname`` names the SampleField whose timestamp
+    marks the start. Skips silently if the start field is None (the delta has
+    not yet opened for this sample).
     """
 
-    def __init__(self, metric_name: str, kv_store: KVStore, subtract_field: str):
-        super().__init__(metric_name, kv_store, requires=(subtract_field,))
-        self._subtract_field = subtract_field
+    def __init__(self, metric_name: str, kv_store: KVStore, delta_start_fieldname: str):
+        super().__init__(metric_name, kv_store, requires=(delta_start_fieldname,))
+        self._delta_start_fieldname = delta_start_fieldname
 
     def fire(self, ev_rec, row, pre_change):
-        baseline = pre_change.get(self._subtract_field)
+        baseline = pre_change.get(self._delta_start_fieldname)
         if baseline is not None:
             self.kv_store.update(self.metric_name, ev_rec.timestamp_ns - baseline)
         return None
@@ -235,7 +237,9 @@ class TtftTrigger(TimeDeltaTrigger):
 
     def __init__(self, kv_store: KVStore):
         super().__init__(
-            MetricSeriesKey.TTFT_NS, kv_store, subtract_field=SampleField.ISSUED_NS
+            MetricSeriesKey.TTFT_NS,
+            kv_store,
+            delta_start_fieldname=SampleField.ISSUED_NS,
         )
 
 
@@ -249,7 +253,7 @@ def __init__(self, kv_store: KVStore):
         super().__init__(
             MetricSeriesKey.CHUNK_DELTA_NS,
             kv_store,
-            subtract_field=SampleField.LAST_RECV_NS,
+            delta_start_fieldname=SampleField.LAST_RECV_NS,
         )
 
 
@@ -260,7 +264,7 @@ def __init__(self, kv_store: KVStore):
         super().__init__(
             MetricSeriesKey.SAMPLE_LATENCY_NS,
             kv_store,
-            subtract_field=SampleField.ISSUED_NS,
+            delta_start_fieldname=SampleField.ISSUED_NS,
         )
 
 
@@ -281,11 +285,12 @@ def __init__(
         super().__init__(MetricSeriesKey.ISL, kv_store, tokenize_pool, loop)
 
     def fire(self, ev_rec, row, pre_change):
-        # Sync fast path: pre-tokenized IDs (SGLang)
+        # Sync fast path: any backend that pre-populates token_ids (e.g. SGLang).
         if isinstance(ev_rec.data, PromptData) and ev_rec.data.token_ids is not None:
             self.kv_store.update(self.metric_name, len(ev_rec.data.token_ids))
             return None
-        # Async path: tokenize raw text (OpenAI) — handled by base class
+        # Async path: tokenize raw text — used when token_ids are unavailable
+        # (e.g. OpenAI-compatible endpoints). Handled by the base class.
         return super().fire(ev_rec, row, pre_change)
 
     def _extract_text(self, ev_rec, row, pre_change):
 
@@ -48,9 +48,15 @@
     ServiceConfig,
     ServiceLauncher,
 )
+from inference_endpoint.async_utils.services.metrics_aggregator.aggregator import (
+    MetricCounterKey,
+)
 from inference_endpoint.async_utils.services.metrics_aggregator.kv_store import (
     BasicKVStoreReader,
 )
+from inference_endpoint.async_utils.services.metrics_aggregator.metrics_table import (
+    MetricSeriesKey,
+)
 from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext
 from inference_endpoint.config.runtime_settings import RuntimeSettings
 from inference_endpoint.config.schema import (
@@ -176,8 +182,11 @@ def _check_tokenizer_exists(model_name: str) -> bool:
     """Check if a HuggingFace tokenizer exists for the model (API only, no download).
 
     Returns True if the model repo exists and has tokenizer files, False otherwise.
-    The actual tokenizer is loaded later by the MetricsAggregator subprocess and
-    by Harmony transforms (each loads their own instance as needed).
+    This function is a probe — it never loads or downloads the tokenizer itself.
+    Downstream consumers that need tokenization (e.g. the MetricsAggregator
+    subprocess for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing,
+    and any future plugin with its own tokenization need) each load their own
+    instance as required.
     """
     try:
         info = model_info(model_name)
@@ -374,24 +383,19 @@ def _setup_kv_reader(
 ) -> BasicKVStoreReader:
     """Create a KVStoreReader pre-registered with all metric keys."""
     reader = BasicKVStoreReader(metrics_dir)
-    # Counter keys (from MetricCounterKey enum)
-    for key in [
-        "total_samples_issued",
-        "total_samples_completed",
-        "total_samples_failed",
-        "tracked_samples_issued",
-        "tracked_samples_completed",
-        "tracked_duration_ns",
-        "total_duration_ns",
-    ]:
-        reader.register_key(key, "counter")
-    # Series keys (from MetricSeriesKey enum)
-    for key in ["isl", "osl", "sample_latency_ns"]:
-        reader.register_key(key, "series")
-    reader.register_key("tpot_ns", "series", dtype=float)
-    if streaming:
-        for key in ["ttft_ns", "chunk_delta_ns"]:
-            reader.register_key(key, "series")
+    for counter_key in MetricCounterKey:
+        reader.register_key(counter_key.value, "counter")
+    _STREAMING_ONLY = {
+        MetricSeriesKey.TTFT_NS,
+        MetricSeriesKey.CHUNK_DELTA_NS,
+        MetricSeriesKey.TPOT_NS,
+    }
+    _FLOAT_SERIES = {MetricSeriesKey.TPOT_NS}
+    for series_key in MetricSeriesKey:
+        if series_key in _STREAMING_ONLY and not streaming:
+            continue
+        dtype = float if series_key in _FLOAT_SERIES else int
+        reader.register_key(series_key.value, "series", dtype=dtype)
     return reader
 
 
@@ -591,12 +595,10 @@ def _write_scoring_artifacts(
         f.write(msgspec.json.format(msgspec.json.encode(sample_idx_map), indent=2))
     logger.debug(f"Wrote {map_path}")
 
-    # Copy events.jsonl from tmpfs to report_dir
+    # Copy events.jsonl from tmpfs to report_dir.
+    # Tmpfs cleanup is handled by run_benchmark()'s finally block.
     _salvage_tmpfs(ctx.report_dir, tmpfs_dir)
 
-    # Clean up tmpfs
-    shutil.rmtree(tmpfs_dir, ignore_errors=True)
-
 
 def _salvage_tmpfs(report_dir: Path, tmpfs_dir: Path) -> None:
     """Copy all salvageable artifacts from tmpfs to report_dir.
 
@@ -18,4 +18,4 @@ settings:
     target_concurrency: 32  # Concurrent requests
 endpoint_config:
   endpoints:  # Endpoint URL(s)
-  - '<ENDPOINT_URL eg: http://localhost:8000>'
+  - 'http://localhost:8000'
@@ -49,7 +49,6 @@ settings:
     target_concurrency: 32  # Concurrent requests
   client:
     num_workers: -1  # Worker processes (-1=auto)
-    record_worker_events: false  # Record per-worker events
     log_level: INFO  # Worker log level
     warmup_connections: -1  # Pre-establish TCP connections (-1=auto, 0=disabled)
     max_connections: -1  # Max TCP connections (-1=unlimited)
@@ -77,7 +76,7 @@ metrics:
   - tpot
 endpoint_config:
   endpoints:  # Endpoint URL(s)
-  - '<ENDPOINT_URL eg: http://localhost:8000>'
+    - 'http://localhost:8000'
   api_key: null  # API key
   api_type: openai  # API type: openai or sglang | options: openai, sglang
 report_dir: null  # Report output directory
 
@@ -15,4 +15,4 @@ settings:
     n_samples_to_issue: null  # Sample count override
 endpoint_config:
   endpoints:  # Endpoint URL(s)
-  - '<ENDPOINT_URL eg: http://localhost:8000>'
+  - 'http://localhost:8000'
@@ -49,7 +49,6 @@ settings:
     target_concurrency: null  # Concurrent requests
   client:
     num_workers: -1  # Worker processes (-1=auto)
-    record_worker_events: false  # Record per-worker events
     log_level: INFO  # Worker log level
     warmup_connections: -1  # Pre-establish TCP connections (-1=auto, 0=disabled)
     max_connections: -1  # Max TCP connections (-1=unlimited)
@@ -77,7 +76,7 @@ metrics:
   - tpot
 endpoint_config:
   endpoints:  # Endpoint URL(s)
-  - '<ENDPOINT_URL eg: http://localhost:8000>'
+    - 'http://localhost:8000'
   api_key: null  # API key
   api_type: openai  # API type: openai or sglang | options: openai, sglang
 report_dir: null  # Report output directory
 
@@ -18,4 +18,4 @@ settings:
     target_qps: 10.0  # Target QPS
 endpoint_config:
   endpoints:  # Endpoint URL(s)
-  - '<ENDPOINT_URL eg: http://localhost:8000>'
+  - 'http://localhost:8000'
@@ -49,7 +49,6 @@ settings:
     target_concurrency: null  # Concurrent requests
   client:
     num_workers: -1  # Worker processes (-1=auto)
-    record_worker_events: false  # Record per-worker events
     log_level: INFO  # Worker log level
     warmup_connections: -1  # Pre-establish TCP connections (-1=auto, 0=disabled)
     max_connections: -1  # Max TCP connections (-1=unlimited)
@@ -77,7 +76,7 @@ metrics:
   - tpot
 endpoint_config:
   endpoints:  # Endpoint URL(s)
-  - '<ENDPOINT_URL eg: http://localhost:8000>'
+    - 'http://localhost:8000'
   api_key: null  # API key
   api_type: openai  # API type: openai or sglang | options: openai, sglang
 report_dir: null  # Report output directory