Adds several QoL features for user convenience (#33)

nv-alicheng · web-flow · commit ebfae71be02c · 2025-11-24T15:00:33.000-08:00
* Add timestamp to outputs.jsonl, add flag to optionally dump eventsdb to CSV file, pad sample count to DS size multiple

* Make dataset alignment optional, update tests

* Use approximate datetime strings in outputs.jsonl for human-readability

* Minor fixes suggested by gemini, condition guards
diff --git a/examples/01_LocalBenchmark/run_tinyllm.py b/examples/01_LocalBenchmark/run_tinyllm.py
@@ -184,6 +184,11 @@ def issue(self, sample):
         action="store_true",
         help="Enable streaming mode for TTFT metrics",
     )
+    parser.add_argument(
+        "--dump-events-csv",
+        action="store_true",
+        help="Dump the events to a CSV file",
+    )
     args = parser.parse_args()
 
     # Set up progress bar hook to monitor sample completion
@@ -238,6 +243,7 @@ def issue(self, sample):
             stop_sample_issuer_on_test_end=False,
             report_path="tinyllm_benchmark_report",
             tokenizer_override=model_runner.tokenizer,
+            dump_events_csv=args.dump_events_csv,
         )
         sess.wait_for_test_end()
 
diff --git a/src/inference_endpoint/config/runtime_settings.py b/src/inference_endpoint/config/runtime_settings.py
@@ -166,7 +166,9 @@ def _from_config_default(
 
         return cls(**kwargs)
 
-    def total_samples_to_issue(self, padding_factor: float = 1.1) -> int:
+    def total_samples_to_issue(
+        self, padding_factor: float = 1.1, align_to_dataset_size: bool = True
+    ) -> int:
         """Calculate the total number of samples to issue to the SUT throughout the course of the test run.
 
         Priority:
@@ -177,6 +179,8 @@ def total_samples_to_issue(self, padding_factor: float = 1.1) -> int:
         Args:
             padding_factor (float): Factor to multiply the expected number of samples by to account for variance.
                                     Use 1.0 for no padding. (Default: 1.1)
+            align_to_dataset_size (bool): Whether to pad the total number of samples up to the nearest multiple of
+                                          dataset size. (Default: True)
 
         Returns:
             int: The total number of samples to issue to the SUT throughout the course of the test run.
@@ -213,4 +217,13 @@ def total_samples_to_issue(self, padding_factor: float = 1.1) -> int:
         logger.debug(
             f"Sample count: {result} (calculated from duration={self.min_duration_ms}ms × target_qps={self.metric_target.target} × padding={padding_factor})"
         )
+
+        # Pad to multiples of dataset size
+        if (
+            align_to_dataset_size
+            and self.n_samples_from_dataset > 0
+            and (rem := result % self.n_samples_from_dataset) != 0
+        ):
+            result += self.n_samples_from_dataset - rem
+            logger.debug(f"Padded sample count: {result}")
         return result
diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
@@ -65,6 +65,7 @@ def _run_test(
         max_shutdown_timeout_s: float = 300.0,
         report_dir: os.PathLike | None = None,
         tokenizer_override: AutoTokenizer | None = None,
+        dump_events_csv: bool = False,
     ):
         with self.event_recorder:
             try:
@@ -157,6 +158,9 @@ def _run_test(
                     with (Path(report_dir) / "runtime_settings.json").open("w") as f:
                         f.write(orjson.dumps(rt_settings_data).decode("utf-8"))
 
+                    if dump_events_csv:
+                        reporter.dump_to_csv(Path(report_path) / "events.csv")
+
                 # Print summary
                 report.display()
 
@@ -186,6 +190,7 @@ def start(
         max_shutdown_timeout_s: float = 300.0,
         report_dir: os.PathLike | None = None,
         tokenizer_override: AutoTokenizer | None = None,
+        dump_events_csv: bool = False,
     ) -> BenchmarkSession:
         """Start a new BenchmarkSession in a thread.
 
@@ -201,6 +206,8 @@ def start(
             report_dir: The path to save the report to. If None, no report will be saved.
             tokenizer_override: The tokenizer to use for the session. If None, a tokenizer will be automatically selected
                                 based on the model name in the runtime settings.
+            dump_events_csv: Whether to dump the events to a CSV file. Only use for debugging
+                             purposes, as the events database can get quite large.
 
         Returns:
             The new BenchmarkSession.
@@ -215,6 +222,7 @@ def start(
                 max_shutdown_timeout_s,
                 report_dir,
                 tokenizer_override,
+                dump_events_csv,
             ),
         )
         session.thread.start()
diff --git a/src/inference_endpoint/metrics/recorder.py b/src/inference_endpoint/metrics/recorder.py
@@ -26,6 +26,7 @@
 import threading
 import time
 import uuid
+from datetime import datetime
 from functools import partial
 from pathlib import Path
 from typing import ClassVar
@@ -39,6 +40,29 @@
 logger = logging.getLogger(__name__)
 
 
+_G_MONOTIME_DELTA = time.time_ns() - time.monotonic_ns()
+"""Approximate delta between monotonic and wall-clock time in nanoseconds. See
+monotime_to_datetime() for more details.
+"""
+
+
+def monotime_to_datetime(monotime_ns: int) -> datetime:
+    """Monotonic clock has an undefined starting point. To convert to human readable timestamp,
+    we can add a constant delta to any monotonic timestamp to get an approximate equivalent wall-clock
+    timestamp. Note that the result will not be completely accurate, but it will be a consistent
+    offset from the real time, as long as this function is called in the same process. Any durations
+    and deltas calculated from resulting datetimes will be accurate, but absolute times will not be.
+
+    Args:
+        monotime_ns: The monotonic timestamp in nanoseconds.
+
+    Returns:
+        The datetime object corresponding to the approximate wall-clock timestamp.
+    """
+    wall_time = (monotime_ns + _G_MONOTIME_DELTA) / 1e9
+    return datetime.fromtimestamp(wall_time)
+
+
 @contextlib.contextmanager
 def sqlite3_cursor(path: Path):
     """Context manager for SQLite cursor that properly handles connection lifecycle.
@@ -272,7 +296,11 @@ def commit_buffer():
                         if item[1] == SampleEvent.FIRST_CHUNK.value:
                             # In post-processing, we use this to validate that the first chunk is the response output is the same as the data in the FIRST_CHUNK_RECEIVED event
                             output_buffer.append(
-                                {"s_uuid": item[0], "first_chunk": item[-1]}
+                                {
+                                    "timestamp": str(monotime_to_datetime(item[2])),
+                                    "s_uuid": item[0],
+                                    "first_chunk": item[-1],
+                                }
                             )
                         elif item[1] == SampleEvent.COMPLETE.value:
                             output_data = item[-1]
@@ -282,13 +310,15 @@ def commit_buffer():
                                 )
                             output_buffer.append(
                                 {
+                                    "timestamp": str(monotime_to_datetime(item[2])),
                                     "s_uuid": item[0],
                                     "output": output_data,
                                 }
                             )
                         elif item[1] == SessionEvent.ERROR.value:
                             output_buffer.append(
                                 {
+                                    "timestamp": str(monotime_to_datetime(item[2])),
                                     "s_uuid": item[0],
                                     "error_type": item[1],
                                     "error_message": item[-1],
diff --git a/src/inference_endpoint/metrics/reporter.py b/src/inference_endpoint/metrics/reporter.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import csv
 import dataclasses
 import functools
 import importlib
@@ -832,6 +833,44 @@ def close(self):
             self.cur_.close()
         self.conn.close()
 
+    def dump_to_csv(self, csv_path: Path):
+        output_values = defaultdict(dict)
+
+        if self.outputs_path.exists():
+            with self.outputs_path.open("r") as outputs:
+                for line in outputs:
+                    if line.strip() == "":
+                        continue
+
+                    data = orjson.loads(line)
+                    if "s_uuid" not in data:
+                        continue
+
+                    if "first_chunk" in data:
+                        output_values[data["s_uuid"]]["first_chunk"] = data[
+                            "first_chunk"
+                        ]
+                    elif "output" in data:
+                        output_values[data["s_uuid"]]["output"] = data["output"]
+                    elif "error_message" in data:
+                        output_values[data["s_uuid"]]["error_message"] = data[
+                            "error_message"
+                        ]
+
+        with csv_path.open("w") as f:
+            writer = csv.writer(f)
+            writer.writerow(["sample_uuid", "event_type", "timestamp_ns", "value"])
+
+            for row in self.cur_.execute("SELECT * FROM events"):
+                value = ""
+                if row[1] == SampleEvent.FIRST_CHUNK.value:
+                    value = output_values[row[0]].get("first_chunk", "<NOT_FOUND>")
+                elif row[1] == SampleEvent.COMPLETE.value:
+                    value = output_values[row[0]].get("output", "<NOT_FOUND>")
+                elif row[1] == SessionEvent.ERROR.value:
+                    value = output_values[row[0]].get("error_message", "<NOT_FOUND>")
+                writer.writerow([row[0], row[1], row[2], value])
+
     def __enter__(self):
         if self.is_closed:
             self.init_connection()
diff --git a/tests/unit/config/rulesets/mlcommons/test_rules.py b/tests/unit/config/rulesets/mlcommons/test_rules.py
@@ -60,7 +60,16 @@ def test_apply_user_config():
     )
 
     # Metric type should be throughput
-    expected_sample_count = 1234.5 * (rt_settings.min_duration_ms / 1000)
+    expected_sample_count = int(1234.5 * 10 * 60)
+    assert (
+        rt_settings.total_samples_to_issue(
+            padding_factor=1.0, align_to_dataset_size=False
+        )
+        == expected_sample_count
+    )
+
+    if (rem := expected_sample_count % rt_settings.n_samples_from_dataset) != 0:
+        expected_sample_count += rt_settings.n_samples_from_dataset - rem
     assert (
         rt_settings.total_samples_to_issue(padding_factor=1.0) == expected_sample_count
     )
@@ -73,7 +82,15 @@ def test_apply_user_config_insufficient_qps():
         user_config=user_config,
         opt_prio=OptimizationPriority.LOW_LATENCY_INTERACTIVE,
     )
-    assert rt_settings.total_samples_to_issue(padding_factor=1.0) == 270336
+
+    # Expected is 270336 padded up to multiple of dataset size, which is 13368
+    assert rt_settings.total_samples_to_issue(padding_factor=1.0) == 280728
+    assert (
+        rt_settings.total_samples_to_issue(
+            padding_factor=1.0, align_to_dataset_size=False
+        )
+        == 270336
+    )
 
 
 def test_apply_user_config_min_sample_count_override():
@@ -83,4 +100,10 @@ def test_apply_user_config_min_sample_count_override():
         user_config=user_config,
         opt_prio=OptimizationPriority.LOW_LATENCY_INTERACTIVE,
     )
-    assert rt_settings.total_samples_to_issue(padding_factor=1.0) == 2 * 10 * 60
+    assert rt_settings.total_samples_to_issue(padding_factor=1.0) == 13368
+    assert (
+        rt_settings.total_samples_to_issue(
+            padding_factor=1.0, align_to_dataset_size=False
+        )
+        == 2 * 10 * 60
+    )