Allow run_uid (#1405)

dhosterman · web-flow · commit c7369a691b0d · 2025-12-04T10:38:41.000-05:00
CLI accepts --run-uid as an argument and will use that for the run_uid
of the benchmark run instead of an automatically generated run_uid.
diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py
@@ -16,6 +16,14 @@
 from rich.table import Table
 
 import modelgauge.annotators.cheval.registration  # noqa: F401
+from modelbench.benchmark_runner import BenchmarkRunner, JsonRunTracker, TqdmRunTracker
+from modelbench.benchmarks import GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark
+from modelbench.consistency_checker import (
+    ConsistencyChecker,
+    summarize_consistency_check_results,
+)
+from modelbench.record import dump_json
+from modelbench.standards import Standards
 from modelgauge.config import load_secrets_from_config, write_default_config
 from modelgauge.load_namespaces import load_namespaces
 from modelgauge.locales import DEFAULT_LOCALE, LOCALES
@@ -25,12 +33,6 @@
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS
 from modelgauge.sut_registry import SUTS
 
-from modelbench.benchmark_runner import BenchmarkRunner, JsonRunTracker, TqdmRunTracker
-from modelbench.benchmarks import GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark
-from modelbench.standards import Standards
-from modelbench.consistency_checker import ConsistencyChecker, summarize_consistency_check_results
-from modelbench.record import dump_json
-
 
 def load_local_plugins(_, __, path: pathlib.Path):
     path_str = str(path)
@@ -90,6 +92,12 @@ def decorator(func):
             help="Which evaluator to use",
             show_default=True,
         )
+        @click.option(
+            "--run-uid",
+            type=str,
+            required=False,
+            help="The run_uid for the run if provided, otherwise one will be generated",
+        )
         @local_plugin_dir_option
         @wraps(func)
         def wrapper(*args, **kwargs):
@@ -151,13 +159,14 @@ def general_benchmark(
     json_logs: bool,
     sut_uid: str,
     locale: str,
+    run_uid: str,
     prompt_set="demo",
     evaluator="default",
 ) -> None:
     sut = make_sut(sut_uid)
     benchmark = GeneralPurposeAiChatBenchmarkV1(locale, prompt_set, evaluator)
     check_benchmark(benchmark)
-    run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir)
+    run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid)
 
 
 @benchmark.command("security", help="run a security benchmark")
@@ -169,17 +178,18 @@ def security_benchmark(
     json_logs: bool,
     sut_uid: str,
     locale: str,
+    run_uid: str,
     prompt_set="official",
     evaluator="default",
 ) -> None:
     sut = make_sut(sut_uid)
     benchmark = SecurityBenchmark(locale, prompt_set, evaluator=evaluator)
     check_benchmark(benchmark)
 
-    run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir)
+    run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid)
 
 
-def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir):
+def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, output_dir, run_uid):
     start_time = datetime.now(timezone.utc)
     run = run_benchmarks_for_sut([benchmark], sut, max_instances, debug=debug, json_logs=json_logs)
 
@@ -188,7 +198,7 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ou
     print_summary(benchmark, benchmark_scores)
     json_path = output_dir / f"benchmark_record-{benchmark.uid}.json"
     scores = [score for score in benchmark_scores if score.benchmark_definition == benchmark]
-    dump_json(json_path, start_time, benchmark, scores)
+    dump_json(json_path, start_time, benchmark, scores, run_uid)
     print(f"Wrote record for {benchmark.uid} to {json_path}.")
     run_consistency_check(run.journal_path, verbose=True)
 
diff --git a/src/modelbench/record.py b/src/modelbench/record.py
@@ -7,6 +7,7 @@
 from typing import Sequence
 
 import pydantic
+
 from modelbench.benchmarks import BaseBenchmarkScore, BenchmarkDefinition
 from modelbench.hazards import HazardDefinition, HazardScore
 from modelgauge.base_test import BaseTest
@@ -76,12 +77,14 @@ def dump_json(
     start_time: datetime.time,
     benchmark: BenchmarkDefinition,
     benchmark_scores: Sequence[BaseBenchmarkScore],
+    run_uid: str | None,
 ):
+    _run_uid = run_uid if run_uid else f"run-{benchmark.uid}-{start_time.strftime('%Y%m%d-%H%M%S')}"
     with open(json_path, "w") as f:
         output = {
             "_metadata": benchmark_metadata(),
             "benchmark": (benchmark),
-            "run_uid": f"run-{benchmark.uid}-{start_time.strftime('%Y%m%d-%H%M%S')}",
+            "run_uid": _run_uid,
             "scores": (benchmark_scores),
         }
         json.dump(output, f, cls=BenchmarkScoreEncoder, indent=4)
diff --git a/tests/modelbench_tests/test_record.py b/tests/modelbench_tests/test_record.py
@@ -8,7 +8,12 @@
 
 import pytest
 
-from modelbench.benchmarks import BenchmarkScore, GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark, SecurityScore
+from modelbench.benchmarks import (
+    BenchmarkScore,
+    GeneralPurposeAiChatBenchmarkV1,
+    SecurityBenchmark,
+    SecurityScore,
+)
 from modelbench.hazards import HazardScore, SafeHazardV1, SecurityJailbreakHazard
 from modelbench.record import BenchmarkScoreEncoder, benchmark_code_info, dump_json
 from modelbench.scoring import ValueEstimate
@@ -282,7 +287,8 @@ def test_benchmark_code_record_without_git(benchmark_score):
         assert source["error"] == "git command not found"
 
 
-def test_dump_json(benchmark_score, tmp_path):
+@pytest.mark.parametrize("run_uid", [None, "custom_run_uid"])
+def test_dump_json(benchmark_score, tmp_path, run_uid):
     # just a smoke test; everything substantial should be tested above.
     json_path = tmp_path / "foo.json"
     with mock.patch("modelbench.record.benchmark_library_info", lambda: {"skipped by": "test_run.fast_metadata"}):
@@ -291,11 +297,15 @@ def test_dump_json(benchmark_score, tmp_path):
             datetime.fromtimestamp(1700000000, timezone.utc),
             benchmark_score.benchmark_definition,
             [benchmark_score],
+            run_uid,
         )
 
     with open(json_path) as f:
         j = json.load(f)
     assert "_metadata" in j
     assert j["benchmark"]["uid"] == benchmark_score.benchmark_definition.uid
-    assert j["run_uid"] == "run-" + benchmark_score.benchmark_definition.uid + "-20231114-221320"
+    if not run_uid:
+        assert j["run_uid"] == "run-" + benchmark_score.benchmark_definition.uid + "-20231114-221320"
+    else:
+        assert j["run_uid"] == run_uid
     assert len(j["scores"]) == 1