1616from rich .table import Table
1717
1818import modelgauge .annotators .cheval .registration # noqa: F401
19+ from modelbench .benchmark_runner import BenchmarkRunner , JsonRunTracker , TqdmRunTracker
20+ from modelbench .benchmarks import GeneralPurposeAiChatBenchmarkV1 , SecurityBenchmark
21+ from modelbench .consistency_checker import (
22+ ConsistencyChecker ,
23+ summarize_consistency_check_results ,
24+ )
25+ from modelbench .record import dump_json
26+ from modelbench .standards import Standards
1927from modelgauge .config import load_secrets_from_config , write_default_config
2028from modelgauge .load_namespaces import load_namespaces
2129from modelgauge .locales import DEFAULT_LOCALE , LOCALES
2533from modelgauge .prompt_sets import GENERAL_PROMPT_SETS , SECURITY_JAILBREAK_PROMPT_SETS
2634from modelgauge .sut_registry import SUTS
2735
28- from modelbench .benchmark_runner import BenchmarkRunner , JsonRunTracker , TqdmRunTracker
29- from modelbench .benchmarks import GeneralPurposeAiChatBenchmarkV1 , SecurityBenchmark
30- from modelbench .standards import Standards
31- from modelbench .consistency_checker import ConsistencyChecker , summarize_consistency_check_results
32- from modelbench .record import dump_json
33-
3436
3537def load_local_plugins (_ , __ , path : pathlib .Path ):
3638 path_str = str (path )
@@ -90,6 +92,12 @@ def decorator(func):
9092 help = "Which evaluator to use" ,
9193 show_default = True ,
9294 )
95+ @click .option (
96+ "--run-uid" ,
97+ type = str ,
98+ required = False ,
99+ help = "The run_uid for the run if provided, otherwise one will be generated" ,
100+ )
93101 @local_plugin_dir_option
94102 @wraps (func )
95103 def wrapper (* args , ** kwargs ):
@@ -151,13 +159,14 @@ def general_benchmark(
151159 json_logs : bool ,
152160 sut_uid : str ,
153161 locale : str ,
162+ run_uid : str ,
154163 prompt_set = "demo" ,
155164 evaluator = "default" ,
156165) -> None :
157166 sut = make_sut (sut_uid )
158167 benchmark = GeneralPurposeAiChatBenchmarkV1 (locale , prompt_set , evaluator )
159168 check_benchmark (benchmark )
160- run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , output_dir )
169+ run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , output_dir , run_uid )
161170
162171
163172@benchmark .command ("security" , help = "run a security benchmark" )
@@ -169,17 +178,18 @@ def security_benchmark(
169178 json_logs : bool ,
170179 sut_uid : str ,
171180 locale : str ,
181+ run_uid : str ,
172182 prompt_set = "official" ,
173183 evaluator = "default" ,
174184) -> None :
175185 sut = make_sut (sut_uid )
176186 benchmark = SecurityBenchmark (locale , prompt_set , evaluator = evaluator )
177187 check_benchmark (benchmark )
178188
179- run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , output_dir )
189+ run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , output_dir , run_uid )
180190
181191
182- def run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , output_dir ):
192+ def run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , output_dir , run_uid ):
183193 start_time = datetime .now (timezone .utc )
184194 run = run_benchmarks_for_sut ([benchmark ], sut , max_instances , debug = debug , json_logs = json_logs )
185195
@@ -188,7 +198,7 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ou
188198 print_summary (benchmark , benchmark_scores )
189199 json_path = output_dir / f"benchmark_record-{ benchmark .uid } .json"
190200 scores = [score for score in benchmark_scores if score .benchmark_definition == benchmark ]
191- dump_json (json_path , start_time , benchmark , scores )
201+ dump_json (json_path , start_time , benchmark , scores , run_uid )
192202 print (f"Wrote record for { benchmark .uid } to { json_path } ." )
193203 run_consistency_check (run .journal_path , verbose = True )
194204
0 commit comments