2424from modelgauge .locales import DEFAULT_LOCALE , LOCALES , PUBLISHED_LOCALES
2525from modelgauge .monitoring import PROMETHEUS
2626from modelgauge .preflight import check_secrets , make_sut
27- from modelgauge .prompt_sets import PROMPT_SETS
27+ from modelgauge .prompt_sets import GENERAL_PROMPT_SETS , SECURITY_PROMPT_SETS
2828from modelgauge .sut import get_sut_and_options
2929from modelgauge .sut_registry import SUTS
3030
@@ -52,30 +52,55 @@ def load_local_plugins(_, __, path: pathlib.Path):
5252)
5353
5454
55- def benchmark_options (func ):
56- @click .option (
57- "--output-dir" ,
58- "-o" ,
59- default = "./run/records" ,
60- type = click .Path (file_okay = False , dir_okay = True , path_type = pathlib .Path ),
61- )
62- @click .option ("--max-instances" , "-m" , type = int , default = 100 )
63- @click .option ("--debug" , default = False , is_flag = True )
64- @click .option ("--json-logs" , default = False , is_flag = True , help = "Print only machine-readable progress reports" )
65- @click .option (
66- "sut_uid" ,
67- "--sut" ,
68- "-s" ,
69- multiple = False ,
70- help = "SUT UID to run" ,
71- required = True ,
72- )
73- @local_plugin_dir_option
74- @wraps (func )
75- def wrapper (* args , ** kwargs ):
76- return func (* args , ** kwargs )
55+ def benchmark_options (prompt_sets : dict , default_prompt_set : str ):
56+ def decorator (func ):
57+ @click .option (
58+ "--output-dir" ,
59+ "-o" ,
60+ default = "./run/records" ,
61+ type = click .Path (file_okay = False , dir_okay = True , path_type = pathlib .Path ),
62+ )
63+ @click .option ("--max-instances" , "-m" , type = int , default = 100 )
64+ @click .option ("--debug" , default = False , is_flag = True )
65+ @click .option ("--json-logs" , default = False , is_flag = True , help = "Print only machine-readable progress reports" )
66+ @click .option (
67+ "sut_uid" ,
68+ "--sut" ,
69+ "-s" ,
70+ multiple = False ,
71+ help = "SUT UID to run" ,
72+ required = True ,
73+ )
74+ @click .option (
75+ "--locale" ,
76+ "-l" ,
77+ type = click .Choice (LOCALES , case_sensitive = False ),
78+ default = DEFAULT_LOCALE ,
79+ help = f"Locale for v1.0 benchmark (Default: { DEFAULT_LOCALE } )" ,
80+ multiple = False ,
81+ )
82+ @click .option (
83+ "--prompt-set" ,
84+ type = click .Choice (list (prompt_sets .keys ())),
85+ default = default_prompt_set ,
86+ help = "Which prompt set to use" ,
87+ show_default = True ,
88+ )
89+ @click .option (
90+ "--evaluator" ,
91+ type = click .Choice (["default" , "ensemble" ]),
92+ default = "default" ,
93+ help = "Which evaluator to use" ,
94+ show_default = True ,
95+ )
96+ @local_plugin_dir_option
97+ @wraps (func )
98+ def wrapper (* args , ** kwargs ):
99+ return func (* args , ** kwargs )
100+
101+ return wrapper
77102
78- return wrapper
103+ return decorator
79104
80105
81106@click .group ()
@@ -113,7 +138,6 @@ def list_suts():
113138
114139
115140@benchmark .command ("general" , help = "run a general purpose AI chat benchmark" )
116- @benchmark_options
117141@click .option (
118142 "--version" ,
119143 "-v" ,
@@ -122,35 +146,14 @@ def list_suts():
122146 help = "Benchmark version to run (Default: 1.0)" ,
123147 multiple = False ,
124148)
125- @click .option (
126- "--locale" ,
127- "-l" ,
128- type = click .Choice (LOCALES , case_sensitive = False ),
129- default = DEFAULT_LOCALE ,
130- help = f"Locale for v1.0 benchmark (Default: { DEFAULT_LOCALE } )" ,
131- multiple = False ,
132- )
133- @click .option (
134- "--prompt-set" ,
135- type = click .Choice (list (PROMPT_SETS .keys ())),
136- default = "demo" ,
137- help = "Which prompt set to use" ,
138- show_default = True ,
139- )
140- @click .option (
141- "--evaluator" ,
142- type = click .Choice (["default" , "ensemble" ]),
143- default = "default" ,
144- help = "Which evaluator to use" ,
145- show_default = True ,
146- )
149+ @benchmark_options (GENERAL_PROMPT_SETS , "demo" )
147150def general_benchmark (
151+ version : str ,
148152 output_dir : pathlib .Path ,
149153 max_instances : int ,
150154 debug : bool ,
151155 json_logs : bool ,
152156 sut_uid : str ,
153- version : str ,
154157 locale : str ,
155158 prompt_set = "demo" ,
156159 evaluator = "default" ,
@@ -169,20 +172,15 @@ def general_benchmark(
169172
170173
171174@benchmark .command ("security" , help = "run a security benchmark" )
172- @benchmark_options
173- @click .option (
174- "--evaluator" ,
175- type = click .Choice (["default" , "ensemble" ]),
176- default = "default" ,
177- help = "Which evaluator to use" ,
178- show_default = True ,
179- )
175+ @benchmark_options (SECURITY_PROMPT_SETS , "practice" )
180176def security_benchmark (
181177 output_dir : pathlib .Path ,
182178 max_instances : int ,
183179 debug : bool ,
184180 json_logs : bool ,
185181 sut_uid : str ,
182+ locale : str ,
183+ prompt_set = "practice" ,
186184 evaluator = "default" ,
187185) -> None :
188186 # TODO: move this check inside the benchmark class?
@@ -193,7 +191,7 @@ def security_benchmark(
193191
194192 sut_uid , _ = get_sut_and_options (sut_uid )
195193 sut = make_sut (sut_uid )
196- benchmark = SecurityBenchmark (evaluator = evaluator )
194+ benchmark = SecurityBenchmark (locale , prompt_set , evaluator = evaluator )
197195 check_benchmark (benchmark )
198196
199197 run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , output_dir )
@@ -368,7 +366,7 @@ def update_standards_to(standards_file):
368366 benchmarks = []
369367 # General purpose benchmarks
370368 for locale in PUBLISHED_LOCALES :
371- for prompt_set in PROMPT_SETS .keys ():
369+ for prompt_set in GENERAL_PROMPT_SETS .keys ():
372370 # we do not want to make demo standards. Instead we want to use the practice standards
373371 if not prompt_set == "demo" :
374372 benchmarks .append (GeneralPurposeAiChatBenchmarkV1 (locale , prompt_set , "ensemble" ))
0 commit comments