Skip to content

Commit 1546793

Browse files
committed
Add benchmark version to result json.
1 parent c7369a6 commit 1546793

3 files changed

Lines changed: 23 additions & 2 deletions

File tree

src/modelbench/record.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ def default(self, o):
103103
del result["_scoring_log"]
104104
return result
105105
elif isinstance(o, BenchmarkDefinition):
106-
return {"uid": o.uid, "hazards": o.hazards()}
106+
benchmark_version = o.get_uid_part("version")
107+
return {"uid": o.uid, "hazards": o.hazards(), "version": benchmark_version}
107108
elif isinstance(o, HazardDefinition):
108109
result = {"uid": o.uid, "reference_standard": o.reference_standard()}
109110
if o._tests:

src/modelbench/uid.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,5 +78,16 @@ def uid(self):
7878

7979
return HasUid._render_uid(self, self.__class__._uid_definition)
8080

81+
def get_uid_part(self, part_name: str) -> str:
82+
"""Gets string-rendered value of a specific part of the UID."""
83+
if not hasattr(self.__class__, "_uid_definition"):
84+
raise AttributeError("classes with HasUid must define _uid_definition")
85+
86+
uid_def = self.__class__._uid_definition
87+
if part_name not in uid_def:
88+
raise KeyError(f"part name {part_name} not found in _uid_definition")
89+
90+
return HasUid._render_uid(self, {part_name: uid_def[part_name]})
91+
8192
def __str__(self):
8293
return f"{self.__class__.__name__}({self.uid})"

tests/modelbench_tests/test_record.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,12 +186,21 @@ def test_v1_hazard_definition_with_tests_loaded(secrets):
186186
assert j["reference_standard"] == hazard.reference_standard()
187187

188188

189-
def test_benchmark_definition():
189+
def test_general_benchmark_definition():
190190
j = encode_and_parse(GeneralPurposeAiChatBenchmarkV1(locale=EN_US, prompt_set="practice"))
191191
assert j["uid"] == "general_purpose_ai_chat_benchmark-1.1-en_us-practice-default"
192+
assert j["version"] == "1.1"
192193
assert "safe_hazard-1.1-cse-en_us-practice" in [i["uid"] for i in j["hazards"]]
193194

194195

196+
def test_security_benchmark_definition():
197+
j = encode_and_parse(SecurityBenchmark(locale=EN_US, prompt_set="official"))
198+
assert j["uid"] == "security_benchmark-0.5-en_us-official-default"
199+
assert j["version"] == "0.5"
200+
hazard_uids = [i["uid"] for i in j["hazards"]]
201+
assert "security_jailbreak_hazard-0.5-en_us-official" in hazard_uids
202+
203+
195204
def test_hazard_score():
196205
hazard = SafeHazardV1("cse", EN_US, "practice")
197206
hazard.set_standard(GeneralPurposeAiChatBenchmarkV1(locale=EN_US, prompt_set="practice").standards)

0 commit comments

Comments
 (0)