Skip to content

Commit f577d6e

Browse files
committed
Enhance BenchmarkScoreEncoder to handle serialization errors and add tests for unserializable SUT initialization params.
1 parent cbe9191 commit f577d6e

2 files changed

Lines changed: 41 additions & 4 deletions

File tree

src/modelbench/record.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,4 +116,7 @@ def default(self, o):
116116
elif isinstance(o, datetime):
117117
return str(o)
118118
else:
119-
return super().default(o)
119+
try:
120+
return super().default(o)
121+
except TypeError as e:
122+
return str(e)

tests/modelbench_tests/test_record.py

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,15 @@
1010

1111
from modelbench.benchmarks import BenchmarkScore, GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark, SecurityScore
1212
from modelbench.hazards import HazardScore, SafeHazardV1, SecurityJailbreakHazard
13-
from modelbench.record import benchmark_code_info, BenchmarkScoreEncoder, dump_json
13+
from modelbench.record import BenchmarkScoreEncoder, benchmark_code_info, dump_json
1414
from modelbench.scoring import ValueEstimate
1515
from modelbench.standards import Standards
16+
from modelgauge.auth.openai_compatible_secrets import OpenAIApiKey
1617
from modelgauge.locales import EN_US
1718
from modelgauge.record_init import InitializationRecord
19+
from modelgauge.sut import PromptResponseSUT
20+
from modelgauge.sut_decorator import modelgauge_sut
21+
from modelgauge.suts.openai_client import OpenAIChat
1822

1923

2024
def benchmark_run_record(benchmark_score):
@@ -30,8 +34,7 @@ def secrets():
3034
return {"together": {"api_key": "fake"}, "modellab_files": {"token": "fake"}}
3135

3236

33-
@pytest.fixture()
34-
def benchmark_score(end_time, sut):
37+
def benchmark_score_gen(end_time, sut):
3538
bd = GeneralPurposeAiChatBenchmarkV1(EN_US, "practice")
3639
low_est = ValueEstimate.make(0.5, 10)
3740
high_est = ValueEstimate.make(0.8, 20)
@@ -61,6 +64,32 @@ def benchmark_score(end_time, sut):
6164
return bs
6265

6366

67+
@pytest.fixture()
68+
def benchmark_score(end_time, sut):
69+
return benchmark_score_gen(end_time, sut)
70+
71+
72+
@pytest.fixture()
73+
def unserializable_sut():
74+
@modelgauge_sut(capabilities=[])
75+
class UnserializableSUT(PromptResponseSUT):
76+
def __init__(self, uid, unserializable_init_param):
77+
super().__init__(uid)
78+
79+
def evaluate(self, request):
80+
pass
81+
82+
def translate_response(self, request, response):
83+
pass
84+
85+
return UnserializableSUT("unserializable_sut", unserializable_init_param=lambda x: x)
86+
87+
88+
@pytest.fixture()
89+
def benchmark_score_with_unserializable_sut(end_time, unserializable_sut):
90+
return benchmark_score_gen(end_time, unserializable_sut)
91+
92+
6493
@pytest.fixture()
6594
def security_score(monkeypatch, tmp_path, end_time, sut):
6695
# Patch the standards in case the benchmark is not yet calibrated.
@@ -121,6 +150,11 @@ def test_sut(sut):
121150
assert "initialization" in encoded
122151

123152

153+
def test_unserializable_sut_within_benchmark_score(benchmark_score_with_unserializable_sut):
154+
encoded = encode(benchmark_score_with_unserializable_sut)
155+
assert '"unserializable_init_param": "Object of type function is not JSON serializable"' in encoded
156+
157+
124158
def test_value_estimate():
125159
ve = ValueEstimate.make(0.5, 1000)
126160
j = encode_and_parse(ve)

0 commit comments

Comments
 (0)