diff --git a/src/modelbench/record.py b/src/modelbench/record.py index 7005f7fb3..64e809537 100644 --- a/src/modelbench/record.py +++ b/src/modelbench/record.py @@ -116,4 +116,7 @@ def default(self, o): elif isinstance(o, datetime): return str(o) else: - return super().default(o) + try: + return super().default(o) + except TypeError as e: + return str(e) diff --git a/tests/modelbench_tests/test_record.py b/tests/modelbench_tests/test_record.py index 21e43733c..aeb4fbba8 100644 --- a/tests/modelbench_tests/test_record.py +++ b/tests/modelbench_tests/test_record.py @@ -10,11 +10,15 @@ from modelbench.benchmarks import BenchmarkScore, GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark, SecurityScore from modelbench.hazards import HazardScore, SafeHazardV1, SecurityJailbreakHazard -from modelbench.record import benchmark_code_info, BenchmarkScoreEncoder, dump_json +from modelbench.record import BenchmarkScoreEncoder, benchmark_code_info, dump_json from modelbench.scoring import ValueEstimate from modelbench.standards import Standards +from modelgauge.auth.openai_compatible_secrets import OpenAIApiKey from modelgauge.locales import EN_US from modelgauge.record_init import InitializationRecord +from modelgauge.sut import PromptResponseSUT +from modelgauge.sut_decorator import modelgauge_sut +from modelgauge.suts.openai_client import OpenAIChat def benchmark_run_record(benchmark_score): @@ -30,8 +34,7 @@ def secrets(): return {"together": {"api_key": "fake"}, "modellab_files": {"token": "fake"}} -@pytest.fixture() -def benchmark_score(end_time, sut): +def benchmark_score_gen(end_time, sut): bd = GeneralPurposeAiChatBenchmarkV1(EN_US, "practice") low_est = ValueEstimate.make(0.5, 10) high_est = ValueEstimate.make(0.8, 20) @@ -61,6 +64,32 @@ def benchmark_score(end_time, sut): return bs +@pytest.fixture() +def benchmark_score(end_time, sut): + return benchmark_score_gen(end_time, sut) + + +@pytest.fixture() +def unserializable_sut(): + @modelgauge_sut(capabilities=[]) + class UnserializableSUT(PromptResponseSUT): + def __init__(self, uid, unserializable_init_param): + super().__init__(uid) + + def evaluate(self, request): + pass + + def translate_response(self, request, response): + pass + + return UnserializableSUT("unserializable_sut", unserializable_init_param=lambda x: x) + + +@pytest.fixture() +def benchmark_score_with_unserializable_sut(end_time, unserializable_sut): + return benchmark_score_gen(end_time, unserializable_sut) + + @pytest.fixture() def security_score(monkeypatch, tmp_path, end_time, sut): # Patch the standards in case the benchmark is not yet calibrated. @@ -121,6 +150,11 @@ def test_sut(sut): assert "initialization" in encoded +def test_unserializable_sut_within_benchmark_score(benchmark_score_with_unserializable_sut): + encoded = encode(benchmark_score_with_unserializable_sut) + assert '"unserializable_init_param": "Object of type function is not JSON serializable"' in encoded + + def test_value_estimate(): ve = ValueEstimate.make(0.5, 1000) j = encode_and_parse(ve)