1010
1111from modelbench .benchmarks import BenchmarkScore , GeneralPurposeAiChatBenchmarkV1 , SecurityBenchmark , SecurityScore
1212from modelbench .hazards import HazardScore , SafeHazardV1 , SecurityJailbreakHazard
13- from modelbench .record import benchmark_code_info , BenchmarkScoreEncoder , dump_json
13+ from modelbench .record import BenchmarkScoreEncoder , benchmark_code_info , dump_json
1414from modelbench .scoring import ValueEstimate
1515from modelbench .standards import Standards
16+ from modelgauge .auth .openai_compatible_secrets import OpenAIApiKey
1617from modelgauge .locales import EN_US
1718from modelgauge .record_init import InitializationRecord
19+ from modelgauge .sut import PromptResponseSUT
20+ from modelgauge .sut_decorator import modelgauge_sut
21+ from modelgauge .suts .openai_client import OpenAIChat
1822
1923
2024def benchmark_run_record (benchmark_score ):
@@ -30,8 +34,7 @@ def secrets():
3034 return {"together" : {"api_key" : "fake" }, "modellab_files" : {"token" : "fake" }}
3135
3236
33- @pytest .fixture ()
34- def benchmark_score (end_time , sut ):
37+ def benchmark_score_gen (end_time , sut ):
3538 bd = GeneralPurposeAiChatBenchmarkV1 (EN_US , "practice" )
3639 low_est = ValueEstimate .make (0.5 , 10 )
3740 high_est = ValueEstimate .make (0.8 , 20 )
@@ -61,6 +64,32 @@ def benchmark_score(end_time, sut):
6164 return bs
6265
6366
67+ @pytest .fixture ()
68+ def benchmark_score (end_time , sut ):
69+ return benchmark_score_gen (end_time , sut )
70+
71+
72+ @pytest .fixture ()
73+ def unserializable_sut ():
74+ @modelgauge_sut (capabilities = [])
75+ class UnserializableSUT (PromptResponseSUT ):
76+ def __init__ (self , uid , unserializable_init_param ):
77+ super ().__init__ (uid )
78+
79+ def evaluate (self , request ):
80+ pass
81+
82+ def translate_response (self , request , response ):
83+ pass
84+
85+ return UnserializableSUT ("unserializable_sut" , unserializable_init_param = lambda x : x )
86+
87+
88+ @pytest .fixture ()
89+ def benchmark_score_with_unserializable_sut (end_time , unserializable_sut ):
90+ return benchmark_score_gen (end_time , unserializable_sut )
91+
92+
6493@pytest .fixture ()
6594def security_score (monkeypatch , tmp_path , end_time , sut ):
6695 # Patch the standards in case the benchmark is not yet calibrated.
@@ -121,6 +150,11 @@ def test_sut(sut):
121150 assert "initialization" in encoded
122151
123152
153+ def test_unserializable_sut_within_benchmark_score (benchmark_score_with_unserializable_sut ):
154+ encoded = encode (benchmark_score_with_unserializable_sut )
155+ assert '"unserializable_init_param": "Object of type function is not JSON serializable"' in encoded
156+
157+
124158def test_value_estimate ():
125159 ve = ValueEstimate .make (0.5 , 1000 )
126160 j = encode_and_parse (ve )
0 commit comments