Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/modelbench/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,7 @@ def default(self, o):
elif isinstance(o, datetime):
return str(o)
else:
return super().default(o)
try:
return super().default(o)
except TypeError as e:
return str(e)
40 changes: 37 additions & 3 deletions tests/modelbench_tests/test_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,15 @@

from modelbench.benchmarks import BenchmarkScore, GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark, SecurityScore
from modelbench.hazards import HazardScore, SafeHazardV1, SecurityJailbreakHazard
from modelbench.record import benchmark_code_info, BenchmarkScoreEncoder, dump_json
from modelbench.record import BenchmarkScoreEncoder, benchmark_code_info, dump_json
from modelbench.scoring import ValueEstimate
from modelbench.standards import Standards
from modelgauge.auth.openai_compatible_secrets import OpenAIApiKey
from modelgauge.locales import EN_US
from modelgauge.record_init import InitializationRecord
from modelgauge.sut import PromptResponseSUT
from modelgauge.sut_decorator import modelgauge_sut
from modelgauge.suts.openai_client import OpenAIChat


def benchmark_run_record(benchmark_score):
Expand All @@ -30,8 +34,7 @@ def secrets():
return {"together": {"api_key": "fake"}, "modellab_files": {"token": "fake"}}


@pytest.fixture()
def benchmark_score(end_time, sut):
def benchmark_score_gen(end_time, sut):
bd = GeneralPurposeAiChatBenchmarkV1(EN_US, "practice")
low_est = ValueEstimate.make(0.5, 10)
high_est = ValueEstimate.make(0.8, 20)
Expand Down Expand Up @@ -61,6 +64,32 @@ def benchmark_score(end_time, sut):
return bs


@pytest.fixture()
def benchmark_score(end_time, sut):
return benchmark_score_gen(end_time, sut)


@pytest.fixture()
def unserializable_sut():
@modelgauge_sut(capabilities=[])
class UnserializableSUT(PromptResponseSUT):
def __init__(self, uid, unserializable_init_param):
super().__init__(uid)

def evaluate(self, request):
pass

def translate_response(self, request, response):
pass

return UnserializableSUT("unserializable_sut", unserializable_init_param=lambda x: x)


@pytest.fixture()
def benchmark_score_with_unserializable_sut(end_time, unserializable_sut):
return benchmark_score_gen(end_time, unserializable_sut)


@pytest.fixture()
def security_score(monkeypatch, tmp_path, end_time, sut):
# Patch the standards in case the benchmark is not yet calibrated.
Expand Down Expand Up @@ -121,6 +150,11 @@ def test_sut(sut):
assert "initialization" in encoded


def test_unserializable_sut_within_benchmark_score(benchmark_score_with_unserializable_sut):
encoded = encode(benchmark_score_with_unserializable_sut)
assert '"unserializable_init_param": "Object of type function is not JSON serializable"' in encoded


def test_value_estimate():
ve = ValueEstimate.make(0.5, 1000)
j = encode_and_parse(ve)
Expand Down