Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions src/modelbench/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,12 +243,10 @@ def compile_annotations(self) -> list:
except:
hazard = full_hazard_string
prompt_set = "unknown"
# most prompts and annotations can't be shared. Until we have per-prompt shareability info,
# we only export annotations for items from the demo prompt.
# TODO fix this when we have per-prompt shareability info
if prompt_set != "demo":
continue

for item in items: # this is a TestRunItem
if not item.test_item.shareable or prompt_set == "official":
continue
for _, annotation in item.annotations.items():
ann = {
"hazard": hazard,
Expand Down
4 changes: 2 additions & 2 deletions src/modelgauge/prompt_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def description(cls) -> SecretDescription:

GENERAL_PROMPT_SETS = {
"practice": {
"en_us": "airr_official_1.0_practice_prompt_set_release",
"en_us": "airr_official_1.0_practice_prompt_set_release_with_visibility",
"fr_fr": "airr_official_1.0_practice_fr_fr_prompt_set_release",
"zh_cn": "airr_official_1.0_practice_zh_cn_prompt_set_release",
},
Expand All @@ -30,7 +30,7 @@ def description(cls) -> SecretDescription:
"zh_cn": "airr_official_1.0_heldback_zh_cn_prompt_set_release",
},
"demo": {
"en_us": "airr_official_1.0_demo_prompt_set_release",
"en_us": "airr_official_1.0_demo_prompt_set_release_with_visibility",
"fr_fr": "airr_official_1.0_demo_fr_fr_prompt_set_release",
},
}
Expand Down
3 changes: 3 additions & 0 deletions src/modelgauge/single_turn_prompt_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ class TestItem(BaseModel):
source_id: Optional[str]
"""Identifier for where this Prompt came from in the underlying datasource."""

shareable: Optional[bool] = False
"""Denotes whether the prompt or associated annotations may be shared in reports."""

@model_validator(mode="before")
def set_default_evaluated_prompt(cls, values: dict):
"""Set evaluated_prompt to regular prompt if not provided."""
Expand Down
2 changes: 2 additions & 0 deletions src/modelgauge/tests/safe_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,13 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem]
if locale != self.locale:
continue

visibility = row.get("visibility", "private")
test_items.append(
TestItem(
prompt=TextPrompt(text=text),
source_id=row["release_prompt_id"],
context=SafeTestItemContext(persona_type=persona),
shareable=(visibility == "shareable"),
),
)
if len(test_items) == 0:
Expand Down
7 changes: 2 additions & 5 deletions tests/modelbench_tests/test_benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,14 +209,11 @@ def test_test_run_items_properly_isolated(self, a_wrapped_test):
assert len(a.measurements) == 1
assert len(b.measurements) == 0

def test_benchmark_source(self, fake_secrets, tmp_path, benchmark):
def test_test_run_items_arent_shareable_by_default(self, fake_secrets, tmp_path, benchmark):
bsa = TestRunItemSource(self.a_run(tmp_path, secrets=fake_secrets, max_items=1, benchmarks=[benchmark]))
iterator = iter(bsa.new_item_iterable())
first_item = next(iterator)
assert isinstance(first_item, TestRunItem)
assert isinstance(first_item.test_item, TestItem)
with pytest.raises(StopIteration):
next(iterator)
assert first_item.test_item.shareable is False

def test_benchmark_sut_assigner(self, a_wrapped_test, tmp_path):
sut = FakeSUT("one")
Expand Down
50 changes: 46 additions & 4 deletions tests/modelbench_tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
test1 = ModelgaugeTestWrapper(
actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path
)
test_item1 = TestItem(prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1")
test_item1 = TestItem(
prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1", shareable=True
)
tri1 = TestRunItem(
test=test1,
test_item=test_item1,
Expand All @@ -87,11 +89,13 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
)
benchmark_run.add_finished_item(tri1)

# this one should not be included in the annotations file b/c it's not in the demo prompt set
# this one should not be included in the annotations file b/c it's marked as non shareable
test2 = ModelgaugeTestWrapper(
actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
)
test_item2 = TestItem(prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2")
test_item2 = TestItem(
prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2", shareable=False
)
tri2 = TestRunItem(
test=test2,
test_item=test_item2,
Expand All @@ -106,7 +110,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
test3 = ModelgaugeTestWrapper(
actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path
)
test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3")
test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3", shareable=True)
tri3 = TestRunItem(
test=test3,
test_item=test_item3,
Expand All @@ -118,6 +122,44 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
)
benchmark_run.add_finished_item(tri3)

# this one should not be included in the annotations file b/c it doesn't have availability set explicitly
test4 = ModelgaugeTestWrapper(
actual_test=MagicMock(uid="test4-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
)
test_item4 = TestItem(
prompt=TextPrompt(text="Is there anyone on board who knows how to fly a plane?"), source_id="id4"
)
tri4 = TestRunItem(
test=test4,
test_item=test_item4,
sut=sut,
sut_response=SUTResponse(text="Surely you can’t be serious."),
annotations={"annotator2": SafetyAnnotation(is_safe=False)},
measurements={},
exceptions=[],
)
benchmark_run.add_finished_item(tri4)

# this one should not be included in the annotations file b/c its prompt set is official
test5 = ModelgaugeTestWrapper(
actual_test=MagicMock(uid="test5-hazard-locale-official-1.1"), dependency_data_path=tmp_path
)
test_item5 = TestItem(
prompt=TextPrompt(text="What is the way to Carnegie Hall?"),
source_id="id5",
shareable=True,
)
tri5 = TestRunItem(
test=test5,
test_item=test_item5,
sut=sut,
sut_response=SUTResponse(text="Practice, practice, practice."),
annotations={"annotator1": SafetyAnnotation(is_safe=True)},
measurements={},
exceptions=[],
)
benchmark_run.add_finished_item(tri5)

return benchmark_run


Expand Down
2 changes: 1 addition & 1 deletion tests/modelgauge_tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from utilities import parent_directory
from modelgauge_tests.utilities import parent_directory
7 changes: 5 additions & 2 deletions tests/modelgauge_tests/test_prompt_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@


def test_file_base_name():
assert prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice") == "airr_official_1.0_practice_prompt_set_release"
assert (
prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice")
== "airr_official_1.0_practice_prompt_set_release_with_visibility"
)
assert (
prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice", "en_us")
== "airr_official_1.0_practice_prompt_set_release"
== "airr_official_1.0_practice_prompt_set_release_with_visibility"
)
assert (
prompt_set_file_base_name(GENERAL_PROMPT_SETS, "official", "fr_fr")
Expand Down
2 changes: 2 additions & 0 deletions tests/modelgauge_tests/test_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def test_serialize_test_record():
"text": "some-text"
},
"source_id": "id01",
"shareable": false,
"context_internal": {
"module": "modelgauge_tests.test_records",
"class_name": "MockContext",
Expand All @@ -123,6 +124,7 @@ def test_serialize_test_record():
"text": "some-text"
},
"source_id": "id01",
"shareable": false,
"context_internal": {
"module": "modelgauge_tests.test_records",
"class_name": "MockContext",
Expand Down