Skip to content

Commit a489154

Browse files
committed
add support for optional shareability field in prompt sets
1 parent 70e8168 commit a489154

7 files changed

Lines changed: 24 additions & 8 deletions

File tree

src/modelbench/benchmark_runner.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,8 @@ def compile_annotations(self) -> list:
249249
if prompt_set != "demo":
250250
continue
251251
for item in items: # this is a TestRunItem
252+
if not item.test_item.shareable:
253+
continue
252254
for _, annotation in item.annotations.items():
253255
ann = {
254256
"hazard": hazard,

src/modelgauge/prompt_sets.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def description(cls) -> SecretDescription:
2020

2121
GENERAL_PROMPT_SETS = {
2222
"practice": {
23-
"en_us": "airr_official_1.0_practice_prompt_set_release",
23+
"en_us": "airr_official_1.0_practice_prompt_set_release_with_visibility",
2424
"fr_fr": "airr_official_1.0_practice_fr_fr_prompt_set_release",
2525
"zh_cn": "airr_official_1.0_practice_zh_cn_prompt_set_release",
2626
},
@@ -30,7 +30,7 @@ def description(cls) -> SecretDescription:
3030
"zh_cn": "airr_official_1.0_heldback_zh_cn_prompt_set_release",
3131
},
3232
"demo": {
33-
"en_us": "airr_official_1.0_demo_prompt_set_release",
33+
"en_us": "airr_official_1.0_demo_prompt_set_release_with_visibility",
3434
"fr_fr": "airr_official_1.0_demo_fr_fr_prompt_set_release",
3535
},
3636
}

src/modelgauge/single_turn_prompt_response.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ class TestItem(BaseModel):
2828
source_id: Optional[str]
2929
"""Identifier for where this Prompt came from in the underlying datasource."""
3030

31+
shareable: Optional[bool] = False
32+
"""Denotes whether the prompt or associated annotations may be shared in reports."""
33+
3134
@model_validator(mode="before")
3235
def set_default_evaluated_prompt(cls, values: dict):
3336
"""Set evaluated_prompt to regular prompt if not provided."""

src/modelgauge/tests/safe_v1.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,11 +179,13 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem]
179179
if locale != self.locale:
180180
continue
181181

182+
visibility = row.get("visibility", "private")
182183
test_items.append(
183184
TestItem(
184185
prompt=TextPrompt(text=text),
185186
source_id=row["release_prompt_id"],
186187
context=SafeTestItemContext(persona_type=persona),
188+
shareable=(visibility == "shareable"),
187189
),
188190
)
189191
if len(test_items) == 0:

tests/modelbench_tests/test_run.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,9 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
7575
test1 = ModelgaugeTestWrapper(
7676
actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path
7777
)
78-
test_item1 = TestItem(prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1")
78+
test_item1 = TestItem(
79+
prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1", shareable=True
80+
)
7981
tri1 = TestRunItem(
8082
test=test1,
8183
test_item=test_item1,
@@ -87,11 +89,13 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
8789
)
8890
benchmark_run.add_finished_item(tri1)
8991

90-
# this one should not be included in the annotations file b/c it's not in the demo prompt set
92+
# this one should not be included in the annotations file b/c it's marked as non shareable
9193
test2 = ModelgaugeTestWrapper(
9294
actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
9395
)
94-
test_item2 = TestItem(prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2")
96+
test_item2 = TestItem(
97+
prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2", shareable=False
98+
)
9599
tri2 = TestRunItem(
96100
test=test2,
97101
test_item=test_item2,
@@ -106,7 +110,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
106110
test3 = ModelgaugeTestWrapper(
107111
actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path
108112
)
109-
test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3")
113+
test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3", shareable=True)
110114
tri3 = TestRunItem(
111115
test=test3,
112116
test_item=test_item3,

tests/modelgauge_tests/test_prompt_sets.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,13 @@
99

1010

1111
def test_file_base_name():
12-
assert prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice") == "airr_official_1.0_practice_prompt_set_release"
12+
assert (
13+
prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice")
14+
== "airr_official_1.0_practice_prompt_set_release_with_visibility"
15+
)
1316
assert (
1417
prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice", "en_us")
15-
== "airr_official_1.0_practice_prompt_set_release"
18+
== "airr_official_1.0_practice_prompt_set_release_with_visibility"
1619
)
1720
assert (
1821
prompt_set_file_base_name(GENERAL_PROMPT_SETS, "official", "fr_fr")

tests/modelgauge_tests/test_records.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def test_serialize_test_record():
106106
"text": "some-text"
107107
},
108108
"source_id": "id01",
109+
"shareable": false,
109110
"context_internal": {
110111
"module": "modelgauge_tests.test_records",
111112
"class_name": "MockContext",
@@ -123,6 +124,7 @@ def test_serialize_test_record():
123124
"text": "some-text"
124125
},
125126
"source_id": "id01",
127+
"shareable": false,
126128
"context_internal": {
127129
"module": "modelgauge_tests.test_records",
128130
"class_name": "MockContext",

0 commit comments

Comments
 (0)