From 70e816855f65df559a3bda2e2a0cf8ee27d8e9ee Mon Sep 17 00:00:00 2001 From: rogthefrog Date: Mon, 15 Dec 2025 19:49:16 -0500 Subject: [PATCH 1/4] fix path, ironically enough --- tests/modelgauge_tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/modelgauge_tests/conftest.py b/tests/modelgauge_tests/conftest.py index 66be554d5..9f465232a 100644 --- a/tests/modelgauge_tests/conftest.py +++ b/tests/modelgauge_tests/conftest.py @@ -1 +1 @@ -from utilities import parent_directory +from modelgauge_tests.utilities import parent_directory From a4891547a342db15be0545262d0144cd91bc4887 Mon Sep 17 00:00:00 2001 From: rogthefrog Date: Mon, 15 Dec 2025 19:49:34 -0500 Subject: [PATCH 2/4] add support for optional shareability field in prompt sets --- src/modelbench/benchmark_runner.py | 2 ++ src/modelgauge/prompt_sets.py | 4 ++-- src/modelgauge/single_turn_prompt_response.py | 3 +++ src/modelgauge/tests/safe_v1.py | 2 ++ tests/modelbench_tests/test_run.py | 12 ++++++++---- tests/modelgauge_tests/test_prompt_sets.py | 7 +++++-- tests/modelgauge_tests/test_records.py | 2 ++ 7 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py index c75107ce3..622031d64 100644 --- a/src/modelbench/benchmark_runner.py +++ b/src/modelbench/benchmark_runner.py @@ -249,6 +249,8 @@ def compile_annotations(self) -> list: if prompt_set != "demo": continue for item in items: # this is a TestRunItem + if not item.test_item.shareable: + continue for _, annotation in item.annotations.items(): ann = { "hazard": hazard, diff --git a/src/modelgauge/prompt_sets.py b/src/modelgauge/prompt_sets.py index dde506d7d..55ca93c25 100644 --- a/src/modelgauge/prompt_sets.py +++ b/src/modelgauge/prompt_sets.py @@ -20,7 +20,7 @@ def description(cls) -> SecretDescription: GENERAL_PROMPT_SETS = { "practice": { - "en_us": "airr_official_1.0_practice_prompt_set_release", + "en_us": "airr_official_1.0_practice_prompt_set_release_with_visibility", "fr_fr": "airr_official_1.0_practice_fr_fr_prompt_set_release", "zh_cn": "airr_official_1.0_practice_zh_cn_prompt_set_release", }, @@ -30,7 +30,7 @@ def description(cls) -> SecretDescription: "zh_cn": "airr_official_1.0_heldback_zh_cn_prompt_set_release", }, "demo": { - "en_us": "airr_official_1.0_demo_prompt_set_release", + "en_us": "airr_official_1.0_demo_prompt_set_release_with_visibility", "fr_fr": "airr_official_1.0_demo_fr_fr_prompt_set_release", }, } diff --git a/src/modelgauge/single_turn_prompt_response.py b/src/modelgauge/single_turn_prompt_response.py index 9cf98e830..3a4d335de 100644 --- a/src/modelgauge/single_turn_prompt_response.py +++ b/src/modelgauge/single_turn_prompt_response.py @@ -28,6 +28,9 @@ class TestItem(BaseModel): source_id: Optional[str] """Identifier for where this Prompt came from in the underlying datasource.""" + shareable: Optional[bool] = False + """Denotes whether the prompt or associated annotations may be shared in reports.""" + @model_validator(mode="before") def set_default_evaluated_prompt(cls, values: dict): """Set evaluated_prompt to regular prompt if not provided.""" diff --git a/src/modelgauge/tests/safe_v1.py b/src/modelgauge/tests/safe_v1.py index 0b49e334a..de4227dfb 100644 --- a/src/modelgauge/tests/safe_v1.py +++ b/src/modelgauge/tests/safe_v1.py @@ -179,11 +179,13 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem] if locale != self.locale: continue + visibility = row.get("visibility", "private") test_items.append( TestItem( prompt=TextPrompt(text=text), source_id=row["release_prompt_id"], context=SafeTestItemContext(persona_type=persona), + shareable=(visibility == "shareable"), ), ) if len(test_items) == 0: diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py index 4d3c77e33..6bea1c986 100644 --- a/tests/modelbench_tests/test_run.py +++ b/tests/modelbench_tests/test_run.py @@ -75,7 +75,9 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): test1 = ModelgaugeTestWrapper( actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path ) - test_item1 = TestItem(prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1") + test_item1 = TestItem( + prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1", shareable=True + ) tri1 = TestRunItem( test=test1, test_item=test_item1, @@ -87,11 +89,13 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): ) benchmark_run.add_finished_item(tri1) - # this one should not be included in the annotations file b/c it's not in the demo prompt set + # this one should not be included in the annotations file b/c it's marked as non shareable test2 = ModelgaugeTestWrapper( actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path ) - test_item2 = TestItem(prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2") + test_item2 = TestItem( + prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2", shareable=False + ) tri2 = TestRunItem( test=test2, test_item=test_item2, @@ -106,7 +110,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): test3 = ModelgaugeTestWrapper( actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path ) - test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3") + test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3", shareable=True) tri3 = TestRunItem( test=test3, test_item=test_item3, diff --git a/tests/modelgauge_tests/test_prompt_sets.py b/tests/modelgauge_tests/test_prompt_sets.py index 5ca2d6de5..952172b1f 100644 --- a/tests/modelgauge_tests/test_prompt_sets.py +++ b/tests/modelgauge_tests/test_prompt_sets.py @@ -9,10 +9,13 @@ def test_file_base_name(): - assert prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice") == "airr_official_1.0_practice_prompt_set_release" + assert ( + prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice") + == "airr_official_1.0_practice_prompt_set_release_with_visibility" + ) assert ( prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice", "en_us") - == "airr_official_1.0_practice_prompt_set_release" + == "airr_official_1.0_practice_prompt_set_release_with_visibility" ) assert ( prompt_set_file_base_name(GENERAL_PROMPT_SETS, "official", "fr_fr") diff --git a/tests/modelgauge_tests/test_records.py b/tests/modelgauge_tests/test_records.py index ed2401836..61dce3d36 100644 --- a/tests/modelgauge_tests/test_records.py +++ b/tests/modelgauge_tests/test_records.py @@ -106,6 +106,7 @@ def test_serialize_test_record(): "text": "some-text" }, "source_id": "id01", + "shareable": false, "context_internal": { "module": "modelgauge_tests.test_records", "class_name": "MockContext", @@ -123,6 +124,7 @@ def test_serialize_test_record(): "text": "some-text" }, "source_id": "id01", + "shareable": false, "context_internal": { "module": "modelgauge_tests.test_records", "class_name": "MockContext", From 5bcdbd501a593cb4f994c887c87cb8788727724e Mon Sep 17 00:00:00 2001 From: rogthefrog Date: Tue, 16 Dec 2025 13:03:39 -0500 Subject: [PATCH 3/4] add test item without explicit shareability flag --- tests/modelbench_tests/test_run.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py index 6bea1c986..bb4a49cb0 100644 --- a/tests/modelbench_tests/test_run.py +++ b/tests/modelbench_tests/test_run.py @@ -122,6 +122,24 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): ) benchmark_run.add_finished_item(tri3) + # this one should not be included in the annotations file b/c it doesn't have availability set explicitly + test4 = ModelgaugeTestWrapper( + actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path + ) + test_item4 = TestItem( + prompt=TextPrompt(text="Is there anyone on board who knows how to fly a plane?"), source_id="id4" + ) + tri4 = TestRunItem( + test=test4, + test_item=test_item4, + sut=sut, + sut_response=SUTResponse(text="Surely you can’t be serious."), + annotations={"annotator2": SafetyAnnotation(is_safe=False)}, + measurements={}, + exceptions=[], + ) + benchmark_run.add_finished_item(tri4) + return benchmark_run From 2650811222fde4da90445bdd7a745f2c1f359f5c Mon Sep 17 00:00:00 2001 From: rogthefrog Date: Tue, 16 Dec 2025 14:16:01 -0500 Subject: [PATCH 4/4] more item shareability tests --- src/modelbench/benchmark_runner.py | 8 ++----- .../modelbench_tests/test_benchmark_runner.py | 7 ++---- tests/modelbench_tests/test_run.py | 22 ++++++++++++++++++- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py index 622031d64..29bd46575 100644 --- a/src/modelbench/benchmark_runner.py +++ b/src/modelbench/benchmark_runner.py @@ -243,13 +243,9 @@ def compile_annotations(self) -> list: except: hazard = full_hazard_string prompt_set = "unknown" - # most prompts and annotations can't be shared. Until we have per-prompt shareability info, - # we only export annotations for items from the demo prompt. - # TODO fix this when we have per-prompt shareability info - if prompt_set != "demo": - continue + for item in items: # this is a TestRunItem - if not item.test_item.shareable: + if not item.test_item.shareable or prompt_set == "official": continue for _, annotation in item.annotations.items(): ann = { diff --git a/tests/modelbench_tests/test_benchmark_runner.py b/tests/modelbench_tests/test_benchmark_runner.py index ee4fec500..3b6f53049 100644 --- a/tests/modelbench_tests/test_benchmark_runner.py +++ b/tests/modelbench_tests/test_benchmark_runner.py @@ -209,14 +209,11 @@ def test_test_run_items_properly_isolated(self, a_wrapped_test): assert len(a.measurements) == 1 assert len(b.measurements) == 0 - def test_benchmark_source(self, fake_secrets, tmp_path, benchmark): + def test_test_run_items_arent_shareable_by_default(self, fake_secrets, tmp_path, benchmark): bsa = TestRunItemSource(self.a_run(tmp_path, secrets=fake_secrets, max_items=1, benchmarks=[benchmark])) iterator = iter(bsa.new_item_iterable()) first_item = next(iterator) - assert isinstance(first_item, TestRunItem) - assert isinstance(first_item.test_item, TestItem) - with pytest.raises(StopIteration): - next(iterator) + assert first_item.test_item.shareable is False def test_benchmark_sut_assigner(self, a_wrapped_test, tmp_path): sut = FakeSUT("one") diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py index bb4a49cb0..5d5dce957 100644 --- a/tests/modelbench_tests/test_run.py +++ b/tests/modelbench_tests/test_run.py @@ -124,7 +124,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # this one should not be included in the annotations file b/c it doesn't have availability set explicitly test4 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path + actual_test=MagicMock(uid="test4-hazard-locale-practice-1.1"), dependency_data_path=tmp_path ) test_item4 = TestItem( prompt=TextPrompt(text="Is there anyone on board who knows how to fly a plane?"), source_id="id4" @@ -140,6 +140,26 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): ) benchmark_run.add_finished_item(tri4) + # this one should not be included in the annotations file b/c its prompt set is official + test5 = ModelgaugeTestWrapper( + actual_test=MagicMock(uid="test5-hazard-locale-official-1.1"), dependency_data_path=tmp_path + ) + test_item5 = TestItem( + prompt=TextPrompt(text="What is the way to Carnegie Hall?"), + source_id="id5", + shareable=True, + ) + tri5 = TestRunItem( + test=test5, + test_item=test_item5, + sut=sut, + sut_response=SUTResponse(text="Practice, practice, practice."), + annotations={"annotator1": SafetyAnnotation(is_safe=True)}, + measurements={}, + exceptions=[], + ) + benchmark_run.add_finished_item(tri5) + return benchmark_run