From 70e816855f65df559a3bda2e2a0cf8ee27d8e9ee Mon Sep 17 00:00:00 2001
From: rogthefrog <roger@mlcommons.org>
Date: Mon, 15 Dec 2025 19:49:16 -0500
Subject: [PATCH 1/4] fix path, ironically enough

---
 tests/modelgauge_tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/modelgauge_tests/conftest.py b/tests/modelgauge_tests/conftest.py
index 66be554d5..9f465232a 100644
--- a/tests/modelgauge_tests/conftest.py
+++ b/tests/modelgauge_tests/conftest.py
@@ -1 +1 @@
-from utilities import parent_directory
+from modelgauge_tests.utilities import parent_directory

From a4891547a342db15be0545262d0144cd91bc4887 Mon Sep 17 00:00:00 2001
From: rogthefrog <roger@mlcommons.org>
Date: Mon, 15 Dec 2025 19:49:34 -0500
Subject: [PATCH 2/4] add support for optional shareability field in prompt
 sets

---
 src/modelbench/benchmark_runner.py            |  2 ++
 src/modelgauge/prompt_sets.py                 |  4 ++--
 src/modelgauge/single_turn_prompt_response.py |  3 +++
 src/modelgauge/tests/safe_v1.py               |  2 ++
 tests/modelbench_tests/test_run.py            | 12 ++++++++----
 tests/modelgauge_tests/test_prompt_sets.py    |  7 +++++--
 tests/modelgauge_tests/test_records.py        |  2 ++
 7 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py
index c75107ce3..622031d64 100644
--- a/src/modelbench/benchmark_runner.py
+++ b/src/modelbench/benchmark_runner.py
@@ -249,6 +249,8 @@ def compile_annotations(self) -> list:
                 if prompt_set != "demo":
                     continue
                 for item in items:  # this is a TestRunItem
+                    if not item.test_item.shareable:
+                        continue
                     for _, annotation in item.annotations.items():
                         ann = {
                             "hazard": hazard,
diff --git a/src/modelgauge/prompt_sets.py b/src/modelgauge/prompt_sets.py
index dde506d7d..55ca93c25 100644
--- a/src/modelgauge/prompt_sets.py
+++ b/src/modelgauge/prompt_sets.py
@@ -20,7 +20,7 @@ def description(cls) -> SecretDescription:
 
 GENERAL_PROMPT_SETS = {
     "practice": {
-        "en_us": "airr_official_1.0_practice_prompt_set_release",
+        "en_us": "airr_official_1.0_practice_prompt_set_release_with_visibility",
         "fr_fr": "airr_official_1.0_practice_fr_fr_prompt_set_release",
         "zh_cn": "airr_official_1.0_practice_zh_cn_prompt_set_release",
     },
@@ -30,7 +30,7 @@ def description(cls) -> SecretDescription:
         "zh_cn": "airr_official_1.0_heldback_zh_cn_prompt_set_release",
     },
     "demo": {
-        "en_us": "airr_official_1.0_demo_prompt_set_release",
+        "en_us": "airr_official_1.0_demo_prompt_set_release_with_visibility",
         "fr_fr": "airr_official_1.0_demo_fr_fr_prompt_set_release",
     },
 }
diff --git a/src/modelgauge/single_turn_prompt_response.py b/src/modelgauge/single_turn_prompt_response.py
index 9cf98e830..3a4d335de 100644
--- a/src/modelgauge/single_turn_prompt_response.py
+++ b/src/modelgauge/single_turn_prompt_response.py
@@ -28,6 +28,9 @@ class TestItem(BaseModel):
     source_id: Optional[str]
     """Identifier for where this Prompt came from in the underlying datasource."""
 
+    shareable: Optional[bool] = False
+    """Denotes whether the prompt or associated annotations may be shared in reports."""
+
     @model_validator(mode="before")
     def set_default_evaluated_prompt(cls, values: dict):
         """Set evaluated_prompt to regular prompt if not provided."""
diff --git a/src/modelgauge/tests/safe_v1.py b/src/modelgauge/tests/safe_v1.py
index 0b49e334a..de4227dfb 100644
--- a/src/modelgauge/tests/safe_v1.py
+++ b/src/modelgauge/tests/safe_v1.py
@@ -179,11 +179,13 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem]
             if locale != self.locale:
                 continue
 
+            visibility = row.get("visibility", "private")
             test_items.append(
                 TestItem(
                     prompt=TextPrompt(text=text),
                     source_id=row["release_prompt_id"],
                     context=SafeTestItemContext(persona_type=persona),
+                    shareable=(visibility == "shareable"),
                 ),
             )
         if len(test_items) == 0:
diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py
index 4d3c77e33..6bea1c986 100644
--- a/tests/modelbench_tests/test_run.py
+++ b/tests/modelbench_tests/test_run.py
@@ -75,7 +75,9 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     test1 = ModelgaugeTestWrapper(
         actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path
     )
-    test_item1 = TestItem(prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1")
+    test_item1 = TestItem(
+        prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1", shareable=True
+    )
     tri1 = TestRunItem(
         test=test1,
         test_item=test_item1,
@@ -87,11 +89,13 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     )
     benchmark_run.add_finished_item(tri1)
 
-    # this one should not be included in the annotations file b/c it's not in the demo prompt set
+    # this one should not be included in the annotations file b/c it's marked as non shareable
     test2 = ModelgaugeTestWrapper(
         actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
     )
-    test_item2 = TestItem(prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2")
+    test_item2 = TestItem(
+        prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2", shareable=False
+    )
     tri2 = TestRunItem(
         test=test2,
         test_item=test_item2,
@@ -106,7 +110,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     test3 = ModelgaugeTestWrapper(
         actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path
     )
-    test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3")
+    test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3", shareable=True)
     tri3 = TestRunItem(
         test=test3,
         test_item=test_item3,
diff --git a/tests/modelgauge_tests/test_prompt_sets.py b/tests/modelgauge_tests/test_prompt_sets.py
index 5ca2d6de5..952172b1f 100644
--- a/tests/modelgauge_tests/test_prompt_sets.py
+++ b/tests/modelgauge_tests/test_prompt_sets.py
@@ -9,10 +9,13 @@
 
 
 def test_file_base_name():
-    assert prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice") == "airr_official_1.0_practice_prompt_set_release"
+    assert (
+        prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice")
+        == "airr_official_1.0_practice_prompt_set_release_with_visibility"
+    )
     assert (
         prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice", "en_us")
-        == "airr_official_1.0_practice_prompt_set_release"
+        == "airr_official_1.0_practice_prompt_set_release_with_visibility"
     )
     assert (
         prompt_set_file_base_name(GENERAL_PROMPT_SETS, "official", "fr_fr")
diff --git a/tests/modelgauge_tests/test_records.py b/tests/modelgauge_tests/test_records.py
index ed2401836..61dce3d36 100644
--- a/tests/modelgauge_tests/test_records.py
+++ b/tests/modelgauge_tests/test_records.py
@@ -106,6 +106,7 @@ def test_serialize_test_record():
           "text": "some-text"
         },
         "source_id": "id01",
+        "shareable": false,
         "context_internal": {
           "module": "modelgauge_tests.test_records",
           "class_name": "MockContext",
@@ -123,6 +124,7 @@ def test_serialize_test_record():
             "text": "some-text"
           },
           "source_id": "id01",
+          "shareable": false,
           "context_internal": {
             "module": "modelgauge_tests.test_records",
             "class_name": "MockContext",

From 5bcdbd501a593cb4f994c887c87cb8788727724e Mon Sep 17 00:00:00 2001
From: rogthefrog <roger@mlcommons.org>
Date: Tue, 16 Dec 2025 13:03:39 -0500
Subject: [PATCH 3/4] add test item without explicit shareability flag

---
 tests/modelbench_tests/test_run.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py
index 6bea1c986..bb4a49cb0 100644
--- a/tests/modelbench_tests/test_run.py
+++ b/tests/modelbench_tests/test_run.py
@@ -122,6 +122,24 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     )
     benchmark_run.add_finished_item(tri3)
 
+    # this one should not be included in the annotations file b/c it doesn't have availability set explicitly
+    test4 = ModelgaugeTestWrapper(
+        actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
+    )
+    test_item4 = TestItem(
+        prompt=TextPrompt(text="Is there anyone on board who knows how to fly a plane?"), source_id="id4"
+    )
+    tri4 = TestRunItem(
+        test=test4,
+        test_item=test_item4,
+        sut=sut,
+        sut_response=SUTResponse(text="Surely you can’t be serious."),
+        annotations={"annotator2": SafetyAnnotation(is_safe=False)},
+        measurements={},
+        exceptions=[],
+    )
+    benchmark_run.add_finished_item(tri4)
+
     return benchmark_run
 
 

From 2650811222fde4da90445bdd7a745f2c1f359f5c Mon Sep 17 00:00:00 2001
From: rogthefrog <roger@mlcommons.org>
Date: Tue, 16 Dec 2025 14:16:01 -0500
Subject: [PATCH 4/4] more item shareability tests

---
 src/modelbench/benchmark_runner.py            |  8 ++-----
 .../modelbench_tests/test_benchmark_runner.py |  7 ++----
 tests/modelbench_tests/test_run.py            | 22 ++++++++++++++++++-
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py
index 622031d64..29bd46575 100644
--- a/src/modelbench/benchmark_runner.py
+++ b/src/modelbench/benchmark_runner.py
@@ -243,13 +243,9 @@ def compile_annotations(self) -> list:
                 except:
                     hazard = full_hazard_string
                     prompt_set = "unknown"
-                # most prompts and annotations can't be shared. Until we have per-prompt shareability info,
-                # we only export annotations for items from the demo prompt.
-                # TODO fix this when we have per-prompt shareability info
-                if prompt_set != "demo":
-                    continue
+
                 for item in items:  # this is a TestRunItem
-                    if not item.test_item.shareable:
+                    if not item.test_item.shareable or prompt_set == "official":
                         continue
                     for _, annotation in item.annotations.items():
                         ann = {
diff --git a/tests/modelbench_tests/test_benchmark_runner.py b/tests/modelbench_tests/test_benchmark_runner.py
index ee4fec500..3b6f53049 100644
--- a/tests/modelbench_tests/test_benchmark_runner.py
+++ b/tests/modelbench_tests/test_benchmark_runner.py
@@ -209,14 +209,11 @@ def test_test_run_items_properly_isolated(self, a_wrapped_test):
         assert len(a.measurements) == 1
         assert len(b.measurements) == 0
 
-    def test_benchmark_source(self, fake_secrets, tmp_path, benchmark):
+    def test_test_run_items_arent_shareable_by_default(self, fake_secrets, tmp_path, benchmark):
         bsa = TestRunItemSource(self.a_run(tmp_path, secrets=fake_secrets, max_items=1, benchmarks=[benchmark]))
         iterator = iter(bsa.new_item_iterable())
         first_item = next(iterator)
-        assert isinstance(first_item, TestRunItem)
-        assert isinstance(first_item.test_item, TestItem)
-        with pytest.raises(StopIteration):
-            next(iterator)
+        assert first_item.test_item.shareable is False
 
     def test_benchmark_sut_assigner(self, a_wrapped_test, tmp_path):
         sut = FakeSUT("one")
diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py
index bb4a49cb0..5d5dce957 100644
--- a/tests/modelbench_tests/test_run.py
+++ b/tests/modelbench_tests/test_run.py
@@ -124,7 +124,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
 
     # this one should not be included in the annotations file b/c it doesn't have availability set explicitly
     test4 = ModelgaugeTestWrapper(
-        actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
+        actual_test=MagicMock(uid="test4-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
     )
     test_item4 = TestItem(
         prompt=TextPrompt(text="Is there anyone on board who knows how to fly a plane?"), source_id="id4"
@@ -140,6 +140,26 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     )
     benchmark_run.add_finished_item(tri4)
 
+    # this one should not be included in the annotations file b/c its prompt set is official
+    test5 = ModelgaugeTestWrapper(
+        actual_test=MagicMock(uid="test5-hazard-locale-official-1.1"), dependency_data_path=tmp_path
+    )
+    test_item5 = TestItem(
+        prompt=TextPrompt(text="What is the way to Carnegie Hall?"),
+        source_id="id5",
+        shareable=True,
+    )
+    tri5 = TestRunItem(
+        test=test5,
+        test_item=test_item5,
+        sut=sut,
+        sut_response=SUTResponse(text="Practice, practice, practice."),
+        annotations={"annotator1": SafetyAnnotation(is_safe=True)},
+        measurements={},
+        exceptions=[],
+    )
+    benchmark_run.add_finished_item(tri5)
+
     return benchmark_run