mlcommons · rogthefrog · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
@@ -243,12 +243,10 @@ def compile_annotations(self) -> list:
                 except:
                     hazard = full_hazard_string
                     prompt_set = "unknown"
-                # most prompts and annotations can't be shared. Until we have per-prompt shareability info,
-                # we only export annotations for items from the demo prompt.
-                # TODO fix this when we have per-prompt shareability info
-                if prompt_set != "demo":
-                    continue
+
                 for item in items:  # this is a TestRunItem
+                    if not item.test_item.shareable or prompt_set == "official":
+                        continue
                     for _, annotation in item.annotations.items():
                         ann = {
                             "hazard": hazard,

@@ -20,7 +20,7 @@ def description(cls) -> SecretDescription:
 
 GENERAL_PROMPT_SETS = {
     "practice": {
-        "en_us": "airr_official_1.0_practice_prompt_set_release",
+        "en_us": "airr_official_1.0_practice_prompt_set_release_with_visibility",
         "fr_fr": "airr_official_1.0_practice_fr_fr_prompt_set_release",
         "zh_cn": "airr_official_1.0_practice_zh_cn_prompt_set_release",
     },
@@ -30,7 +30,7 @@ def description(cls) -> SecretDescription:
         "zh_cn": "airr_official_1.0_heldback_zh_cn_prompt_set_release",
     },
     "demo": {
-        "en_us": "airr_official_1.0_demo_prompt_set_release",
+        "en_us": "airr_official_1.0_demo_prompt_set_release_with_visibility",
         "fr_fr": "airr_official_1.0_demo_fr_fr_prompt_set_release",
     },
 }

@@ -28,6 +28,9 @@ class TestItem(BaseModel):
     source_id: Optional[str]
     """Identifier for where this Prompt came from in the underlying datasource."""
 
+    shareable: Optional[bool] = False
+    """Denotes whether the prompt or associated annotations may be shared in reports."""
+
     @model_validator(mode="before")
     def set_default_evaluated_prompt(cls, values: dict):
         """Set evaluated_prompt to regular prompt if not provided."""

@@ -179,11 +179,13 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem]
             if locale != self.locale:
                 continue
 
+            visibility = row.get("visibility", "private")
             test_items.append(
                 TestItem(
                     prompt=TextPrompt(text=text),
                     source_id=row["release_prompt_id"],
                     context=SafeTestItemContext(persona_type=persona),
+                    shareable=(visibility == "shareable"),
                 ),
             )
         if len(test_items) == 0:

@@ -209,14 +209,11 @@ def test_test_run_items_properly_isolated(self, a_wrapped_test):
         assert len(a.measurements) == 1
         assert len(b.measurements) == 0
 
-    def test_benchmark_source(self, fake_secrets, tmp_path, benchmark):
+    def test_test_run_items_arent_shareable_by_default(self, fake_secrets, tmp_path, benchmark):
         bsa = TestRunItemSource(self.a_run(tmp_path, secrets=fake_secrets, max_items=1, benchmarks=[benchmark]))
         iterator = iter(bsa.new_item_iterable())
         first_item = next(iterator)
-        assert isinstance(first_item, TestRunItem)
-        assert isinstance(first_item.test_item, TestItem)
-        with pytest.raises(StopIteration):
-            next(iterator)
+        assert first_item.test_item.shareable is False
 
     def test_benchmark_sut_assigner(self, a_wrapped_test, tmp_path):
         sut = FakeSUT("one")

@@ -75,7 +75,9 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     test1 = ModelgaugeTestWrapper(
         actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path
     )
-    test_item1 = TestItem(prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1")
+    test_item1 = TestItem(
+        prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1", shareable=True
+    )
     tri1 = TestRunItem(
         test=test1,
         test_item=test_item1,
@@ -87,11 +89,13 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     )
     benchmark_run.add_finished_item(tri1)
 
-    # this one should not be included in the annotations file b/c it's not in the demo prompt set
+    # this one should not be included in the annotations file b/c it's marked as non shareable
     test2 = ModelgaugeTestWrapper(
         actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
     )
-    test_item2 = TestItem(prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2")
+    test_item2 = TestItem(
+        prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2", shareable=False
+    )
     tri2 = TestRunItem(
         test=test2,
         test_item=test_item2,
@@ -106,7 +110,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     test3 = ModelgaugeTestWrapper(
         actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path
     )
-    test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3")
+    test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3", shareable=True)
     tri3 = TestRunItem(
         test=test3,
         test_item=test_item3,
@@ -118,6 +122,44 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     )
     benchmark_run.add_finished_item(tri3)
 
+    # this one should not be included in the annotations file b/c it doesn't have availability set explicitly
+    test4 = ModelgaugeTestWrapper(
+        actual_test=MagicMock(uid="test4-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
+    )
+    test_item4 = TestItem(
+        prompt=TextPrompt(text="Is there anyone on board who knows how to fly a plane?"), source_id="id4"
+    )
+    tri4 = TestRunItem(
+        test=test4,
+        test_item=test_item4,
+        sut=sut,
+        sut_response=SUTResponse(text="Surely you can’t be serious."),
+        annotations={"annotator2": SafetyAnnotation(is_safe=False)},
+        measurements={},
+        exceptions=[],
+    )
+    benchmark_run.add_finished_item(tri4)
+
+    # this one should not be included in the annotations file b/c its prompt set is official
+    test5 = ModelgaugeTestWrapper(
+        actual_test=MagicMock(uid="test5-hazard-locale-official-1.1"), dependency_data_path=tmp_path
+    )
+    test_item5 = TestItem(
+        prompt=TextPrompt(text="What is the way to Carnegie Hall?"),
+        source_id="id5",
+        shareable=True,
+    )
+    tri5 = TestRunItem(
+        test=test5,
+        test_item=test_item5,
+        sut=sut,
+        sut_response=SUTResponse(text="Practice, practice, practice."),
+        annotations={"annotator1": SafetyAnnotation(is_safe=True)},
+        measurements={},
+        exceptions=[],
+    )
+    benchmark_run.add_finished_item(tri5)
+
     return benchmark_run
 
 

@@ -1 +1 @@
-from utilities import parent_directory
+from modelgauge_tests.utilities import parent_directory
@@ -9,10 +9,13 @@
 
 
 def test_file_base_name():
-    assert prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice") == "airr_official_1.0_practice_prompt_set_release"
+    assert (
+        prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice")
+        == "airr_official_1.0_practice_prompt_set_release_with_visibility"
+    )
     assert (
         prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice", "en_us")
-        == "airr_official_1.0_practice_prompt_set_release"
+        == "airr_official_1.0_practice_prompt_set_release_with_visibility"
     )
     assert (
         prompt_set_file_base_name(GENERAL_PROMPT_SETS, "official", "fr_fr")

@@ -106,6 +106,7 @@ def test_serialize_test_record():
           "text": "some-text"
         },
         "source_id": "id01",
+        "shareable": false,
         "context_internal": {
           "module": "modelgauge_tests.test_records",
           "class_name": "MockContext",
@@ -123,6 +124,7 @@ def test_serialize_test_record():
             "text": "some-text"
           },
           "source_id": "id01",
+          "shareable": false,
           "context_internal": {
             "module": "modelgauge_tests.test_records",
             "class_name": "MockContext",
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from utilities import parent_directory
		from modelgauge_tests.utilities import parent_directory