add support for optional shareability field in prompt sets

rogthefrog · rogthefrog · commit a4891547a342 · 2025-12-15T19:52:19.000-05:00
diff --git a/src/modelbench/benchmark_runner.py b/src/modelbench/benchmark_runner.py
@@ -249,6 +249,8 @@ def compile_annotations(self) -> list:
                 if prompt_set != "demo":
                     continue
                 for item in items:  # this is a TestRunItem
+                    if not item.test_item.shareable:
+                        continue
                     for _, annotation in item.annotations.items():
                         ann = {
                             "hazard": hazard,
diff --git a/src/modelgauge/prompt_sets.py b/src/modelgauge/prompt_sets.py
@@ -20,7 +20,7 @@ def description(cls) -> SecretDescription:
 
 GENERAL_PROMPT_SETS = {
     "practice": {
-        "en_us": "airr_official_1.0_practice_prompt_set_release",
+        "en_us": "airr_official_1.0_practice_prompt_set_release_with_visibility",
         "fr_fr": "airr_official_1.0_practice_fr_fr_prompt_set_release",
         "zh_cn": "airr_official_1.0_practice_zh_cn_prompt_set_release",
     },
@@ -30,7 +30,7 @@ def description(cls) -> SecretDescription:
         "zh_cn": "airr_official_1.0_heldback_zh_cn_prompt_set_release",
     },
     "demo": {
-        "en_us": "airr_official_1.0_demo_prompt_set_release",
+        "en_us": "airr_official_1.0_demo_prompt_set_release_with_visibility",
         "fr_fr": "airr_official_1.0_demo_fr_fr_prompt_set_release",
     },
 }
diff --git a/src/modelgauge/single_turn_prompt_response.py b/src/modelgauge/single_turn_prompt_response.py
@@ -28,6 +28,9 @@ class TestItem(BaseModel):
     source_id: Optional[str]
     """Identifier for where this Prompt came from in the underlying datasource."""
 
+    shareable: Optional[bool] = False
+    """Denotes whether the prompt or associated annotations may be shared in reports."""
+
     @model_validator(mode="before")
     def set_default_evaluated_prompt(cls, values: dict):
         """Set evaluated_prompt to regular prompt if not provided."""
diff --git a/src/modelgauge/tests/safe_v1.py b/src/modelgauge/tests/safe_v1.py
@@ -179,11 +179,13 @@ def make_test_items(self, dependency_helper: DependencyHelper) -> List[TestItem]
             if locale != self.locale:
                 continue
 
+            visibility = row.get("visibility", "private")
             test_items.append(
                 TestItem(
                     prompt=TextPrompt(text=text),
                     source_id=row["release_prompt_id"],
                     context=SafeTestItemContext(persona_type=persona),
+                    shareable=(visibility == "shareable"),
                 ),
             )
         if len(test_items) == 0:
diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py
@@ -75,7 +75,9 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     test1 = ModelgaugeTestWrapper(
         actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path
     )
-    test_item1 = TestItem(prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1")
+    test_item1 = TestItem(
+        prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1", shareable=True
+    )
     tri1 = TestRunItem(
         test=test1,
         test_item=test_item1,
@@ -87,11 +89,13 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     )
     benchmark_run.add_finished_item(tri1)
 
-    # this one should not be included in the annotations file b/c it's not in the demo prompt set
+    # this one should not be included in the annotations file b/c it's marked as non shareable
     test2 = ModelgaugeTestWrapper(
         actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
     )
-    test_item2 = TestItem(prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2")
+    test_item2 = TestItem(
+        prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2", shareable=False
+    )
     tri2 = TestRunItem(
         test=test2,
         test_item=test_item2,
@@ -106,7 +110,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     test3 = ModelgaugeTestWrapper(
         actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path
     )
-    test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3")
+    test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3", shareable=True)
     tri3 = TestRunItem(
         test=test3,
         test_item=test_item3,
diff --git a/tests/modelgauge_tests/test_prompt_sets.py b/tests/modelgauge_tests/test_prompt_sets.py
@@ -9,10 +9,13 @@
 
 
 def test_file_base_name():
-    assert prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice") == "airr_official_1.0_practice_prompt_set_release"
+    assert (
+        prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice")
+        == "airr_official_1.0_practice_prompt_set_release_with_visibility"
+    )
     assert (
         prompt_set_file_base_name(GENERAL_PROMPT_SETS, "practice", "en_us")
-        == "airr_official_1.0_practice_prompt_set_release"
+        == "airr_official_1.0_practice_prompt_set_release_with_visibility"
     )
     assert (
         prompt_set_file_base_name(GENERAL_PROMPT_SETS, "official", "fr_fr")
diff --git a/tests/modelgauge_tests/test_records.py b/tests/modelgauge_tests/test_records.py
@@ -106,6 +106,7 @@ def test_serialize_test_record():
           "text": "some-text"
         },
         "source_id": "id01",
+        "shareable": false,
         "context_internal": {
           "module": "modelgauge_tests.test_records",
           "class_name": "MockContext",
@@ -123,6 +124,7 @@ def test_serialize_test_record():
             "text": "some-text"
           },
           "source_id": "id01",
+          "shareable": false,
           "context_internal": {
             "module": "modelgauge_tests.test_records",
             "class_name": "MockContext",