Duplicate prompt/response pairs with unique prompt ids (#1215)

superdosh · web-flow · commit 54a24f0d3362 · 2025-08-22T13:20:12.000-04:00
* Deal with issue related to duplicate prompt/response pairs wiht unique prompt ids.

* Remove debug print statement.

* Update the annotator worker key to include prompt id.

* Fix test.
diff --git a/src/modelgauge/annotation_pipeline.py b/src/modelgauge/annotation_pipeline.py
@@ -1,26 +1,18 @@
-import csv
-import jsonlines
 import logging
 import time
-from abc import abstractmethod, ABCMeta
 from collections import defaultdict
 from pydantic import BaseModel
-from typing import Iterable
 
 from modelgauge.annotation import Annotation
 from modelgauge.annotator import Annotator
 from modelgauge.annotator_set import AnnotatorSet
 from modelgauge.dataset import AnnotationDataset, PromptResponseDataset
-from modelgauge.data_schema import DEFAULT_PROMPT_RESPONSE_SCHEMA, PromptResponseSchema
 from modelgauge.pipeline import CachingPipe, Pipe, Sink, Source
-from modelgauge.prompt import TextPrompt
 from modelgauge.single_turn_prompt_response import (
     AnnotatedSUTInteraction,
     SUTResponseAnnotations,
     SUTInteraction,
-    TestItem,
 )
-from modelgauge.sut import PromptResponseSUT, SUTResponse
 
 logger = logging.getLogger(__name__)
 
@@ -59,7 +51,7 @@ def key(self, item):
         request = annotator.translate_request(sut_interaction.prompt, sut_interaction.response)
         if isinstance(request, BaseModel):
             request = request.model_dump_json()
-        return (request, annotator_uid)
+        return (sut_interaction.prompt.source_id, request, annotator_uid)
 
     def handle_uncached_item(self, item):
         sut_interaction, annotator_uid = item
diff --git a/tests/modelgauge_tests/test_annotation_pipeline.py b/tests/modelgauge_tests/test_annotation_pipeline.py
@@ -143,9 +143,9 @@ def test_annotator_worker_unique_responses(annotators, tmp_path):
     w.handle_item((make_sut_interaction("", "", "", "response 2"), "annotator_pydantic"))
     assert annotators["annotator_pydantic"].annotate_calls == 2
 
-    # Non-response SUT interaction attributes do not affect the cache key.
+    # New prompt id does affect the cache key.
     w.handle_item((make_sut_interaction("2", "2", "2", "response 2"), "annotator_pydantic"))
-    assert annotators["annotator_pydantic"].annotate_calls == 2
+    assert annotators["annotator_pydantic"].annotate_calls == 3
 
 
 def test_annotator_worker_cache_unique_prompts(tmp_path):
diff --git a/tests/modelgauge_tests/test_pipeline_runner.py b/tests/modelgauge_tests/test_pipeline_runner.py
@@ -8,7 +8,7 @@
     AnnotatorWorkers,
 )
 from modelgauge.annotator_set import AnnotatorSet
-from modelgauge.dataset import PromptDataset, PromptResponseDataset
+from modelgauge.dataset import AnnotationDataset, PromptDataset, PromptResponseDataset
 from modelgauge.data_schema import (
     DEFAULT_PROMPT_RESPONSE_SCHEMA as PROMPT_RESPONSE_SCHEMA,
     DEFAULT_PROMPT_SCHEMA as PROMPT_SCHEMA,
@@ -54,7 +54,7 @@ def prompts_dataset(prompts_file):
 
 @pytest.fixture(scope="session")
 def prompt_responses_file(tmp_path_factory):
-    """Sample file with 2 prompts + responses from 2 SUTs for testing."""
+    """Sample file with 3 prompts + responses from 2 SUTs for testing."""
     file = tmp_path_factory.mktemp("data") / "prompt-responses.csv"
     with open(file, "w") as f:
         text = f"{PROMPT_RESPONSE_SCHEMA.prompt_uid},{PROMPT_RESPONSE_SCHEMA.prompt_text},{PROMPT_RESPONSE_SCHEMA.sut_uid},{PROMPT_RESPONSE_SCHEMA.sut_response}\n"
@@ -65,6 +65,21 @@ def prompt_responses_file(tmp_path_factory):
     return file
 
 
+@pytest.fixture(scope="session")
+def prompt_responses_file_with_duplicates(tmp_path_factory):
+    """Sample file with 3 prompts + responses from 2 SUTs for testing. Also include duplicate prompt/response, with unique prompt id."""
+    file = tmp_path_factory.mktemp("data") / "prompt-responses.csv"
+    with open(file, "w") as f:
+        text = f"{PROMPT_RESPONSE_SCHEMA.prompt_uid},{PROMPT_RESPONSE_SCHEMA.prompt_text},{PROMPT_RESPONSE_SCHEMA.sut_uid},{PROMPT_RESPONSE_SCHEMA.sut_response}\n"
+        for i in range(NUM_PROMPTS):
+            text += f"p{i},Prompt {i},sut1,Response {i}\n"
+            text += f"p{i},Prompt {i},sut2,Response {i}\n"
+        # add a duplicate with unique prompt ids
+        text += f"q0,Prompt 0,sut1,Response 0\n"
+        f.write(text)
+    return file
+
+
 @pytest.fixture
 def annotators():
     return {
@@ -493,6 +508,23 @@ def test_metadata_ensemble(self, runner_ensemble):
         assert metadata["ensemble"]["annotators"] == ["annotator1", "annotator2", "annotator3"]
         assert metadata["ensemble"]["num_votes"] == NUM_PROMPTS * self.NUM_SUTS
 
+    def test_cache_responses(self, prompt_responses_file_with_duplicates, annotators, tmp_path):
+        runner = AnnotatorRunner(
+            annotators=annotators,
+            num_workers=1,
+            input_dataset=PromptResponseDataset(prompt_responses_file_with_duplicates, mode="r"),
+            output_dir=tmp_path,
+            cache_dir=tmp_path / "cache",
+        )
+        runner.run(progress_callback=lambda _: _, debug=False)
+        prompt_ids = set()
+        with PromptResponseDataset(prompt_responses_file_with_duplicates, "r") as prompts:
+            prompt_ids.update(item.prompt.source_id for item in prompts)
+        annotated_prompt_ids = set()
+        with AnnotationDataset(runner.output_dir() / runner.output_file_name, "r") as annotations:
+            annotated_prompt_ids.update(item.sut_interaction.prompt.source_id for item in annotations)
+        assert prompt_ids == annotated_prompt_ids
+
 
 class TestBuildRunner:
     def test_build_prompt_runner(self, prompts_file, suts, tmp_path):