fix(rag): use doc_id dedup key for any provider, not only dify (#35759)

ki3nd · asukaminato0721 · autofix-ci[bot] · web-flow · commit 1f29565673b5 · 2026-05-05T06:42:18.000Z
Co-authored-by: Asuka Minato &lt;i@asukaminato.eu.org&gt;
Co-authored-by: autofix-ci[bot] &lt;114827586+autofix-ci[bot]@users.noreply.github.com&gt;
diff --git a/api/core/rag/datasource/retrieval_service.py b/api/core/rag/datasource/retrieval_service.py
@@ -217,10 +217,11 @@ def _deduplicate_documents(cls, documents: list[Document]) -> list[Document]:
         """Deduplicate documents in O(n) while preserving first-seen order.
 
         Rules:
-        - For provider == "dify" and metadata["doc_id"] exists: keep the doc with the highest
-          metadata["score"] among duplicates; if a later duplicate has no score, ignore it.
-        - For non-dify documents (or dify without doc_id): deduplicate by content key
-          (provider, page_content), keeping the first occurrence.
+        - If metadata["doc_id"] exists (any provider): deduplicate by (provider, doc_id) key;
+          keep the doc with the highest metadata["score"] among duplicates. If a later duplicate
+          has no score, ignore it.
+        - If metadata["doc_id"] is absent: deduplicate by content key (provider, page_content),
+          keeping the first occurrence.
         """
         if not documents:
             return documents
@@ -231,11 +232,10 @@ def _deduplicate_documents(cls, documents: list[Document]) -> list[Document]:
         order: list[tuple] = []
 
         for doc in documents:
-            is_dify = doc.provider == "dify"
-            doc_id = (doc.metadata or {}).get("doc_id") if is_dify else None
+            doc_id = (doc.metadata or {}).get("doc_id")
 
-            if is_dify and doc_id:
-                key = ("dify", doc_id)
+            if doc_id:
+                key = (doc.provider or "dify", doc_id)
                 if key not in chosen:
                     chosen[key] = doc
                     order.append(key)
diff --git a/api/tests/unit_tests/core/rag/retrieval/test_dataset_retrieval.py b/api/tests/unit_tests/core/rag/retrieval/test_dataset_retrieval.py
@@ -1106,11 +1106,11 @@ def test_deduplicate_documents_empty_list(self):
 
     def test_deduplicate_documents_non_dify_provider(self):
         """
-        Test deduplication with non-dify provider documents.
+        Test deduplication with non-dify provider documents that have no doc_id.
 
         Verifies:
-        - External provider documents use content-based deduplication
-        - Different providers are handled correctly
+        - External provider documents without doc_id use content-based deduplication
+        - Identical content from the same provider is collapsed to one result
         """
         # Arrange
         doc1 = Document(
@@ -1131,7 +1131,96 @@ def test_deduplicate_documents_non_dify_provider(self):
 
         # Assert
         # External documents without doc_id should use content-based dedup
-        assert len(result) >= 1
+        assert len(result) == 1
+
+    def test_deduplicate_documents_non_dify_provider_with_doc_id_different_sources(self):
+        """
+        Regression test for issue #35707.
+
+        Two chunks from different source documents share identical text content but carry
+        different doc_ids. Before the fix, non-dify providers were forced into content-based
+        deduplication and the second chunk was silently dropped. After the fix, doc_id is used
+        as the dedup key for any provider that exposes it, so both chunks must be retained.
+
+        Verifies:
+        - Non-dify provider documents with different doc_ids are NOT deduplicated even when
+          their page_content is identical.
+        """
+        # Arrange — same content, different doc_ids, non-dify provider (e.g. Weaviate / Qdrant)
+        doc_a = Document(
+            page_content="Shared identical content",
+            metadata={"doc_id": "doc-from-file-a", "score": 0.85},
+            provider="weaviate",
+        )
+        doc_b = Document(
+            page_content="Shared identical content",
+            metadata={"doc_id": "doc-from-file-b", "score": 0.82},
+            provider="weaviate",
+        )
+
+        # Act
+        result = RetrievalService._deduplicate_documents([doc_a, doc_b])
+
+        # Assert — both documents must be kept; losing either silently drops a source citation
+        assert len(result) == 2
+        doc_ids = {doc.metadata["doc_id"] for doc in result}
+        assert doc_ids == {"doc-from-file-a", "doc-from-file-b"}
+
+    def test_deduplicate_documents_non_dify_provider_with_same_doc_id(self):
+        """
+        Test that non-dify provider documents sharing the same doc_id are deduplicated by
+        doc_id key (not by content), and the higher-scored duplicate is retained.
+
+        Verifies:
+        - doc_id-based deduplication now applies to any provider, not only "dify"
+        - The document with the highest score wins when doc_ids collide
+        """
+        # Arrange
+        doc_low = Document(
+            page_content="Content A",
+            metadata={"doc_id": "chunk-1", "score": 0.5},
+            provider="qdrant",
+        )
+        doc_high = Document(
+            page_content="Content A",
+            metadata={"doc_id": "chunk-1", "score": 0.9},
+            provider="qdrant",
+        )
+
+        # Act
+        result = RetrievalService._deduplicate_documents([doc_low, doc_high])
+
+        # Assert
+        assert len(result) == 1
+        assert result[0].metadata["score"] == 0.9
+
+    def test_deduplicate_documents_dify_provider_without_doc_id_falls_back_to_content(self):
+        """
+        Test that a dify provider document without doc_id still falls back to content-based
+        deduplication (no regression from original behaviour).
+
+        Verifies:
+        - Absence of doc_id triggers content-based dedup regardless of provider
+        - First occurrence is kept when content is identical
+        """
+        # Arrange — dify docs with no doc_id, same content
+        doc1 = Document(
+            page_content="Same content",
+            metadata={"score": 0.8},
+            provider="dify",
+        )
+        doc2 = Document(
+            page_content="Same content",
+            metadata={"score": 0.9},
+            provider="dify",
+        )
+
+        # Act
+        result = RetrievalService._deduplicate_documents([doc1, doc2])
+
+        # Assert — collapsed to one; first-seen wins (no score comparison in content branch)
+        assert len(result) == 1
+        assert result[0].metadata["score"] == 0.8
 
     # ==================== Metadata Filtering Tests ====================