Skip to content

Commit 1f29565

Browse files
ki3ndasukaminato0721autofix-ci[bot]
authored
fix(rag): use doc_id dedup key for any provider, not only dify (#35759)
Co-authored-by: Asuka Minato <i@asukaminato.eu.org> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
1 parent 90fe54c commit 1f29565

2 files changed

Lines changed: 101 additions & 12 deletions

File tree

api/core/rag/datasource/retrieval_service.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -217,10 +217,11 @@ def _deduplicate_documents(cls, documents: list[Document]) -> list[Document]:
217217
"""Deduplicate documents in O(n) while preserving first-seen order.
218218
219219
Rules:
220-
- For provider == "dify" and metadata["doc_id"] exists: keep the doc with the highest
221-
metadata["score"] among duplicates; if a later duplicate has no score, ignore it.
222-
- For non-dify documents (or dify without doc_id): deduplicate by content key
223-
(provider, page_content), keeping the first occurrence.
220+
- If metadata["doc_id"] exists (any provider): deduplicate by (provider, doc_id) key;
221+
keep the doc with the highest metadata["score"] among duplicates. If a later duplicate
222+
has no score, ignore it.
223+
- If metadata["doc_id"] is absent: deduplicate by content key (provider, page_content),
224+
keeping the first occurrence.
224225
"""
225226
if not documents:
226227
return documents
@@ -231,11 +232,10 @@ def _deduplicate_documents(cls, documents: list[Document]) -> list[Document]:
231232
order: list[tuple] = []
232233

233234
for doc in documents:
234-
is_dify = doc.provider == "dify"
235-
doc_id = (doc.metadata or {}).get("doc_id") if is_dify else None
235+
doc_id = (doc.metadata or {}).get("doc_id")
236236

237-
if is_dify and doc_id:
238-
key = ("dify", doc_id)
237+
if doc_id:
238+
key = (doc.provider or "dify", doc_id)
239239
if key not in chosen:
240240
chosen[key] = doc
241241
order.append(key)

api/tests/unit_tests/core/rag/retrieval/test_dataset_retrieval.py

Lines changed: 93 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,11 +1106,11 @@ def test_deduplicate_documents_empty_list(self):
11061106

11071107
def test_deduplicate_documents_non_dify_provider(self):
11081108
"""
1109-
Test deduplication with non-dify provider documents.
1109+
Test deduplication with non-dify provider documents that have no doc_id.
11101110
11111111
Verifies:
1112-
- External provider documents use content-based deduplication
1113-
- Different providers are handled correctly
1112+
- External provider documents without doc_id use content-based deduplication
1113+
- Identical content from the same provider is collapsed to one result
11141114
"""
11151115
# Arrange
11161116
doc1 = Document(
@@ -1131,7 +1131,96 @@ def test_deduplicate_documents_non_dify_provider(self):
11311131

11321132
# Assert
11331133
# External documents without doc_id should use content-based dedup
1134-
assert len(result) >= 1
1134+
assert len(result) == 1
1135+
1136+
def test_deduplicate_documents_non_dify_provider_with_doc_id_different_sources(self):
1137+
"""
1138+
Regression test for issue #35707.
1139+
1140+
Two chunks from different source documents share identical text content but carry
1141+
different doc_ids. Before the fix, non-dify providers were forced into content-based
1142+
deduplication and the second chunk was silently dropped. After the fix, doc_id is used
1143+
as the dedup key for any provider that exposes it, so both chunks must be retained.
1144+
1145+
Verifies:
1146+
- Non-dify provider documents with different doc_ids are NOT deduplicated even when
1147+
their page_content is identical.
1148+
"""
1149+
# Arrange — same content, different doc_ids, non-dify provider (e.g. Weaviate / Qdrant)
1150+
doc_a = Document(
1151+
page_content="Shared identical content",
1152+
metadata={"doc_id": "doc-from-file-a", "score": 0.85},
1153+
provider="weaviate",
1154+
)
1155+
doc_b = Document(
1156+
page_content="Shared identical content",
1157+
metadata={"doc_id": "doc-from-file-b", "score": 0.82},
1158+
provider="weaviate",
1159+
)
1160+
1161+
# Act
1162+
result = RetrievalService._deduplicate_documents([doc_a, doc_b])
1163+
1164+
# Assert — both documents must be kept; losing either silently drops a source citation
1165+
assert len(result) == 2
1166+
doc_ids = {doc.metadata["doc_id"] for doc in result}
1167+
assert doc_ids == {"doc-from-file-a", "doc-from-file-b"}
1168+
1169+
def test_deduplicate_documents_non_dify_provider_with_same_doc_id(self):
1170+
"""
1171+
Test that non-dify provider documents sharing the same doc_id are deduplicated by
1172+
doc_id key (not by content), and the higher-scored duplicate is retained.
1173+
1174+
Verifies:
1175+
- doc_id-based deduplication now applies to any provider, not only "dify"
1176+
- The document with the highest score wins when doc_ids collide
1177+
"""
1178+
# Arrange
1179+
doc_low = Document(
1180+
page_content="Content A",
1181+
metadata={"doc_id": "chunk-1", "score": 0.5},
1182+
provider="qdrant",
1183+
)
1184+
doc_high = Document(
1185+
page_content="Content A",
1186+
metadata={"doc_id": "chunk-1", "score": 0.9},
1187+
provider="qdrant",
1188+
)
1189+
1190+
# Act
1191+
result = RetrievalService._deduplicate_documents([doc_low, doc_high])
1192+
1193+
# Assert
1194+
assert len(result) == 1
1195+
assert result[0].metadata["score"] == 0.9
1196+
1197+
def test_deduplicate_documents_dify_provider_without_doc_id_falls_back_to_content(self):
1198+
"""
1199+
Test that a dify provider document without doc_id still falls back to content-based
1200+
deduplication (no regression from original behaviour).
1201+
1202+
Verifies:
1203+
- Absence of doc_id triggers content-based dedup regardless of provider
1204+
- First occurrence is kept when content is identical
1205+
"""
1206+
# Arrange — dify docs with no doc_id, same content
1207+
doc1 = Document(
1208+
page_content="Same content",
1209+
metadata={"score": 0.8},
1210+
provider="dify",
1211+
)
1212+
doc2 = Document(
1213+
page_content="Same content",
1214+
metadata={"score": 0.9},
1215+
provider="dify",
1216+
)
1217+
1218+
# Act
1219+
result = RetrievalService._deduplicate_documents([doc1, doc2])
1220+
1221+
# Assert — collapsed to one; first-seen wins (no score comparison in content branch)
1222+
assert len(result) == 1
1223+
assert result[0].metadata["score"] == 0.8
11351224

11361225
# ==================== Metadata Filtering Tests ====================
11371226

0 commit comments

Comments
 (0)