@@ -1106,11 +1106,11 @@ def test_deduplicate_documents_empty_list(self):
11061106
11071107 def test_deduplicate_documents_non_dify_provider (self ):
11081108 """
1109- Test deduplication with non-dify provider documents.
1109+ Test deduplication with non-dify provider documents that have no doc_id .
11101110
11111111 Verifies:
1112- - External provider documents use content-based deduplication
1113- - Different providers are handled correctly
1112+ - External provider documents without doc_id use content-based deduplication
1113+ - Identical content from the same provider is collapsed to one result
11141114 """
11151115 # Arrange
11161116 doc1 = Document (
@@ -1131,7 +1131,96 @@ def test_deduplicate_documents_non_dify_provider(self):
11311131
11321132 # Assert
11331133 # External documents without doc_id should use content-based dedup
1134- assert len (result ) >= 1
1134+ assert len (result ) == 1
1135+
1136+ def test_deduplicate_documents_non_dify_provider_with_doc_id_different_sources (self ):
1137+ """
1138+ Regression test for issue #35707.
1139+
1140+ Two chunks from different source documents share identical text content but carry
1141+ different doc_ids. Before the fix, non-dify providers were forced into content-based
1142+ deduplication and the second chunk was silently dropped. After the fix, doc_id is used
1143+ as the dedup key for any provider that exposes it, so both chunks must be retained.
1144+
1145+ Verifies:
1146+ - Non-dify provider documents with different doc_ids are NOT deduplicated even when
1147+ their page_content is identical.
1148+ """
1149+ # Arrange — same content, different doc_ids, non-dify provider (e.g. Weaviate / Qdrant)
1150+ doc_a = Document (
1151+ page_content = "Shared identical content" ,
1152+ metadata = {"doc_id" : "doc-from-file-a" , "score" : 0.85 },
1153+ provider = "weaviate" ,
1154+ )
1155+ doc_b = Document (
1156+ page_content = "Shared identical content" ,
1157+ metadata = {"doc_id" : "doc-from-file-b" , "score" : 0.82 },
1158+ provider = "weaviate" ,
1159+ )
1160+
1161+ # Act
1162+ result = RetrievalService ._deduplicate_documents ([doc_a , doc_b ])
1163+
1164+ # Assert — both documents must be kept; losing either silently drops a source citation
1165+ assert len (result ) == 2
1166+ doc_ids = {doc .metadata ["doc_id" ] for doc in result }
1167+ assert doc_ids == {"doc-from-file-a" , "doc-from-file-b" }
1168+
1169+ def test_deduplicate_documents_non_dify_provider_with_same_doc_id (self ):
1170+ """
1171+ Test that non-dify provider documents sharing the same doc_id are deduplicated by
1172+ doc_id key (not by content), and the higher-scored duplicate is retained.
1173+
1174+ Verifies:
1175+ - doc_id-based deduplication now applies to any provider, not only "dify"
1176+ - The document with the highest score wins when doc_ids collide
1177+ """
1178+ # Arrange
1179+ doc_low = Document (
1180+ page_content = "Content A" ,
1181+ metadata = {"doc_id" : "chunk-1" , "score" : 0.5 },
1182+ provider = "qdrant" ,
1183+ )
1184+ doc_high = Document (
1185+ page_content = "Content A" ,
1186+ metadata = {"doc_id" : "chunk-1" , "score" : 0.9 },
1187+ provider = "qdrant" ,
1188+ )
1189+
1190+ # Act
1191+ result = RetrievalService ._deduplicate_documents ([doc_low , doc_high ])
1192+
1193+ # Assert
1194+ assert len (result ) == 1
1195+ assert result [0 ].metadata ["score" ] == 0.9
1196+
1197+ def test_deduplicate_documents_dify_provider_without_doc_id_falls_back_to_content (self ):
1198+ """
1199+ Test that a dify provider document without doc_id still falls back to content-based
1200+ deduplication (no regression from original behaviour).
1201+
1202+ Verifies:
1203+ - Absence of doc_id triggers content-based dedup regardless of provider
1204+ - First occurrence is kept when content is identical
1205+ """
1206+ # Arrange — dify docs with no doc_id, same content
1207+ doc1 = Document (
1208+ page_content = "Same content" ,
1209+ metadata = {"score" : 0.8 },
1210+ provider = "dify" ,
1211+ )
1212+ doc2 = Document (
1213+ page_content = "Same content" ,
1214+ metadata = {"score" : 0.9 },
1215+ provider = "dify" ,
1216+ )
1217+
1218+ # Act
1219+ result = RetrievalService ._deduplicate_documents ([doc1 , doc2 ])
1220+
1221+ # Assert — collapsed to one; first-seen wins (no score comparison in content branch)
1222+ assert len (result ) == 1
1223+ assert result [0 ].metadata ["score" ] == 0.8
11351224
11361225 # ==================== Metadata Filtering Tests ====================
11371226
0 commit comments