Skip to content

Commit 6ca713c

Browse files
committed
fix e2e
Signed-off-by: Anxhela Coba <acoba@redhat.com>
1 parent 6bf140a commit 6ca713c

1 file changed

Lines changed: 15 additions & 3 deletions

File tree

src/utils/vector_search.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,20 @@ def _filter_documents_for_chunks(
4949
attrs.get("reference_url") or attrs.get("doc_url") or attrs.get("docs_url")
5050
)
5151
doc_id = attrs.get("document_id") or attrs.get("doc_id")
52-
dedup_key = doc_id or doc_url or chunk.source or ""
52+
# Use same precedence as _process_byok_rag_chunks_for_documents:
53+
# reference_url first, then doc_id
54+
dedup_key = doc_url or doc_id or chunk.source or ""
5355
if dedup_key:
5456
final_chunk_identifiers.add(dedup_key)
5557

5658
# Filter documents that match final chunk identifiers
5759
filtered_documents = []
5860
seen = set()
5961
for doc in all_documents:
60-
# Build same dedup key for document
62+
# Build same dedup key for document using same logic as extraction
6163
doc_url_str = str(doc.doc_url) if doc.doc_url else None
64+
# Use the same dedup key logic as _process_byok_rag_chunks_for_documents
65+
# which uses reference_url or doc_id as the key
6266
dedup_key = doc_url_str or doc.source or ""
6367

6468
if dedup_key in final_chunk_identifiers and dedup_key not in seen:
@@ -312,8 +316,16 @@ def _process_byok_rag_chunks_for_documents(
312316
or metadata.get("docs_url")
313317
)
314318

319+
# If no standard document identifiers are available, create a fallback
320+
# using the source (vector store ID) to ensure referenced documents
321+
# are still created for e2e tests where metadata may be minimal
315322
if not doc_id and not reference_url:
316-
continue
323+
# Use source as fallback document identifier
324+
fallback_doc_id = result.get("source", "unknown")
325+
if fallback_doc_id and fallback_doc_id != "unknown":
326+
doc_id = fallback_doc_id
327+
else:
328+
continue
317329

318330
# Use doc_id or reference_url as deduplication key
319331
dedup_key = reference_url or doc_id

0 commit comments

Comments
 (0)