@@ -49,16 +49,20 @@ def _filter_documents_for_chunks(
4949 attrs .get ("reference_url" ) or attrs .get ("doc_url" ) or attrs .get ("docs_url" )
5050 )
5151 doc_id = attrs .get ("document_id" ) or attrs .get ("doc_id" )
52- dedup_key = doc_id or doc_url or chunk .source or ""
52+ # Use same precedence as _process_byok_rag_chunks_for_documents:
53+ # reference_url first, then doc_id
54+ dedup_key = doc_url or doc_id or chunk .source or ""
5355 if dedup_key :
5456 final_chunk_identifiers .add (dedup_key )
5557
5658 # Filter documents that match final chunk identifiers
5759 filtered_documents = []
5860 seen = set ()
5961 for doc in all_documents :
60- # Build same dedup key for document
62+ # Build same dedup key for document using same logic as extraction
6163 doc_url_str = str (doc .doc_url ) if doc .doc_url else None
64+ # Use the same dedup key logic as _process_byok_rag_chunks_for_documents
65+ # which uses reference_url or doc_id as the key
6266 dedup_key = doc_url_str or doc .source or ""
6367
6468 if dedup_key in final_chunk_identifiers and dedup_key not in seen :
@@ -312,8 +316,16 @@ def _process_byok_rag_chunks_for_documents(
312316 or metadata .get ("docs_url" )
313317 )
314318
319+ # If no standard document identifiers are available, create a fallback
320+ # using the source (vector store ID) to ensure referenced documents
321+ # are still created for e2e tests where metadata may be minimal
315322 if not doc_id and not reference_url :
316- continue
323+ # Use source as fallback document identifier
324+ fallback_doc_id = result .get ("source" , "unknown" )
325+ if fallback_doc_id and fallback_doc_id != "unknown" :
326+ doc_id = fallback_doc_id
327+ else :
328+ continue
317329
318330 # Use doc_id or reference_url as deduplication key
319331 dedup_key = reference_url or doc_id
0 commit comments