@@ -59,7 +59,7 @@ def chunk_side_effect(dl_doc: Any) -> list[SimpleNamespace]:
5959
6060 assert "contextualized-chunk-1-of-dl-doc-for-file-a.pdf" in contents
6161 assert "contextualized-chunk-2-of-dl-doc-for-file-a.pdf" in contents
62- assert { "chunk_id" : "chunk-1-of-dl-doc-for-file-a.pdf" } in metas
62+ assert any ( m . get ( "chunk_id" ) == "chunk-1-of-dl-doc-for-file-a.pdf" for m in metas )
6363
6464 # Ensure our collaborators were actually exercised.
6565 assert converter_mock .convert .call_count == len (paths )
@@ -543,3 +543,76 @@ def test_run_without_sources_or_paths_raises_value_error() -> None:
543543 converter = DoclingConverter (converter = MagicMock (), meta_extractor = MagicMock ())
544544 with pytest .raises (ValueError , match = r"Either 'sources' or the deprecated 'paths' parameter must be provided." ):
545545 converter .run ()
546+
547+
548+ def test_run_doc_chunks_split_id_and_split_idx_start () -> None :
549+ converter_mock = MagicMock ()
550+ chunker_mock = MagicMock ()
551+ meta_extractor_mock = MagicMock ()
552+
553+ converter_mock .convert .return_value = SimpleNamespace (document = "dl-doc" )
554+
555+ chunks = [
556+ SimpleNamespace (text = "hello world" ),
557+ SimpleNamespace (text = "foo bar baz" ),
558+ ]
559+ chunker_mock .chunk .return_value = chunks
560+ chunker_mock .contextualize .side_effect = lambda chunk : f"ctx:{ chunk .text } "
561+ meta_extractor_mock .extract_chunk_meta .return_value = {}
562+
563+ converter = DoclingConverter (
564+ converter = converter_mock ,
565+ export_type = ExportType .DOC_CHUNKS ,
566+ chunker = chunker_mock ,
567+ meta_extractor = meta_extractor_mock ,
568+ )
569+
570+ result = converter .run (sources = ["doc.pdf" ])
571+ documents = result ["documents" ]
572+
573+ assert len (documents ) == 2
574+ assert documents [0 ].meta ["split_id" ] == 0
575+ assert documents [0 ].meta ["split_idx_start" ] == 0
576+ assert documents [1 ].meta ["split_id" ] == 1
577+ assert documents [1 ].meta ["split_idx_start" ] == len ("hello world" )
578+
579+
580+ def test_run_doc_chunks_split_id_resets_per_document () -> None :
581+ converter_mock = MagicMock ()
582+ chunker_mock = MagicMock ()
583+ meta_extractor_mock = MagicMock ()
584+
585+ converter_mock .convert .side_effect = [
586+ SimpleNamespace (document = "dl-doc-a" ),
587+ SimpleNamespace (document = "dl-doc-b" ),
588+ ]
589+ chunker_mock .chunk .side_effect = lambda dl_doc : [
590+ SimpleNamespace (text = f"chunk-1-of-{ dl_doc } " ),
591+ SimpleNamespace (text = f"chunk-2-of-{ dl_doc } " ),
592+ ]
593+ chunker_mock .contextualize .side_effect = lambda chunk : chunk .text
594+ meta_extractor_mock .extract_chunk_meta .return_value = {}
595+
596+ converter = DoclingConverter (
597+ converter = converter_mock ,
598+ export_type = ExportType .DOC_CHUNKS ,
599+ chunker = chunker_mock ,
600+ meta_extractor = meta_extractor_mock ,
601+ )
602+
603+ result = converter .run (sources = ["a.pdf" , "b.pdf" ])
604+ documents = result ["documents" ]
605+
606+ # split_id and split_idx_start reset for each source document
607+ doc_a_chunks = documents [:2 ]
608+ doc_b_chunks = documents [2 :]
609+
610+ assert doc_a_chunks [0 ].meta ["split_id" ] == 0
611+ assert doc_a_chunks [0 ].meta ["split_idx_start" ] == 0
612+ assert doc_a_chunks [1 ].meta ["split_id" ] == 1
613+ assert doc_a_chunks [1 ].meta ["split_idx_start" ] == len ("chunk-1-of-dl-doc-a" )
614+
615+ assert doc_b_chunks [0 ].meta ["split_id" ] == 0
616+ assert doc_b_chunks [0 ].meta ["split_idx_start" ] == 0
617+ assert doc_b_chunks [1 ].meta ["split_id" ] == 1
618+ assert doc_b_chunks [1 ].meta ["split_idx_start" ] == len ("chunk-1-of-dl-doc-b" )
0 commit comments