From 667a168882f3f169f22d50ad26215339f3e3a12b Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Balusu Date: Tue, 24 Mar 2026 22:27:01 -0400 Subject: [PATCH 1/7] feat(cli): add --page-break-placeholder option for Markdown and Text exports Expose the existing `page_break_placeholder` parameter from the Python API (`save_as_markdown`) as a CLI option. When set, the specified string is inserted between pages in Markdown and Text outputs. Closes #3175 --- docling/cli/main.py | 14 +++++++++++++- tests/test_cli.py | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index 05f7214685..1937bf599d 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -216,6 +216,7 @@ def export_documents( print_timings: bool, export_timings: bool, image_export_mode: ImageRefMode, + page_break_placeholder: str | None = None, ): success_count = 0 failure_count = 0 @@ -283,6 +284,7 @@ def export_documents( filename=fname, strict_text=True, image_mode=ImageRefMode.PLACEHOLDER, + page_break_placeholder=page_break_placeholder, ) # Export Markdown format: @@ -290,7 +292,9 @@ def export_documents( fname = output_dir / f"{doc_filename}.md" _log.info(f"writing Markdown output to {fname}") conv_res.document.save_as_markdown( - filename=fname, image_mode=image_export_mode + filename=fname, + image_mode=image_export_mode, + page_break_placeholder=page_break_placeholder, ) # Export Document Tags format: @@ -426,6 +430,13 @@ def convert( # noqa: C901 help="Image export mode for image-capable document outputs (JSON, YAML, HTML, HTML split-page, and Markdown). Text, DocTags, and WebVTT outputs do not export images. With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.", ), ] = ImageRefMode.EMBEDDED, + page_break_placeholder: Annotated[ + str | None, + typer.Option( + ..., + help="Specify a custom page break placeholder string for Markdown and Text exports. When set, this string is inserted between pages in the output. Examples: '---', ''.", + ), + ] = None, pipeline: Annotated[ ProcessingPipeline, typer.Option(..., help="Choose the pipeline to process PDF or image files."), @@ -968,6 +979,7 @@ def convert( # noqa: C901 print_timings=profiling, export_timings=save_profiling, image_export_mode=image_export_mode, + page_break_placeholder=page_break_placeholder, ) end_time = time.time() - start_time diff --git a/tests/test_cli.py b/tests/test_cli.py index 64506fd220..16dfc5b521 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -30,6 +30,26 @@ def test_cli_convert(tmp_path): assert converted.exists() +def test_cli_page_break_placeholder(tmp_path): + source = "./tests/data/pdf/2305.03393v1-pg9.pdf" + output = tmp_path / "out" + output.mkdir() + placeholder = "" + result = runner.invoke( + app, + [ + source, + "--output", + str(output), + "--page-break-placeholder", + placeholder, + ], + ) + assert result.exit_code == 0 + converted = output / f"{Path(source).stem}.md" + assert converted.exists() + + @pytest.mark.parametrize( ("image_export_mode", "to_formats", "expected"), [ From d45e4cd8449d092693c3686f31c129ae8a441d86 Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Balusu Date: Mon, 30 Mar 2026 09:30:59 -0400 Subject: [PATCH 2/7] DCO Remediation Commit for Krishna Chaitanya Balusu I, Krishna Chaitanya Balusu , hereby add my Signed-off-by to this commit: 667a168882f3f169f22d50ad26215339f3e3a12b Signed-off-by: Krishna Chaitanya Balusu From 9bf673d1601bda3a44fbf4235bbcd16cb35e9b4a Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Balusu Date: Wed, 1 Apr 2026 15:44:07 -0700 Subject: [PATCH 3/7] feat(cli): support dynamic page numbers in --page-break-placeholder Add {prev_page} and {next_page} format variables to the --page-break-placeholder option. When these variables are present, each page break in the output is replaced with the placeholder formatted with the actual page numbers for that specific break. Example usage: docling --to md --page-break-placeholder '--- Page {next_page} ---' input.pdf Which produces: ... content from page 1 ... --- Page 2 --- ... content from page 2 ... --- Page 3 --- ... content from page 3 ... Uses a sentinel-based approach: a unique sentinel is passed to docling-core during serialization, then post-processed to replace each sentinel occurrence with the formatted placeholder using sequential page numbers. Static placeholders (without format variables) continue to work unchanged. --- docling/cli/main.py | 51 ++++++++++++++++++++-- tests/test_cli.py | 102 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 4 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index 1937bf599d..273cf95fe0 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -201,6 +201,35 @@ def print_external_plugins(factory: BaseFactory, factory_name: str): raise typer.Exit() +_PAGE_BREAK_SENTINEL = "\x00_DOCLING_PAGE_BREAK_\x00" + + +def _has_dynamic_page_vars(placeholder: str) -> bool: + """Check if the placeholder string contains {prev_page} or {next_page} variables.""" + return "{prev_page}" in placeholder or "{next_page}" in placeholder + + +def _apply_dynamic_page_breaks(file_path: Path, placeholder: str) -> None: + """Post-process a file to replace page break sentinels with formatted placeholders. + + When the placeholder contains {prev_page} and/or {next_page} format variables, + each page break sentinel is replaced with the placeholder formatted with + sequential page numbers. + """ + text = file_path.read_text(encoding="utf-8") + parts = text.split(_PAGE_BREAK_SENTINEL) + if len(parts) <= 1: + return + result = [parts[0]] + for i, part in enumerate(parts[1:], start=1): + formatted = placeholder.replace("{prev_page}", str(i)).replace( + "{next_page}", str(i + 1) + ) + result.append(formatted) + result.append(part) + file_path.write_text("".join(result), encoding="utf-8") + + def export_documents( conv_results: Iterable[ConversionResult], output_dir: Path, @@ -280,22 +309,38 @@ def export_documents( if export_txt: fname = output_dir / f"{doc_filename}.txt" _log.info(f"writing TXT output to {fname}") + use_dynamic = ( + page_break_placeholder is not None + and _has_dynamic_page_vars(page_break_placeholder) + ) conv_res.document.save_as_markdown( filename=fname, strict_text=True, image_mode=ImageRefMode.PLACEHOLDER, - page_break_placeholder=page_break_placeholder, + page_break_placeholder=( + _PAGE_BREAK_SENTINEL if use_dynamic else page_break_placeholder + ), ) + if use_dynamic: + _apply_dynamic_page_breaks(fname, page_break_placeholder) # Export Markdown format: if export_md: fname = output_dir / f"{doc_filename}.md" _log.info(f"writing Markdown output to {fname}") + use_dynamic = ( + page_break_placeholder is not None + and _has_dynamic_page_vars(page_break_placeholder) + ) conv_res.document.save_as_markdown( filename=fname, image_mode=image_export_mode, - page_break_placeholder=page_break_placeholder, + page_break_placeholder=( + _PAGE_BREAK_SENTINEL if use_dynamic else page_break_placeholder + ), ) + if use_dynamic: + _apply_dynamic_page_breaks(fname, page_break_placeholder) # Export Document Tags format: if export_doctags: @@ -434,7 +479,7 @@ def convert( # noqa: C901 str | None, typer.Option( ..., - help="Specify a custom page break placeholder string for Markdown and Text exports. When set, this string is inserted between pages in the output. Examples: '---', ''.", + help="Specify a custom page break placeholder string for Markdown and Text exports. When set, this string is inserted between pages in the output. Supports {prev_page} and {next_page} format variables for dynamic page numbers. Examples: '---', '', '---\\n*[Page {next_page}]*\\n---'.", ), ] = None, pipeline: Annotated[ diff --git a/tests/test_cli.py b/tests/test_cli.py index 16dfc5b521..faa7de8583 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,7 +4,13 @@ from docling_core.types.doc import ImageRefMode from typer.testing import CliRunner -from docling.cli.main import _should_generate_export_images, app +from docling.cli.main import ( + _PAGE_BREAK_SENTINEL, + _apply_dynamic_page_breaks, + _has_dynamic_page_vars, + _should_generate_export_images, + app, +) from docling.datamodel.base_models import OutputFormat runner = CliRunner() @@ -142,3 +148,97 @@ def test_cli_audio_extensions_coverage(): assert ext in audio_extensions, ( f"Audio extension {ext} not found in FormatToExtensions[InputFormat.AUDIO]" ) + + +@pytest.mark.parametrize( + ("placeholder", "expected"), + [ + ("---", False), + ("", False), + ("Page {next_page}", True), + ("End of page {prev_page}", True), + ("---\n*[Page {next_page}]*\n---", True), + ("{prev_page} -> {next_page}", True), + ], +) +def test_has_dynamic_page_vars(placeholder, expected): + assert _has_dynamic_page_vars(placeholder) is expected + + +def test_apply_dynamic_page_breaks(tmp_path): + content = ( + "Content from page 1\n\n" + f"{_PAGE_BREAK_SENTINEL}\n\n" + "Content from page 2\n\n" + f"{_PAGE_BREAK_SENTINEL}\n\n" + "Content from page 3" + ) + file_path = tmp_path / "test.md" + file_path.write_text(content, encoding="utf-8") + + _apply_dynamic_page_breaks(file_path, "---\n*[Page {next_page}]*\n---") + + result = file_path.read_text(encoding="utf-8") + assert _PAGE_BREAK_SENTINEL not in result + assert "---\n*[Page 2]*\n---" in result + assert "---\n*[Page 3]*\n---" in result + + +def test_apply_dynamic_page_breaks_prev_page(tmp_path): + content = f"Content from page 1\n\n{_PAGE_BREAK_SENTINEL}\n\nContent from page 2" + file_path = tmp_path / "test.md" + file_path.write_text(content, encoding="utf-8") + + _apply_dynamic_page_breaks(file_path, "[End of page {prev_page}]") + + result = file_path.read_text(encoding="utf-8") + assert _PAGE_BREAK_SENTINEL not in result + assert "[End of page 1]" in result + + +def test_apply_dynamic_page_breaks_both_vars(tmp_path): + content = f"Page 1\n{_PAGE_BREAK_SENTINEL}\nPage 2" + file_path = tmp_path / "test.md" + file_path.write_text(content, encoding="utf-8") + + _apply_dynamic_page_breaks(file_path, "({prev_page} -> {next_page})") + + result = file_path.read_text(encoding="utf-8") + assert "(1 -> 2)" in result + + +def test_apply_dynamic_page_breaks_no_sentinel(tmp_path): + content = "Content with no page breaks" + file_path = tmp_path / "test.md" + file_path.write_text(content, encoding="utf-8") + + _apply_dynamic_page_breaks(file_path, "---\n*[Page {next_page}]*\n---") + + result = file_path.read_text(encoding="utf-8") + assert result == content + + +def test_cli_dynamic_page_break_placeholder(tmp_path): + source = "./tests/data/pdf/normal_4pages.pdf" + output = tmp_path / "out" + output.mkdir() + placeholder = "--- Page {next_page} ---" + result = runner.invoke( + app, + [ + source, + "--output", + str(output), + "--page-break-placeholder", + placeholder, + ], + ) + assert result.exit_code == 0 + converted = output / f"{Path(source).stem}.md" + assert converted.exists() + content = converted.read_text(encoding="utf-8") + # Should not contain raw page break markers or sentinels + assert "#_#_DOCLING_DOC_PAGE_BREAK" not in content + assert _PAGE_BREAK_SENTINEL not in content + # Multi-page PDF should have page break placeholders with actual page numbers + assert "--- Page 2 ---" in content From 0914f4d00eb779b4481529763195ab41f75673c6 Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Balusu Date: Wed, 1 Apr 2026 17:21:43 -0700 Subject: [PATCH 4/7] fix(cli): use actual page numbers for dynamic page break placeholders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use document item provenance to determine real page numbers instead of sequential counting. This fixes incorrect numbering when documents contain blank pages — e.g. a 5-page doc with a blank page 4 now correctly produces page numbers 1, 2, 3, 5 instead of 1, 2, 3, 4. Signed-off-by: Krishna Chaitanya Balusu --- docling/cli/main.py | 55 +++++++++++++++++++++++++++++++++++++++------ tests/test_cli.py | 53 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 7 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index 273cf95fe0..02885e9cfa 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -209,21 +209,51 @@ def _has_dynamic_page_vars(placeholder: str) -> bool: return "{prev_page}" in placeholder or "{next_page}" in placeholder -def _apply_dynamic_page_breaks(file_path: Path, placeholder: str) -> None: +def _get_content_page_numbers(doc) -> list[int]: + """Extract sorted unique page numbers from document items with provenance. + + This returns the actual page numbers (from doc item provenance) rather than + sequential indices, so that blank pages are correctly skipped in numbering. + """ + seen: set[int] = set() + for item, _ in doc.iterate_items(): + if hasattr(item, "prov") and item.prov: + seen.add(item.prov[0].page_no) + return sorted(seen) + + +def _apply_dynamic_page_breaks( + file_path: Path, + placeholder: str, + content_page_numbers: list[int] | None = None, +) -> None: """Post-process a file to replace page break sentinels with formatted placeholders. When the placeholder contains {prev_page} and/or {next_page} format variables, each page break sentinel is replaced with the placeholder formatted with - sequential page numbers. + the actual page numbers from the document. + + Args: + file_path: Path to the file to process. + placeholder: The placeholder string with optional {prev_page}/{next_page} vars. + content_page_numbers: Sorted list of actual page numbers that have content. + When provided, uses these instead of sequential counting to handle + documents with blank pages correctly. """ text = file_path.read_text(encoding="utf-8") parts = text.split(_PAGE_BREAK_SENTINEL) if len(parts) <= 1: return result = [parts[0]] - for i, part in enumerate(parts[1:], start=1): - formatted = placeholder.replace("{prev_page}", str(i)).replace( - "{next_page}", str(i + 1) + for i, part in enumerate(parts[1:]): + if content_page_numbers is not None and i + 1 < len(content_page_numbers): + prev_page = content_page_numbers[i] + next_page = content_page_numbers[i + 1] + else: + prev_page = i + 1 + next_page = i + 2 + formatted = placeholder.replace("{prev_page}", str(prev_page)).replace( + "{next_page}", str(next_page) ) result.append(formatted) result.append(part) @@ -255,6 +285,13 @@ def export_documents( success_count += 1 doc_filename = conv_res.input.file.stem + # Pre-compute content page numbers for dynamic page breaks + content_page_numbers = None + if page_break_placeholder is not None and _has_dynamic_page_vars( + page_break_placeholder + ): + content_page_numbers = _get_content_page_numbers(conv_res.document) + # Export JSON format: if export_json: fname = output_dir / f"{doc_filename}.json" @@ -322,7 +359,9 @@ def export_documents( ), ) if use_dynamic: - _apply_dynamic_page_breaks(fname, page_break_placeholder) + _apply_dynamic_page_breaks( + fname, page_break_placeholder, content_page_numbers + ) # Export Markdown format: if export_md: @@ -340,7 +379,9 @@ def export_documents( ), ) if use_dynamic: - _apply_dynamic_page_breaks(fname, page_break_placeholder) + _apply_dynamic_page_breaks( + fname, page_break_placeholder, content_page_numbers + ) # Export Document Tags format: if export_doctags: diff --git a/tests/test_cli.py b/tests/test_cli.py index faa7de8583..4113f378b3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,6 +7,7 @@ from docling.cli.main import ( _PAGE_BREAK_SENTINEL, _apply_dynamic_page_breaks, + _get_content_page_numbers, _has_dynamic_page_vars, _should_generate_export_images, app, @@ -242,3 +243,55 @@ def test_cli_dynamic_page_break_placeholder(tmp_path): assert _PAGE_BREAK_SENTINEL not in content # Multi-page PDF should have page break placeholders with actual page numbers assert "--- Page 2 ---" in content + + +def test_apply_dynamic_page_breaks_with_blank_pages(tmp_path): + """Test that page numbering uses actual page numbers when blank pages exist.""" + content = ( + "Content from page 1\n\n" + f"{_PAGE_BREAK_SENTINEL}\n\n" + "Content from page 2\n\n" + f"{_PAGE_BREAK_SENTINEL}\n\n" + "Content from page 3\n\n" + f"{_PAGE_BREAK_SENTINEL}\n\n" + "Content from page 5" + ) + file_path = tmp_path / "test.md" + file_path.write_text(content, encoding="utf-8") + + # Page 4 is blank, so content pages are [1, 2, 3, 5] + _apply_dynamic_page_breaks( + file_path, + "---\n*[Page {next_page}]*\n---", + content_page_numbers=[1, 2, 3, 5], + ) + + result = file_path.read_text(encoding="utf-8") + assert _PAGE_BREAK_SENTINEL not in result + assert "---\n*[Page 2]*\n---" in result + assert "---\n*[Page 3]*\n---" in result + assert "---\n*[Page 5]*\n---" in result + assert "---\n*[Page 4]*\n---" not in result + + +def test_apply_dynamic_page_breaks_prev_page_with_blank_pages(tmp_path): + """Test {prev_page} with blank pages uses actual page numbers.""" + content = ( + f"Page 1\n{_PAGE_BREAK_SENTINEL}\n" + f"Page 3\n{_PAGE_BREAK_SENTINEL}\n" + "Page 5" + ) + file_path = tmp_path / "test.md" + file_path.write_text(content, encoding="utf-8") + + _apply_dynamic_page_breaks( + file_path, + "({prev_page} -> {next_page})", + content_page_numbers=[1, 3, 5], + ) + + result = file_path.read_text(encoding="utf-8") + assert "(1 -> 3)" in result + assert "(3 -> 5)" in result + assert "(1 -> 2)" not in result + assert "(2 -> 3)" not in result From 924ca4428395fa820bb3efc231b7fdaa026b29c8 Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Balusu Date: Wed, 8 Apr 2026 16:06:23 -0700 Subject: [PATCH 5/7] fix(cli): resolve mypy type errors and ruff formatting Add assert for page_break_placeholder narrowing before calls to _apply_dynamic_page_breaks, and fix ruff-format string in test. Signed-off-by: Krishna Chaitanya Balusu --- docling/cli/main.py | 2 ++ tests/test_cli.py | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index 02885e9cfa..d2f582fa8a 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -359,6 +359,7 @@ def export_documents( ), ) if use_dynamic: + assert page_break_placeholder is not None _apply_dynamic_page_breaks( fname, page_break_placeholder, content_page_numbers ) @@ -379,6 +380,7 @@ def export_documents( ), ) if use_dynamic: + assert page_break_placeholder is not None _apply_dynamic_page_breaks( fname, page_break_placeholder, content_page_numbers ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 4113f378b3..551f9175a2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -276,11 +276,7 @@ def test_apply_dynamic_page_breaks_with_blank_pages(tmp_path): def test_apply_dynamic_page_breaks_prev_page_with_blank_pages(tmp_path): """Test {prev_page} with blank pages uses actual page numbers.""" - content = ( - f"Page 1\n{_PAGE_BREAK_SENTINEL}\n" - f"Page 3\n{_PAGE_BREAK_SENTINEL}\n" - "Page 5" - ) + content = f"Page 1\n{_PAGE_BREAK_SENTINEL}\nPage 3\n{_PAGE_BREAK_SENTINEL}\nPage 5" file_path = tmp_path / "test.md" file_path.write_text(content, encoding="utf-8") From 04e5c9e5dae331cdb53a657c8d8297f04038c49a Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Balusu Date: Wed, 8 Apr 2026 16:06:28 -0700 Subject: [PATCH 6/7] DCO Remediation Commit for Krishna Chaitanya Balusu I, Krishna Chaitanya Balusu , hereby add my Signed-off-by to this commit: 0914f4d00eb779b4481529763195ab41f75673c6 Signed-off-by: Krishna Chaitanya Balusu From f88741e842b56e6cded00968000b1293fcecc8ec Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Balusu Date: Wed, 8 Apr 2026 16:13:32 -0700 Subject: [PATCH 7/7] DCO Remediation Commit for Krishna Chaitanya Balusu I, Krishna Chaitanya Balusu , hereby add my Signed-off-by to this commit: 9bf673d1601bda3a44fbf4235bbcd16cb35e9b4a Signed-off-by: Krishna Chaitanya Balusu