Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions packages/markitdown/src/markitdown/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,11 @@ def _handle_output(args, result: DocumentConverterResult):
with open(args.output, "w", encoding="utf-8") as f:
f.write(result.markdown)
else:
# Handle stdout encoding errors more gracefully
# Handle stdout encoding errors more gracefully, with fallback for
# cases where sys.stdout.encoding is None (e.g. redirected streams).
encoding = sys.stdout.encoding or "utf-8"
print(
result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
sys.stdout.encoding
)
result.markdown.encode(encoding, errors="replace").decode(encoding)
)


Expand Down
109 changes: 104 additions & 5 deletions packages/markitdown/src/markitdown/converters/_pdf_converter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import io
import re
from collections import Counter
from typing import BinaryIO, Any

from .._base_converter import DocumentConverter, DocumentConverterResult
Expand Down Expand Up @@ -57,6 +58,104 @@ def _merge_partial_numbering_lines(text: str) -> str:
return "\n".join(result_lines)


def _extract_text_with_headings(pdf_bytes: io.BytesIO) -> str:
"""
Extract text from a PDF using pdfminer's layout analysis, detecting headings
by comparing font sizes across the document.

Text blocks with font sizes significantly larger than the body text are
converted to Markdown headings (# for largest, ## for next, etc., up to ######).

Falls back to plain text extraction if layout analysis fails or produces
no useful size information.
"""
try:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTChar, LTAnno
except ImportError:
pdf_bytes.seek(0)
return pdfminer.high_level.extract_text(pdf_bytes)

laparams = LAParams()
pages_blocks: list[list[dict]] = []

try:
for page_layout in extract_pages(pdf_bytes, laparams=laparams):
page_blocks: list[dict] = []
for element in page_layout:
if not isinstance(element, LTTextBox):
continue
lines_data: list[tuple[str, float]] = []
char_sizes: list[float] = []
for text_line in element:
if not isinstance(text_line, LTTextLine):
continue
line_text = ""
line_sizes: list[float] = []
for char in text_line:
if isinstance(char, LTChar):
line_sizes.append(char.size)
line_text += char.get_text()
elif isinstance(char, LTAnno):
line_text += char.get_text()
if line_text.strip():
dominant = max(line_sizes) if line_sizes else 0.0
lines_data.append((line_text, dominant))
char_sizes.extend(line_sizes)
if lines_data:
block_text = "".join(lt for lt, _ in lines_data)
dominant_size = max(char_sizes) if char_sizes else 0.0
page_blocks.append({"text": block_text, "size": dominant_size})
pages_blocks.append(page_blocks)
except Exception:
pdf_bytes.seek(0)
return pdfminer.high_level.extract_text(pdf_bytes)

# Collect all font sizes to determine the body text size
all_sizes = [
round(b["size"] * 2) / 2
for page in pages_blocks
for b in page
if b["size"] > 0
]

if not all_sizes:
pdf_bytes.seek(0)
return pdfminer.high_level.extract_text(pdf_bytes)

# Body text = most common (mode) font size
size_counter = Counter(all_sizes)
body_size = size_counter.most_common(1)[0][0]

# Heading threshold: at least 15% larger than body text
heading_threshold = body_size * 1.15

# Collect distinct heading sizes, largest first
heading_sizes = sorted(
{s for s in all_sizes if s >= heading_threshold}, reverse=True
)

# Map each heading size to a Markdown heading level (H1 … H6)
size_to_level: dict[float, int] = {
s: min(i + 1, 6) for i, s in enumerate(heading_sizes)
}

result_parts: list[str] = []
for page_blocks in pages_blocks:
for block in page_blocks:
text = block["text"].strip()
if not text:
continue
rounded = round(block["size"] * 2) / 2
if rounded in size_to_level:
level = size_to_level[rounded]
result_parts.append(f"{'#' * level} {text}")
else:
result_parts.append(text)

return "\n\n".join(result_parts)


# Load dependencies
_dependency_exc_info = None
try:
Expand Down Expand Up @@ -565,23 +664,23 @@ def convert(

page.close() # Free cached page data immediately

# If no pages had form-style content, use pdfminer for
# the whole document (better text spacing for prose).
# If no pages had form-style content, use heading-aware extraction
# for the whole document (better text spacing and heading detection).
if form_page_count == 0:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
markdown = _extract_text_with_headings(pdf_bytes)
else:
markdown = "\n\n".join(markdown_chunks).strip()

except Exception:
# Fallback if pdfplumber fails
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
markdown = _extract_text_with_headings(pdf_bytes)

# Fallback if still empty
if not markdown:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
markdown = _extract_text_with_headings(pdf_bytes)

# Post-process to merge MasterFormat-style partial numbering with following text
markdown = _merge_partial_numbering_lines(markdown)
Expand Down
101 changes: 101 additions & 0 deletions packages/markitdown/tests/test_pdf_headings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python3 -m pytest
"""Tests for PDF heading detection via font-size analysis."""

import io
import pytest

from markitdown.converters._pdf_converter import _extract_text_with_headings


def _make_pdf_with_headings():
"""Create a simple PDF with H1 (24pt), H2 (18pt), and body text (12pt)."""
try:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
except ImportError:
pytest.skip("reportlab not available")

buf = io.BytesIO()
c = canvas.Canvas(buf, pagesize=letter)

# H1 heading (24pt)
c.setFont("Helvetica-Bold", 24)
c.drawString(72, 720, "Chapter One")

# Body text (12pt) — repeated to become the "mode" font size
c.setFont("Helvetica", 12)
c.drawString(72, 690, "This is body text describing the chapter.")
c.drawString(72, 675, "More body text that continues the description.")
c.drawString(72, 660, "Yet more body content for good measure.")

# H2 heading (18pt)
c.setFont("Helvetica-Bold", 18)
c.drawString(72, 630, "Section Overview")

# Body text (12pt)
c.setFont("Helvetica", 12)
c.drawString(72, 605, "Section body text goes here with details.")
c.drawString(72, 590, "Additional section content follows.")

c.save()
buf.seek(0)
return buf


class TestPdfHeadingDetection:
"""Tests for heading detection via font-size analysis."""

def test_headings_detected_in_pdf_with_varied_font_sizes(self):
"""Headings with larger font sizes should become Markdown headings."""
buf = _make_pdf_with_headings()
result = _extract_text_with_headings(buf)

# Largest font (24pt) should be H1
assert "# Chapter One" in result, (
"Expected '# Chapter One' in output, got:\n" + result
)
# Second-largest font (18pt) should be H2
assert "## Section Overview" in result, (
"Expected '## Section Overview' in output, got:\n" + result
)

def test_body_text_not_converted_to_heading(self):
"""Body text (most common font size) should not receive heading markers."""
buf = _make_pdf_with_headings()
result = _extract_text_with_headings(buf)

# Body text lines must not start with '#'
for line in result.splitlines():
if "body text" in line.lower():
assert not line.startswith("#"), (
f"Body text line should not be a heading: {line!r}"
)

def test_uniform_font_size_pdf_produces_no_headings(self):
"""A PDF where all text has the same font size should have no headings."""
try:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
except ImportError:
pytest.skip("reportlab not available")

buf = io.BytesIO()
c = canvas.Canvas(buf, pagesize=letter)
c.setFont("Helvetica", 12)
c.drawString(72, 720, "All text is the same size here.")
c.drawString(72, 705, "No headings should be detected.")
c.drawString(72, 690, "Every line is twelve points.")
c.save()
buf.seek(0)

result = _extract_text_with_headings(buf)
assert "#" not in result, (
"Uniform-font PDF should produce no headings, got:\n" + result
)

def test_extract_text_with_headings_returns_string(self):
"""_extract_text_with_headings should always return a string."""
buf = _make_pdf_with_headings()
result = _extract_text_with_headings(buf)
assert isinstance(result, str)
assert len(result) > 0