diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6bb..54ac54452 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -206,11 +206,11 @@ def _handle_output(args, result: DocumentConverterResult): with open(args.output, "w", encoding="utf-8") as f: f.write(result.markdown) else: - # Handle stdout encoding errors more gracefully + # Handle stdout encoding errors more gracefully, with fallback for + # cases where sys.stdout.encoding is None (e.g. redirected streams). + encoding = sys.stdout.encoding or "utf-8" print( - result.markdown.encode(sys.stdout.encoding, errors="replace").decode( - sys.stdout.encoding - ) + result.markdown.encode(encoding, errors="replace").decode(encoding) ) diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index ffbcbd990..8e5212389 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,6 +1,7 @@ import sys import io import re +from collections import Counter from typing import BinaryIO, Any from .._base_converter import DocumentConverter, DocumentConverterResult @@ -57,6 +58,104 @@ def _merge_partial_numbering_lines(text: str) -> str: return "\n".join(result_lines) +def _extract_text_with_headings(pdf_bytes: io.BytesIO) -> str: + """ + Extract text from a PDF using pdfminer's layout analysis, detecting headings + by comparing font sizes across the document. + + Text blocks with font sizes significantly larger than the body text are + converted to Markdown headings (# for largest, ## for next, etc., up to ######). + + Falls back to plain text extraction if layout analysis fails or produces + no useful size information. + """ + try: + from pdfminer.high_level import extract_pages + from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTChar, LTAnno + except ImportError: + pdf_bytes.seek(0) + return pdfminer.high_level.extract_text(pdf_bytes) + + laparams = LAParams() + pages_blocks: list[list[dict]] = [] + + try: + for page_layout in extract_pages(pdf_bytes, laparams=laparams): + page_blocks: list[dict] = [] + for element in page_layout: + if not isinstance(element, LTTextBox): + continue + lines_data: list[tuple[str, float]] = [] + char_sizes: list[float] = [] + for text_line in element: + if not isinstance(text_line, LTTextLine): + continue + line_text = "" + line_sizes: list[float] = [] + for char in text_line: + if isinstance(char, LTChar): + line_sizes.append(char.size) + line_text += char.get_text() + elif isinstance(char, LTAnno): + line_text += char.get_text() + if line_text.strip(): + dominant = max(line_sizes) if line_sizes else 0.0 + lines_data.append((line_text, dominant)) + char_sizes.extend(line_sizes) + if lines_data: + block_text = "".join(lt for lt, _ in lines_data) + dominant_size = max(char_sizes) if char_sizes else 0.0 + page_blocks.append({"text": block_text, "size": dominant_size}) + pages_blocks.append(page_blocks) + except Exception: + pdf_bytes.seek(0) + return pdfminer.high_level.extract_text(pdf_bytes) + + # Collect all font sizes to determine the body text size + all_sizes = [ + round(b["size"] * 2) / 2 + for page in pages_blocks + for b in page + if b["size"] > 0 + ] + + if not all_sizes: + pdf_bytes.seek(0) + return pdfminer.high_level.extract_text(pdf_bytes) + + # Body text = most common (mode) font size + size_counter = Counter(all_sizes) + body_size = size_counter.most_common(1)[0][0] + + # Heading threshold: at least 15% larger than body text + heading_threshold = body_size * 1.15 + + # Collect distinct heading sizes, largest first + heading_sizes = sorted( + {s for s in all_sizes if s >= heading_threshold}, reverse=True + ) + + # Map each heading size to a Markdown heading level (H1 … H6) + size_to_level: dict[float, int] = { + s: min(i + 1, 6) for i, s in enumerate(heading_sizes) + } + + result_parts: list[str] = [] + for page_blocks in pages_blocks: + for block in page_blocks: + text = block["text"].strip() + if not text: + continue + rounded = round(block["size"] * 2) / 2 + if rounded in size_to_level: + level = size_to_level[rounded] + result_parts.append(f"{'#' * level} {text}") + else: + result_parts.append(text) + + return "\n\n".join(result_parts) + + # Load dependencies _dependency_exc_info = None try: @@ -565,23 +664,23 @@ def convert( page.close() # Free cached page data immediately - # If no pages had form-style content, use pdfminer for - # the whole document (better text spacing for prose). + # If no pages had form-style content, use heading-aware extraction + # for the whole document (better text spacing and heading detection). if form_page_count == 0: pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + markdown = _extract_text_with_headings(pdf_bytes) else: markdown = "\n\n".join(markdown_chunks).strip() except Exception: # Fallback if pdfplumber fails pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + markdown = _extract_text_with_headings(pdf_bytes) # Fallback if still empty if not markdown: pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + markdown = _extract_text_with_headings(pdf_bytes) # Post-process to merge MasterFormat-style partial numbering with following text markdown = _merge_partial_numbering_lines(markdown) diff --git a/packages/markitdown/tests/test_pdf_headings.py b/packages/markitdown/tests/test_pdf_headings.py new file mode 100644 index 000000000..0e25a3db3 --- /dev/null +++ b/packages/markitdown/tests/test_pdf_headings.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 -m pytest +"""Tests for PDF heading detection via font-size analysis.""" + +import io +import pytest + +from markitdown.converters._pdf_converter import _extract_text_with_headings + + +def _make_pdf_with_headings(): + """Create a simple PDF with H1 (24pt), H2 (18pt), and body text (12pt).""" + try: + from reportlab.lib.pagesizes import letter + from reportlab.pdfgen import canvas + except ImportError: + pytest.skip("reportlab not available") + + buf = io.BytesIO() + c = canvas.Canvas(buf, pagesize=letter) + + # H1 heading (24pt) + c.setFont("Helvetica-Bold", 24) + c.drawString(72, 720, "Chapter One") + + # Body text (12pt) — repeated to become the "mode" font size + c.setFont("Helvetica", 12) + c.drawString(72, 690, "This is body text describing the chapter.") + c.drawString(72, 675, "More body text that continues the description.") + c.drawString(72, 660, "Yet more body content for good measure.") + + # H2 heading (18pt) + c.setFont("Helvetica-Bold", 18) + c.drawString(72, 630, "Section Overview") + + # Body text (12pt) + c.setFont("Helvetica", 12) + c.drawString(72, 605, "Section body text goes here with details.") + c.drawString(72, 590, "Additional section content follows.") + + c.save() + buf.seek(0) + return buf + + +class TestPdfHeadingDetection: + """Tests for heading detection via font-size analysis.""" + + def test_headings_detected_in_pdf_with_varied_font_sizes(self): + """Headings with larger font sizes should become Markdown headings.""" + buf = _make_pdf_with_headings() + result = _extract_text_with_headings(buf) + + # Largest font (24pt) should be H1 + assert "# Chapter One" in result, ( + "Expected '# Chapter One' in output, got:\n" + result + ) + # Second-largest font (18pt) should be H2 + assert "## Section Overview" in result, ( + "Expected '## Section Overview' in output, got:\n" + result + ) + + def test_body_text_not_converted_to_heading(self): + """Body text (most common font size) should not receive heading markers.""" + buf = _make_pdf_with_headings() + result = _extract_text_with_headings(buf) + + # Body text lines must not start with '#' + for line in result.splitlines(): + if "body text" in line.lower(): + assert not line.startswith("#"), ( + f"Body text line should not be a heading: {line!r}" + ) + + def test_uniform_font_size_pdf_produces_no_headings(self): + """A PDF where all text has the same font size should have no headings.""" + try: + from reportlab.lib.pagesizes import letter + from reportlab.pdfgen import canvas + except ImportError: + pytest.skip("reportlab not available") + + buf = io.BytesIO() + c = canvas.Canvas(buf, pagesize=letter) + c.setFont("Helvetica", 12) + c.drawString(72, 720, "All text is the same size here.") + c.drawString(72, 705, "No headings should be detected.") + c.drawString(72, 690, "Every line is twelve points.") + c.save() + buf.seek(0) + + result = _extract_text_with_headings(buf) + assert "#" not in result, ( + "Uniform-font PDF should produce no headings, got:\n" + result + ) + + def test_extract_text_with_headings_returns_string(self): + """_extract_text_with_headings should always return a string.""" + buf = _make_pdf_with_headings() + result = _extract_text_with_headings(buf) + assert isinstance(result, str) + assert len(result) > 0