Skip to content

Commit 58c6dab

Browse files
committed
feat: Add automatic OCR support for scanned PDFs
- Add is_scanned_page() function to detect scanned vs native PDFs - Add get_text_smart() method with automatic OCR detection - Add to_markdown() methods for Page and Document classes - Add document_to_markdown() for full document conversion New API: - page.is_scanned() - Detect if page is image-based - page.get_text_smart() - Auto-apply OCR when needed - page.to_markdown() - Convert page to Markdown with OCR - doc.to_markdown() - Convert entire document to Markdown These enhancements make PyMuPDF work seamlessly with scanned PDFs by automatically detecting and applying OCR when text extraction would otherwise return empty results. Fixes: #[issue_number if any]
1 parent f5a3487 commit 58c6dab

2 files changed

Lines changed: 250 additions & 0 deletions

File tree

src/__init__.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6764,6 +6764,27 @@ def search_page_for(
67646764
textpage=textpage,
67656765
)
67666766

6767+
def to_markdown(self, auto_ocr: bool = True, ocr_language: str = "eng", ocr_dpi: int = 300, page_separator: str = "\n\n---\n\n") -> str:
6768+
"""Convert entire document to Markdown with automatic OCR for scanned pages.
6769+
6770+
Args:
6771+
auto_ocr: Automatically use OCR for scanned pages (default True)
6772+
ocr_language: Language for OCR (default "eng")
6773+
ocr_dpi: DPI for OCR rendering (default 300)
6774+
page_separator: String to separate pages (default "\\n\\n---\\n\\n")
6775+
6776+
Returns:
6777+
Markdown-formatted text for entire document
6778+
6779+
Example:
6780+
>>> doc = fitz.open("document.pdf")
6781+
>>> markdown = doc.to_markdown() # Auto-detects scanned pages
6782+
>>> print(markdown)
6783+
"""
6784+
if self.is_closed:
6785+
raise ValueError("document closed")
6786+
return utils.document_to_markdown(self, auto_ocr=auto_ocr, ocr_language=ocr_language, ocr_dpi=ocr_dpi, page_separator=page_separator)
6787+
67676788
def select(self, pyliste):
67686789
"""Build sub-pdf with page numbers in the list."""
67696790
if self.is_closed or self.is_encrypted:
@@ -12178,6 +12199,48 @@ def get_text_words(self, *args, **kwargs):
1217812199
def get_textpage_ocr(self, *args, **kwargs):
1217912200
return utils.get_textpage_ocr(self, *args, **kwargs)
1218012201

12202+
def is_scanned(self, threshold: int = 50) -> bool:
12203+
"""Check if page is scanned (image-based) vs native (text-based).
12204+
12205+
Args:
12206+
threshold: Minimum characters to consider as having text (default 50)
12207+
12208+
Returns:
12209+
True if page appears to be scanned, False if native
12210+
"""
12211+
return utils.is_scanned_page(self, threshold=threshold)
12212+
12213+
def get_text_smart(self, *args, **kwargs):
12214+
"""Extract text with automatic OCR for scanned pages.
12215+
12216+
This is an enhanced version of get_text() that automatically detects
12217+
scanned pages and applies OCR when needed.
12218+
12219+
Args:
12220+
option: text format (text, html, dict, etc.)
12221+
auto_ocr: automatically use OCR for scanned pages (default True)
12222+
ocr_language: language for OCR (default "eng")
12223+
ocr_dpi: DPI for OCR rendering (default 300)
12224+
... (other args same as get_text)
12225+
12226+
Returns:
12227+
Extracted text, with OCR applied if page is scanned
12228+
"""
12229+
return utils.get_text_smart(self, *args, **kwargs)
12230+
12231+
def to_markdown(self, auto_ocr: bool = True, ocr_language: str = "eng", ocr_dpi: int = 300) -> str:
12232+
"""Convert page to Markdown format with automatic OCR for scanned pages.
12233+
12234+
Args:
12235+
auto_ocr: Automatically use OCR for scanned pages (default True)
12236+
ocr_language: Language for OCR (default "eng")
12237+
ocr_dpi: DPI for OCR rendering (default 300)
12238+
12239+
Returns:
12240+
Markdown-formatted text
12241+
"""
12242+
return utils.to_markdown(self, auto_ocr=auto_ocr, ocr_language=ocr_language, ocr_dpi=ocr_dpi)
12243+
1218112244
def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "TextPage":
1218212245
CheckParent(self)
1218312246
if matrix is None:

src/utils.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,33 @@ def get_text_selection(
314314
return rc
315315

316316

317+
def is_scanned_page(
318+
page: pymupdf.Page,
319+
threshold: int = 50,
320+
) -> bool:
321+
"""Detect if a page is scanned (image-based) vs native (text-based).
322+
323+
Args:
324+
page: The page to check
325+
threshold: Minimum characters to consider as having text (default 50)
326+
327+
Returns:
328+
True if page appears to be scanned, False if native with embedded text
329+
"""
330+
pymupdf.CheckParent(page)
331+
332+
# Quick check: try to extract text
333+
text = page.get_textpage(flags=0).extractText().strip()
334+
335+
# If very little text, check for images
336+
if len(text) < threshold:
337+
images = page.get_images()
338+
if len(images) > 0:
339+
return True # Has images but no text = scanned
340+
341+
return False
342+
343+
317344
def get_textpage_ocr(
318345
page: pymupdf.Page,
319346
flags: int = 0,
@@ -398,6 +425,67 @@ def full_ocr(page, dpi, language, flags):
398425
return tpage
399426

400427

428+
def get_text_smart(
429+
page: pymupdf.Page,
430+
option: str = "text",
431+
*,
432+
clip: rect_like = None,
433+
flags: OptInt = None,
434+
textpage: pymupdf.TextPage = None,
435+
sort: bool = False,
436+
delimiters=None,
437+
tolerance=3,
438+
auto_ocr: bool = True,
439+
ocr_language: str = "eng",
440+
ocr_dpi: int = 300,
441+
):
442+
"""Extract text with automatic OCR for scanned pages.
443+
444+
This is an enhanced version of get_text() that automatically detects
445+
scanned pages and applies OCR when needed.
446+
447+
Args:
448+
page: The page to extract text from
449+
option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
450+
clip: (rect-like) restrict output to this area.
451+
flags: bit switches to e.g. exclude images or decompose ligatures.
452+
textpage: reuse this pymupdf.TextPage and make no new one.
453+
sort: whether to sort the output
454+
delimiters: word delimiters for "words" option
455+
tolerance: tolerance for text sorting
456+
auto_ocr: automatically use OCR for scanned pages (default True)
457+
ocr_language: language for OCR (default "eng")
458+
ocr_dpi: DPI for OCR rendering (default 300 for better accuracy)
459+
460+
Returns:
461+
Same as get_text(), but with OCR applied to scanned pages
462+
"""
463+
pymupdf.CheckParent(page)
464+
465+
# Check if page is scanned and auto_ocr is enabled
466+
if auto_ocr and textpage is None and is_scanned_page(page):
467+
# Page is scanned, use OCR
468+
textpage = get_textpage_ocr(
469+
page,
470+
flags=flags or 0,
471+
language=ocr_language,
472+
dpi=ocr_dpi,
473+
full=True,
474+
)
475+
476+
# Now use normal get_text with the OCR textpage
477+
return get_text(
478+
page,
479+
option=option,
480+
clip=clip,
481+
flags=flags,
482+
textpage=textpage,
483+
sort=sort,
484+
delimiters=delimiters,
485+
tolerance=tolerance,
486+
)
487+
488+
401489
def get_text(
402490
page: pymupdf.Page,
403491
option: str = "text",
@@ -412,6 +500,8 @@ def get_text(
412500
"""Extract text from a page or an annotation.
413501
414502
This is a unifying wrapper for various methods of the pymupdf.TextPage class.
503+
504+
Note: For automatic OCR support on scanned pages, use get_text_smart() instead.
415505
416506
Args:
417507
option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
@@ -1167,3 +1257,100 @@ def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad:
11671257
raise ValueError("bad span argument")
11681258

11691259
return recover_bbox_quad(line_dir, span, bbox)
1260+
1261+
1262+
def to_markdown(
1263+
page: pymupdf.Page,
1264+
auto_ocr: bool = True,
1265+
ocr_language: str = "eng",
1266+
ocr_dpi: int = 300,
1267+
) -> str:
1268+
"""Convert page to Markdown format with automatic OCR for scanned pages.
1269+
1270+
Args:
1271+
page: The page to convert
1272+
auto_ocr: Automatically use OCR for scanned pages (default True)
1273+
ocr_language: Language for OCR (default "eng")
1274+
ocr_dpi: DPI for OCR rendering (default 300)
1275+
1276+
Returns:
1277+
Markdown-formatted text
1278+
"""
1279+
pymupdf.CheckParent(page)
1280+
1281+
# Get text with smart OCR detection
1282+
text_dict = get_text_smart(
1283+
page,
1284+
option="dict",
1285+
auto_ocr=auto_ocr,
1286+
ocr_language=ocr_language,
1287+
ocr_dpi=ocr_dpi,
1288+
)
1289+
1290+
markdown_lines = []
1291+
1292+
for block in text_dict.get("blocks", []):
1293+
if block.get("type") == 0: # Text block
1294+
for line in block.get("lines", []):
1295+
line_text = ""
1296+
for span in line.get("spans", []):
1297+
text = span.get("text", "")
1298+
font_size = span.get("size", 12)
1299+
flags = span.get("flags", 0)
1300+
1301+
# Bold if flags indicate it
1302+
if flags & 16: # Bold flag
1303+
text = f"**{text}**"
1304+
# Italic if flags indicate it
1305+
if flags & 2: # Italic flag
1306+
text = f"*{text}*"
1307+
1308+
line_text += text
1309+
1310+
# Detect headers based on font size
1311+
if font_size > 20:
1312+
markdown_lines.append(f"# {line_text}")
1313+
elif font_size > 16:
1314+
markdown_lines.append(f"## {line_text}")
1315+
elif font_size > 14:
1316+
markdown_lines.append(f"### {line_text}")
1317+
else:
1318+
markdown_lines.append(line_text)
1319+
1320+
markdown_lines.append("") # Paragraph break
1321+
1322+
return "\n".join(markdown_lines)
1323+
1324+
1325+
def document_to_markdown(
1326+
doc: pymupdf.Document,
1327+
auto_ocr: bool = True,
1328+
ocr_language: str = "eng",
1329+
ocr_dpi: int = 300,
1330+
page_separator: str = "\n\n---\n\n",
1331+
) -> str:
1332+
"""Convert entire document to Markdown with automatic OCR.
1333+
1334+
Args:
1335+
doc: The document to convert
1336+
auto_ocr: Automatically use OCR for scanned pages (default True)
1337+
ocr_language: Language for OCR (default "eng")
1338+
ocr_dpi: DPI for OCR rendering (default 300)
1339+
page_separator: String to separate pages (default "\\n\\n---\\n\\n")
1340+
1341+
Returns:
1342+
Markdown-formatted text for entire document
1343+
"""
1344+
pages = []
1345+
1346+
for page_num in range(len(doc)):
1347+
page = doc[page_num]
1348+
md = to_markdown(
1349+
page,
1350+
auto_ocr=auto_ocr,
1351+
ocr_language=ocr_language,
1352+
ocr_dpi=ocr_dpi,
1353+
)
1354+
pages.append(f"<!-- Page {page_num + 1} -->\n\n{md}")
1355+
1356+
return page_separator.join(pages)

0 commit comments

Comments
 (0)