@@ -314,6 +314,33 @@ def get_text_selection(
314314 return rc
315315
316316
317+ def is_scanned_page (
318+ page : pymupdf .Page ,
319+ threshold : int = 50 ,
320+ ) -> bool :
321+ """Detect if a page is scanned (image-based) vs native (text-based).
322+
323+ Args:
324+ page: The page to check
325+ threshold: Minimum characters to consider as having text (default 50)
326+
327+ Returns:
328+ True if page appears to be scanned, False if native with embedded text
329+ """
330+ pymupdf .CheckParent (page )
331+
332+ # Quick check: try to extract text
333+ text = page .get_textpage (flags = 0 ).extractText ().strip ()
334+
335+ # If very little text, check for images
336+ if len (text ) < threshold :
337+ images = page .get_images ()
338+ if len (images ) > 0 :
339+ return True # Has images but no text = scanned
340+
341+ return False
342+
343+
317344def get_textpage_ocr (
318345 page : pymupdf .Page ,
319346 flags : int = 0 ,
@@ -398,6 +425,67 @@ def full_ocr(page, dpi, language, flags):
398425 return tpage
399426
400427
428+ def get_text_smart (
429+ page : pymupdf .Page ,
430+ option : str = "text" ,
431+ * ,
432+ clip : rect_like = None ,
433+ flags : OptInt = None ,
434+ textpage : pymupdf .TextPage = None ,
435+ sort : bool = False ,
436+ delimiters = None ,
437+ tolerance = 3 ,
438+ auto_ocr : bool = True ,
439+ ocr_language : str = "eng" ,
440+ ocr_dpi : int = 300 ,
441+ ):
442+ """Extract text with automatic OCR for scanned pages.
443+
444+ This is an enhanced version of get_text() that automatically detects
445+ scanned pages and applies OCR when needed.
446+
447+ Args:
448+ page: The page to extract text from
449+ option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
450+ clip: (rect-like) restrict output to this area.
451+ flags: bit switches to e.g. exclude images or decompose ligatures.
452+ textpage: reuse this pymupdf.TextPage and make no new one.
453+ sort: whether to sort the output
454+ delimiters: word delimiters for "words" option
455+ tolerance: tolerance for text sorting
456+ auto_ocr: automatically use OCR for scanned pages (default True)
457+ ocr_language: language for OCR (default "eng")
458+ ocr_dpi: DPI for OCR rendering (default 300 for better accuracy)
459+
460+ Returns:
461+ Same as get_text(), but with OCR applied to scanned pages
462+ """
463+ pymupdf .CheckParent (page )
464+
465+ # Check if page is scanned and auto_ocr is enabled
466+ if auto_ocr and textpage is None and is_scanned_page (page ):
467+ # Page is scanned, use OCR
468+ textpage = get_textpage_ocr (
469+ page ,
470+ flags = flags or 0 ,
471+ language = ocr_language ,
472+ dpi = ocr_dpi ,
473+ full = True ,
474+ )
475+
476+ # Now use normal get_text with the OCR textpage
477+ return get_text (
478+ page ,
479+ option = option ,
480+ clip = clip ,
481+ flags = flags ,
482+ textpage = textpage ,
483+ sort = sort ,
484+ delimiters = delimiters ,
485+ tolerance = tolerance ,
486+ )
487+
488+
401489def get_text (
402490 page : pymupdf .Page ,
403491 option : str = "text" ,
@@ -412,6 +500,8 @@ def get_text(
412500 """Extract text from a page or an annotation.
413501
414502 This is a unifying wrapper for various methods of the pymupdf.TextPage class.
503+
504+ Note: For automatic OCR support on scanned pages, use get_text_smart() instead.
415505
416506 Args:
417507 option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
@@ -1167,3 +1257,100 @@ def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad:
11671257 raise ValueError ("bad span argument" )
11681258
11691259 return recover_bbox_quad (line_dir , span , bbox )
1260+
1261+
1262+ def to_markdown (
1263+ page : pymupdf .Page ,
1264+ auto_ocr : bool = True ,
1265+ ocr_language : str = "eng" ,
1266+ ocr_dpi : int = 300 ,
1267+ ) -> str :
1268+ """Convert page to Markdown format with automatic OCR for scanned pages.
1269+
1270+ Args:
1271+ page: The page to convert
1272+ auto_ocr: Automatically use OCR for scanned pages (default True)
1273+ ocr_language: Language for OCR (default "eng")
1274+ ocr_dpi: DPI for OCR rendering (default 300)
1275+
1276+ Returns:
1277+ Markdown-formatted text
1278+ """
1279+ pymupdf .CheckParent (page )
1280+
1281+ # Get text with smart OCR detection
1282+ text_dict = get_text_smart (
1283+ page ,
1284+ option = "dict" ,
1285+ auto_ocr = auto_ocr ,
1286+ ocr_language = ocr_language ,
1287+ ocr_dpi = ocr_dpi ,
1288+ )
1289+
1290+ markdown_lines = []
1291+
1292+ for block in text_dict .get ("blocks" , []):
1293+ if block .get ("type" ) == 0 : # Text block
1294+ for line in block .get ("lines" , []):
1295+ line_text = ""
1296+ for span in line .get ("spans" , []):
1297+ text = span .get ("text" , "" )
1298+ font_size = span .get ("size" , 12 )
1299+ flags = span .get ("flags" , 0 )
1300+
1301+ # Bold if flags indicate it
1302+ if flags & 16 : # Bold flag
1303+ text = f"**{ text } **"
1304+ # Italic if flags indicate it
1305+ if flags & 2 : # Italic flag
1306+ text = f"*{ text } *"
1307+
1308+ line_text += text
1309+
1310+ # Detect headers based on font size
1311+ if font_size > 20 :
1312+ markdown_lines .append (f"# { line_text } " )
1313+ elif font_size > 16 :
1314+ markdown_lines .append (f"## { line_text } " )
1315+ elif font_size > 14 :
1316+ markdown_lines .append (f"### { line_text } " )
1317+ else :
1318+ markdown_lines .append (line_text )
1319+
1320+ markdown_lines .append ("" ) # Paragraph break
1321+
1322+ return "\n " .join (markdown_lines )
1323+
1324+
1325+ def document_to_markdown (
1326+ doc : pymupdf .Document ,
1327+ auto_ocr : bool = True ,
1328+ ocr_language : str = "eng" ,
1329+ ocr_dpi : int = 300 ,
1330+ page_separator : str = "\n \n ---\n \n " ,
1331+ ) -> str :
1332+ """Convert entire document to Markdown with automatic OCR.
1333+
1334+ Args:
1335+ doc: The document to convert
1336+ auto_ocr: Automatically use OCR for scanned pages (default True)
1337+ ocr_language: Language for OCR (default "eng")
1338+ ocr_dpi: DPI for OCR rendering (default 300)
1339+ page_separator: String to separate pages (default "\\ n\\ n---\\ n\\ n")
1340+
1341+ Returns:
1342+ Markdown-formatted text for entire document
1343+ """
1344+ pages = []
1345+
1346+ for page_num in range (len (doc )):
1347+ page = doc [page_num ]
1348+ md = to_markdown (
1349+ page ,
1350+ auto_ocr = auto_ocr ,
1351+ ocr_language = ocr_language ,
1352+ ocr_dpi = ocr_dpi ,
1353+ )
1354+ pages .append (f"<!-- Page { page_num + 1 } -->\n \n { md } " )
1355+
1356+ return page_separator .join (pages )
0 commit comments