@@ -322,80 +322,103 @@ def get_textpage_ocr(
322322 full : bool = False ,
323323 tessdata : str = None ,
324324) -> pymupdf .TextPage :
325- """Create a Textpage from combined results of normal and OCR text parsing.
325+ """Create a Textpage from the OCR version of the page.
326+
327+ OCR can be executed for the full page image, or (the default) only
328+ for areas that are not covered by readable digital text.
326329
327330 Args:
328331 flags: (int) control content becoming part of the result.
329332 language: (str) specify expected language(s). Default is "eng" (English).
330333 dpi: (int) resolution in dpi, default 72.
331334 full: (bool) whether to OCR the full page image, or only its images (default)
335+ tessdata: (str) path to Tesseract language data files. If None, the
336+ built-in function is used to find the path.
332337 """
333338 pymupdf .CheckParent (page )
334- tessdata = pymupdf .get_tessdata (tessdata )
339+ if tessdata is None :
340+ tessdata = pymupdf .get_tessdata (tessdata )
335341
336342 def full_ocr (page , dpi , language , flags ):
337- zoom = dpi / 72
338- mat = pymupdf . Matrix ( zoom , zoom )
339- pix = page . get_pixmap ( matrix = mat )
343+ """Perform OCR for the full page image."""
344+ pix = page . get_pixmap ( dpi = dpi )
345+ # create a 1- page PDF with an OCR text layer.
340346 ocr_pdf = pymupdf .Document (
341- "pdf" ,
342- pix .pdfocr_tobytes (
343- compress = False ,
344- language = language ,
345- tessdata = tessdata ,
346- ),
347- )
347+ stream = pix .pdfocr_tobytes (
348+ compress = False ,
349+ language = language ,
350+ tessdata = tessdata ,
351+ ),
352+ )
348353 ocr_page = ocr_pdf .load_page (0 )
349354 unzoom = page .rect .width / ocr_page .rect .width
350355 ctm = pymupdf .Matrix (unzoom , unzoom ) * page .derotation_matrix
351356 tpage = ocr_page .get_textpage (flags = flags , matrix = ctm )
352- ocr_pdf . close ()
353- pix = None
357+
358+ # associate the textpage with the original page
354359 tpage .parent = weakref .proxy (page )
355360 return tpage
356361
362+ def partial_ocr (page , dpi , language , flags ):
363+ """Perform OCR for the part of the page without readable text.
364+
365+ We create a temporary PDF for which we can freely redact text.
366+ """
367+ doc = page .parent
368+
369+ # make temporary PDF with the passed-in page
370+ temp_pdf = pymupdf .open ()
371+ temp_pdf .insert_pdf (doc , from_page = page .number , to_page = page .number )
372+ temp_page = temp_pdf .load_page (0 )
373+ temp_page .remove_rotation () # avoid OCR problems with rotated pages
374+
375+ # extract readable text span bboxes from the page
376+ tp = page .get_textpage (flags = flags )
377+ blocks = tp .extractDICT ()["blocks" ]
378+ span_bboxes = [
379+ s ["bbox" ]
380+ for b in blocks
381+ if b ["type" ] == 0
382+ for l in b ["lines" ]
383+ for s in l ["spans" ]
384+ if not chr (0xFFFD ) in s ["text" ] # omit unreadable spans
385+ ]
386+
387+ # Remove all readable digital text by redacting the span bboxes.
388+ # Then OCR the remainder of the page.
389+ for bbox in span_bboxes :
390+ temp_page .add_redact_annot (bbox )
391+
392+ # only remove text, no images, no vectors
393+ temp_page .apply_redactions (
394+ images = pymupdf .PDF_REDACT_IMAGE_NONE ,
395+ graphics = pymupdf .PDF_REDACT_LINE_ART_NONE ,
396+ text = pymupdf .PDF_REDACT_TEXT_REMOVE ,
397+ )
398+ pix = temp_page .get_pixmap (dpi = dpi )
399+ matrix = pymupdf .Rect (pix .irect ).torect (page .rect )
400+
401+ # OCR the redacted page
402+ ocr_pdf = pymupdf .open (
403+ stream = pix .pdfocr_tobytes (
404+ compress = False ,
405+ language = language ,
406+ tessdata = tessdata ,
407+ ),
408+ )
409+ ocr_page = ocr_pdf [0 ]
410+
411+ # Extend the original textpage with OCR-ed text.
412+ ocr_page .extend_textpage (tp , flags = pymupdf .TEXT_ACCURATE_BBOXES )
413+ return tp
414+
357415 # if OCR for the full page, OCR its pixmap @ desired dpi
358416 if full :
359417 return full_ocr (page , dpi , language , flags )
360418
361419 # For partial OCR, make a normal textpage, then extend it with text that
362- # is OCRed from each image.
363- # Because of this, we need the images flag bit set ON.
364- tpage = page .get_textpage (flags = flags )
365- for block in page .get_text ("dict" , flags = pymupdf .TEXT_PRESERVE_IMAGES )["blocks" ]:
366- if block ["type" ] != 1 : # only look at images
367- continue
368- bbox = pymupdf .Rect (block ["bbox" ])
369- if bbox .width <= 3 or bbox .height <= 3 : # ignore tiny stuff
370- continue
371- try :
372- pix = pymupdf .Pixmap (block ["image" ]) # get image pixmap
373- if pix .n - pix .alpha != 3 : # we need to convert this to RGB!
374- pix = pymupdf .Pixmap (pymupdf .csRGB , pix )
375- if pix .alpha : # must remove alpha channel
376- pix = pymupdf .Pixmap (pix , 0 )
377- imgdoc = pymupdf .Document (
378- "pdf" ,
379- pix .pdfocr_tobytes (language = language , tessdata = tessdata ),
380- ) # pdf with OCRed page
381- imgpage = imgdoc .load_page (0 ) # read image as a page
382- pix = None
383- # compute matrix to transform coordinates back to that of 'page'
384- imgrect = imgpage .rect # page size of image PDF
385- shrink = pymupdf .Matrix (1 / imgrect .width , 1 / imgrect .height )
386- mat = shrink * block ["transform" ]
387- imgpage .extend_textpage (tpage , flags = 0 , matrix = mat )
388- imgdoc .close ()
389- except (RuntimeError , mupdf .FzErrorBase ):
390- if 0 and g_exceptions_verbose :
391- # Don't show exception info here because it can happen in
392- # normal operation (see test_3842b).
393- pymupdf .exception_info ()
394- tpage = None
395- pymupdf .message ("Falling back to full page OCR" )
396- return full_ocr (page , dpi , language , flags )
397-
398- return tpage
420+ # is OCRed from the rest of page.
421+ return partial_ocr (page , dpi , language , flags )
399422
400423
401424def get_text (
0 commit comments