@@ -322,80 +322,143 @@ def get_textpage_ocr(
322322 full : bool = False ,
323323 tessdata : str = None ,
324324) -> pymupdf .TextPage :
325- """Create a Textpage from combined results of normal and OCR text parsing.
325+ """Create a Textpage from the OCR version of the page.
326+
327+ OCR can be executed for the full page image, or (the default) only
328+ for areas that are not covered by readable digital text.
326329
327330 Args:
328331 flags: (int) control content becoming part of the result.
329332 language: (str) specify expected language(s). Default is "eng" (English).
330333 dpi: (int) resolution in dpi, default 72.
331- full: (bool) whether to OCR the full page image, or only its images (default)
334+ full: (bool) whether to OCR the full page, or to keep legible text
335+ tessdata: (str) path to Tesseract language data files. If None, the
336+ built-in function is used to find the path.
332337 """
333338 pymupdf .CheckParent (page )
334- tessdata = pymupdf .get_tessdata (tessdata )
339+ if tessdata is None :
340+ tessdata = pymupdf .get_tessdata (tessdata )
341+
342+ # Ensure 0xFFFD is not suppressed
343+ flags = (
344+ flags
345+ & ~ pymupdf .TEXT_USE_CID_FOR_UNKNOWN_UNICODE # pylint: disable=no-member
346+ & ~ pymupdf .TEXT_USE_GID_FOR_UNKNOWN_UNICODE # pylint: disable=no-member
347+ )
335348
336349 def full_ocr (page , dpi , language , flags ):
337- zoom = dpi / 72
338- mat = pymupdf . Matrix ( zoom , zoom )
339- pix = page . get_pixmap ( matrix = mat )
350+ """Perform OCR for the full page image."""
351+ pix = page . get_pixmap ( dpi = dpi )
352+ # create a 1- page PDF with an OCR text layer.
340353 ocr_pdf = pymupdf .Document (
341- "pdf" ,
342- pix .pdfocr_tobytes (
343- compress = False ,
344- language = language ,
345- tessdata = tessdata ,
346- ),
347- )
354+ stream = pix .pdfocr_tobytes (
355+ compress = False ,
356+ language = language ,
357+ tessdata = tessdata ,
358+ ),
359+ )
348360 ocr_page = ocr_pdf .load_page (0 )
349361 unzoom = page .rect .width / ocr_page .rect .width
350362 ctm = pymupdf .Matrix (unzoom , unzoom ) * page .derotation_matrix
351363 tpage = ocr_page .get_textpage (flags = flags , matrix = ctm )
352- ocr_pdf . close ()
353- pix = None
364+
365+ # associate the textpage with the original page
354366 tpage .parent = weakref .proxy (page )
355367 return tpage
356368
369+ def partial_ocr (page , dpi , language , flags ):
370+ """Perform OCR for parts of the page without legible text.
371+
372+ We create a temporary PDF for which we can freely redact text.
373+ """
374+ doc = page .parent
375+
376+ # make temporary PDF with the passed-in page
377+ temp_pdf = pymupdf .open ()
378+ temp_pdf .insert_pdf (doc , from_page = page .number , to_page = page .number )
379+ temp_page = temp_pdf .load_page (0 )
380+ temp_page .remove_rotation () # avoid OCR problems with rotated pages
381+
382+ # extract text bboxes from the page
383+ tp = temp_page .get_textpage (flags = flags )
384+ blocks = tp .extractDICT ()["blocks" ]
385+
386+ """
387+ For partial OCR we need a TextPage that contains legible text only.
388+ Illegible text must be passed to the OCR engine.
389+ """
390+ # Select spans with illegible text. If present, remove them first.
391+ fffd_spans = [
392+ s ["bbox" ]
393+ for b in blocks
394+ if b ["type" ] == 0
395+ for l in b ["lines" ]
396+ for s in l ["spans" ]
397+ if chr (0xFFFD ) in s ["text" ]
398+ ]
399+ if fffd_spans :
400+ for bbox in fffd_spans :
401+ temp_page .add_redact_annot (bbox )
402+ temp_page .apply_redactions (
403+ images = pymupdf .PDF_REDACT_IMAGE_NONE , # pylint: disable=no-member
404+ graphics = pymupdf .PDF_REDACT_LINE_ART_NONE , # pylint: disable=no-member
405+ text = pymupdf .PDF_REDACT_TEXT_REMOVE , # pylint: disable=no-member
406+ )
407+ # Extract text again, now without the unreadable spans.
408+ tp = temp_page .get_textpage (flags = flags )
409+ blocks = tp .extractDICT ()["blocks" ]
410+ # We also need a fresh copy of the original page.
411+ temp_pdf .insert_pdf (doc , from_page = page .number , to_page = page .number )
412+ temp_page = temp_pdf .load_page (- 1 )
413+ temp_page .remove_rotation () # avoid OCR problems with rotated pages
414+
415+ span_bboxes = [
416+ s ["bbox" ]
417+ for b in blocks
418+ if b ["type" ] == 0
419+ for l in b ["lines" ]
420+ for s in l ["spans" ]
421+ if not chr (0xFFFD ) in s ["text" ]
422+ ]
423+
424+ # Remove digital text by redacting the span bboxes.
425+ # Then OCR the remainder of the page.
426+ for bbox in span_bboxes :
427+ temp_page .add_redact_annot (bbox )
428+
429+ # only remove text, no images, no vectors
430+ temp_page .apply_redactions (
431+ images = pymupdf .PDF_REDACT_IMAGE_NONE , # pylint: disable=no-member
432+ graphics = pymupdf .PDF_REDACT_LINE_ART_NONE , # pylint: disable=no-member
433+ text = pymupdf .PDF_REDACT_TEXT_REMOVE , # pylint: disable=no-member
434+ )
435+ pix = temp_page .get_pixmap (dpi = dpi )
436+ # matrix = pymupdf.Rect(pix.irect).torect(page.rect)
437+
438+ # OCR the redacted page
439+ ocr_pdf = pymupdf .open (
440+ stream = pix .pdfocr_tobytes (
441+ compress = False ,
442+ language = language ,
443+ tessdata = tessdata ,
444+ ),
445+ )
446+ ocr_page = ocr_pdf [0 ]
447+
448+ # Extend the original textpage with OCR-ed text.
449+ ocr_page .extend_textpage (tp , flags = pymupdf .TEXT_ACCURATE_BBOXES )
450+
451+ # associate the textpage with the original page
452+ tp .parent = weakref .proxy (page )
453+ return tp
454+
357455 # if OCR for the full page, OCR its pixmap @ desired dpi
358456 if full :
359457 return full_ocr (page , dpi , language , flags )
360458
361459 # For partial OCR, make a normal textpage, then extend it with text that
362- # is OCRed from each image.
363- # Because of this, we need the images flag bit set ON.
364- tpage = page .get_textpage (flags = flags )
365- for block in page .get_text ("dict" , flags = pymupdf .TEXT_PRESERVE_IMAGES )["blocks" ]:
366- if block ["type" ] != 1 : # only look at images
367- continue
368- bbox = pymupdf .Rect (block ["bbox" ])
369- if bbox .width <= 3 or bbox .height <= 3 : # ignore tiny stuff
370- continue
371- try :
372- pix = pymupdf .Pixmap (block ["image" ]) # get image pixmap
373- if pix .n - pix .alpha != 3 : # we need to convert this to RGB!
374- pix = pymupdf .Pixmap (pymupdf .csRGB , pix )
375- if pix .alpha : # must remove alpha channel
376- pix = pymupdf .Pixmap (pix , 0 )
377- imgdoc = pymupdf .Document (
378- "pdf" ,
379- pix .pdfocr_tobytes (language = language , tessdata = tessdata ),
380- ) # pdf with OCRed page
381- imgpage = imgdoc .load_page (0 ) # read image as a page
382- pix = None
383- # compute matrix to transform coordinates back to that of 'page'
384- imgrect = imgpage .rect # page size of image PDF
385- shrink = pymupdf .Matrix (1 / imgrect .width , 1 / imgrect .height )
386- mat = shrink * block ["transform" ]
387- imgpage .extend_textpage (tpage , flags = 0 , matrix = mat )
388- imgdoc .close ()
389- except (RuntimeError , mupdf .FzErrorBase ):
390- if 0 and g_exceptions_verbose :
391- # Don't show exception info here because it can happen in
392- # normal operation (see test_3842b).
393- pymupdf .exception_info ()
394- tpage = None
395- pymupdf .message ("Falling back to full page OCR" )
396- return full_ocr (page , dpi , language , flags )
397-
398- return tpage
460+ # is OCRed from the rest of page.
461+ return partial_ocr (page , dpi , language , flags )
399462
400463
401464def get_text (
0 commit comments