Skip to content

Commit 26285a8

Browse files
committed
Rework textpage_ocr
For partial OCR, we previously added text content from OCR'd images on the page. We now redact legible text and let the OCR engine recognize the remaining page content - which includes images as before but also vectors simulating text.
1 parent 1d272d7 commit 26285a8

1 file changed

Lines changed: 115 additions & 56 deletions

File tree

src/utils.py

Lines changed: 115 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,6 @@
1414
from . import pymupdf
1515
except Exception:
1616
import pymupdf
17-
try:
18-
from . import mupdf
19-
except Exception:
20-
import mupdf
2117

2218
_format_g = pymupdf.format_g
2319

@@ -322,80 +318,143 @@ def get_textpage_ocr(
322318
full: bool = False,
323319
tessdata: str = None,
324320
) -> pymupdf.TextPage:
325-
"""Create a Textpage from combined results of normal and OCR text parsing.
321+
"""Create a Textpage from the OCR version of the page.
322+
323+
OCR can be executed for the full page image, or (the default) only
324+
for areas that are not covered by readable digital text.
326325
327326
Args:
328327
flags: (int) control content becoming part of the result.
329328
language: (str) specify expected language(s). Default is "eng" (English).
330329
dpi: (int) resolution in dpi, default 72.
331-
full: (bool) whether to OCR the full page image, or only its images (default)
330+
full: (bool) whether to OCR the full page, or to keep legible text
331+
tessdata: (str) path to Tesseract language data files. If None, the
332+
built-in function is used to find the path.
332333
"""
333334
pymupdf.CheckParent(page)
334-
tessdata = pymupdf.get_tessdata(tessdata)
335+
if tessdata is None:
336+
tessdata = pymupdf.get_tessdata(tessdata)
337+
338+
# Ensure 0xFFFD is not suppressed
339+
flags = (
340+
flags
341+
& ~pymupdf.TEXT_USE_CID_FOR_UNKNOWN_UNICODE # pylint: disable=no-member
342+
& ~pymupdf.TEXT_USE_GID_FOR_UNKNOWN_UNICODE # pylint: disable=no-member
343+
)
335344

336345
def full_ocr(page, dpi, language, flags):
337-
zoom = dpi / 72
338-
mat = pymupdf.Matrix(zoom, zoom)
339-
pix = page.get_pixmap(matrix=mat)
346+
"""Perform OCR for the full page image."""
347+
pix = page.get_pixmap(dpi=dpi)
348+
# create a 1-page PDF with an OCR text layer.
340349
ocr_pdf = pymupdf.Document(
341-
"pdf",
342-
pix.pdfocr_tobytes(
343-
compress=False,
344-
language=language,
345-
tessdata=tessdata,
346-
),
347-
)
350+
stream=pix.pdfocr_tobytes(
351+
compress=False,
352+
language=language,
353+
tessdata=tessdata,
354+
),
355+
)
348356
ocr_page = ocr_pdf.load_page(0)
349357
unzoom = page.rect.width / ocr_page.rect.width
350358
ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix
351359
tpage = ocr_page.get_textpage(flags=flags, matrix=ctm)
352-
ocr_pdf.close()
353-
pix = None
360+
361+
# associate the textpage with the original page
354362
tpage.parent = weakref.proxy(page)
355363
return tpage
356364

365+
def partial_ocr(page, dpi, language, flags):
366+
"""Perform OCR for parts of the page without legible text.
367+
368+
We create a temporary PDF for which we can freely redact text.
369+
"""
370+
doc = page.parent
371+
372+
# make temporary PDF with the passed-in page
373+
temp_pdf = pymupdf.open()
374+
temp_pdf.insert_pdf(doc, from_page=page.number, to_page=page.number)
375+
temp_page = temp_pdf.load_page(0)
376+
temp_page.remove_rotation() # avoid OCR problems with rotated pages
377+
378+
# extract text bboxes from the page
379+
tp = temp_page.get_textpage(flags=flags)
380+
blocks = tp.extractDICT()["blocks"]
381+
382+
"""
383+
For partial OCR we need a TextPage that contains legible text only.
384+
Illegible text must be passed to the OCR engine.
385+
"""
386+
# Select spans with illegible text. If present, remove them first.
387+
fffd_spans = [
388+
s["bbox"]
389+
for b in blocks
390+
if b["type"] == 0
391+
for l in b["lines"]
392+
for s in l["spans"]
393+
if chr(0xFFFD) in s["text"]
394+
]
395+
if fffd_spans:
396+
for bbox in fffd_spans:
397+
temp_page.add_redact_annot(bbox)
398+
temp_page.apply_redactions(
399+
images=pymupdf.PDF_REDACT_IMAGE_NONE, # pylint: disable=no-member
400+
graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, # pylint: disable=no-member
401+
text=pymupdf.PDF_REDACT_TEXT_REMOVE, # pylint: disable=no-member
402+
)
403+
# Extract text again, now without the unreadable spans.
404+
tp = temp_page.get_textpage(flags=flags)
405+
blocks = tp.extractDICT()["blocks"]
406+
# We also need a fresh copy of the original page.
407+
temp_pdf.insert_pdf(doc, from_page=page.number, to_page=page.number)
408+
temp_page = temp_pdf.load_page(-1)
409+
temp_page.remove_rotation() # avoid OCR problems with rotated pages
410+
411+
span_bboxes = [
412+
s["bbox"]
413+
for b in blocks
414+
if b["type"] == 0
415+
for l in b["lines"]
416+
for s in l["spans"]
417+
if not chr(0xFFFD) in s["text"]
418+
]
419+
420+
# Remove digital text by redacting the span bboxes.
421+
# Then OCR the remainder of the page.
422+
for bbox in span_bboxes:
423+
temp_page.add_redact_annot(bbox)
424+
425+
# only remove text, no images, no vectors
426+
temp_page.apply_redactions(
427+
images=pymupdf.PDF_REDACT_IMAGE_NONE, # pylint: disable=no-member
428+
graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, # pylint: disable=no-member
429+
text=pymupdf.PDF_REDACT_TEXT_REMOVE, # pylint: disable=no-member
430+
)
431+
pix = temp_page.get_pixmap(dpi=dpi)
432+
# matrix = pymupdf.Rect(pix.irect).torect(page.rect)
433+
434+
# OCR the redacted page
435+
ocr_pdf = pymupdf.open(
436+
stream=pix.pdfocr_tobytes(
437+
compress=False,
438+
language=language,
439+
tessdata=tessdata,
440+
),
441+
)
442+
ocr_page = ocr_pdf[0]
443+
444+
# Extend the original textpage with OCR-ed text.
445+
ocr_page.extend_textpage(tp, flags=pymupdf.TEXT_ACCURATE_BBOXES)
446+
447+
# associate the textpage with the original page
448+
tp.parent = weakref.proxy(page)
449+
return tp
450+
357451
# if OCR for the full page, OCR its pixmap @ desired dpi
358452
if full:
359453
return full_ocr(page, dpi, language, flags)
360454

361455
# For partial OCR, make a normal textpage, then extend it with text that
362-
# is OCRed from each image.
363-
# Because of this, we need the images flag bit set ON.
364-
tpage = page.get_textpage(flags=flags)
365-
for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]:
366-
if block["type"] != 1: # only look at images
367-
continue
368-
bbox = pymupdf.Rect(block["bbox"])
369-
if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff
370-
continue
371-
try:
372-
pix = pymupdf.Pixmap(block["image"]) # get image pixmap
373-
if pix.n - pix.alpha != 3: # we need to convert this to RGB!
374-
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
375-
if pix.alpha: # must remove alpha channel
376-
pix = pymupdf.Pixmap(pix, 0)
377-
imgdoc = pymupdf.Document(
378-
"pdf",
379-
pix.pdfocr_tobytes(language=language, tessdata=tessdata),
380-
) # pdf with OCRed page
381-
imgpage = imgdoc.load_page(0) # read image as a page
382-
pix = None
383-
# compute matrix to transform coordinates back to that of 'page'
384-
imgrect = imgpage.rect # page size of image PDF
385-
shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height)
386-
mat = shrink * block["transform"]
387-
imgpage.extend_textpage(tpage, flags=0, matrix=mat)
388-
imgdoc.close()
389-
except (RuntimeError, mupdf.FzErrorBase):
390-
if 0 and g_exceptions_verbose:
391-
# Don't show exception info here because it can happen in
392-
# normal operation (see test_3842b).
393-
pymupdf.exception_info()
394-
tpage = None
395-
pymupdf.message("Falling back to full page OCR")
396-
return full_ocr(page, dpi, language, flags)
397-
398-
return tpage
456+
# is OCRed from the rest of page.
457+
return partial_ocr(page, dpi, language, flags)
399458

400459

401460
def get_text(

0 commit comments

Comments
 (0)