Skip to content

Commit 8a91387

Browse files
committed
Rework textpage_ocr
For partial OCR, we previously added text content from OCR'd images on the page. We now redact legible text and let the OCR engine recognize the remaining page content - which includes images as before but also vectors simulating text.
1 parent 1d272d7 commit 8a91387

1 file changed

Lines changed: 115 additions & 52 deletions

File tree

src/utils.py

Lines changed: 115 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -322,80 +322,143 @@ def get_textpage_ocr(
322322
full: bool = False,
323323
tessdata: str = None,
324324
) -> pymupdf.TextPage:
325-
"""Create a Textpage from combined results of normal and OCR text parsing.
325+
"""Create a Textpage from the OCR version of the page.
326+
327+
OCR can be executed for the full page image, or (the default) only
328+
for areas that are not covered by readable digital text.
326329
327330
Args:
328331
flags: (int) control content becoming part of the result.
329332
language: (str) specify expected language(s). Default is "eng" (English).
330333
dpi: (int) resolution in dpi, default 72.
331-
full: (bool) whether to OCR the full page image, or only its images (default)
334+
full: (bool) whether to OCR the full page, or to keep legible text
335+
tessdata: (str) path to Tesseract language data files. If None, the
336+
built-in function is used to find the path.
332337
"""
333338
pymupdf.CheckParent(page)
334-
tessdata = pymupdf.get_tessdata(tessdata)
339+
if tessdata is None:
340+
tessdata = pymupdf.get_tessdata(tessdata)
341+
342+
# Ensure 0xFFFD is not suppressed
343+
flags = (
344+
flags
345+
& ~pymupdf.TEXT_USE_CID_FOR_UNKNOWN_UNICODE # pylint: disable=no-member
346+
& ~pymupdf.TEXT_USE_GID_FOR_UNKNOWN_UNICODE # pylint: disable=no-member
347+
)
335348

336349
def full_ocr(page, dpi, language, flags):
337-
zoom = dpi / 72
338-
mat = pymupdf.Matrix(zoom, zoom)
339-
pix = page.get_pixmap(matrix=mat)
350+
"""Perform OCR for the full page image."""
351+
pix = page.get_pixmap(dpi=dpi)
352+
# create a 1-page PDF with an OCR text layer.
340353
ocr_pdf = pymupdf.Document(
341-
"pdf",
342-
pix.pdfocr_tobytes(
343-
compress=False,
344-
language=language,
345-
tessdata=tessdata,
346-
),
347-
)
354+
stream=pix.pdfocr_tobytes(
355+
compress=False,
356+
language=language,
357+
tessdata=tessdata,
358+
),
359+
)
348360
ocr_page = ocr_pdf.load_page(0)
349361
unzoom = page.rect.width / ocr_page.rect.width
350362
ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix
351363
tpage = ocr_page.get_textpage(flags=flags, matrix=ctm)
352-
ocr_pdf.close()
353-
pix = None
364+
365+
# associate the textpage with the original page
354366
tpage.parent = weakref.proxy(page)
355367
return tpage
356368

369+
def partial_ocr(page, dpi, language, flags):
370+
"""Perform OCR for parts of the page without legible text.
371+
372+
We create a temporary PDF for which we can freely redact text.
373+
"""
374+
doc = page.parent
375+
376+
# make temporary PDF with the passed-in page
377+
temp_pdf = pymupdf.open()
378+
temp_pdf.insert_pdf(doc, from_page=page.number, to_page=page.number)
379+
temp_page = temp_pdf.load_page(0)
380+
temp_page.remove_rotation() # avoid OCR problems with rotated pages
381+
382+
# extract text bboxes from the page
383+
tp = temp_page.get_textpage(flags=flags)
384+
blocks = tp.extractDICT()["blocks"]
385+
386+
"""
387+
For partial OCR we need a TextPage that contains legible text only.
388+
Illegible text must be passed to the OCR engine.
389+
"""
390+
# Select spans with illegible text. If present, remove them first.
391+
fffd_spans = [
392+
s["bbox"]
393+
for b in blocks
394+
if b["type"] == 0
395+
for l in b["lines"]
396+
for s in l["spans"]
397+
if chr(0xFFFD) in s["text"]
398+
]
399+
if fffd_spans:
400+
for bbox in fffd_spans:
401+
temp_page.add_redact_annot(bbox)
402+
temp_page.apply_redactions(
403+
images=pymupdf.PDF_REDACT_IMAGE_NONE, # pylint: disable=no-member
404+
graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, # pylint: disable=no-member
405+
text=pymupdf.PDF_REDACT_TEXT_REMOVE, # pylint: disable=no-member
406+
)
407+
# Extract text again, now without the unreadable spans.
408+
tp = temp_page.get_textpage(flags=flags)
409+
blocks = tp.extractDICT()["blocks"]
410+
# We also need a fresh copy of the original page.
411+
temp_pdf.insert_pdf(doc, from_page=page.number, to_page=page.number)
412+
temp_page = temp_pdf.load_page(-1)
413+
temp_page.remove_rotation() # avoid OCR problems with rotated pages
414+
415+
span_bboxes = [
416+
s["bbox"]
417+
for b in blocks
418+
if b["type"] == 0
419+
for l in b["lines"]
420+
for s in l["spans"]
421+
if not chr(0xFFFD) in s["text"]
422+
]
423+
424+
# Remove digital text by redacting the span bboxes.
425+
# Then OCR the remainder of the page.
426+
for bbox in span_bboxes:
427+
temp_page.add_redact_annot(bbox)
428+
429+
# only remove text, no images, no vectors
430+
temp_page.apply_redactions(
431+
images=pymupdf.PDF_REDACT_IMAGE_NONE, # pylint: disable=no-member
432+
graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, # pylint: disable=no-member
433+
text=pymupdf.PDF_REDACT_TEXT_REMOVE, # pylint: disable=no-member
434+
)
435+
pix = temp_page.get_pixmap(dpi=dpi)
436+
# matrix = pymupdf.Rect(pix.irect).torect(page.rect)
437+
438+
# OCR the redacted page
439+
ocr_pdf = pymupdf.open(
440+
stream=pix.pdfocr_tobytes(
441+
compress=False,
442+
language=language,
443+
tessdata=tessdata,
444+
),
445+
)
446+
ocr_page = ocr_pdf[0]
447+
448+
# Extend the original textpage with OCR-ed text.
449+
ocr_page.extend_textpage(tp, flags=pymupdf.TEXT_ACCURATE_BBOXES)
450+
451+
# associate the textpage with the original page
452+
tp.parent = weakref.proxy(page)
453+
return tp
454+
357455
# if OCR for the full page, OCR its pixmap @ desired dpi
358456
if full:
359457
return full_ocr(page, dpi, language, flags)
360458

361459
# For partial OCR, make a normal textpage, then extend it with text that
362-
# is OCRed from each image.
363-
# Because of this, we need the images flag bit set ON.
364-
tpage = page.get_textpage(flags=flags)
365-
for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]:
366-
if block["type"] != 1: # only look at images
367-
continue
368-
bbox = pymupdf.Rect(block["bbox"])
369-
if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff
370-
continue
371-
try:
372-
pix = pymupdf.Pixmap(block["image"]) # get image pixmap
373-
if pix.n - pix.alpha != 3: # we need to convert this to RGB!
374-
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
375-
if pix.alpha: # must remove alpha channel
376-
pix = pymupdf.Pixmap(pix, 0)
377-
imgdoc = pymupdf.Document(
378-
"pdf",
379-
pix.pdfocr_tobytes(language=language, tessdata=tessdata),
380-
) # pdf with OCRed page
381-
imgpage = imgdoc.load_page(0) # read image as a page
382-
pix = None
383-
# compute matrix to transform coordinates back to that of 'page'
384-
imgrect = imgpage.rect # page size of image PDF
385-
shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height)
386-
mat = shrink * block["transform"]
387-
imgpage.extend_textpage(tpage, flags=0, matrix=mat)
388-
imgdoc.close()
389-
except (RuntimeError, mupdf.FzErrorBase):
390-
if 0 and g_exceptions_verbose:
391-
# Don't show exception info here because it can happen in
392-
# normal operation (see test_3842b).
393-
pymupdf.exception_info()
394-
tpage = None
395-
pymupdf.message("Falling back to full page OCR")
396-
return full_ocr(page, dpi, language, flags)
397-
398-
return tpage
460+
# is OCRed from the rest of page.
461+
return partial_ocr(page, dpi, language, flags)
399462

400463

401464
def get_text(

0 commit comments

Comments
 (0)