Skip to content

Commit ffb5294

Browse files
committed
Rework textpage_ocr
1 parent 1d272d7 commit ffb5294

1 file changed

Lines changed: 74 additions & 51 deletions

File tree

src/utils.py

Lines changed: 74 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -322,80 +322,103 @@ def get_textpage_ocr(
322322
full: bool = False,
323323
tessdata: str = None,
324324
) -> pymupdf.TextPage:
325-
"""Create a Textpage from combined results of normal and OCR text parsing.
325+
"""Create a Textpage from the OCR version of the page.
326+
327+
OCR can be executed for the full page image, or (the default) only
328+
for areas that are not covered by readable digital text.
326329
327330
Args:
328331
flags: (int) control content becoming part of the result.
329332
language: (str) specify expected language(s). Default is "eng" (English).
330333
dpi: (int) resolution in dpi, default 72.
331334
full: (bool) whether to OCR the full page image, or only its images (default)
335+
tessdata: (str) path to Tesseract language data files. If None, the
336+
built-in function is used to find the path.
332337
"""
333338
pymupdf.CheckParent(page)
334-
tessdata = pymupdf.get_tessdata(tessdata)
339+
if tessdata is None:
340+
tessdata = pymupdf.get_tessdata(tessdata)
335341

336342
def full_ocr(page, dpi, language, flags):
337-
zoom = dpi / 72
338-
mat = pymupdf.Matrix(zoom, zoom)
339-
pix = page.get_pixmap(matrix=mat)
343+
"""Perform OCR for the full page image."""
344+
pix = page.get_pixmap(dpi=dpi)
345+
# create a 1-page PDF with an OCR text layer.
340346
ocr_pdf = pymupdf.Document(
341-
"pdf",
342-
pix.pdfocr_tobytes(
343-
compress=False,
344-
language=language,
345-
tessdata=tessdata,
346-
),
347-
)
347+
stream=pix.pdfocr_tobytes(
348+
compress=False,
349+
language=language,
350+
tessdata=tessdata,
351+
),
352+
)
348353
ocr_page = ocr_pdf.load_page(0)
349354
unzoom = page.rect.width / ocr_page.rect.width
350355
ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix
351356
tpage = ocr_page.get_textpage(flags=flags, matrix=ctm)
352-
ocr_pdf.close()
353-
pix = None
357+
358+
# associate the textpage with the original page
354359
tpage.parent = weakref.proxy(page)
355360
return tpage
356361

362+
def partial_ocr(page, dpi, language, flags):
363+
"""Perform OCR for the part of the page without readable text.
364+
365+
We create a temporary PDF for which we can freely redact text.
366+
"""
367+
doc = page.parent
368+
369+
# make temporary PDF with the passed-in page
370+
temp_pdf = pymupdf.open()
371+
temp_pdf.insert_pdf(doc, from_page=page.number, to_page=page.number)
372+
temp_page = temp_pdf.load_page(0)
373+
temp_page.remove_rotation() # avoid OCR problems with rotated pages
374+
375+
# extract readable text span bboxes from the page
376+
tp = page.get_textpage(flags=flags)
377+
blocks = tp.extractDICT()["blocks"]
378+
span_bboxes = [
379+
s["bbox"]
380+
for b in blocks
381+
if b["type"] == 0
382+
for l in b["lines"]
383+
for s in l["spans"]
384+
if not chr(0xFFFD) in s["text"] # omit unreadable spans
385+
]
386+
387+
# Remove all readable digital text by redacting the span bboxes.
388+
# Then OCR the remainder of the page.
389+
for bbox in span_bboxes:
390+
temp_page.add_redact_annot(bbox)
391+
392+
# only remove text, no images, no vectors
393+
temp_page.apply_redactions(
394+
images=pymupdf.PDF_REDACT_IMAGE_NONE,
395+
graphics=pymupdf.PDF_REDACT_LINE_ART_NONE,
396+
text=pymupdf.PDF_REDACT_TEXT_REMOVE,
397+
)
398+
pix = temp_page.get_pixmap(dpi=dpi)
399+
matrix = pymupdf.Rect(pix.irect).torect(page.rect)
400+
401+
# OCR the redacted page
402+
ocr_pdf = pymupdf.open(
403+
stream=pix.pdfocr_tobytes(
404+
compress=False,
405+
language=language,
406+
tessdata=tessdata,
407+
),
408+
)
409+
ocr_page = ocr_pdf[0]
410+
411+
# Extend the original textpage with OCR-ed text.
412+
ocr_page.extend_textpage(tp, flags=pymupdf.TEXT_ACCURATE_BBOXES)
413+
return tp
414+
357415
# if OCR for the full page, OCR its pixmap @ desired dpi
358416
if full:
359417
return full_ocr(page, dpi, language, flags)
360418

361419
# For partial OCR, make a normal textpage, then extend it with text that
362-
# is OCRed from each image.
363-
# Because of this, we need the images flag bit set ON.
364-
tpage = page.get_textpage(flags=flags)
365-
for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]:
366-
if block["type"] != 1: # only look at images
367-
continue
368-
bbox = pymupdf.Rect(block["bbox"])
369-
if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff
370-
continue
371-
try:
372-
pix = pymupdf.Pixmap(block["image"]) # get image pixmap
373-
if pix.n - pix.alpha != 3: # we need to convert this to RGB!
374-
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
375-
if pix.alpha: # must remove alpha channel
376-
pix = pymupdf.Pixmap(pix, 0)
377-
imgdoc = pymupdf.Document(
378-
"pdf",
379-
pix.pdfocr_tobytes(language=language, tessdata=tessdata),
380-
) # pdf with OCRed page
381-
imgpage = imgdoc.load_page(0) # read image as a page
382-
pix = None
383-
# compute matrix to transform coordinates back to that of 'page'
384-
imgrect = imgpage.rect # page size of image PDF
385-
shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height)
386-
mat = shrink * block["transform"]
387-
imgpage.extend_textpage(tpage, flags=0, matrix=mat)
388-
imgdoc.close()
389-
except (RuntimeError, mupdf.FzErrorBase):
390-
if 0 and g_exceptions_verbose:
391-
# Don't show exception info here because it can happen in
392-
# normal operation (see test_3842b).
393-
pymupdf.exception_info()
394-
tpage = None
395-
pymupdf.message("Falling back to full page OCR")
396-
return full_ocr(page, dpi, language, flags)
397-
398-
return tpage
420+
# is OCRed from the rest of page.
421+
return partial_ocr(page, dpi, language, flags)
399422

400423

401424
def get_text(

0 commit comments

Comments
 (0)