|
14 | 14 | from . import pymupdf |
15 | 15 | except Exception: |
16 | 16 | import pymupdf |
17 | | -try: |
18 | | - from . import mupdf |
19 | | -except Exception: |
20 | | - import mupdf |
21 | 17 |
|
22 | 18 | _format_g = pymupdf.format_g |
23 | 19 |
|
@@ -322,80 +318,142 @@ def get_textpage_ocr( |
322 | 318 | full: bool = False, |
323 | 319 | tessdata: str = None, |
324 | 320 | ) -> pymupdf.TextPage: |
325 | | - """Create a Textpage from combined results of normal and OCR text parsing. |
| 321 | + """Create a Textpage from the OCR version of the page. |
| 322 | +
|
| 323 | + OCR can be executed for the full page image, or (the default) only |
| 324 | + for areas that are not covered by readable digital text. |
326 | 325 |
|
327 | 326 | Args: |
328 | 327 | flags: (int) control content becoming part of the result. |
329 | 328 | language: (str) specify expected language(s). Default is "eng" (English). |
330 | 329 | dpi: (int) resolution in dpi, default 72. |
331 | | - full: (bool) whether to OCR the full page image, or only its images (default) |
| 330 | + full: (bool) whether to OCR the full page, or to keep legible text |
| 331 | + tessdata: (str) path to Tesseract language data files. If None, the |
| 332 | + built-in function is used to find the path. |
332 | 333 | """ |
333 | 334 | pymupdf.CheckParent(page) |
334 | 335 | tessdata = pymupdf.get_tessdata(tessdata) |
335 | 336 |
|
| 337 | + # Ensure 0xFFFD is not suppressed |
| 338 | + flags = ( |
| 339 | + flags |
| 340 | + & ~pymupdf.TEXT_USE_CID_FOR_UNKNOWN_UNICODE # pylint: disable=no-member |
| 341 | + & ~pymupdf.TEXT_USE_GID_FOR_UNKNOWN_UNICODE # pylint: disable=no-member |
| 342 | + ) |
| 343 | + |
336 | 344 | def full_ocr(page, dpi, language, flags): |
337 | | - zoom = dpi / 72 |
338 | | - mat = pymupdf.Matrix(zoom, zoom) |
339 | | - pix = page.get_pixmap(matrix=mat) |
| 345 | + """Perform OCR for the full page image.""" |
| 346 | + pix = page.get_pixmap(dpi=dpi) |
| 347 | + # create a 1-page PDF with an OCR text layer. |
340 | 348 | ocr_pdf = pymupdf.Document( |
341 | | - "pdf", |
342 | | - pix.pdfocr_tobytes( |
343 | | - compress=False, |
344 | | - language=language, |
345 | | - tessdata=tessdata, |
346 | | - ), |
347 | | - ) |
| 349 | + stream=pix.pdfocr_tobytes( |
| 350 | + compress=False, |
| 351 | + language=language, |
| 352 | + tessdata=tessdata, |
| 353 | + ), |
| 354 | + ) |
348 | 355 | ocr_page = ocr_pdf.load_page(0) |
349 | 356 | unzoom = page.rect.width / ocr_page.rect.width |
350 | 357 | ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix |
351 | 358 | tpage = ocr_page.get_textpage(flags=flags, matrix=ctm) |
352 | | - ocr_pdf.close() |
353 | | - pix = None |
| 359 | + |
| 360 | + # associate the textpage with the original page |
354 | 361 | tpage.parent = weakref.proxy(page) |
355 | 362 | return tpage |
356 | 363 |
|
| 364 | + def partial_ocr(page, dpi, language, flags): |
| 365 | + """Perform OCR for parts of the page without legible text. |
| 366 | +
|
| 367 | + We create a temporary PDF for which we can freely redact text. |
| 368 | + """ |
| 369 | + doc = page.parent |
| 370 | + |
| 371 | + # make temporary PDF with the passed-in page |
| 372 | + temp_pdf = pymupdf.open() |
| 373 | + temp_pdf.insert_pdf(doc, from_page=page.number, to_page=page.number) |
| 374 | + temp_page = temp_pdf.load_page(0) |
| 375 | + temp_page.remove_rotation() # avoid OCR problems with rotated pages |
| 376 | + |
| 377 | + # extract text bboxes from the page |
| 378 | + tp = temp_page.get_textpage(flags=flags) |
| 379 | + blocks = tp.extractDICT()["blocks"] |
| 380 | + |
| 381 | + """ |
| 382 | + For partial OCR we need a TextPage that contains legible text only. |
| 383 | + Illegible text must be passed to the OCR engine. |
| 384 | + """ |
| 385 | + # Select spans with illegible text. If present, remove them first. |
| 386 | + fffd_spans = [ |
| 387 | + s["bbox"] |
| 388 | + for b in blocks |
| 389 | + if b["type"] == 0 |
| 390 | + for l in b["lines"] |
| 391 | + for s in l["spans"] |
| 392 | + if chr(0xFFFD) in s["text"] |
| 393 | + ] |
| 394 | + if fffd_spans: |
| 395 | + for bbox in fffd_spans: |
| 396 | + temp_page.add_redact_annot(bbox) |
| 397 | + temp_page.apply_redactions( |
| 398 | + images=pymupdf.PDF_REDACT_IMAGE_NONE, # pylint: disable=no-member |
| 399 | + graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, # pylint: disable=no-member |
| 400 | + text=pymupdf.PDF_REDACT_TEXT_REMOVE, # pylint: disable=no-member |
| 401 | + ) |
| 402 | + # Extract text again, now without the unreadable spans. |
| 403 | + tp = temp_page.get_textpage(flags=flags) |
| 404 | + blocks = tp.extractDICT()["blocks"] |
| 405 | + # We also need a fresh copy of the original page. |
| 406 | + temp_pdf.insert_pdf(doc, from_page=page.number, to_page=page.number) |
| 407 | + temp_page = temp_pdf.load_page(-1) |
| 408 | + temp_page.remove_rotation() # avoid OCR problems with rotated pages |
| 409 | + |
| 410 | + span_bboxes = [ |
| 411 | + s["bbox"] |
| 412 | + for b in blocks |
| 413 | + if b["type"] == 0 |
| 414 | + for l in b["lines"] |
| 415 | + for s in l["spans"] |
| 416 | + if not chr(0xFFFD) in s["text"] |
| 417 | + ] |
| 418 | + |
| 419 | + # Remove digital text by redacting the span bboxes. |
| 420 | + # Then OCR the remainder of the page. |
| 421 | + for bbox in span_bboxes: |
| 422 | + temp_page.add_redact_annot(bbox) |
| 423 | + |
| 424 | + # only remove text, no images, no vectors |
| 425 | + temp_page.apply_redactions( |
| 426 | + images=pymupdf.PDF_REDACT_IMAGE_NONE, # pylint: disable=no-member |
| 427 | + graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, # pylint: disable=no-member |
| 428 | + text=pymupdf.PDF_REDACT_TEXT_REMOVE, # pylint: disable=no-member |
| 429 | + ) |
| 430 | + pix = temp_page.get_pixmap(dpi=dpi) |
| 431 | + # matrix = pymupdf.Rect(pix.irect).torect(page.rect) |
| 432 | + |
| 433 | + # OCR the redacted page |
| 434 | + ocr_pdf = pymupdf.open( |
| 435 | + stream=pix.pdfocr_tobytes( |
| 436 | + compress=False, |
| 437 | + language=language, |
| 438 | + tessdata=tessdata, |
| 439 | + ), |
| 440 | + ) |
| 441 | + ocr_page = ocr_pdf[0] |
| 442 | + |
| 443 | + # Extend the original textpage with OCR-ed text. |
| 444 | + ocr_page.extend_textpage(tp, flags=pymupdf.TEXT_ACCURATE_BBOXES) |
| 445 | + |
| 446 | + # associate the textpage with the original page |
| 447 | + tp.parent = weakref.proxy(page) |
| 448 | + return tp |
| 449 | + |
357 | 450 | # if OCR for the full page, OCR its pixmap @ desired dpi |
358 | 451 | if full: |
359 | 452 | return full_ocr(page, dpi, language, flags) |
360 | 453 |
|
361 | 454 | # For partial OCR, make a normal textpage, then extend it with text that |
362 | | - # is OCRed from each image. |
363 | | - # Because of this, we need the images flag bit set ON. |
364 | | - tpage = page.get_textpage(flags=flags) |
365 | | - for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]: |
366 | | - if block["type"] != 1: # only look at images |
367 | | - continue |
368 | | - bbox = pymupdf.Rect(block["bbox"]) |
369 | | - if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff |
370 | | - continue |
371 | | - try: |
372 | | - pix = pymupdf.Pixmap(block["image"]) # get image pixmap |
373 | | - if pix.n - pix.alpha != 3: # we need to convert this to RGB! |
374 | | - pix = pymupdf.Pixmap(pymupdf.csRGB, pix) |
375 | | - if pix.alpha: # must remove alpha channel |
376 | | - pix = pymupdf.Pixmap(pix, 0) |
377 | | - imgdoc = pymupdf.Document( |
378 | | - "pdf", |
379 | | - pix.pdfocr_tobytes(language=language, tessdata=tessdata), |
380 | | - ) # pdf with OCRed page |
381 | | - imgpage = imgdoc.load_page(0) # read image as a page |
382 | | - pix = None |
383 | | - # compute matrix to transform coordinates back to that of 'page' |
384 | | - imgrect = imgpage.rect # page size of image PDF |
385 | | - shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height) |
386 | | - mat = shrink * block["transform"] |
387 | | - imgpage.extend_textpage(tpage, flags=0, matrix=mat) |
388 | | - imgdoc.close() |
389 | | - except (RuntimeError, mupdf.FzErrorBase): |
390 | | - if 0 and g_exceptions_verbose: |
391 | | - # Don't show exception info here because it can happen in |
392 | | - # normal operation (see test_3842b). |
393 | | - pymupdf.exception_info() |
394 | | - tpage = None |
395 | | - pymupdf.message("Falling back to full page OCR") |
396 | | - return full_ocr(page, dpi, language, flags) |
397 | | - |
398 | | - return tpage |
| 455 | + # is OCRed from the rest of page. |
| 456 | + return partial_ocr(page, dpi, language, flags) |
399 | 457 |
|
400 | 458 |
|
401 | 459 | def get_text( |
|
0 commit comments