33import logging
44from typing import List , BinaryIO
55
6+ # File mime-type detection
67import magic
78from magic .compat import FileMagic
89
10+ # PDF manipulation
911from pikepdf import Pdf
1012
13+ # PDF text extraction
1114from pdfminer .high_level import extract_text
1215from pdfminer .pdfparser import PDFSyntaxError
1316
17+ # Image text extraction
18+ # OCR can only happen on images, OCR doesn't work with PDF
1419import pytesseract
1520from pytesseract .pytesseract import TesseractError
1621
22+ # Convert non-searchable PDFs to images before performing OCR
1723from pdf2image import convert_from_path
1824
1925from fastapi import UploadFile
2026
27+ # Perform image preprocessing to improve image quality, crispness and readability
2128from image_preprocessing import preprocess_image
2229
30+ from text_analysis import is_meaningful_content
31+
2332
2433logger = logging .getLogger (__name__ )
2534
@@ -90,7 +99,7 @@ def save_file(file: BinaryIO, path: str):
9099 logger .info (f"Saved file to { path } " )
91100
92101
93- def extract_pdf_text (file : BinaryIO ):
102+ def extract_pdf_text_searchable (file : BinaryIO ):
94103 """
95104 :param: A file like object, opened in binary mode.
96105 Extracts text from a PDF containing embedded text using pdfminer.six library.
@@ -105,31 +114,20 @@ def extract_pdf_text(file: BinaryIO):
105114 return False , "An invalid or corrupted PDF"
106115
107116
108- def extract_pdf_text_all (file_path : str ):
117+ def extract_pdf_text_non_searchable (file_path : str ):
109118 """
110- Attempts extraction for both searchable and non-searchable PDFs .
111-
112- 1. For searchable_pdfs, delegate to extract_pdf_text which uses pdfminer.six
113- 2. For non-searchable PDFs, convert to an image and then extract text
119+ :param: A PDF file path .
120+ Extracts text from non searchable PDFs i.e scanned PDFs that don't have embedded text.
121+ Converts a PDF to an image and then extracts text from it. Delegates to extract_image_text which
122+ performs OCR using Pytesseract.
114123 """
115- f = open (file_path , "rb" )
116- is_success , content = extract_pdf_text (f )
117- f .close ()
118- if is_success is False :
119- # It's not even a PDF probably
120- return False , content
121- if len (content ) > 10 :
122- return True , content
123- # Probably it's a non-searchable PDF, that's why we were able to get less than 10 characters.
124- # Convert it to an image first
125124 output_folder = "/media/pdf-to-image" # Directory name -> /media/pdf-to-image
126125 basename = os .path .basename (file_path ) # File name -> sample.pdf
127126 if '.pdf' in basename :
128127 basename = basename .replace ('.pdf' , '' )
129128 convert_from_path (file_path , output_folder = output_folder , fmt = "png" , output_file = basename )
130129 # The converted images have been saved now.
131130 converted_images_paths = sorted (glob .glob (f"{ output_folder } /{ basename } *.png" ))
132- # We will extend it for all images later.
133131 is_successes = []
134132 contents = []
135133 for converted_image_path in converted_images_paths :
@@ -143,6 +141,25 @@ def extract_pdf_text_all(file_path: str):
143141 return any (is_successes ), "\n " .join (contents )
144142
145143
144+ def extract_pdf_text_all (file_path : str ):
145+ """
146+ Attempts extraction for both searchable and non-searchable PDFs.
147+
148+ 1. For searchable_pdfs, delegate to extract_pdf_text which uses pdfminer.six
149+ 2. For non-searchable PDFs, convert to an image and then extract text
150+ """
151+ f = open (file_path , "rb" )
152+ is_success , content = extract_pdf_text_searchable (f )
153+ f .close ()
154+ if is_success is False :
155+ # It's not even a PDF probably
156+ return False , content
157+ if is_meaningful_content (content ):
158+ return True , content
159+ is_success , content = extract_pdf_text_non_searchable (file_path )
160+ return is_success , content
161+
162+
146163def get_file_size (file ):
147164 file .seek (0 , 2 ) # Move to the end of file
148165 size = file .tell ()
@@ -156,11 +173,7 @@ def extract_image_text(file_path: str):
156173 A TesseractError would happen, and will be handled, if the file is non-image.
157174 """
158175 try :
159- # Raw image OCR
160- raw_image_text = pytesseract .image_to_string (file_path )
161- # Preprocessed image OCR
162- processed_image_path = preprocess_image (file_path )
163- pytesseract .image_to_string (processed_image_path )
164- return True , raw_image_text
176+ text = pytesseract .image_to_string (file_path )
177+ return True , text
165178 except TesseractError :
166179 return False , "An invalid or corrupted image"
0 commit comments