|
1 | 1 | import os |
2 | 2 | import glob |
3 | 3 | import logging |
4 | | -import nltk |
5 | 4 | from typing import List, BinaryIO |
6 | 5 |
|
7 | 6 | import magic |
|
19 | 18 |
|
20 | 19 | from fastapi import UploadFile |
21 | 20 |
|
22 | | - |
23 | | -logger = logging.getLogger(__name__) |
| 21 | +from image_preprocessing import preprocess_image |
24 | 22 |
|
25 | 23 |
|
26 | | -# It's an idempotent operatation |
27 | | -nltk.download('stopwords') |
28 | | -nltk.download('punkt_tab') |
29 | | -nltk.download('words') |
| 24 | +logger = logging.getLogger(__name__) |
30 | 25 |
|
31 | 26 |
|
32 | 27 | def identify_file_type(file_object_or_stream: BinaryIO) -> FileMagic: |
@@ -161,28 +156,11 @@ def extract_image_text(file_path: str): |
161 | 156 | A TesseractError would happen, and will be handled, if the file is non-image. |
162 | 157 | """ |
163 | 158 | try: |
164 | | - text = pytesseract.image_to_string(file_path) |
165 | | - return True, text |
| 159 | + # Raw image OCR |
| 160 | + raw_image_text = pytesseract.image_to_string(file_path) |
| 161 | + # Preprocessed image OCR |
| 162 | + processed_image_path = preprocess_image(file_path) |
| 163 | + pytesseract.image_to_string(processed_image_path) |
| 164 | + return True, raw_image_text |
166 | 165 | except TesseractError: |
167 | 166 | return False, "An invalid or corrupted image" |
168 | | - |
169 | | - |
170 | | -def text_analysis(text: str): |
171 | | - """ |
172 | | - Performs analysis on text using nltk. |
173 | | - Currently does the following: |
174 | | - - Length of the text |
175 | | - - Most common words |
176 | | - - Unique words |
177 | | - - Collocations |
178 | | - """ |
179 | | - words = nltk.word_tokenize(text) |
180 | | - # Remove stopwords |
181 | | - words = [word for word in words if word not in nltk.corpus.stopwords.words("english")] |
182 | | - uniques = set(words) |
183 | | - text = nltk.Text(words) |
184 | | - freq_dist = nltk.FreqDist(text) |
185 | | - length = len(text) |
186 | | - most_common = freq_dist.most_common(10) |
187 | | - collocations = text.collocations() |
188 | | - return {"length": length, "most_common": most_common, "uniques": uniques, "collocations": collocations} |
0 commit comments