Skip to content

Commit 3d7eea5

Browse files
committed
Refactoring
1 parent 087db4f commit 3d7eea5

3 files changed

Lines changed: 37 additions & 32 deletions

File tree

main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from fastapi.middleware.cors import CORSMiddleware
1010

1111
from services import identify_file_type, merge_pdfs, save_file, extract_pdf_text, get_file_size, extract_image_text, extract_pdf_text_all
12-
from services import text_analysis
12+
from text_analysis import analyze
1313
from tasks import enqueue_extraction
1414
from textract import detect_text
1515
from db import get_value
@@ -88,7 +88,7 @@ def extract_text(attachment: UploadFile):
8888
is_success, content = extract_pdf_text(attachment.file)
8989
if is_success is False:
9090
raise HTTPException(status_code=400, detail=content)
91-
analysis_result = text_analysis(content)
91+
analysis_result = analyze(content)
9292
return {"content": content, "analysis_result": analysis_result}
9393

9494

services.py

Lines changed: 8 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import os
22
import glob
33
import logging
4-
import nltk
54
from typing import List, BinaryIO
65

76
import magic
@@ -19,14 +18,10 @@
1918

2019
from fastapi import UploadFile
2120

22-
23-
logger = logging.getLogger(__name__)
21+
from image_preprocessing import preprocess_image
2422

2523

26-
# It's an idempotent operatation
27-
nltk.download('stopwords')
28-
nltk.download('punkt_tab')
29-
nltk.download('words')
24+
logger = logging.getLogger(__name__)
3025

3126

3227
def identify_file_type(file_object_or_stream: BinaryIO) -> FileMagic:
@@ -161,28 +156,11 @@ def extract_image_text(file_path: str):
161156
A TesseractError would happen, and will be handled, if the file is non-image.
162157
"""
163158
try:
164-
text = pytesseract.image_to_string(file_path)
165-
return True, text
159+
# Raw image OCR
160+
raw_image_text = pytesseract.image_to_string(file_path)
161+
# Preprocessed image OCR
162+
processed_image_path = preprocess_image(file_path)
163+
pytesseract.image_to_string(processed_image_path)
164+
return True, raw_image_text
166165
except TesseractError:
167166
return False, "An invalid or corrupted image"
168-
169-
170-
def text_analysis(text: str):
171-
"""
172-
Performs analysis on text using nltk.
173-
Currently does the following:
174-
- Length of the text
175-
- Most common words
176-
- Unique words
177-
- Collocations
178-
"""
179-
words = nltk.word_tokenize(text)
180-
# Remove stopwords
181-
words = [word for word in words if word not in nltk.corpus.stopwords.words("english")]
182-
uniques = set(words)
183-
text = nltk.Text(words)
184-
freq_dist = nltk.FreqDist(text)
185-
length = len(text)
186-
most_common = freq_dist.most_common(10)
187-
collocations = text.collocations()
188-
return {"length": length, "most_common": most_common, "uniques": uniques, "collocations": collocations}

text_analysis.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import nltk
2+
3+
# It's an idempotent operatation
4+
nltk.download('stopwords')
5+
nltk.download('punkt_tab')
6+
nltk.download('words')
7+
8+
9+
def analyze(text: str):
10+
"""
11+
Performs analysis on text using nltk.
12+
Currently does the following:
13+
- Length of the text
14+
- Most common words
15+
- Unique words
16+
- Collocations
17+
"""
18+
words = nltk.word_tokenize(text)
19+
# Remove stopwords
20+
words = [word for word in words if word not in nltk.corpus.stopwords.words("english")]
21+
uniques = set(words)
22+
text = nltk.Text(words)
23+
freq_dist = nltk.FreqDist(text)
24+
length = len(text)
25+
most_common = freq_dist.most_common(10)
26+
collocations = text.collocations()
27+
return {"length": length, "most_common": most_common, "uniques": uniques, "collocations": collocations}

0 commit comments

Comments
 (0)