Refactoring

raaj-akshar · raaj-akshar · commit 3d7eea5d3c1c · 2025-05-31T13:58:42.000+05:30
diff --git a/main.py b/main.py
@@ -9,7 +9,7 @@
 from fastapi.middleware.cors import CORSMiddleware
 
 from services import identify_file_type, merge_pdfs, save_file, extract_pdf_text, get_file_size, extract_image_text, extract_pdf_text_all
-from services import text_analysis
+from text_analysis import analyze
 from tasks import enqueue_extraction
 from textract import detect_text
 from db import get_value
@@ -88,7 +88,7 @@ def extract_text(attachment: UploadFile):
     is_success, content = extract_pdf_text(attachment.file)
     if is_success is False:
         raise HTTPException(status_code=400, detail=content)
-    analysis_result = text_analysis(content)
+    analysis_result = analyze(content)
     return {"content": content, "analysis_result": analysis_result}
 
 
diff --git a/services.py b/services.py
@@ -1,7 +1,6 @@
 import os
 import glob
 import logging
-import nltk
 from typing import List, BinaryIO
 
 import magic
@@ -19,14 +18,10 @@
 
 from fastapi import UploadFile
 
-
-logger = logging.getLogger(__name__)
+from image_preprocessing import preprocess_image
 
 
-# It's an idempotent operatation
-nltk.download('stopwords')
-nltk.download('punkt_tab')
-nltk.download('words')
+logger = logging.getLogger(__name__)
 
 
 def identify_file_type(file_object_or_stream: BinaryIO) -> FileMagic:
@@ -161,28 +156,11 @@ def extract_image_text(file_path: str):
     A TesseractError would happen, and will be handled, if the file is non-image.
     """
     try:
-        text = pytesseract.image_to_string(file_path)
-        return True, text
+        # Raw image OCR
+        raw_image_text = pytesseract.image_to_string(file_path)
+        # Preprocessed image OCR
+        processed_image_path = preprocess_image(file_path)
+        pytesseract.image_to_string(processed_image_path)
+        return True, raw_image_text
     except TesseractError:
         return False, "An invalid or corrupted image"
-
-
-def text_analysis(text: str):
-    """
-    Performs analysis on text using nltk.
-    Currently does the following:
-    - Length of the text
-    - Most common words
-    - Unique words
-    - Collocations
-    """
-    words = nltk.word_tokenize(text)
-    # Remove stopwords
-    words = [word for word in words if word not in nltk.corpus.stopwords.words("english")]
-    uniques = set(words)
-    text = nltk.Text(words)
-    freq_dist = nltk.FreqDist(text)
-    length = len(text)
-    most_common = freq_dist.most_common(10)
-    collocations = text.collocations()
-    return {"length": length, "most_common": most_common, "uniques": uniques, "collocations": collocations}
diff --git a/text_analysis.py b/text_analysis.py
@@ -0,0 +1,27 @@
+import nltk
+
+# It's an idempotent operatation
+nltk.download('stopwords')
+nltk.download('punkt_tab')
+nltk.download('words')
+
+
+def analyze(text: str):
+    """
+    Performs analysis on text using nltk.
+    Currently does the following:
+    - Length of the text
+    - Most common words
+    - Unique words
+    - Collocations
+    """
+    words = nltk.word_tokenize(text)
+    # Remove stopwords
+    words = [word for word in words if word not in nltk.corpus.stopwords.words("english")]
+    uniques = set(words)
+    text = nltk.Text(words)
+    freq_dist = nltk.FreqDist(text)
+    length = len(text)
+    most_common = freq_dist.most_common(10)
+    collocations = text.collocations()
+    return {"length": length, "most_common": most_common, "uniques": uniques, "collocations": collocations}