Add opencv for Computer Vision and performing image preprocessing. Provides better algorithms than Pillow for doing image smoothing, denoising and binarisation. Can later come in handy to deal with skewed or rotated images.

raaj-akshar · raaj-akshar · commit 7d6bcceb86cd · 2025-06-02T01:51:14.000+05:30
diff --git a/Dockerfile b/Dockerfile
@@ -3,10 +3,12 @@ FROM python:3.11-bullseye
 # Install system dependencies
 # tesseract-ocr needed for pytesseract, to extract text from scanned images
 # poppler-utils needed for pdftotext
+# libgl1 needed for opencv
 RUN apt-get update && apt-get install -y \
     tesseract-ocr \
     libtesseract-dev \
     poppler-utils \
+    libgl1 \
     && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app
diff --git a/image_preprocessing.py b/image_preprocessing.py
@@ -13,13 +13,18 @@
 TODO:
 - DPI Normalization: to make the image crispier and easy to read
 - Contour detection
+- Border Removal
+- Erosion and Dilation
+- Edge detection
+- Rotation and Alignment
 
 Currently we are using Pillow, which is basic. We can move to opencv which has better denoising and binarization support. Also it supports contour detection and DPI normalization.
 """
 
 import os
 import logging
 from PIL import Image, ImageFilter
+import cv2 as cv
 
 logger = logging.getLogger(__name__)
 
@@ -38,3 +43,58 @@ def preprocess_image(file_path: str):
     except Exception as exc:
         logger.error(f"Exception {exc} ocurred during image preprocessing.")
         return file_path
+
+
+def preprocess_image_opencv(file_path: str, options: dict = None, source: str = "image"):
+    """
+    source could be one of 'image' or 'pdf'. We will apply fastNlMeansDenoising to PDF pages converted to images,
+    while apply bilateralFilter to camera images.
+
+    Currently performs:
+    - Color space conversion from RGB to Grayscale, to make the image easier to read
+    - Denoising, Smoothing and Blurring to remove specks/grains, using fastNlMeansDenoising.
+    - Binarisation, to have black text on a white background
+
+    TODO:
+    - Cropping the area of interest
+    - Rotation and Alignment: Using Canny, HoughLines.
+    """
+    default_options = {
+        "gray": True,
+        "denoise": True,
+        "binarize": True,
+    }
+    if options is None:
+        options = {}
+    default_options.update(options)
+    logger.info(f"file_path: {file_path}, options: {default_options}")
+    base, ext = os.path.splitext(file_path)
+    ext = ext.lstrip(".")
+    img = cv.imread(file_path)
+    if default_options['denoise'] is True and default_options['gray'] is False:
+        # Force grayscale, as denoising is done in grayscale
+        logger.info("Forcing grayscale, as denoising is done in grayscale")
+        default_options['gray'] = True
+    if default_options['gray'] is True:
+        img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
+    if default_options['denoise'] is True:
+        if source == "pdf":
+            logger.info("Applying fastNlMeansDenoising")
+            img = cv.fastNlMeansDenoising(img, h=30)
+        else:
+            logger.info("Applying bilateralFilter")
+            # It smoothes the image without losing edges
+            img = cv.bilateralFilter(img, d=9, sigmaColor=75, sigmaSpace=75)
+    if default_options['binarize'] is True:
+        img = cv.adaptiveThreshold(
+            img,
+            maxValue=255,
+            adaptiveMethod=cv.ADAPTIVE_THRESH_GAUSSIAN_C,  # or MEAN_C
+            thresholdType=cv.THRESH_BINARY,
+            blockSize=11,  # size of the neighborhood (must be odd)
+            C=2            # constant subtracted from the mean
+        )
+    # Check if dilation and erosion needed
+    # Find the margins and crop
+    output_path = f"{base}-cv-processed.{ext}"
+    cv.imwrite(output_path, img)
diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,4 @@ nltk                # natural language toolkit. Used for text analysis
 boto3               # Need to communicate with AWS Textract service
 rq                  # Redis-queue for task/worker setup
 Pillow              # To perform image preprocessing before OCR. Performs grayscale conversion, denoising and binarization
+opencv-contrib-python # To perfomr image preprocessing
diff --git a/tasks.py b/tasks.py
@@ -20,4 +20,5 @@ def report_success(job, connection, result, *args, **kwargs):
 def enqueue_extraction(extraction_function, file_path):
     connection = get_connection()
     q = Queue(connection=connection)
+    # Extraction should be performed on the raw image as well as processed image.
     q.enqueue(extraction_function, file_path, on_success=report_success)