Skip to content

Commit 7d6bcce

Browse files
committed
Add opencv for Computer Vision and performing image preprocessing. Provides better algorithms than Pillow for doing image smoothing, denoising and binarisation. Can later come in handy to deal with skewed or rotated images.
1 parent bc5adcf commit 7d6bcce

4 files changed

Lines changed: 64 additions & 0 deletions

File tree

Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@ FROM python:3.11-bullseye
33
# Install system dependencies
44
# tesseract-ocr needed for pytesseract, to extract text from scanned images
55
# poppler-utils needed for pdftotext
6+
# libgl1 needed for opencv
67
RUN apt-get update && apt-get install -y \
78
tesseract-ocr \
89
libtesseract-dev \
910
poppler-utils \
11+
libgl1 \
1012
&& rm -rf /var/lib/apt/lists/*
1113

1214
WORKDIR /app

image_preprocessing.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,18 @@
1313
TODO:
1414
- DPI Normalization: to make the image crispier and easy to read
1515
- Contour detection
16+
- Border Removal
17+
- Erosion and Dilation
18+
- Edge detection
19+
- Rotation and Alignment
1620
1721
Currently we are using Pillow, which is basic. We can move to opencv which has better denoising and binarization support. Also it supports contour detection and DPI normalization.
1822
"""
1923

2024
import os
2125
import logging
2226
from PIL import Image, ImageFilter
27+
import cv2 as cv
2328

2429
logger = logging.getLogger(__name__)
2530

@@ -38,3 +43,58 @@ def preprocess_image(file_path: str):
3843
except Exception as exc:
3944
logger.error(f"Exception {exc} ocurred during image preprocessing.")
4045
return file_path
46+
47+
48+
def preprocess_image_opencv(file_path: str, options: dict = None, source: str = "image"):
49+
"""
50+
source could be one of 'image' or 'pdf'. We will apply fastNlMeansDenoising to PDF pages converted to images,
51+
while apply bilateralFilter to camera images.
52+
53+
Currently performs:
54+
- Color space conversion from RGB to Grayscale, to make the image easier to read
55+
- Denoising, Smoothing and Blurring to remove specks/grains, using fastNlMeansDenoising.
56+
- Binarisation, to have black text on a white background
57+
58+
TODO:
59+
- Cropping the area of interest
60+
- Rotation and Alignment: Using Canny, HoughLines.
61+
"""
62+
default_options = {
63+
"gray": True,
64+
"denoise": True,
65+
"binarize": True,
66+
}
67+
if options is None:
68+
options = {}
69+
default_options.update(options)
70+
logger.info(f"file_path: {file_path}, options: {default_options}")
71+
base, ext = os.path.splitext(file_path)
72+
ext = ext.lstrip(".")
73+
img = cv.imread(file_path)
74+
if default_options['denoise'] is True and default_options['gray'] is False:
75+
# Force grayscale, as denoising is done in grayscale
76+
logger.info("Forcing grayscale, as denoising is done in grayscale")
77+
default_options['gray'] = True
78+
if default_options['gray'] is True:
79+
img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
80+
if default_options['denoise'] is True:
81+
if source == "pdf":
82+
logger.info("Applying fastNlMeansDenoising")
83+
img = cv.fastNlMeansDenoising(img, h=30)
84+
else:
85+
logger.info("Applying bilateralFilter")
86+
# It smoothes the image without losing edges
87+
img = cv.bilateralFilter(img, d=9, sigmaColor=75, sigmaSpace=75)
88+
if default_options['binarize'] is True:
89+
img = cv.adaptiveThreshold(
90+
img,
91+
maxValue=255,
92+
adaptiveMethod=cv.ADAPTIVE_THRESH_GAUSSIAN_C, # or MEAN_C
93+
thresholdType=cv.THRESH_BINARY,
94+
blockSize=11, # size of the neighborhood (must be odd)
95+
C=2 # constant subtracted from the mean
96+
)
97+
# Check if dilation and erosion needed
98+
# Find the margins and crop
99+
output_path = f"{base}-cv-processed.{ext}"
100+
cv.imwrite(output_path, img)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ nltk # natural language toolkit. Used for text analysis
99
boto3 # Need to communicate with AWS Textract service
1010
rq # Redis-queue for task/worker setup
1111
Pillow # To perform image preprocessing before OCR. Performs grayscale conversion, denoising and binarization
12+
opencv-contrib-python # To perfomr image preprocessing

tasks.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@ def report_success(job, connection, result, *args, **kwargs):
2020
def enqueue_extraction(extraction_function, file_path):
2121
connection = get_connection()
2222
q = Queue(connection=connection)
23+
# Extraction should be performed on the raw image as well as processed image.
2324
q.enqueue(extraction_function, file_path, on_success=report_success)

0 commit comments

Comments
 (0)