Skip to content

Commit 4f30826

Browse files
committed
Added ability to infer structured data from PAN and Passport using heuristics and rule based extraction.
1 parent 4d06197 commit 4f30826

1 file changed

Lines changed: 173 additions & 19 deletions

File tree

text_analysis.py

Lines changed: 173 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,19 @@
1111
TODO:
1212
- Summarization
1313
"""
14+
import re
1415
import nltk
16+
import string
17+
import logging
18+
import Levenshtein
1519

1620
# It's an idempotent operatation
1721
nltk.download('stopwords')
1822
nltk.download('punkt_tab')
1923
nltk.download('words')
2024

25+
logger = logging.getLogger(__name__)
26+
2127

2228
def analyze(text: str):
2329
"""
@@ -71,26 +77,11 @@ def classify(text: str):
7177
It currently does a simple string matching. We will move to NLP NaiveBayes Classification soon.
7278
And gradually to more advanced classification models.
7379
"""
74-
lowered_text = text.lower()
75-
passport_needed_words = ['india', 'indian', 'surname', 'nationality', 'given', 'name', 'passport', 'date of birth', 'place of birth', 'place of issue', 'date of issue', 'passport no']
76-
passport_found_words = 0
77-
for passport_word in passport_needed_words:
78-
if passport_word in lowered_text:
79-
passport_found_words += 1
80-
if passport_found_words >= 6:
81-
# Also ensure Passport number REGEX is found
80+
if classify_passport(text):
8281
return "passport"
83-
pan_needed_words = ['income', 'tax', 'department', 'govt', 'india']
84-
pan_found_words = 0
85-
for pan_word in pan_needed_words:
86-
# Enhance it to make it lenient. For example, 'indome' could be found instead of 'income'
87-
# OCR makes such kind of mistakes and hence accommodation for such must be made.
88-
if pan_word in lowered_text:
89-
pan_found_words += 1
90-
if pan_found_words >= 3:
91-
# TODO: Also make sure that a Regex of form 'AZMPR1111L' is found.
92-
# This text is dark and bold and OCR would have definitely picked it up.
82+
if classify_pan(text):
9383
return "pan"
84+
lowered_text = text.lower()
9485
# In Aadhaar Card, Government of India is shaded, hence binarization causes it not to be read properly.
9586
aadhaar_needed_words = ['issue', 'date']
9687
aadhaar_found_words = 0
@@ -102,4 +93,167 @@ def classify(text: str):
10293
# aadhaar has a regex of form 1234 1234 1234
10394
# This finding is a must because it is dark and bold and OCR would have definitely picked it up.
10495
return "aadhaar"
105-
return None
96+
return None
97+
98+
99+
def fuzzy_substring_match(text, phrase, max_distance=2):
100+
phrase_len = len(phrase)
101+
text = text.lower()
102+
phrase = phrase.lower()
103+
104+
for i in range(len(text) - phrase_len + 1):
105+
window = text[i:i + phrase_len]
106+
logger.info(f"Comparing {window} with {phrase}")
107+
distance = Levenshtein.distance(window, phrase)
108+
if distance <= max_distance:
109+
return True, window, distance
110+
111+
return False, None, None
112+
113+
114+
def classify_passport(text: str):
115+
"""
116+
Does this look like a passport?
117+
"""
118+
text = text.lower()
119+
REPUBLIC_OF_INDIA = "republic of india"
120+
republic_of_india_found = False
121+
match_found, match_str, distance = fuzzy_substring_match(text, REPUBLIC_OF_INDIA)
122+
if match_found:
123+
logger.info(f"Matched {match_str} with {REPUBLIC_OF_INDIA} with distance {distance}")
124+
republic_of_india_found = True
125+
NATIONALITY = "nationality"
126+
nationality_found = False
127+
match_found, match_str, distance = fuzzy_substring_match(text, NATIONALITY)
128+
if match_found:
129+
logger.info(f"Matched {match_str} with {NATIONALITY} with distance {distance}")
130+
nationality_found = True
131+
PASSPORT_NUMBER = "passport no"
132+
passport_number_found = False
133+
match_found, match_str, distance = fuzzy_substring_match(text, PASSPORT_NUMBER)
134+
if match_found:
135+
logger.info(f"Matched {match_str} with {PASSPORT_NUMBER} with distance {distance}")
136+
passport_number_found = True
137+
DATE_OF_BIRTH = "date of birth"
138+
date_of_birth_found = False
139+
match_found, match_str, distance = fuzzy_substring_match(text, DATE_OF_BIRTH)
140+
if match_found:
141+
logger.info(f"Matched {match_str} with {DATE_OF_BIRTH} with distance {distance}")
142+
date_of_birth_found = True
143+
PLACE_OF_BIRTH = "place of birth"
144+
place_of_birth_found = False
145+
match_found, match_str, distance = fuzzy_substring_match(text, PLACE_OF_BIRTH)
146+
if match_found:
147+
logger.info(f"Matched {match_str} with {PLACE_OF_BIRTH} with distance {distance}")
148+
place_of_birth_found = True
149+
GIVEN_NAME = "given name"
150+
given_name_found = False
151+
match_found, match_str, distance = fuzzy_substring_match(text, GIVEN_NAME)
152+
if match_found:
153+
logger.info(f"Matched {match_str} with {GIVEN_NAME} with distance {distance}")
154+
given_name_found = True
155+
SURNAME = "surname"
156+
surname_found = False
157+
match_found, match_str, distance = fuzzy_substring_match(text, SURNAME)
158+
if match_found:
159+
logger.info(f"Matched {match_str} with {SURNAME} with distance {distance}")
160+
surname_found = True
161+
found_array = [republic_of_india_found, nationality_found, passport_number_found, date_of_birth_found, place_of_birth_found, given_name_found, surname_found]
162+
found_true = [x for x in found_array if x]
163+
# Consider 60% of the text as a threshold.
164+
if len(found_true)/len(found_array) >= 0.6:
165+
return True
166+
return False
167+
168+
169+
def classify_pan(text: str):
170+
text = text.lower()
171+
INCOME_TAX_DEPARTMENT = 'income tax department'
172+
GOVT_OF_INDIA = 'govt of india'
173+
PERMANENT_ACCOUNT_NUMBER = 'permanent account number'
174+
income_tax_department_found = False
175+
govt_of_india_found = False
176+
permanent_account_number_found = False
177+
match_found, match_str, distance = fuzzy_substring_match(text, INCOME_TAX_DEPARTMENT, max_distance=5)
178+
if match_found:
179+
logger.info(f"Matched {match_str} with {INCOME_TAX_DEPARTMENT} with distance {distance}")
180+
income_tax_department_found = True
181+
match_found, match_str, distance = fuzzy_substring_match(text, GOVT_OF_INDIA, max_distance=4)
182+
if match_found:
183+
logger.info(f"Matched {match_str} with {GOVT_OF_INDIA} with distance {distance}")
184+
govt_of_india_found = True
185+
match_found, match_str, distance = fuzzy_substring_match(text, PERMANENT_ACCOUNT_NUMBER, max_distance=4)
186+
if match_found:
187+
logger.info(f"Matched {match_str} with {PERMANENT_ACCOUNT_NUMBER} with distance {distance}")
188+
permanent_account_number_found = True
189+
found_array = [income_tax_department_found, govt_of_india_found, permanent_account_number_found]
190+
found_true = [x for x in found_array if x]
191+
# Consider 60% of the text as a threshold.
192+
if len(found_true)/len(found_array) >= 0.6:
193+
return True
194+
return False
195+
196+
197+
def analyze_passport(text: str):
198+
# Word boundary on both sides.
199+
# An upper case letter followed by exactly 7 digits
200+
matches = re.findall(r'\b[A-Z]\d{7}\b', text)
201+
passport_number = None
202+
if len(matches) > 0:
203+
passport_number = matches[0]
204+
data = {
205+
}
206+
if passport_number is not None:
207+
data['passport_number'] = passport_number
208+
return data
209+
210+
211+
def analyze_pan(text: str):
212+
# Remove blank lines
213+
lines = text.splitlines()
214+
non_blank_lines = [line for line in lines if line.strip() != '']
215+
text = '\n'.join(non_blank_lines)
216+
lowered_text = text.lower()
217+
# Word boundary on both sides.
218+
# 5 upper case letters followed by exactly 4 digits, followed by a letter
219+
matches = re.findall(r'\b[A-Z]{5}\d{4}[A-Z]{1}\b', text)
220+
pan_number = None
221+
name = None
222+
father_name = None
223+
dob = None
224+
if len(matches) > 0:
225+
pan_number = matches[0]
226+
# Extract name
227+
# Find where "India" occurs
228+
match_found, match_str, distance = fuzzy_substring_match(lowered_text, "india")
229+
if match_found:
230+
# Find index of "India"
231+
index = lowered_text.index(match_str)
232+
# Find first new line after this index
233+
new_line_index = text.find('\n', index)
234+
# Name of person is after this new line
235+
content_after_new_line = text[new_line_index+1:]
236+
name_and_others = content_after_new_line.split('\n')
237+
# Get the name of the person
238+
if len(name_and_others) > 0:
239+
name = name_and_others[0]
240+
# Remove punctuation from name
241+
name = name.translate(str.maketrans('', '', string.punctuation))
242+
name = name.strip()
243+
if name is not None and len(name_and_others) > 1:
244+
# Father name is just after name, on the next line
245+
father_name = name_and_others[1]
246+
father_name = father_name.translate(str.maketrans('', '', string.punctuation))
247+
father_name = father_name.strip()
248+
if len(name_and_others) > 2:
249+
dob = name_and_others[2]
250+
data = {}
251+
if pan_number is not None:
252+
data['PAN No.'] = pan_number
253+
if name is not None:
254+
data['Name'] = name
255+
if father_name is not None:
256+
data["Father's Name"] = father_name
257+
if dob is not None:
258+
data['Date of Birth'] = dob
259+
return data

0 commit comments

Comments
 (0)