1111TODO:
1212- Summarization
1313"""
14+ import re
1415import nltk
16+ import string
17+ import logging
18+ import Levenshtein
1519
1620# It's an idempotent operatation
1721nltk .download ('stopwords' )
1822nltk .download ('punkt_tab' )
1923nltk .download ('words' )
2024
25+ logger = logging .getLogger (__name__ )
26+
2127
2228def analyze (text : str ):
2329 """
@@ -71,26 +77,11 @@ def classify(text: str):
7177 It currently does a simple string matching. We will move to NLP NaiveBayes Classification soon.
7278 And gradually to more advanced classification models.
7379 """
74- lowered_text = text .lower ()
75- passport_needed_words = ['india' , 'indian' , 'surname' , 'nationality' , 'given' , 'name' , 'passport' , 'date of birth' , 'place of birth' , 'place of issue' , 'date of issue' , 'passport no' ]
76- passport_found_words = 0
77- for passport_word in passport_needed_words :
78- if passport_word in lowered_text :
79- passport_found_words += 1
80- if passport_found_words >= 6 :
81- # Also ensure Passport number REGEX is found
80+ if classify_passport (text ):
8281 return "passport"
83- pan_needed_words = ['income' , 'tax' , 'department' , 'govt' , 'india' ]
84- pan_found_words = 0
85- for pan_word in pan_needed_words :
86- # Enhance it to make it lenient. For example, 'indome' could be found instead of 'income'
87- # OCR makes such kind of mistakes and hence accommodation for such must be made.
88- if pan_word in lowered_text :
89- pan_found_words += 1
90- if pan_found_words >= 3 :
91- # TODO: Also make sure that a Regex of form 'AZMPR1111L' is found.
92- # This text is dark and bold and OCR would have definitely picked it up.
82+ if classify_pan (text ):
9383 return "pan"
84+ lowered_text = text .lower ()
9485 # In Aadhaar Card, Government of India is shaded, hence binarization causes it not to be read properly.
9586 aadhaar_needed_words = ['issue' , 'date' ]
9687 aadhaar_found_words = 0
@@ -102,4 +93,167 @@ def classify(text: str):
10293 # aadhaar has a regex of form 1234 1234 1234
10394 # This finding is a must because it is dark and bold and OCR would have definitely picked it up.
10495 return "aadhaar"
105- return None
96+ return None
97+
98+
99+ def fuzzy_substring_match (text , phrase , max_distance = 2 ):
100+ phrase_len = len (phrase )
101+ text = text .lower ()
102+ phrase = phrase .lower ()
103+
104+ for i in range (len (text ) - phrase_len + 1 ):
105+ window = text [i :i + phrase_len ]
106+ logger .info (f"Comparing { window } with { phrase } " )
107+ distance = Levenshtein .distance (window , phrase )
108+ if distance <= max_distance :
109+ return True , window , distance
110+
111+ return False , None , None
112+
113+
114+ def classify_passport (text : str ):
115+ """
116+ Does this look like a passport?
117+ """
118+ text = text .lower ()
119+ REPUBLIC_OF_INDIA = "republic of india"
120+ republic_of_india_found = False
121+ match_found , match_str , distance = fuzzy_substring_match (text , REPUBLIC_OF_INDIA )
122+ if match_found :
123+ logger .info (f"Matched { match_str } with { REPUBLIC_OF_INDIA } with distance { distance } " )
124+ republic_of_india_found = True
125+ NATIONALITY = "nationality"
126+ nationality_found = False
127+ match_found , match_str , distance = fuzzy_substring_match (text , NATIONALITY )
128+ if match_found :
129+ logger .info (f"Matched { match_str } with { NATIONALITY } with distance { distance } " )
130+ nationality_found = True
131+ PASSPORT_NUMBER = "passport no"
132+ passport_number_found = False
133+ match_found , match_str , distance = fuzzy_substring_match (text , PASSPORT_NUMBER )
134+ if match_found :
135+ logger .info (f"Matched { match_str } with { PASSPORT_NUMBER } with distance { distance } " )
136+ passport_number_found = True
137+ DATE_OF_BIRTH = "date of birth"
138+ date_of_birth_found = False
139+ match_found , match_str , distance = fuzzy_substring_match (text , DATE_OF_BIRTH )
140+ if match_found :
141+ logger .info (f"Matched { match_str } with { DATE_OF_BIRTH } with distance { distance } " )
142+ date_of_birth_found = True
143+ PLACE_OF_BIRTH = "place of birth"
144+ place_of_birth_found = False
145+ match_found , match_str , distance = fuzzy_substring_match (text , PLACE_OF_BIRTH )
146+ if match_found :
147+ logger .info (f"Matched { match_str } with { PLACE_OF_BIRTH } with distance { distance } " )
148+ place_of_birth_found = True
149+ GIVEN_NAME = "given name"
150+ given_name_found = False
151+ match_found , match_str , distance = fuzzy_substring_match (text , GIVEN_NAME )
152+ if match_found :
153+ logger .info (f"Matched { match_str } with { GIVEN_NAME } with distance { distance } " )
154+ given_name_found = True
155+ SURNAME = "surname"
156+ surname_found = False
157+ match_found , match_str , distance = fuzzy_substring_match (text , SURNAME )
158+ if match_found :
159+ logger .info (f"Matched { match_str } with { SURNAME } with distance { distance } " )
160+ surname_found = True
161+ found_array = [republic_of_india_found , nationality_found , passport_number_found , date_of_birth_found , place_of_birth_found , given_name_found , surname_found ]
162+ found_true = [x for x in found_array if x ]
163+ # Consider 60% of the text as a threshold.
164+ if len (found_true )/ len (found_array ) >= 0.6 :
165+ return True
166+ return False
167+
168+
169+ def classify_pan (text : str ):
170+ text = text .lower ()
171+ INCOME_TAX_DEPARTMENT = 'income tax department'
172+ GOVT_OF_INDIA = 'govt of india'
173+ PERMANENT_ACCOUNT_NUMBER = 'permanent account number'
174+ income_tax_department_found = False
175+ govt_of_india_found = False
176+ permanent_account_number_found = False
177+ match_found , match_str , distance = fuzzy_substring_match (text , INCOME_TAX_DEPARTMENT , max_distance = 5 )
178+ if match_found :
179+ logger .info (f"Matched { match_str } with { INCOME_TAX_DEPARTMENT } with distance { distance } " )
180+ income_tax_department_found = True
181+ match_found , match_str , distance = fuzzy_substring_match (text , GOVT_OF_INDIA , max_distance = 4 )
182+ if match_found :
183+ logger .info (f"Matched { match_str } with { GOVT_OF_INDIA } with distance { distance } " )
184+ govt_of_india_found = True
185+ match_found , match_str , distance = fuzzy_substring_match (text , PERMANENT_ACCOUNT_NUMBER , max_distance = 4 )
186+ if match_found :
187+ logger .info (f"Matched { match_str } with { PERMANENT_ACCOUNT_NUMBER } with distance { distance } " )
188+ permanent_account_number_found = True
189+ found_array = [income_tax_department_found , govt_of_india_found , permanent_account_number_found ]
190+ found_true = [x for x in found_array if x ]
191+ # Consider 60% of the text as a threshold.
192+ if len (found_true )/ len (found_array ) >= 0.6 :
193+ return True
194+ return False
195+
196+
197+ def analyze_passport (text : str ):
198+ # Word boundary on both sides.
199+ # An upper case letter followed by exactly 7 digits
200+ matches = re .findall (r'\b[A-Z]\d{7}\b' , text )
201+ passport_number = None
202+ if len (matches ) > 0 :
203+ passport_number = matches [0 ]
204+ data = {
205+ }
206+ if passport_number is not None :
207+ data ['passport_number' ] = passport_number
208+ return data
209+
210+
211+ def analyze_pan (text : str ):
212+ # Remove blank lines
213+ lines = text .splitlines ()
214+ non_blank_lines = [line for line in lines if line .strip () != '' ]
215+ text = '\n ' .join (non_blank_lines )
216+ lowered_text = text .lower ()
217+ # Word boundary on both sides.
218+ # 5 upper case letters followed by exactly 4 digits, followed by a letter
219+ matches = re .findall (r'\b[A-Z]{5}\d{4}[A-Z]{1}\b' , text )
220+ pan_number = None
221+ name = None
222+ father_name = None
223+ dob = None
224+ if len (matches ) > 0 :
225+ pan_number = matches [0 ]
226+ # Extract name
227+ # Find where "India" occurs
228+ match_found , match_str , distance = fuzzy_substring_match (lowered_text , "india" )
229+ if match_found :
230+ # Find index of "India"
231+ index = lowered_text .index (match_str )
232+ # Find first new line after this index
233+ new_line_index = text .find ('\n ' , index )
234+ # Name of person is after this new line
235+ content_after_new_line = text [new_line_index + 1 :]
236+ name_and_others = content_after_new_line .split ('\n ' )
237+ # Get the name of the person
238+ if len (name_and_others ) > 0 :
239+ name = name_and_others [0 ]
240+ # Remove punctuation from name
241+ name = name .translate (str .maketrans ('' , '' , string .punctuation ))
242+ name = name .strip ()
243+ if name is not None and len (name_and_others ) > 1 :
244+ # Father name is just after name, on the next line
245+ father_name = name_and_others [1 ]
246+ father_name = father_name .translate (str .maketrans ('' , '' , string .punctuation ))
247+ father_name = father_name .strip ()
248+ if len (name_and_others ) > 2 :
249+ dob = name_and_others [2 ]
250+ data = {}
251+ if pan_number is not None :
252+ data ['PAN No.' ] = pan_number
253+ if name is not None :
254+ data ['Name' ] = name
255+ if father_name is not None :
256+ data ["Father's Name" ] = father_name
257+ if dob is not None :
258+ data ['Date of Birth' ] = dob
259+ return data
0 commit comments