Added ability to infer structured data from PAN and Passport using heuristics and rule based extraction.

raaj-akshar · raaj-akshar · commit 4f3082601075 · 2025-06-03T23:11:59.000+05:30
diff --git a/text_analysis.py b/text_analysis.py
@@ -11,13 +11,19 @@
 TODO:
 - Summarization
 """
+import re
 import nltk
+import string
+import logging
+import Levenshtein
 
 # It's an idempotent operatation
 nltk.download('stopwords')
 nltk.download('punkt_tab')
 nltk.download('words')
 
+logger = logging.getLogger(__name__)
+
 
 def analyze(text: str):
     """
@@ -71,26 +77,11 @@ def classify(text: str):
     It currently does a simple string matching. We will move to NLP NaiveBayes Classification soon.
     And gradually to more advanced classification models.
     """
-    lowered_text = text.lower()
-    passport_needed_words = ['india', 'indian', 'surname', 'nationality', 'given', 'name', 'passport', 'date of birth', 'place of birth', 'place of issue', 'date of issue', 'passport no']
-    passport_found_words = 0
-    for passport_word in passport_needed_words:
-        if passport_word in lowered_text:
-            passport_found_words += 1
-    if passport_found_words >= 6:
-        # Also ensure Passport number REGEX is found
+    if classify_passport(text):
         return "passport"
-    pan_needed_words = ['income', 'tax', 'department', 'govt', 'india']
-    pan_found_words = 0
-    for pan_word in pan_needed_words:
-        # Enhance it to make it lenient. For example, 'indome' could be found instead of 'income'
-        # OCR makes such kind of mistakes and hence accommodation for such must be made.
-        if pan_word in lowered_text:
-            pan_found_words += 1
-    if pan_found_words >= 3:
-        # TODO: Also make sure that a Regex of form 'AZMPR1111L' is found.
-        # This text is dark and bold and OCR would have definitely picked it up.
+    if classify_pan(text):
         return "pan"
+    lowered_text = text.lower()
     # In Aadhaar Card, Government of India is shaded, hence binarization causes it not to be read properly.
     aadhaar_needed_words = ['issue', 'date']
     aadhaar_found_words = 0
@@ -102,4 +93,167 @@ def classify(text: str):
         # aadhaar has a regex of form 1234 1234 1234
         # This finding is a must because it is dark and bold and OCR would have definitely picked it up.
         return "aadhaar"
-    return None
+    return None
+
+
+def fuzzy_substring_match(text, phrase, max_distance=2):
+    phrase_len = len(phrase)
+    text = text.lower()
+    phrase = phrase.lower()
+
+    for i in range(len(text) - phrase_len + 1):
+        window = text[i:i + phrase_len]
+        logger.info(f"Comparing {window} with {phrase}")
+        distance = Levenshtein.distance(window, phrase)
+        if distance <= max_distance:
+            return True, window, distance
+
+    return False, None, None
+
+
+def classify_passport(text: str):
+    """
+    Does this look like a passport?
+    """
+    text = text.lower()
+    REPUBLIC_OF_INDIA = "republic of india"
+    republic_of_india_found = False
+    match_found, match_str, distance = fuzzy_substring_match(text, REPUBLIC_OF_INDIA)
+    if match_found:
+        logger.info(f"Matched {match_str} with {REPUBLIC_OF_INDIA} with distance {distance}")
+        republic_of_india_found = True
+    NATIONALITY = "nationality"
+    nationality_found = False
+    match_found, match_str, distance = fuzzy_substring_match(text, NATIONALITY)
+    if match_found:
+        logger.info(f"Matched {match_str} with {NATIONALITY} with distance {distance}")
+        nationality_found = True
+    PASSPORT_NUMBER = "passport no"
+    passport_number_found = False
+    match_found, match_str, distance = fuzzy_substring_match(text, PASSPORT_NUMBER)
+    if match_found:
+        logger.info(f"Matched {match_str} with {PASSPORT_NUMBER} with distance {distance}")
+        passport_number_found = True
+    DATE_OF_BIRTH = "date of birth"
+    date_of_birth_found = False
+    match_found, match_str, distance = fuzzy_substring_match(text, DATE_OF_BIRTH)
+    if match_found:
+        logger.info(f"Matched {match_str} with {DATE_OF_BIRTH} with distance {distance}")
+        date_of_birth_found = True
+    PLACE_OF_BIRTH = "place of birth"
+    place_of_birth_found = False
+    match_found, match_str, distance = fuzzy_substring_match(text, PLACE_OF_BIRTH)
+    if match_found:
+        logger.info(f"Matched {match_str} with {PLACE_OF_BIRTH} with distance {distance}")
+        place_of_birth_found = True
+    GIVEN_NAME = "given name"
+    given_name_found = False
+    match_found, match_str, distance = fuzzy_substring_match(text, GIVEN_NAME)
+    if match_found:
+        logger.info(f"Matched {match_str} with {GIVEN_NAME} with distance {distance}")
+        given_name_found = True
+    SURNAME = "surname"
+    surname_found = False
+    match_found, match_str, distance = fuzzy_substring_match(text, SURNAME)
+    if match_found:
+        logger.info(f"Matched {match_str} with {SURNAME} with distance {distance}")
+        surname_found = True
+    found_array = [republic_of_india_found, nationality_found, passport_number_found, date_of_birth_found, place_of_birth_found, given_name_found, surname_found]
+    found_true = [x for x in found_array if x]
+    # Consider 60% of the text as a threshold.
+    if len(found_true)/len(found_array) >= 0.6:
+        return True
+    return False
+
+
+def classify_pan(text: str):
+    text = text.lower()
+    INCOME_TAX_DEPARTMENT = 'income tax department'
+    GOVT_OF_INDIA = 'govt of india'
+    PERMANENT_ACCOUNT_NUMBER = 'permanent account number'
+    income_tax_department_found = False
+    govt_of_india_found = False
+    permanent_account_number_found = False
+    match_found, match_str, distance = fuzzy_substring_match(text, INCOME_TAX_DEPARTMENT, max_distance=5)
+    if match_found:
+        logger.info(f"Matched {match_str} with {INCOME_TAX_DEPARTMENT} with distance {distance}")
+        income_tax_department_found = True
+    match_found, match_str, distance = fuzzy_substring_match(text, GOVT_OF_INDIA, max_distance=4)
+    if match_found:
+        logger.info(f"Matched {match_str} with {GOVT_OF_INDIA} with distance {distance}")
+        govt_of_india_found = True
+    match_found, match_str, distance = fuzzy_substring_match(text, PERMANENT_ACCOUNT_NUMBER, max_distance=4)
+    if match_found:
+        logger.info(f"Matched {match_str} with {PERMANENT_ACCOUNT_NUMBER} with distance {distance}")
+        permanent_account_number_found = True
+    found_array = [income_tax_department_found, govt_of_india_found, permanent_account_number_found]
+    found_true = [x for x in found_array if x]
+    # Consider 60% of the text as a threshold.
+    if len(found_true)/len(found_array) >= 0.6:
+        return True
+    return False
+
+
+def analyze_passport(text: str):
+    # Word boundary on both sides.
+    # An upper case letter followed by exactly 7 digits
+    matches = re.findall(r'\b[A-Z]\d{7}\b', text)
+    passport_number = None
+    if len(matches) > 0:
+        passport_number = matches[0]
+    data = {
+    }
+    if passport_number is not None:
+        data['passport_number'] = passport_number
+    return data
+
+
+def analyze_pan(text: str):
+    # Remove blank lines
+    lines = text.splitlines()
+    non_blank_lines = [line for line in lines if line.strip() != '']
+    text = '\n'.join(non_blank_lines)
+    lowered_text = text.lower()
+    # Word boundary on both sides.
+    # 5 upper case letters followed by exactly 4 digits, followed by a letter
+    matches = re.findall(r'\b[A-Z]{5}\d{4}[A-Z]{1}\b', text)
+    pan_number = None
+    name = None
+    father_name = None
+    dob = None
+    if len(matches) > 0:
+        pan_number = matches[0]
+    # Extract name
+    # Find where "India" occurs
+    match_found, match_str, distance = fuzzy_substring_match(lowered_text, "india")
+    if match_found:
+        # Find index of "India"
+        index = lowered_text.index(match_str)
+        # Find first new line after this index
+        new_line_index = text.find('\n', index)
+        # Name of person is after this new line
+        content_after_new_line = text[new_line_index+1:]
+        name_and_others = content_after_new_line.split('\n')
+        # Get the name of the person
+        if len(name_and_others) > 0:
+            name = name_and_others[0]
+            # Remove punctuation from name
+            name = name.translate(str.maketrans('', '', string.punctuation))
+            name = name.strip()
+    if name is not None and len(name_and_others) > 1:
+        # Father name is just after name, on the next line
+        father_name = name_and_others[1]
+        father_name = father_name.translate(str.maketrans('', '', string.punctuation))
+        father_name = father_name.strip()
+        if len(name_and_others) > 2:
+            dob = name_and_others[2]
+    data = {}
+    if pan_number is not None:
+        data['PAN No.'] = pan_number
+    if name is not None:
+        data['Name'] = name
+    if father_name is not None:
+        data["Father's Name"] = father_name
+    if dob is not None:
+        data['Date of Birth'] = dob
+    return data