@@ -59,3 +59,47 @@ def is_meaningful_content(text: str):
5959 if text .count ("\x0c " ) / len (text ) > PAGE_MARGER_PERCENTAGE_THRESHOLD :
6060 return False
6161 return True
62+
63+
64+ def classify (text : str ):
65+ """
66+ Classifies a text into various categories. Currently supported categories are:
67+ - PAN Card
68+ - Aadhar Card
69+ - Passport
70+
71+ It currently does a simple string matching. We will move to NLP NaiveBayes Classification soon.
72+ And gradually to more advanced classification models.
73+ """
74+ lowered_text = text .lower ()
75+ passport_needed_words = ['india' , 'indian' , 'surname' , 'nationality' , 'given' , 'name' , 'passport' , 'date of birth' , 'place of birth' , 'place of issue' , 'date of issue' , 'passport no' ]
76+ passport_found_words = 0
77+ for passport_word in passport_needed_words :
78+ if passport_word in lowered_text :
79+ passport_found_words += 1
80+ if passport_found_words >= 6 :
81+ # Also ensure Passport number REGEX is found
82+ return "passport"
83+ pan_needed_words = ['income' , 'tax' , 'department' , 'govt' , 'india' ]
84+ pan_found_words = 0
85+ for pan_word in pan_needed_words :
86+ # Enhance it to make it lenient. For example, 'indome' could be found instead of 'income'
87+ # OCR makes such kind of mistakes and hence accommodation for such must be made.
88+ if pan_word in lowered_text :
89+ pan_found_words += 1
90+ if pan_found_words >= 3 :
91+ # TODO: Also make sure that a Regex of form 'AZMPR1111L' is found.
92+ # This text is dark and bold and OCR would have definitely picked it up.
93+ return "pan"
94+ # In Aadhaar Card, Government of India is shaded, hence binarization causes it not to be read properly.
95+ aadhaar_needed_words = ['issue' , 'date' ]
96+ aadhaar_found_words = 0
97+ for aadhaar_word in aadhaar_needed_words :
98+ # Enhance it to make it lenient. For example, 'isdue' could be found instead of 'issue'
99+ if aadhaar_word in lowered_text :
100+ aadhaar_found_words += 1
101+ if aadhaar_found_words >= 1 :
102+ # aadhaar has a regex of form 1234 1234 1234
103+ # This finding is a must because it is dark and bold and OCR would have definitely picked it up.
104+ return "aadhaar"
105+ return None
0 commit comments