-
Notifications
You must be signed in to change notification settings - Fork 2.1k
feat(ingestion): add India PII patterns for Aadhaar, PAN, UPI #27237
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
5849292
c3b60a5
9f6c619
82cec35
57f6053
0c8018b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| """ | ||
| India specific PII patterns for DPDP Act 2023 compliance. | ||
| Extends OpenMetadata auto PII tagging with locale aware detection. | ||
| """ | ||
|
|
||
| import re | ||
|
|
||
| # Patterns for column name matching | ||
| INDIA_COLUMN_PATTERNS = { | ||
| "aadhaar": re.compile(r".*aadhaar.*|.*aadhar.*|.*uidai.*", re.IGNORECASE), | ||
| "pan": re.compile(r".*pan.*|.*permanent_account.*", re.IGNORECASE), | ||
| "upi": re.compile(r".*upi.*|.*vpa.*", re.IGNORECASE), | ||
| } | ||
|
|
||
| # Verhoeff algorithm for Aadhaar validation | ||
| _VERHOEFF_D = [ | ||
| [0,1,2,3,4,5,6,7,8,9], | ||
| [1,2,3,4,0,6,7,8,9,5], | ||
| [2,3,4,0,1,7,8,9,5,6], | ||
| [3,4,0,1,2,8,9,5,6,7], | ||
|
Comment on lines
+22
to
+26
|
||
| [4,0,1,2,3,9,5,6,7,8], | ||
| [5,9,8,7,6,0,4,3,2,1], | ||
| [6,5,9,8,7,1,0,4,3,2], | ||
| [7,6,5,9,8,2,1,0,4,3], | ||
| [8,7,6,5,9,3,2,1,0,4], | ||
| [9,8,7,6,5,4,3,2,1,0] | ||
| ] | ||
|
|
||
| _VERHOEFF_P = [ | ||
| [0,1,2,3,4,5,6,7,8,9], | ||
| [1,5,7,6,2,8,3,0,9,4], | ||
| [5,8,0,3,7,9,6,1,4,2], | ||
| [8,9,1,6,0,4,3,5,2,7], | ||
| [9,4,5,3,1,2,6,8,7,0], | ||
| [4,2,8,6,5,7,3,9,0,1], | ||
| [2,7,9,3,8,0,6,4,1,5], | ||
| [7,0,4,6,9,1,3,2,5,8] | ||
| ] | ||
|
|
||
| def validate_aadhaar(number: str) -> bool: | ||
| """ | ||
| Validate Aadhaar number using Verhoeff checksum. | ||
| Returns True only if 12 digits pass the algorithm. | ||
| This prevents tagging random 12-digit order IDs as Aadhaar. | ||
| """ | ||
| if not number or len(number)!= 12 or not number.isdigit(): | ||
| return False | ||
|
|
||
| c = 0 | ||
| for i, digit in enumerate(reversed(number)): | ||
| c = _VERHOEFF_D[c][_VERHOEFF_P[i % 8][int(digit)]] | ||
| return c == 0 | ||
|
gitar-bot[bot] marked this conversation as resolved.
|
||
|
|
||
| def validate_pan(number: str) -> bool: | ||
| """ | ||
| Validate PAN format: 5 uppercase letters, 4 digits, 1 uppercase letter. | ||
| Example: ABCDE1234F | ||
| """ | ||
| if not number: | ||
| return False | ||
| return bool(re.match(r'^[A-Z]{5}[0-9]{4}[A-Z]$', number)) | ||
|
gitar-bot[bot] marked this conversation as resolved.
Outdated
|
||
|
|
||
| def is_india_pii_column(column_name: str) -> str | None: | ||
|
||
| """ | ||
| Check if column name matches India PII patterns. | ||
| Returns the PII type or None. | ||
| """ | ||
| name_lower = column_name.lower() | ||
|
|
||
| if INDIA_COLUMN_PATTERNS["aadhaar"].match(name_lower): | ||
| return "Aadhaar" | ||
| if INDIA_COLUMN_PATTERNS["pan"].match(name_lower): | ||
| return "PAN" | ||
| if INDIA_COLUMN_PATTERNS["upi"].match(name_lower): | ||
| return "UPI" | ||
|
|
||
| return None | ||
|
gitar-bot[bot] marked this conversation as resolved.
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,28 @@ | ||||||||||||||
| import pytest | ||||||||||||||
|
||||||||||||||
| import pytest |
Copilot
AI
Apr 17, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test file doesn’t appear black-formatted (e.g., inline comments need two spaces before #). Please run the standard ingestion formatter (make py_format / black) so formatting checks pass.
| assert validate_pan("abcdE1234f") is False # must be uppercase | |
| assert validate_pan("ABCD1234F") is False # too short | |
| assert validate_pan("ABCDE12345") is False # wrong format | |
| assert validate_pan("abcdE1234f") is False # must be uppercase | |
| assert validate_pan("ABCD1234F") is False # too short | |
| assert validate_pan("ABCDE12345") is False # wrong format |
Uh oh!
There was an error while loading. Please reload this page.