Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions ingestion/src/metadata/pii/india_patterns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
India specific PII patterns for DPDP Act 2023 compliance.
Extends OpenMetadata auto PII tagging with locale aware detection.
"""

import re

# Patterns for column name matching
INDIA_COLUMN_PATTERNS = {
"aadhaar": re.compile(r".*aadhaar.*|.*aadhar.*|.*uidai.*", re.IGNORECASE),
"pan": re.compile(r".*pan.*|.*permanent_account.*", re.IGNORECASE),
"upi": re.compile(r".*upi.*|.*vpa.*", re.IGNORECASE),
Comment thread
gitar-bot[bot] marked this conversation as resolved.
Outdated
}

# Verhoeff algorithm for Aadhaar validation
_VERHOEFF_D = [
[0,1,2,3,4,5,6,7,8,9],
[1,2,3,4,0,6,7,8,9,5],
[2,3,4,0,1,7,8,9,5,6],
[3,4,0,1,2,8,9,5,6,7],
Comment on lines +22 to +26
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This module appears to need black formatting (e.g., the Verhoeff tables aren’t spaced/indented per project formatting), which will likely cause CI formatting checks to fail. Please run the ingestion formatter (make py_format / black) on this file before merging.

Copilot uses AI. Check for mistakes.
[4,0,1,2,3,9,5,6,7,8],
[5,9,8,7,6,0,4,3,2,1],
[6,5,9,8,7,1,0,4,3,2],
[7,6,5,9,8,2,1,0,4,3],
[8,7,6,5,9,3,2,1,0,4],
[9,8,7,6,5,4,3,2,1,0]
]

_VERHOEFF_P = [
[0,1,2,3,4,5,6,7,8,9],
[1,5,7,6,2,8,3,0,9,4],
[5,8,0,3,7,9,6,1,4,2],
[8,9,1,6,0,4,3,5,2,7],
[9,4,5,3,1,2,6,8,7,0],
[4,2,8,6,5,7,3,9,0,1],
[2,7,9,3,8,0,6,4,1,5],
[7,0,4,6,9,1,3,2,5,8]
]

def validate_aadhaar(number: str) -> bool:
"""
Validate Aadhaar number using Verhoeff checksum.
Returns True only if 12 digits pass the algorithm.
This prevents tagging random 12-digit order IDs as Aadhaar.
"""
if not number or len(number)!= 12 or not number.isdigit():
return False

c = 0
for i, digit in enumerate(reversed(number)):
c = _VERHOEFF_D[c][_VERHOEFF_P[i % 8][int(digit)]]
return c == 0
Comment thread
gitar-bot[bot] marked this conversation as resolved.

def validate_pan(number: str) -> bool:
"""
Validate PAN format: 5 uppercase letters, 4 digits, 1 uppercase letter.
Example: ABCDE1234F
"""
if not number:
return False
return bool(re.match(r'^[A-Z]{5}[0-9]{4}[A-Z]$', number))
Comment thread
gitar-bot[bot] marked this conversation as resolved.
Outdated

def is_india_pii_column(column_name: str) -> str | None:
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is_india_pii_column uses PEP 604 union syntax (str | None), which is invalid on Python 3.9 (the ingestion package declares requires-python >=3.9). Please switch to Optional[str] / Union[str, None] (or otherwise ensure 3.9 compatibility).

Copilot uses AI. Check for mistakes.
"""
Check if column name matches India PII patterns.
Returns the PII type or None.
"""
name_lower = column_name.lower()

if INDIA_COLUMN_PATTERNS["aadhaar"].match(name_lower):
return "Aadhaar"
if INDIA_COLUMN_PATTERNS["pan"].match(name_lower):
return "PAN"
if INDIA_COLUMN_PATTERNS["upi"].match(name_lower):
return "UPI"

return None
Comment thread
gitar-bot[bot] marked this conversation as resolved.
28 changes: 28 additions & 0 deletions ingestion/tests/unit/pii/test_india_patterns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pytest
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pytest is imported but never used in this test module (all assertions are plain assert). This will be flagged by cleanup/lint tooling (e.g., pycln). Remove the unused import.

Suggested change
import pytest

Copilot uses AI. Check for mistakes.
from metadata.pii.india_patterns import (
validate_aadhaar,
validate_pan,
is_india_pii_column,
)

def test_aadhaar_validation():
# Valid test Aadhaar (passes Verhoeff)
assert validate_aadhaar("999999990019") is True
# Invalid - fails checksum
assert validate_aadhaar("123456789012") is False
# Invalid - wrong length
assert validate_aadhaar("12345") is False
# Invalid - not digits
assert validate_aadhaar("abcdefghijkl") is False

def test_pan_validation():
assert validate_pan("ABCDE1234F") is True
assert validate_pan("abcdE1234f") is False # must be uppercase
assert validate_pan("ABCD1234F") is False # too short
assert validate_pan("ABCDE12345") is False # wrong format
Comment on lines +20 to +22
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test file doesn’t appear black-formatted (e.g., inline comments need two spaces before #). Please run the standard ingestion formatter (make py_format / black) so formatting checks pass.

Suggested change
assert validate_pan("abcdE1234f") is False # must be uppercase
assert validate_pan("ABCD1234F") is False # too short
assert validate_pan("ABCDE12345") is False # wrong format
assert validate_pan("abcdE1234f") is False # must be uppercase
assert validate_pan("ABCD1234F") is False # too short
assert validate_pan("ABCDE12345") is False # wrong format

Copilot uses AI. Check for mistakes.

def test_column_name_matching():
assert is_india_pii_column("aadhaar_number") == "Aadhaar"
assert is_india_pii_column("customer_pan") == "PAN"
assert is_india_pii_column("upi_id") == "UPI"
assert is_india_pii_column("order_id") is None
Loading