Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions ingestion/src/metadata/pii/india_patterns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""
India specific PII patterns for DPDP Act 2023 compliance.
Extends OpenMetadata auto PII tagging with locale aware detection.
"""

import re

# Patterns for column name matching
INDIA_COLUMN_PATTERNS = {
"aadhaar": re.compile(r".*aadhaar.*|.*aadhar.*|.*uidai.*", re.IGNORECASE),
"pan": re.compile(
r".*\bpan_?(card|number|no|num)\b.*|.*permanent_account.*",
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PAN column-name regex uses \b word boundaries, which don’t treat _ as a boundary. As a result, common snake_case names like customer_pan won’t match (and the added unit test for customer_pan will fail). Adjust the pattern/normalization to handle underscores (and consider matching bare pan tokens too).

Suggested change
r".*\bpan_?(card|number|no|num)\b.*|.*permanent_account.*",
r".*(?:^|[^a-z0-9])pan(?:_?(?:card|number|no|num))?(?:$|[^a-z0-9]).*|.*permanent_account.*",

Copilot uses AI. Check for mistakes.
re.IGNORECASE,
),
"upi": re.compile(
r".*\bupi_?(id|address|vpa)\b.*|.*\bvpa\b.*",
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The UPI column-name regex also relies on \b word boundaries, so snake_case names like customer_upi_id won’t match because _ is a word character. Consider normalizing separators (e.g., _ -> space) before applying \b, or using lookarounds/separator-aware matching instead of \b.

Suggested change
r".*\bupi_?(id|address|vpa)\b.*|.*\bvpa\b.*",
r".*(?<![a-z0-9])upi_?(id|address|vpa)(?![a-z0-9]).*|.*(?<![a-z0-9])vpa(?![a-z0-9]).*",

Copilot uses AI. Check for mistakes.
re.IGNORECASE,
),
}

# Verhoeff algorithm for Aadhaar validation
_VERHOEFF_D = [
[0,1,2,3,4,5,6,7,8,9],
[1,2,3,4,0,6,7,8,9,5],
[2,3,4,0,1,7,8,9,5,6],
[3,4,0,1,2,8,9,5,6,7],
Comment on lines +22 to +26
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This module appears to need black formatting (e.g., the Verhoeff tables aren’t spaced/indented per project formatting), which will likely cause CI formatting checks to fail. Please run the ingestion formatter (make py_format / black) on this file before merging.

Copilot uses AI. Check for mistakes.
[4,0,1,2,3,9,5,6,7,8],
[5,9,8,7,6,0,4,3,2,1],
[6,5,9,8,7,1,0,4,3,2],
[7,6,5,9,8,2,1,0,4,3],
[8,7,6,5,9,3,2,1,0,4],
[9,8,7,6,5,4,3,2,1,0]
]

_VERHOEFF_P = [
[0,1,2,3,4,5,6,7,8,9],
[1,5,7,6,2,8,3,0,9,4],
[5,8,0,3,7,9,6,1,4,2],
[8,9,1,6,0,4,3,5,2,7],
[9,4,5,3,1,2,6,8,7,0],
[4,2,8,6,5,7,3,9,0,1],
[2,7,9,3,8,0,6,4,1,5],
[7,0,4,6,9,1,3,2,5,8]
]

def validate_aadhaar(number: str) -> bool:
"""
Validate Aadhaar number using Verhoeff checksum.
Returns True only if 12 digits pass the algorithm.
This prevents tagging random 12-digit order IDs as Aadhaar.
"""
if not number or len(number)!= 12 or not number.isdigit():
return False

# UIDAI rule: Aadhaar cannot start with 0 or 1
if number[0] in ('0', '1'):
return False

c = 0
for i, digit in enumerate(reversed(number)):
c = _VERHOEFF_D[c][_VERHOEFF_P[i % 8][int(digit)]]
return c == 0
Comment thread
gitar-bot[bot] marked this conversation as resolved.

def validate_pan(number: str) -> bool:
"""
Validate PAN format: 5 uppercase letters, 4 digits, 1 uppercase letter.
4th character must be one of: A,B,C,F,G,H,J,L,P,T
"""
if not number:
return False
if not re.match(r'^[A-Z]{5}[0-9]{4}[A-Z]$', number):
return False

# Check 4th character (index 3) is valid holder type
valid_types = {'A', 'B', 'C', 'F', 'G', 'H', 'J', 'L', 'P', 'T'}
return number[3] in valid_types

def is_india_pii_column(column_name: str) -> str | None:
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is_india_pii_column uses PEP 604 union syntax (str | None), which is invalid on Python 3.9 (the ingestion package declares requires-python >=3.9). Please switch to Optional[str] / Union[str, None] (or otherwise ensure 3.9 compatibility).

Copilot uses AI. Check for mistakes.
"""
Check if column name matches India PII patterns.
Returns the PII type or None.
"""
name_lower = column_name.lower()

if INDIA_COLUMN_PATTERNS["aadhaar"].match(name_lower):
return "Aadhaar"
if INDIA_COLUMN_PATTERNS["pan"].match(name_lower):
return "PAN"
if INDIA_COLUMN_PATTERNS["upi"].match(name_lower):
return "UPI"

return None
Comment thread
gitar-bot[bot] marked this conversation as resolved.
26 changes: 26 additions & 0 deletions ingestion/src/metadata/pii/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@
from metadata.pii.constants import PII
from metadata.utils import fqn
from metadata.utils.logger import profiler_logger
from metadata.pii.india_patterns import (
is_india_pii_column,
validate_aadhaar,
validate_pan,
)

logger = profiler_logger()

Expand Down Expand Up @@ -105,6 +110,27 @@ def create_column_tag_labels(
if PII in tag.tagFQN.root:
return []

# Check India PII patterns
india_pii = is_india_pii_column(column.name.root)
# India PII validation - check majority of samples like existing classifier
sample_values = [str(v) for v in sample_data if v] if sample_data else []
Comment on lines +113 to +116
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new India PII detection is added to PIIProcessor, but PIIProcessor is marked deprecated and the workflow defaults to tag-pii-processor (TagProcessor) in processor_factory. As written, most users won’t hit this new logic unless they explicitly opt into the legacy processor type. To meet the stated goal (Column Name Scanner / default auto-tagging), this should be integrated into the TagProcessor path (e.g., via TagAnalyzer/ColumnNameScanner or a custom recognizer) or the factory default needs to change.

Copilot uses AI. Check for mistakes.

if india_pii == "Aadhaar" and sample_values:
valid_count = sum(1 for v in sample_values if validate_aadhaar(v))
if valid_count > len(sample_values) * 0.5:
return [self.build_tag_label(PIISensitivityTag.SENSITIVE, "India Aadhaar detected")]

if india_pii == "PAN" and sample_values:
valid_count = sum(1 for v in sample_values if validate_pan(v))
if valid_count > len(sample_values) * 0.5:
return [self.build_tag_label(PIISensitivityTag.SENSITIVE, "India PAN detected")]
Comment on lines +121 to +126
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new tag reasons (e.g., "India Aadhaar detected") diverge from the existing PIIProcessor reason format produced by explain_recognition_results(...). There are unit tests (ingestion/tests/unit/pii/test_processor.py) asserting the reason matches "Detected by ...Recognizer" for all produced tags; if this India path triggers it will break those expectations (and potentially any downstream consumers relying on the format). Consider reusing the existing reason builder / include consistent metadata, and add/adjust unit tests accordingly.

Copilot uses AI. Check for mistakes.

if india_pii == "UPI" and sample_values:
# UPI validation is simpler - check for @ symbol in majority
valid_count = sum(1 for v in sample_values if "@" in v and len(v) > 5)
if valid_count > len(sample_values) * 0.5:
return [self.build_tag_label(PIISensitivityTag.SENSITIVE, "India UPI detected")]
Comment on lines +121 to +132
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These newly added return statements are not black-formatted (the build_tag_label(..., "India Aadhaar detected") line exceeds typical line length and black will reflow it). Please run make py_format/black on this file to avoid CI formatting failures.

Suggested change
return [self.build_tag_label(PIISensitivityTag.SENSITIVE, "India Aadhaar detected")]
if india_pii == "PAN" and sample_values:
valid_count = sum(1 for v in sample_values if validate_pan(v))
if valid_count > len(sample_values) * 0.5:
return [self.build_tag_label(PIISensitivityTag.SENSITIVE, "India PAN detected")]
if india_pii == "UPI" and sample_values:
# UPI validation is simpler - check for @ symbol in majority
valid_count = sum(1 for v in sample_values if "@" in v and len(v) > 5)
if valid_count > len(sample_values) * 0.5:
return [self.build_tag_label(PIISensitivityTag.SENSITIVE, "India UPI detected")]
return [
self.build_tag_label(
PIISensitivityTag.SENSITIVE, "India Aadhaar detected"
)
]
if india_pii == "PAN" and sample_values:
valid_count = sum(1 for v in sample_values if validate_pan(v))
if valid_count > len(sample_values) * 0.5:
return [
self.build_tag_label(
PIISensitivityTag.SENSITIVE, "India PAN detected"
)
]
if india_pii == "UPI" and sample_values:
# UPI validation is simpler - check for @ symbol in majority
valid_count = sum(1 for v in sample_values if "@" in v and len(v) > 5)
if valid_count > len(sample_values) * 0.5:
return [
self.build_tag_label(
PIISensitivityTag.SENSITIVE, "India UPI detected"
)
]

Copilot uses AI. Check for mistakes.

# Build classifier with the results capturing patcher
result_capturer = ResultCapturingPatcher()
classifier: ColumnClassifier[PIISensitivityTag] = PIISensitiveClassifier(
Expand Down
28 changes: 28 additions & 0 deletions ingestion/tests/unit/pii/test_india_patterns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pytest
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pytest is imported but never used in this test module (all assertions are plain assert). This will be flagged by cleanup/lint tooling (e.g., pycln). Remove the unused import.

Suggested change
import pytest

Copilot uses AI. Check for mistakes.
from metadata.pii.india_patterns import (
validate_aadhaar,
validate_pan,
is_india_pii_column,
)

def test_aadhaar_validation():
# Valid test Aadhaar (passes Verhoeff)
assert validate_aadhaar("999999990019") is True
# Invalid - fails checksum
assert validate_aadhaar("123456789012") is False
# Invalid - wrong length
assert validate_aadhaar("12345") is False
# Invalid - not digits
assert validate_aadhaar("abcdefghijkl") is False

def test_pan_validation():
assert validate_pan("ABCDE1234F") is True
assert validate_pan("abcdE1234f") is False # must be uppercase
assert validate_pan("ABCD1234F") is False # too short
assert validate_pan("ABCDE12345") is False # wrong format
Comment on lines +20 to +22
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test file doesn’t appear black-formatted (e.g., inline comments need two spaces before #). Please run the standard ingestion formatter (make py_format / black) so formatting checks pass.

Suggested change
assert validate_pan("abcdE1234f") is False # must be uppercase
assert validate_pan("ABCD1234F") is False # too short
assert validate_pan("ABCDE12345") is False # wrong format
assert validate_pan("abcdE1234f") is False # must be uppercase
assert validate_pan("ABCD1234F") is False # too short
assert validate_pan("ABCDE12345") is False # wrong format

Copilot uses AI. Check for mistakes.

def test_column_name_matching():
assert is_india_pii_column("aadhaar_number") == "Aadhaar"
assert is_india_pii_column("customer_pan") == "PAN"
assert is_india_pii_column("upi_id") == "UPI"
assert is_india_pii_column("order_id") is None
Loading