Skip to content

Commit 31c1be8

Browse files
authored
Merge pull request #20 from bk86a/feature/14-mt-regex-country-fallback
feat: relax MT regex and add country-level fallback (#14)
2 parents 7223267 + c1a5e84 commit 31c1be8

4 files changed

Lines changed: 70 additions & 6 deletions

File tree

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
44

55
The format is based on [Keep a Changelog](https://keepachangelog.com/).
66

7+
## [0.12.0] - 2026-02-23
8+
9+
### Fixed
10+
11+
- **MT regex** (#14): separator between alpha prefix and digits is now optional (`MST1000` accepted alongside `MST 1000` and `MST-1000`). Previously, codes without a space failed regex extraction and fell to approximate matching with lower confidence.
12+
13+
### Added
14+
15+
- **Country-level majority-vote fallback**: new Tier 4 in the lookup chain for countries where all postal codes map to the same NUTS1/NUTS2 but NUTS3 has a dominant winner. Returns `match_type: "approximate"` with NUTS1/NUTS2 confidence 1.0 and NUTS3 confidence based on agreement ratio (capped at 0.80). Naturally captures MT (MT0/MT00/MT001 at ~77%). Digit-only MT codes like `1043` that previously returned 404 now get a valid approximate result.
16+
717
## [0.11.0] - 2026-02-23
818

919
### Added

app/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.11.0"
1+
__version__ = "0.12.0"

app/data_loader.py

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@
3535
# Countries with a single NUTS3 region: country_code -> nuts3 code
3636
_single_nuts3: dict[str, str] = {}
3737

38+
# Country-level majority-vote fallback for countries where NUTS1/NUTS2
39+
# are unanimous but NUTS3 has a dominant winner (e.g. MT → MT0/MT00/MT001)
40+
_country_fallback: dict[str, dict] = {}
41+
3842
# NUTS region names: nuts_id -> name_latn
3943
_nuts_names: dict[str, str] = {}
4044

@@ -619,6 +623,41 @@ def _build_prefix_index() -> None:
619623
if _single_nuts3:
620624
logger.info("Single-NUTS3 countries: %s", ", ".join(sorted(_single_nuts3)))
621625

626+
# Country-level majority-vote fallback for countries NOT in _single_nuts3
627+
# where NUTS1 and NUTS2 are unanimous but NUTS3 has a dominant winner
628+
_country_fallback.clear()
629+
caps = settings.approximate_confidence_caps
630+
for cc, nuts3_set in country_nuts3.items():
631+
if cc in _single_nuts3:
632+
continue
633+
nuts1_set = {n[:3] for n in nuts3_set}
634+
nuts2_set = {n[:4] for n in nuts3_set}
635+
if len(nuts1_set) != 1 or len(nuts2_set) != 1:
636+
continue
637+
# Count postal codes per NUTS3 to find dominant region
638+
nuts3_counts: Counter[str] = Counter()
639+
for (c, _), n3 in _lookup.items():
640+
if c == cc:
641+
nuts3_counts[n3] += 1
642+
total = sum(nuts3_counts.values())
643+
if total == 0:
644+
continue
645+
winner, winner_count = nuts3_counts.most_common(1)[0]
646+
ratio = winner_count / total
647+
_country_fallback[cc] = {
648+
"nuts1": next(iter(nuts1_set)),
649+
"nuts1_confidence": 1.0,
650+
"nuts2": next(iter(nuts2_set)),
651+
"nuts2_confidence": 1.0,
652+
"nuts3": winner,
653+
"nuts3_confidence": round(min(ratio, caps["nuts3"]), 2),
654+
}
655+
if _country_fallback:
656+
logger.info(
657+
"Country-level fallback: %s",
658+
", ".join(f"{cc}{v['nuts3']}" for cc, v in sorted(_country_fallback.items())),
659+
)
660+
622661

623662
def _estimate_by_prefix(cc: str, postal_code: str) -> dict | None:
624663
"""Runtime estimation via longest prefix match + majority vote.
@@ -910,11 +949,12 @@ def load_data() -> None:
910949
def lookup(country_code: str, postal_code: str) -> dict | None:
911950
"""Look up NUTS codes for a given country + postal code.
912951
913-
Four-tier fall-through:
952+
Five-tier fall-through:
914953
1. Exact TERCET match → confidence 1.0
915954
2. Pre-computed estimate → stored confidence per level
916955
3. Runtime prefix-based estimation → calculated confidence
917-
4. Single-NUTS3 country fallback → confidence 1.0 (e.g. LI, CY, LU)
956+
4. Country-level majority vote → unanimous NUTS1/2, dominant NUTS3 (e.g. MT)
957+
5. Single-NUTS3 country fallback → confidence 1.0 (e.g. LI, CY, LU)
918958
919959
Returns a dict with nuts1/2/3, match_type, and per-level confidence, or None.
920960
"""
@@ -962,7 +1002,21 @@ def lookup(country_code: str, postal_code: str) -> dict | None:
9621002
approx.update(_resolve_names(approx["nuts1"], approx["nuts2"], approx["nuts3"]))
9631003
return approx
9641004

965-
# Tier 4: Single-NUTS3 country fallback (e.g. LI → LI000)
1005+
# Tier 4: Country-level majority vote (unanimous NUTS1/2, dominant NUTS3)
1006+
fallback = _country_fallback.get(cc)
1007+
if fallback is not None:
1008+
return {
1009+
"match_type": "approximate",
1010+
"nuts1": fallback["nuts1"],
1011+
"nuts1_confidence": fallback["nuts1_confidence"],
1012+
"nuts2": fallback["nuts2"],
1013+
"nuts2_confidence": fallback["nuts2_confidence"],
1014+
"nuts3": fallback["nuts3"],
1015+
"nuts3_confidence": fallback["nuts3_confidence"],
1016+
**_resolve_names(fallback["nuts1"], fallback["nuts2"], fallback["nuts3"]),
1017+
}
1018+
1019+
# Tier 5: Single-NUTS3 country fallback (e.g. LI → LI000)
9661020
nuts3 = _single_nuts3.get(cc)
9671021
if nuts3 is not None:
9681022
return {

app/postal_patterns.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,8 @@
116116
"expected_digits": 4
117117
},
118118
"MT": {
119-
"regex": "^([A-Z]{2,3}\\s\\d{2,4})$",
120-
"example": "VLT 1010, FNT 1010, MSK 1234",
119+
"regex": "^([A-Z]{2,3}[\\s\\-]?\\d{2,4})$",
120+
"example": "VLT 1010, MST1000, FNT-1010",
121121
"tercet_map": "keep_alpha"
122122
},
123123
"NL": {

0 commit comments

Comments
 (0)