Skip to content

Commit 0605094

Browse files
committed
Add a "current OSM survivor" sanity check to changed points.
1 parent 174a61b commit 0605094

5 files changed

Lines changed: 417 additions & 3 deletions

File tree

.claude/TODO.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ Short running list of in-progress / upcoming work. Edit freely; trim older compl
66

77
## Upcoming
88

9+
- [ ] **Per-region calibration knob for the change-detection penalty.** Added 2026-05-19. Today `conflation.change_detection.default_delta` is a single global scalar (with per-`shared_label` overrides from the fitted turnover model). The model was fit on national OSM-history data, so the per-group δ values are a national average of OSM editor reliability. That assumption breaks in regions where OSM is sparse or stale — e.g., a "Restaurant deletion" in a rural county where OSM has low edit traffic may not be an actual closure, just an unmaintained entry. We should add a release-valve: allow `default_delta` (and ideally the per-`shared_label` deltas) to be overridden per state (or per Census place/county). Cleanest landing spot is a new optional CSV at `directories.model_output.regional_overrides` keyed by `(state_fips, shared_label) → delta_override`, and `change_detection.load_delta_lookup` would merge it in after the national values. Until we have a vetted set from a non-Seattle region we don't have data to calibrate this, but the hook should be in place. Tracking against the asymmetric-blindness problem documented in the May 2026 plan at `~/.claude/plans/our-current-deduplication-strategy-wild-graham.md`.
910
- [ ] **Auto-capture the three per-version README fields** so the publish step doesn't need `publish.version_metadata` overrides. Added 2026-04-24. Today `build_version_readme` in [src/openpois/publish/build_readme.py](../src/openpois/publish/build_readme.py) falls back to config overrides or best-effort guesses; aim is for the pipeline to write authoritative values alongside the data it produces, and the publish step to just read them.
1011
- *OSM snapshot date*`scripts/osm_snapshot/download.py` should write a `~/data/openpois/snapshots/osm/<version>/download_metadata.json` containing `{"downloaded_at": "<ISO date>", "pbf_url": "..."}` after the PBF download completes. `_resolve_osm_snapshot_date` then reads that file before falling back to the version string.
1112
- *Overture release*`scripts/overture/download.py` already resolves a concrete release (pinned or auto-detected) inside `download_overture_snapshot`; currently only the `.parts/<release>/` directory records it and `.parts/` is deleted on success. Surface the resolved release by writing `~/data/openpois/snapshots/overture/<version>/download_metadata.json` with `{"release": "2026-04-15.0", ...}` before the cleanup step. `_resolve_overture_release` reads that file ahead of the `.parts/` heuristic.

config.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,31 @@ conflation:
248248
# model's per-group params. Equals sigmoid(logit_delta_0) for the
249249
# current 20260422_by_shared_label fit (logit_delta_0 = -2.72).
250250
default_delta: 0.062
251+
# Hard gate on Overture-name vs ghost-prior-name token_set_ratio
252+
# (0-100), applied *before* the composite-score-based shadow
253+
# matcher. The default 0 keeps the loose matcher: any spatial +
254+
# type + composite match above ``min_shadow_match_score`` will
255+
# fire, even when Overture's name doesn't lexically match the
256+
# OSM ghost. This is intentional. A higher value would only
257+
# fire when Overture is showing the *same name* OSM closed,
258+
# which we explored in May 2026 (decision rule A) and rejected
259+
# because it loses the bulk of real closures where Overture has
260+
# already updated to a different current name at a churned
261+
# address (Sleep Train → Roosevelt Square etc.). Knob retained
262+
# for future data-quality-only modes; leave at 0 for production
263+
# change detection.
264+
min_prior_name_match_score: 0
265+
suppress_if_current_survivor:
266+
# Belt-and-suspenders post-filter: drop the penalty if a
267+
# *current* OSM POI within radius_m has name token_set_ratio
268+
# >= threshold against the Overture name. Catches cases where
269+
# the POI is still in OSM under different geometry (e.g.,
270+
# node remapped to a building way) and the primary matcher
271+
# missed it. Kept enabled because it's cheap and orthogonal
272+
# to min_prior_name_match_score.
273+
enabled: true
274+
radius_m: 50
275+
name_similarity_threshold: 70
251276

252277
# Settings for publishing snapshots to Source Cooperative
253278
# (https://source.coop/henryspatialanalysis/openpois). Source Coop is

scripts/conflation/apply_change_detection.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,26 @@ def main() -> None:
7272
"Use when the baseline was produced with --test."
7373
),
7474
)
75+
parser.add_argument(
76+
"--min-prior-name-score",
77+
type = float,
78+
default = None,
79+
help = (
80+
"Override config's min_prior_name_match_score. Higher "
81+
"values require a stricter Overture-name vs ghost-prior-"
82+
"name token_set_ratio match before a penalty fires. "
83+
"Default config value implements decision rule A "
84+
"(name-match required)."
85+
),
86+
)
87+
parser.add_argument(
88+
"--no-survivor-filter",
89+
action = "store_true",
90+
help = (
91+
"Disable the current-OSM-survivor post-filter for this "
92+
"run. Used for ablation against the vetted set."
93+
),
94+
)
7595
args = parser.parse_args()
7696

7797
config = Config("~/repos/openpois/config.yaml")
@@ -93,6 +113,17 @@ def main() -> None:
93113
cd_cfg = config.get("conflation", "change_detection")
94114
min_match_score = float(cd_cfg["min_shadow_match_score"])
95115
default_delta = float(cd_cfg["default_delta"])
116+
min_prior_name_match_score = float(
117+
cd_cfg.get("min_prior_name_match_score", 0)
118+
)
119+
if args.min_prior_name_score is not None:
120+
min_prior_name_match_score = float(args.min_prior_name_score)
121+
122+
survivor_filter = cd_cfg.get("suppress_if_current_survivor") or {}
123+
if args.no_survivor_filter:
124+
survivor_filter = dict(survivor_filter)
125+
survivor_filter["enabled"] = False
126+
print("Current-OSM-survivor filter disabled for this run.")
96127

97128
max_radius_m = float(config.get("conflation", "max_radius_m"))
98129
default_radius_m = float(
@@ -105,6 +136,12 @@ def main() -> None:
105136
config.get("conflation", "identifier_weight")
106137
)
107138

139+
# R1 needs the rated snapshot; no other auxiliary inputs are
140+
# needed by the simplified pipeline.
141+
rated_snapshot_path = config.get_file_path(
142+
"snapshot_osm", "rated_snapshot",
143+
)
144+
108145
test_bbox = (
109146
config.get("conflation", "test_bbox") if args.test else None
110147
)
@@ -113,10 +150,12 @@ def main() -> None:
113150
print(f"Ghosts: {ghosts_path}")
114151
print(f"Fitted params: {fitted_params_path}")
115152
print(f"Output: {output_path}")
153+
print(f"Rated snapshot (survivor filter): {rated_snapshot_path}")
116154
print(
117155
f"min_match_score={min_match_score} "
118156
f"max_radius_m={max_radius_m} "
119-
f"default_delta={default_delta}"
157+
f"default_delta={default_delta} "
158+
f"min_prior_name_match_score={min_prior_name_match_score}"
120159
)
121160
if args.test:
122161
print(f"Test bbox: {test_bbox}")
@@ -136,6 +175,9 @@ def main() -> None:
136175
identifier_weight = identifier_weight,
137176
default_delta = default_delta,
138177
test_bbox = test_bbox,
178+
rated_snapshot_path = rated_snapshot_path,
179+
survivor_filter = survivor_filter,
180+
min_prior_name_match_score = min_prior_name_match_score,
139181
)
140182
elapsed = time.time() - t0
141183

@@ -147,9 +189,13 @@ def main() -> None:
147189
)
148190
print(f" Ghosts considered: {summary['n_ghosts']:,}")
149191
print(
150-
f" Shadow matches: "
192+
f" Shadow matches (final): "
151193
f"{summary['n_shadow_matches']:,}"
152194
)
195+
print(
196+
f" Dropped by survivor filter: "
197+
f"{summary['n_survivor_dropped']}"
198+
)
153199
print(
154200
f" Mean penalty factor (Δ/old): "
155201
f"{summary['mean_penalty_factor']:.4f}"

src/openpois/conflation/change_detection.py

Lines changed: 190 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,12 @@
2424
import gc
2525
from pathlib import Path
2626

27+
import duckdb
2728
import geopandas as gpd
2829
import numpy as np
2930
import pandas as pd
3031
import pyarrow.parquet as pq
32+
from rapidfuzz import fuzz
3133

3234
from openpois.conflation.ghost_osm import _is_token_subset_or_superset
3335
from openpois.conflation.match import (
@@ -86,6 +88,7 @@ def find_shadow_matches(
8688
name_weight: float,
8789
type_weight: float,
8890
identifier_weight: float,
91+
min_prior_name_match_score: float = 0.0,
8992
) -> pd.DataFrame:
9093
"""Run a single-pass match between Overture rows and ghost rows.
9194
@@ -98,6 +101,14 @@ def find_shadow_matches(
98101
bit arrays) so type_score is binary on exact ``shared_label``
99102
equality — the change-detection penalty is conservative and
100103
should only fire when taxonomy genuinely matches.
104+
105+
``min_prior_name_match_score`` is an additional hard gate on the
106+
Overture-name vs ghost-prior-name token_set_ratio (0–100). When
107+
> 0, candidate pairs below that threshold are dropped *before*
108+
the composite-score-based selection runs. Subset/superset pairs
109+
pass regardless. Set this to require a strong direct name match
110+
(e.g. 70) and you'll trade most of the recall for much higher
111+
precision. Default 0 disables the gate.
101112
"""
102113
if len(unmatched_overture) == 0 or len(ghosts) == 0:
103114
return pd.DataFrame(
@@ -143,6 +154,38 @@ def find_shadow_matches(
143154

144155
ov_labels = _to_str_array(unmatched_overture["shared_label"])
145156

157+
# Optional pre-gate: drop candidate pairs whose Overture-name vs
158+
# ghost-prior-name token_set_ratio is below the configured floor.
159+
# Subset/superset pairs pass regardless (a short subset like
160+
# "CVS" vs "CVS Pharmacy" can dip below threshold on token-set
161+
# ratio but is obviously the same business). This is the "tighten
162+
# matcher" alternative — when set high (e.g. 70) it trades most
163+
# recall for high precision and removes the need for downstream
164+
# suppression rules.
165+
if min_prior_name_match_score > 0 and not candidates.empty:
166+
cand_osm_idx = candidates["osm_idx"].to_numpy()
167+
cand_ov_idx = candidates["overture_idx"].to_numpy()
168+
keep = np.zeros(len(candidates), dtype = bool)
169+
for i in range(len(candidates)):
170+
gname = ghost_names[cand_osm_idx[i]]
171+
oname = ov_names[cand_ov_idx[i]]
172+
if not gname or not oname:
173+
continue
174+
if _is_token_subset_or_superset(gname, oname):
175+
keep[i] = True
176+
continue
177+
sim = fuzz.token_set_ratio(gname, oname)
178+
if sim >= min_prior_name_match_score:
179+
keep[i] = True
180+
candidates = candidates.loc[keep].reset_index(drop = True)
181+
if candidates.empty:
182+
return pd.DataFrame(
183+
columns = [
184+
"osm_idx", "overture_idx",
185+
"composite_score", "distance_m",
186+
]
187+
)
188+
146189
# All-zero L0 bits → only exact shared_label match scores 1.0
147190
# (broad-group bitmask overlap collapses to 0 because all bits
148191
# are 0). Keeps the secondary pass conservative.
@@ -197,6 +240,117 @@ def find_shadow_matches(
197240
].reset_index(drop = True)
198241

199242

243+
def apply_current_survivor_filter(
244+
matches: pd.DataFrame,
245+
unmatched_overture: gpd.GeoDataFrame,
246+
*,
247+
rated_snapshot_path: Path,
248+
radius_m: float,
249+
name_similarity_threshold: float,
250+
test_bbox: dict | None = None,
251+
duckdb_memory_limit: str = "6GB",
252+
verbose: bool = True,
253+
) -> tuple[pd.DataFrame, int]:
254+
"""Drop shadow matches where the POI is still present in the live
255+
OSM snapshot under a different geometry / spelling.
256+
257+
For each match, spatial-joins the Overture POI's centroid against
258+
the rated OSM snapshot for any feature within ``radius_m``. If any
259+
such feature's ``name`` token_set_ratio against the Overture name
260+
is ≥ ``name_similarity_threshold``, the match is dropped — the POI
261+
isn't gone, the primary matcher just missed it.
262+
263+
Returns ``(kept_matches, n_dropped)``. The DuckDB spatial join is
264+
bounded by ``test_bbox`` when given so the Seattle A/B path stays
265+
fast.
266+
"""
267+
if matches.empty:
268+
return matches.copy(), 0
269+
270+
ov_idx_arr = matches["overture_idx"].to_numpy().astype(int)
271+
272+
if verbose:
273+
print(
274+
f" R1 (current-OSM-survivor): radius="
275+
f"{radius_m}m, name>={name_similarity_threshold}"
276+
)
277+
278+
ov_lons = unmatched_overture.geometry.x.to_numpy()
279+
ov_lats = unmatched_overture.geometry.y.to_numpy()
280+
bbox = test_bbox or {
281+
"xmin": float(np.min(ov_lons[ov_idx_arr])) - 0.01,
282+
"ymin": float(np.min(ov_lats[ov_idx_arr])) - 0.01,
283+
"xmax": float(np.max(ov_lons[ov_idx_arr])) + 0.01,
284+
"ymax": float(np.max(ov_lats[ov_idx_arr])) + 0.01,
285+
}
286+
287+
con = duckdb.connect()
288+
con.execute(f"SET memory_limit = '{duckdb_memory_limit}'")
289+
con.execute("INSTALL spatial; LOAD spatial;")
290+
ov_subset = pd.DataFrame({
291+
"match_idx": np.arange(len(matches)),
292+
"ov_name": _to_str_array(
293+
unmatched_overture["name"]
294+
)[ov_idx_arr],
295+
"ov_lon": ov_lons[ov_idx_arr],
296+
"ov_lat": ov_lats[ov_idx_arr],
297+
})
298+
ov_subset.to_parquet("/tmp/cd_r1_ov.parquet")
299+
300+
nearby = con.execute(f"""
301+
SELECT ov.match_idx, ov.ov_name,
302+
s.name AS osm_name,
303+
ST_Distance_Sphere(
304+
ST_Point(ov.ov_lon, ov.ov_lat),
305+
ST_Centroid(s.geometry)
306+
) AS dist_m
307+
FROM read_parquet('/tmp/cd_r1_ov.parquet') ov
308+
JOIN read_parquet('{rated_snapshot_path}') s
309+
ON ST_Distance_Sphere(
310+
ST_Point(ov.ov_lon, ov.ov_lat),
311+
ST_Centroid(s.geometry)
312+
) <= {radius_m}
313+
AND ST_X(ST_Centroid(s.geometry))
314+
BETWEEN {bbox['xmin']} AND {bbox['xmax']}
315+
AND ST_Y(ST_Centroid(s.geometry))
316+
BETWEEN {bbox['ymin']} AND {bbox['ymax']}
317+
""").fetch_df()
318+
con.close()
319+
320+
if verbose:
321+
print(
322+
f" {len(nearby):,} nearby-OSM candidate rows; "
323+
f"computing token_set_ratio ..."
324+
)
325+
326+
if not len(nearby):
327+
return matches.copy(), 0
328+
329+
nearby["sim"] = [
330+
fuzz.token_set_ratio(str(a), str(b))
331+
for a, b in zip(
332+
nearby["ov_name"].astype(str),
333+
nearby["osm_name"].astype(str),
334+
)
335+
]
336+
suppress_idx = (
337+
nearby[nearby["sim"] >= name_similarity_threshold]
338+
["match_idx"]
339+
.unique()
340+
)
341+
342+
if verbose:
343+
print(f" R1 suppressed: {len(suppress_idx)}")
344+
345+
if not len(suppress_idx):
346+
return matches.copy(), 0
347+
348+
keep_mask = np.ones(len(matches), dtype = bool)
349+
keep_mask[suppress_idx] = False
350+
kept = matches.loc[keep_mask].reset_index(drop = True)
351+
return kept, int(len(suppress_idx))
352+
353+
200354
def apply_shadow_match(
201355
conflated_path: Path,
202356
ghosts_path: Path,
@@ -212,6 +366,9 @@ def apply_shadow_match(
212366
identifier_weight: float,
213367
default_delta: float,
214368
test_bbox: dict | None = None,
369+
rated_snapshot_path: Path | None = None,
370+
survivor_filter: dict | None = None,
371+
min_prior_name_match_score: float = 0.0,
215372
verbose: bool = True,
216373
) -> dict:
217374
"""Post-process a conflated dataset with the change-detection penalty.
@@ -316,9 +473,40 @@ def apply_shadow_match(
316473
name_weight = name_weight,
317474
type_weight = type_weight,
318475
identifier_weight = identifier_weight,
476+
min_prior_name_match_score = min_prior_name_match_score,
477+
)
478+
if verbose:
479+
print(
480+
f" Shadow matches (pre-survivor-filter): "
481+
f"{len(matches):,}"
482+
)
483+
484+
# -- Current-OSM-survivor filter ----------------------------------
485+
n_survivor_dropped = 0
486+
if (
487+
survivor_filter
488+
and bool(survivor_filter.get("enabled", False))
489+
and rated_snapshot_path is not None
490+
and len(matches) > 0
491+
):
492+
if verbose:
493+
print("Applying current-OSM-survivor filter ...")
494+
matches, n_survivor_dropped = apply_current_survivor_filter(
495+
matches = matches,
496+
unmatched_overture = unmatched_ov,
497+
rated_snapshot_path = rated_snapshot_path,
498+
radius_m = float(survivor_filter.get("radius_m", 50)),
499+
name_similarity_threshold = float(
500+
survivor_filter.get("name_similarity_threshold", 70)
501+
),
502+
test_bbox = test_bbox,
503+
verbose = verbose,
319504
)
320505
if verbose:
321-
print(f" Shadow matches: {len(matches):,}")
506+
print(
507+
f" Shadow matches (post-survivor-filter): "
508+
f"{len(matches):,} (dropped {n_survivor_dropped})"
509+
)
322510

323511
# -- Build audit columns -------------------------------------------
324512
n = len(conflated)
@@ -409,6 +597,7 @@ def apply_shadow_match(
409597
"n_unmatched_overture": int(len(ov_global_idx)),
410598
"n_ghosts": int(len(ghosts)),
411599
"n_shadow_matches": int(len(matches)),
600+
"n_survivor_dropped": int(n_survivor_dropped),
412601
"mean_penalty_factor": (
413602
float(
414603
(new_conf_mean[shadow_matched]

0 commit comments

Comments
 (0)