Skip to content

Commit 174a61b

Browse files
committed
Add new dimension to conflation: change detection for deleted points.
1 parent 5313b2f commit 174a61b

12 files changed

Lines changed: 2241 additions & 17 deletions

File tree

config.yaml

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
# Versioned directories (used with config.get_dir_path())
22
versions:
3-
osm_data: "20260416"
3+
osm_data: "20260515"
44
model_output: "20260422_by_shared_label"
55
snapshot_osm: "20260417"
66
snapshot_overture: "20260423"
77
conflation: "20260423"
88
source_coop: "2026-04-23-v0" # Source Cooperative upload folder (YYYY-MM-DD-v<IDX>); bump v<IDX> only for same-day re-uploads
9+
# Ghost POI dataset reconstructed from OSM history (one row per
10+
# detected previous-state event). Pinned to the same value as
11+
# ``osm_data`` since it is derived from the same history parquets.
12+
ghost_osm: "20260515"
913

1014
# Settings for downloading data
1115
download:
@@ -188,6 +192,11 @@ directories:
188192
partitioned: conflated_partitioned
189193
pmtiles: conflated.pmtiles
190194
summary_by_label: summary_by_label.csv
195+
ghost_osm:
196+
versioned: true
197+
path: ~/data/openpois/ghost_osm
198+
files:
199+
ghosts: ghosts.parquet
191200
testing:
192201
versioned: false
193202
path: ~/data/openpois/testing
@@ -222,6 +231,23 @@ conflation:
222231
ymin: 47.50
223232
xmax: -122.25
224233
ymax: 47.70
234+
# Change-detection feature: use OSM history to penalize Overture POIs
235+
# that co-locate with a "ghost" — a previous state of an OSM element
236+
# (primary-tag deletion, lifecycle-prefix addition, or substantial
237+
# rename). Disabled by default for clean A/B testing.
238+
change_detection:
239+
enabled: false
240+
# Minimum composite score for an Overture × ghost shadow match.
241+
# Same scale as the main matcher's min_match_score.
242+
min_shadow_match_score: 0.50
243+
# rapidfuzz.fuzz.token_set_ratio threshold below which an OSM name
244+
# change is considered a "substantial rename" rather than a typo
245+
# fix. Range 0-100. Lower = stricter (fewer events emitted).
246+
name_change_similarity_threshold: 50
247+
# Fallback delta for ghosts whose shared_label isn't in the fitted
248+
# model's per-group params. Equals sigmoid(logit_delta_0) for the
249+
# current 20260422_by_shared_label fit (logit_delta_0 = -2.72).
250+
default_delta: 0.062
225251

226252
# Settings for publishing snapshots to Source Cooperative
227253
# (https://source.coop/henryspatialanalysis/openpois). Source Coop is
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/usr/bin/env python
2+
"""
3+
Apply the change-detection penalty to a baseline conflated dataset.
4+
5+
Reads:
6+
- the baseline ``conflated.parquet`` (no change detection)
7+
- ``ghosts.parquet`` (from ``scripts/conflation/build_ghosts.py``)
8+
- ``fitted_params.csv`` for the active ``model_output`` version
9+
10+
Writes a new conflated parquet (suffix ``_cd`` by default) whose
11+
unmatched-Overture rows have had ``conf_mean`` re-weighted by
12+
``δ_group`` for any spatial+name+taxonomy match against a ghost. Audit
13+
columns are appended (``shadow_*`` + ``original_conf_mean``) so the
14+
demoted rows can be inspected by hand.
15+
16+
Usage:
17+
python scripts/conflation/apply_change_detection.py \
18+
--baseline-suffix=baseline --output-suffix=cd [--test]
19+
20+
Both ``--baseline-suffix`` and ``--output-suffix`` are inserted into
21+
the conflated filename before ``.parquet`` (e.g. ``conflated_cd.parquet``).
22+
"""
23+
from __future__ import annotations
24+
25+
import argparse
26+
import time
27+
from pathlib import Path
28+
29+
from config_versioned import Config
30+
31+
from openpois.conflation.change_detection import apply_shadow_match
32+
33+
34+
def _suffixed_path(base_path: Path, suffix: str | None) -> Path:
35+
"""Insert ``suffix`` before the parquet extension."""
36+
if not suffix:
37+
return base_path
38+
return base_path.with_name(
39+
f"{base_path.stem}_{suffix}{base_path.suffix}"
40+
)
41+
42+
43+
def main() -> None:
44+
parser = argparse.ArgumentParser(
45+
description = (
46+
"Apply change-detection penalty to a baseline conflated "
47+
"dataset using OSM-history-derived ghost POIs."
48+
)
49+
)
50+
parser.add_argument(
51+
"--baseline-suffix",
52+
default = "baseline",
53+
help = (
54+
"Suffix inserted into the input parquet filename "
55+
"(default: 'baseline' → conflated_baseline.parquet). "
56+
"Pass an empty string to read conflated.parquet directly."
57+
),
58+
)
59+
parser.add_argument(
60+
"--output-suffix",
61+
default = "cd",
62+
help = (
63+
"Suffix inserted into the output parquet filename "
64+
"(default: 'cd' → conflated_cd.parquet)."
65+
),
66+
)
67+
parser.add_argument(
68+
"--test",
69+
action = "store_true",
70+
help = (
71+
"Restrict ghosts to the configured conflation.test_bbox. "
72+
"Use when the baseline was produced with --test."
73+
),
74+
)
75+
args = parser.parse_args()
76+
77+
config = Config("~/repos/openpois/config.yaml")
78+
79+
conflated_base = config.get_file_path("conflation", "conflated")
80+
baseline_path = _suffixed_path(
81+
conflated_base, args.baseline_suffix,
82+
)
83+
output_path = _suffixed_path(
84+
conflated_base, args.output_suffix,
85+
)
86+
ghosts_path = config.get_file_path("ghost_osm", "ghosts")
87+
88+
model_dir = Path(config.get_dir_path("model_output"))
89+
fitted_params_path = model_dir / config.get(
90+
"directories", "model_output", "files", "fitted_params",
91+
)
92+
93+
cd_cfg = config.get("conflation", "change_detection")
94+
min_match_score = float(cd_cfg["min_shadow_match_score"])
95+
default_delta = float(cd_cfg["default_delta"])
96+
97+
max_radius_m = float(config.get("conflation", "max_radius_m"))
98+
default_radius_m = float(
99+
config.get("conflation", "default_radius_m")
100+
)
101+
distance_weight = float(config.get("conflation", "distance_weight"))
102+
name_weight = float(config.get("conflation", "name_weight"))
103+
type_weight = float(config.get("conflation", "type_weight"))
104+
identifier_weight = float(
105+
config.get("conflation", "identifier_weight")
106+
)
107+
108+
test_bbox = (
109+
config.get("conflation", "test_bbox") if args.test else None
110+
)
111+
112+
print(f"Baseline: {baseline_path}")
113+
print(f"Ghosts: {ghosts_path}")
114+
print(f"Fitted params: {fitted_params_path}")
115+
print(f"Output: {output_path}")
116+
print(
117+
f"min_match_score={min_match_score} "
118+
f"max_radius_m={max_radius_m} "
119+
f"default_delta={default_delta}"
120+
)
121+
if args.test:
122+
print(f"Test bbox: {test_bbox}")
123+
124+
t0 = time.time()
125+
summary = apply_shadow_match(
126+
conflated_path = baseline_path,
127+
ghosts_path = ghosts_path,
128+
fitted_params_path = fitted_params_path,
129+
output_path = output_path,
130+
min_match_score = min_match_score,
131+
max_radius_m = max_radius_m,
132+
default_radius_m = default_radius_m,
133+
distance_weight = distance_weight,
134+
name_weight = name_weight,
135+
type_weight = type_weight,
136+
identifier_weight = identifier_weight,
137+
default_delta = default_delta,
138+
test_bbox = test_bbox,
139+
)
140+
elapsed = time.time() - t0
141+
142+
print(f"\nApplied change-detection in {elapsed:.0f}s")
143+
print(f" Total conflated rows: {summary['n_total']:,}")
144+
print(
145+
f" Unmatched Overture rows: "
146+
f"{summary['n_unmatched_overture']:,}"
147+
)
148+
print(f" Ghosts considered: {summary['n_ghosts']:,}")
149+
print(
150+
f" Shadow matches: "
151+
f"{summary['n_shadow_matches']:,}"
152+
)
153+
print(
154+
f" Mean penalty factor (Δ/old): "
155+
f"{summary['mean_penalty_factor']:.4f}"
156+
)
157+
print(f" Output: {output_path}")
158+
159+
160+
if __name__ == "__main__":
161+
main()

scripts/conflation/build_ghosts.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/usr/bin/env python
2+
"""
3+
Build the ghost-OSM POI dataset from OSM history.
4+
5+
A ghost is a previous state of an OSM node that we believe no longer
6+
reflects ground truth (primary tag deleted, lifecycle prefix added, or
7+
substantial rename). The output Parquet feeds the change-detection
8+
pass in ``scripts/conflation/conflate.py``.
9+
10+
Config keys used (config.yaml):
11+
versions.osm_data, versions.ghost_osm — pinned together
12+
directories.osm_data.osm_versions
13+
directories.osm_data.osm_changes
14+
directories.ghost_osm.ghosts
15+
download.osm.filter_keys — POI tag keys
16+
conflation.change_detection.name_change_similarity_threshold
17+
18+
Usage:
19+
python scripts/conflation/build_ghosts.py
20+
"""
21+
from __future__ import annotations
22+
23+
import time
24+
25+
from config_versioned import Config
26+
27+
from openpois.conflation.ghost_osm import build_ghosts
28+
29+
30+
def main() -> None:
31+
config = Config("~/repos/openpois/config.yaml")
32+
33+
versions_path = config.get_file_path("osm_data", "osm_versions")
34+
changes_path = config.get_file_path("osm_data", "osm_changes")
35+
output_path = config.get_file_path("ghost_osm", "ghosts")
36+
37+
filter_keys = config.get("download", "osm", "filter_keys")
38+
name_threshold = float(
39+
config.get(
40+
"conflation", "change_detection",
41+
"name_change_similarity_threshold",
42+
)
43+
)
44+
45+
print(f"Versions path: {versions_path}")
46+
print(f"Changes path: {changes_path}")
47+
print(f"Output path: {output_path}")
48+
print(f"POI keys: {filter_keys}")
49+
print(f"Name similarity threshold: {name_threshold}")
50+
51+
t0 = time.time()
52+
ghosts = build_ghosts(
53+
versions_path = versions_path,
54+
changes_path = changes_path,
55+
poi_keys = filter_keys,
56+
name_change_similarity_threshold = name_threshold,
57+
)
58+
elapsed = time.time() - t0
59+
print(f"\nBuilt {len(ghosts):,} ghosts in {elapsed:.0f}s")
60+
61+
if len(ghosts):
62+
event_counts = (
63+
ghosts["event_type"].value_counts().to_dict()
64+
)
65+
print("Event-type breakdown:")
66+
for et, n in sorted(event_counts.items(), key = lambda kv: -kv[1]):
67+
print(f" {et}: {n:,}")
68+
69+
sl_total = int((ghosts["shared_label"] != "").sum())
70+
print(
71+
f"shared_label assigned: {sl_total:,}/{len(ghosts):,} "
72+
f"({100 * sl_total / max(len(ghosts), 1):.1f}%)"
73+
)
74+
75+
output_path.parent.mkdir(parents = True, exist_ok = True)
76+
ghosts.to_parquet(output_path, compression = "zstd")
77+
print(f"\nWrote {output_path}")
78+
config.write_self("ghost_osm")
79+
80+
81+
if __name__ == "__main__":
82+
main()

scripts/conflation/conflate.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,23 @@ def _load_gdf(
274274
"a debug/baseline option."
275275
),
276276
)
277+
parser.add_argument(
278+
"--output-suffix",
279+
default = "",
280+
help = (
281+
"If set, inserts ``_<suffix>`` before .parquet in the "
282+
"output filename (e.g. --output-suffix=baseline writes "
283+
"conflated_baseline.parquet). Used by the change-detection "
284+
"A/B testing workflow."
285+
),
286+
)
277287
args = parser.parse_args()
288+
289+
if args.output_suffix:
290+
OUTPUT_PATH = OUTPUT_PATH.with_name(
291+
f"{OUTPUT_PATH.stem}_{args.output_suffix}"
292+
f"{OUTPUT_PATH.suffix}"
293+
)
278294
t0 = time.time()
279295

280296
test_bbox = TEST_BBOX if args.test else None

0 commit comments

Comments
 (0)