-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathconfig.yaml
More file actions
442 lines (436 loc) · 20 KB
/
config.yaml
File metadata and controls
442 lines (436 loc) · 20 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
# Versioned directories (used with config.get_dir_path())
versions:
osm_data: "20260521"
model_output: "20260422_by_shared_label"
snapshot_osm: "20260521"
snapshot_overture: "20260521"
conflation: "20260521"
source_coop: "2026-05-21-v0" # Source Cooperative upload folder (YYYY-MM-DD-v<IDX>); bump v<IDX> only for same-day re-uploads
# Ghost POI dataset reconstructed from OSM history (one row per
# detected previous-state event). Pinned to the same value as
# ``osm_data`` since it is derived from the same history parquets,
# and regenerated together with the monthly snapshot refresh.
ghost_osm: "20260521"
# Settings for downloading data
download:
general:
timeout: 1_000
# Census 1:5M cartographic state boundary file; includes 50 states + DC
# + 5 inhabited US territories (PR, USVI, GU, MP, AS). The 1:20M variant
# used previously only covered states + DC + PR. Used by all three
# snapshot downloads to restrict POIs to the US footprint. The coastline
# buffer expands the dissolved polygon outward by N metres so near-shore
# POIs are retained; internal state borders disappear on dissolve so the
# buffer only affects the coast.
boundary:
source_url: "https://www2.census.gov/geo/tiger/GENZ2023/shp/cb_2023_us_state_5m.zip"
coastline_buffer_m: 100
osm:
start_date: 2016-01-01
end_date: 2025-12-31
# Snapshot extracts. Geofabrik publishes a dedicated PBF for PR and USVI
# under north-america/us/, but the western-Pacific inhabited territories
# (Guam, Northern Mariana Islands, American Samoa) have no per-territory
# files. They are bundled into a single `american-oceania` extract that
# also includes the uninhabited US Pacific possessions (Wake, Midway,
# Howland, Baker, Jarvis, Palmyra, Kingman) — those contribute near-zero
# POIs and are accepted as bonus coverage.
pbf_url: "https://download.geofabrik.de/north-america/us-latest.osm.pbf"
pr_pbf_url: "https://download.geofabrik.de/north-america/us/puerto-rico-latest.osm.pbf"
usvi_pbf_url: "https://download.geofabrik.de/north-america/us/us-virgin-islands-latest.osm.pbf"
american_oceania_pbf_url: "https://download.geofabrik.de/australia-oceania/american-oceania-latest.osm.pbf"
# Full-history PBFs live on Geofabrik's OAuth-protected internal server.
# Any OSM account grants access; generate a Netscape-format cookie jar by
# logging in at https://osm-internal.download.geofabrik.de/ and exporting
# cookies, or by running Geofabrik's oauth_cookie_client.py. The
# `usvi_*` and `american_oceania_*` history URLs follow the same path
# convention as the snapshot URLs; if any are missing on the server, the
# history loader logs a warning and continues without that territory's
# history (the rater then falls back to the global-mean delta).
history_pbf_url: "https://osm-internal.download.geofabrik.de/north-america/us-internal.osh.pbf"
pr_history_pbf_url: "https://osm-internal.download.geofabrik.de/north-america/us/puerto-rico-internal.osh.pbf"
usvi_history_pbf_url: "https://osm-internal.download.geofabrik.de/north-america/us/us-virgin-islands-internal.osh.pbf"
american_oceania_history_pbf_url: "https://osm-internal.download.geofabrik.de/australia-oceania/american-oceania-internal.osh.pbf"
history_cookie_file: "~/data/openpois/.creds/geofabrik_cookies.txt"
overwrite_download: true
overwrite_filter: true
overwrite_parse: true
source_label: "osm"
keep_all_keys: false
chunk_size: 100_000
max_area_nodes: 1_000
verbose: true
filter_keys: ['shop', 'healthcare', 'leisure', 'amenity', 'tourism', 'office', 'craft', 'historic']
extract_keys: [
'addr:city','addr:country','addr:housename','addr:housenumber','addr:postcode',
'addr:state','addr:street','addr:unit','access','amenity','atm','bar','bicycle',
'bicycle_parking','brand','brand:wikidata','building','check_date',
'check_date:opening_hours','craft','cuisine','description','education','email',
'emergency','fountain','geometry','healthcare','healthcare:speciality','historic',
'image','landuse','leisure','name','nursery','office','official_name','old_name',
'opening_date','opening_hours','osm_id','osm_type','phone','playground','preschool',
'recycling_type','religion','self_service','service','shelter','shop','short_name',
'social_facility','social_facility:for','source','sport','tourism','type','url',
'website','wikidata','wikipedia'
]
overture:
release_date: "2026-05-20.0" # pin for determinism; null = auto-detect latest
s3_bucket: "overturemaps-us-west-2"
s3_region: "us-west-2"
# DuckDB resource caps for the per-part S3 scans and the final polygon
# filter. Peak host RAM ~= workers * memory_limit, peak CPU ~= workers *
# threads. Scale per-worker values down if raising workers above 1.
duckdb:
memory_limit: "4GB"
threads: 2
workers: 2
# (L0, L1) allowlist. L1 = null means "all of this L0".
# Entries intentionally exclude office/B2B-style L1s (corporate offices,
# media services, etc.), transit/parking/airports (covered elsewhere), and
# private lodging (Airbnb-style — duplicates residential addresses).
taxonomy_allowlist:
- [food_and_drink, null]
- [shopping, null]
- [arts_and_entertainment, null]
- [sports_and_recreation, null]
- [health_care, null]
- [lodging, null]
- [cultural_and_historic, null]
- [education, null]
- [lifestyle_services, personal_or_beauty_service]
- [lifestyle_services, wellness_service]
- [lifestyle_services, animal_or_pet_service]
- [lifestyle_services, beauty_service]
- [lifestyle_services, food_service]
- [services_and_business, financial_service]
- [services_and_business, legal_service]
- [services_and_business, professional_service]
- [services_and_business, real_estate_service]
- [services_and_business, home_service]
- [services_and_business, family_service]
- [community_and_government, social_or_community_service]
- [community_and_government, government_office]
- [community_and_government, civic_organization]
- [community_and_government, public_facility]
- [community_and_government, public_safety_service]
- [travel_and_transportation, fueling_station]
- [travel_and_transportation, vehicle_service]
# Census reference areas for the msa_code + urban_rural indicators. The
# cartographic-boundary (cb_*_500k) line is used because Census publishes
# single national "us" files there for both CBSA and Place. Place population
# comes from the 2020 Decennial (covers CDPs, which PEP omits) via the Census
# API; it requires a free key (CENSUS_API_KEY in the env or ~/.Renviron) and
# is cached to a static CSV on first run.
census_areas:
cbsa_url: "https://www2.census.gov/geo/tiger/GENZ2023/shp/cb_2023_us_cbsa_500k.zip"
place_url: "https://www2.census.gov/geo/tiger/GENZ2023/shp/cb_2023_us_place_500k.zip"
cbsa_shp_name: "cb_2023_us_cbsa_500k.shp"
place_shp_name: "cb_2023_us_place_500k.shp"
population_api_url: "https://api.census.gov/data/2020/dec/pl?get=NAME,P1_001N&for=place:*&in=state:*"
# Settings for OSM exploratory data analysis
osm_data:
tag_key: name
top_n_types: 10
timestamp_cols:
- obs_timestamp
- last_obs_timestamp
- last_tag_timestamp
apply_model:
model_stub: '20260422'
# Settings for scripts/models/osm_turnover.py (JAX turnover model)
osm_turnover_model:
# Overridable at the CLI via --model-type
# {constant,constant_breakpoint,random_by_type,random_effects}.
default_model_type: constant
var_prior: [-1.0, 5.0]
# Time-varying λ — only used by --model-type constant_breakpoint. Log-normal
# prior on the breakpoint age t_B (years since the tag was established):
# (loc, scale) on log t_B; loc = 0 → median 1 year.
t_breakpoint_prior: [0.0, 1.0]
# Tight hyperprior on log_tau (random-effect scale for per-group logit_delta
# in RandomByTypeModel). Tau median ≈ exp(-2) ≈ 0.135 on the logit scale —
# shrinks per-group δ toward the global intercept.
logit_delta_var_prior: [-2.0, 0.5]
# Column in osm_observations.parquet for grouping random effects.
# "shared_label" = shared taxonomy category
group_key: shared_label
group_values: null
min_value_count: 5
# NUTS warmup (window adaptation) and retained-sample counts. Warmup should
# generally be >= n_samples for hierarchical models.
n_warmup: 500
n_samples: 500
# Number of independent chains (vmapped in parallel). n_chains > 1 enables
# R-hat and bulk ESS diagnostics at roughly linear wall-time cost on CPU.
n_chains: 4
save_full_model: true
# ---- random_effects model (multi-term additive random intercepts) --------
# Used only when --model-type random_effects. Every hyperparameter the model
# uses is wired from this block — no magic numbers in the script or model
# body. Each term is included iff enabled: true, so any combination composes.
# var_prior / prior are (loc, scale) on the relevant log_sigma or coefficient.
random_effects:
# Minimum DISTINCT POIs in an (amenity, MSA) cell to warrant its own
# interaction level; smaller cells fall back on the main effects.
interaction_min_count: 100
terms:
amenity:
enabled: true
column: shared_label
var_prior: [-1.0, 5.0] # matches the legacy random_by_type prior
msa:
enabled: true
column: msa_code
var_prior: [-1.0, 1.0]
amenity_msa:
enabled: true
columns: [shared_label, msa_code]
var_prior: [-2.0, 0.5] # tight: shrink interactions toward zero
urbanicity:
enabled: true
column: urban_rural # urban = reference; suburban/rural coeffs
prior: [0.0, 1.0]
# δ (zero-inflation) random intercepts — composable and toggled
# independently of the λ terms above. Each enabled term adds a random
# intercept on logit δ (amenity grouped by shared_label, msa by msa_code),
# with its own log_tau scale (var_prior). No terms enabled → single global
# δ. Unseen levels back off to the global intercept, same as the λ terms.
delta_terms:
amenity:
enabled: true
column: shared_label
var_prior: [-2.0, 0.5] # tight, matches the legacy per-group δ
msa:
enabled: false
column: msa_code
var_prior: [-2.0, 0.5]
# Directory definitions (used with config.get_dir_path())
directories:
osm_data:
versioned: true
path: ~/data/openpois/osm_data
files:
# US + territories full-history pipeline (PBF-based)
osm_changes: osm_changes.parquet
osm_versions: osm_versions.parquet
raw_history_pbf: us-internal.osh.pbf
filtered_history_pbf: us-pois.osh.pbf
time_filtered_history_pbf: us-pois-timefilt.osh.pbf
raw_pr_history_pbf: puerto-rico-internal.osh.pbf
filtered_pr_history_pbf: puerto-rico-pois.osh.pbf
time_filtered_pr_history_pbf: puerto-rico-pois-timefilt.osh.pbf
raw_usvi_history_pbf: us-virgin-islands-internal.osh.pbf
filtered_usvi_history_pbf: us-virgin-islands-pois.osh.pbf
time_filtered_usvi_history_pbf: us-virgin-islands-pois-timefilt.osh.pbf
raw_american_oceania_history_pbf: american-oceania-internal.osh.pbf
filtered_american_oceania_history_pbf: american-oceania-pois.osh.pbf
time_filtered_american_oceania_history_pbf: american-oceania-pois-timefilt.osh.pbf
us_versions: us_osm_versions.parquet
us_changes: us_osm_changes.parquet
pr_versions: pr_osm_versions.parquet
pr_changes: pr_osm_changes.parquet
usvi_versions: usvi_osm_versions.parquet
usvi_changes: usvi_osm_changes.parquet
american_oceania_versions: american_oceania_osm_versions.parquet
american_oceania_changes: american_oceania_osm_changes.parquet
# Modelling-ready observations (one row per POI version × shared_label)
osm_observations: osm_observations.parquet
# Shared k-fold holdout assignment (id → fold), built once by
# scripts/models/make_holdout_folds.py and reused by every OOS run so
# all model specifications are scored on identical folds.
holdout_folds: holdout_folds.parquet
model_output:
versioned: true
path: ~/data/openpois/osm_turnover_model
files:
fitted_params: fitted_params.csv
param_draws: param_draws.parquet
predictions: predictions.csv
diagnostics: diagnostics.csv
inference_data: inference_data.nc
metrics_summary: metrics_summary.csv
metrics_subgroup: metrics_subgroup.csv
# Long-form factor → level-name map (random_effects), used by the
# apply step to reconstruct per-cell curves from the fitted params.
factor_lookups: factor_lookups.csv
# Out-of-sample cross-validation outputs (scripts/models/osm_turnover_cv.py)
oos_metrics_per_fold: oos_metrics_per_fold.csv
oos_metrics_aggregate: oos_metrics_aggregate.csv
oos_metrics_subgroup: oos_metrics_subgroup.csv
snapshot_osm:
versioned: true
path: ~/data/openpois/snapshots/osm
files:
raw_pbf: us-latest.osm.pbf
filtered_pbf: us-pois.osm.pbf
raw_pr_pbf: puerto-rico-latest.osm.pbf
filtered_pr_pbf: puerto-rico-pois.osm.pbf
raw_usvi_pbf: us-virgin-islands-latest.osm.pbf
filtered_usvi_pbf: us-virgin-islands-pois.osm.pbf
raw_american_oceania_pbf: american-oceania-latest.osm.pbf
filtered_american_oceania_pbf: american-oceania-pois.osm.pbf
snapshot: osm_snapshot.parquet
rated_snapshot: osm_snapshot_rated.parquet
partitioned: osm_snapshot_partitioned
pmtiles: osm_snapshot.pmtiles
boundary:
versioned: false
path: ~/data/openpois/boundary
census_areas:
versioned: false
path: ~/data/openpois/census_areas
files:
cbsa_shapefile: cb_2023_us_cbsa_500k.shp
place_shapefile: cb_2023_us_place_500k.shp
place_population: place_population_2020.csv
snapshot_overture:
versioned: true
path: ~/data/openpois/snapshots/overture
files:
snapshot: overture_snapshot.parquet
conflation:
versioned: true
path: ~/data/openpois/conflation
files:
conflated: conflated.parquet
match_diagnostics: match_diagnostics.parquet
partitioned: conflated_partitioned
pmtiles: conflated.pmtiles
summary_by_label: summary_by_label.csv
ghost_osm:
versioned: true
path: ~/data/openpois/ghost_osm
files:
ghosts: ghosts.parquet
testing:
versioned: false
path: ~/data/openpois/testing
files:
osm_snippet: osm_snippet.csv
overture_snippet: overture_snippet.csv
# Small national-scale modelling fixture (~10k obs, 5 MSAs x 10 amenity
# types) built by scripts/exploratory/build_test_dataset.py.
test_observations: test_observations.parquet
# Settings for POI conflation
conflation:
overture_confidence_weight: 0.7
min_match_score: 0.50
max_radius_m: 200
default_radius_m: 100
distance_weight: 0.0
name_weight: 0.50
type_weight: 0.30
identifier_weight: 0.20
chunk_size: 500_000
chunk_target_pois: 200_000
# Overture-internal deduplication runs before OSM × Overture
# matching. Self-matches Overture POIs, groups them into clusters,
# and drops non-winners so they never reach the cross-source stage.
# See ``openpois.conflation.dedup_overture.mark_no_conflate``.
overture_internal_dedup:
enabled: true
min_match_score: 0.75
max_radius_m: 100
chunk_target_pois: 200_000
duckdb_memory_limit: "4GB"
test_bbox:
xmin: -122.45
ymin: 47.50
xmax: -122.25
ymax: 47.70
# Change-detection feature: use OSM history to penalize Overture POIs
# that co-locate with a "ghost" — a previous state of an OSM element
# (primary-tag deletion, lifecycle-prefix addition, or substantial
# rename). Disabled by default for clean A/B testing.
change_detection:
enabled: false
# Minimum composite score for an Overture × ghost shadow match.
# Same scale as the main matcher's min_match_score.
min_shadow_match_score: 0.50
# rapidfuzz.fuzz.token_set_ratio threshold below which an OSM name
# change is considered a "substantial rename" rather than a typo
# fix. Range 0-100. Lower = stricter (fewer events emitted).
name_change_similarity_threshold: 50
# Fallback delta for ghosts whose shared_label isn't in the fitted
# model's per-group params. Equals sigmoid(logit_delta_0) for the
# current 20260422_by_shared_label fit (logit_delta_0 = -2.72).
default_delta: 0.062
# Hard gate on Overture-name vs ghost-prior-name token_set_ratio
# (0-100), applied *before* the composite-score-based shadow
# matcher. The default 0 keeps the loose matcher: any spatial +
# type + composite match above ``min_shadow_match_score`` will
# fire, even when Overture's name doesn't lexically match the
# OSM ghost. This is intentional. A higher value would only
# fire when Overture is showing the *same name* OSM closed,
# which we explored in May 2026 (decision rule A) and rejected
# because it loses the bulk of real closures where Overture has
# already updated to a different current name at a churned
# address (Sleep Train → Roosevelt Square etc.). Knob retained
# for future data-quality-only modes; leave at 0 for production
# change detection.
min_prior_name_match_score: 0
suppress_if_current_survivor:
# Belt-and-suspenders post-filter: drop the penalty if a
# *current* OSM POI within radius_m has name token_set_ratio
# >= threshold against the Overture name. Catches cases where
# the POI is still in OSM under different geometry (e.g.,
# node remapped to a building way) and the primary matcher
# missed it. Kept enabled because it's cheap and orthogonal
# to min_prior_name_match_score.
enabled: true
radius_m: 50
name_similarity_threshold: 70
# Settings for publishing snapshots to Source Cooperative
# (https://source.coop/henryspatialanalysis/openpois). Source Coop is
# S3-compatible: uploads go to the literal bucket name below with keys
# prefixed by ``{repo_prefix}/``; public reads are served at
# ``{public_base_url}/``.
publish:
bucket: "us-west-2.opendata.source.coop"
repo_prefix: "henryspatialanalysis/openpois"
public_base_url: "https://data.source.coop/henryspatialanalysis/openpois"
credentials_file: "~/repos/openpois/.env.json"
geohash_precision_sort: 6 # ~0.6 km x 1.2 km; within-partition sort key for spatial row-group pruning
# Values surfaced in the per-version README. Set each round until the
# pipeline captures them automatically — see .claude/TODO.md.
version_metadata:
osm_snapshot_date: "2026-04-17" # YYYY-MM-DD Geofabrik download date
overture_release: "2026-04-15.0" # Overture Maps release ID, https://docs.overturemaps.org/release-calendar/
model_commit: null # null → use current git HEAD; set a short SHA to pin
# PMTiles generation — multi-zoom archive z10–z14 for both OSM and conflated.
# Site's View.minZoom is 10 (full-metro view). z10–z14 are served directly
# from the PMTiles; z15+ render as lossless OL over-zooms of the z14 tile.
# drop-densest-as-needed silently drops features at lower zooms to keep
# each tile under ~500 KB; per-zoom point radius scales down on the site.
pmtiles:
min_zoom: 10
max_zoom: 14
drop_strategy: "drop-densest-as-needed"
osm_layer_name: "osm_pois"
conflated_layer_name: "conflated_pois"
osm_properties:
- osm_id
- osm_type # node|way|relation — drives popup OSM link path
- source
- name
- conf_mean # drives confidence-based point coloring
- amenity
- shop
- leisure
- healthcare
- craft
- historic
- landuse
- office
- tourism
conflated_properties:
- unified_id
- source
- osm_id
- osm_type # node|way|relation — drives popup OSM link path
- shared_label
- conf_mean
- name
- brand
- match_score
- match_distance_m