|
2 | 2 | Download the current US+PR Overture Maps Places snapshot as a GeoParquet file. |
3 | 3 |
|
4 | 4 | Queries Overture Maps GeoParquet files on public S3 using DuckDB's httpfs and |
5 | | -spatial extensions, filtering with a two-stage spatial filter (coarse bbox |
6 | | -prefilter in the DuckDB query, then exact within-polygon filter in Python |
7 | | -against the US+PR Census boundary) and by L0 taxonomy category. No |
8 | | -authentication required — Overture Maps data is publicly accessible. |
| 5 | +spatial extensions. Iterates the release's ``part-*.parquet`` files, writing a |
| 6 | +bounded-memory DuckDB COPY per part into a ``.parts/<release>/`` directory. |
| 7 | +Once every part is present, a single DuckDB COPY applies the exact US+PR |
| 8 | +polygon filter and writes the final GeoParquet without materializing rows in |
| 9 | +Python. Interrupted runs resume by skipping parts whose intermediates already |
| 10 | +exist. No authentication required — Overture Maps data is publicly accessible. |
9 | 11 |
|
10 | 12 | Auto-detects the latest available Overture release from S3 unless a specific |
11 | 13 | release_date is pinned in config.yaml. |
|
15 | 17 | download.overture.s3_bucket — Overture Maps S3 bucket name |
16 | 18 | download.overture.s3_region — AWS region of the Overture bucket |
17 | 19 | download.overture.taxonomy_allowlist — list of [L0, L1] pairs; L1 null = any |
| 20 | + download.overture.duckdb.memory_limit — per-connection DuckDB memory cap |
| 21 | + download.overture.duckdb.threads — per-connection DuckDB thread count |
| 22 | + download.overture.duckdb.workers — parallel part downloads (must be 1) |
18 | 23 | download.general.boundary.source_url — Census state-boundary zip URL |
19 | 24 | download.general.boundary.coastline_buffer_m — outward coastline buffer (m) |
20 | 25 | directories.boundary — cache directory for boundary file |
|
25 | 30 | Columns: overture_id, overture_name, taxonomy_l0, taxonomy_l1, |
26 | 31 | taxonomy_l2, brand_name, confidence, geometry, source |
27 | 32 | """ |
| 33 | +import pyarrow.parquet as pq |
28 | 34 | from config_versioned import Config |
29 | 35 | from openpois.io.boundary import get_us_pr_boundary |
30 | 36 | from openpois.io.overture import download_overture_snapshot |
|
40 | 46 | S3_BUCKET = config.get("download", "overture", "s3_bucket") |
41 | 47 | S3_REGION = config.get("download", "overture", "s3_region") |
42 | 48 | TAXONOMY_ALLOWLIST = config.get("download", "overture", "taxonomy_allowlist") |
| 49 | +DUCKDB_MEMORY_LIMIT = config.get( |
| 50 | + "download", "overture", "duckdb", "memory_limit", fail_if_none=False |
| 51 | +) or "4GB" |
| 52 | +DUCKDB_THREADS = config.get( |
| 53 | + "download", "overture", "duckdb", "threads", fail_if_none=False |
| 54 | +) or 2 |
| 55 | +DUCKDB_WORKERS = config.get( |
| 56 | + "download", "overture", "duckdb", "workers", fail_if_none=False |
| 57 | +) or 2 |
43 | 58 | BOUNDARY_URL = config.get("download", "general", "boundary", "source_url") |
44 | 59 | COASTLINE_BUFFER_M = config.get( |
45 | 60 | "download", "general", "boundary", "coastline_buffer_m" |
|
62 | 77 | cache_dir = BOUNDARY_DIR, |
63 | 78 | coastline_buffer_m = COASTLINE_BUFFER_M, |
64 | 79 | ) |
65 | | - gdf = download_overture_snapshot( |
66 | | - output_path=OUTPUT_PATH, |
67 | | - taxonomy_allowlist=TAXONOMY_ALLOWLIST, |
68 | | - boundary_gdf=boundary_gdf, |
69 | | - coarse_bboxes=coarse_bboxes, |
70 | | - bucket=S3_BUCKET, |
71 | | - s3_region=S3_REGION, |
72 | | - release_date=RELEASE_DATE, |
| 80 | + output_path = download_overture_snapshot( |
| 81 | + output_path = OUTPUT_PATH, |
| 82 | + taxonomy_allowlist = TAXONOMY_ALLOWLIST, |
| 83 | + boundary_gdf = boundary_gdf, |
| 84 | + coarse_bboxes = coarse_bboxes, |
| 85 | + bucket = S3_BUCKET, |
| 86 | + s3_region = S3_REGION, |
| 87 | + release_date = RELEASE_DATE, |
| 88 | + duckdb_memory_limit = DUCKDB_MEMORY_LIMIT, |
| 89 | + duckdb_threads = DUCKDB_THREADS, |
| 90 | + workers = DUCKDB_WORKERS, |
73 | 91 | ) |
74 | | - print(f"Saved {len(gdf):,} Overture POIs to {OUTPUT_PATH}") |
| 92 | + n_rows = pq.read_metadata(output_path).num_rows |
| 93 | + print(f"Saved {n_rows:,} Overture POIs to {output_path}") |
0 commit comments