Skip to content

Commit 5dde199

Browse files
committed
Upload conflated dataset to AWS S3.
1 parent fb25679 commit 5dde199

4 files changed

Lines changed: 150 additions & 3 deletions

File tree

config.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,10 @@ conflation:
165165
upload:
166166
s3_region: "us-west-2"
167167
s3_bucket: "openpois-public"
168-
s3_prefix: "snapshots/osm"
169-
full_url: "https://openpois-public.s3.us-west-2.amazonaws.com/snapshots/osm/20260313/osm_snapshot_partitioned/"
168+
s3_prefix_osm: "snapshots/osm"
169+
s3_prefix_conflation: "snapshots/conflated"
170+
latest_url_osm: "https://openpois-public.s3.us-west-2.amazonaws.com/snapshots/osm/20260313/osm_snapshot_partitioned/"
171+
latest_url_conflation: "https://openpois-public.s3.us-west-2.amazonaws.com/snapshots/conflated/20260313/conflated_partitioned/"
170172
geohash_precision_partition: 4 # ~39 km x 20 km cells; ~1,000–3,000 cells over CONUS
171173
geohash_precision_sort: 6 # ~0.6 km x 1.2 km; fine-grained sort within each partition
172174
pmtiles:
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""
2+
Spatially partition the conflated POI dataset for optimized web map viewport queries.
3+
4+
Reads conflated.parquet, adds a geohash-4 partition column computed from each POI's
5+
centroid, sorts rows within partitions by geohash-6 for spatial locality, and writes
6+
a Hive-style partitioned dataset:
7+
8+
conflated_partitioned/
9+
geohash_prefix=9q/
10+
part-0.parquet
11+
geohash_prefix=dr/
12+
part-0.parquet
13+
...
14+
15+
Clients can fetch only the geohash cells covering their map viewport, avoiding a full
16+
dataset scan.
17+
"""
18+
import geopandas as gpd
19+
from config_versioned import Config
20+
21+
from openpois.io.geohash_partition import add_geohash_columns, write_partitioned_dataset
22+
23+
# -----------------------------------------------------------------------------
24+
# Configuration
25+
# -----------------------------------------------------------------------------
26+
27+
config = Config("~/repos/openpois/config.yaml")
28+
29+
INPUT_PATH = config.get_file_path("conflation", "conflated")
30+
OUTPUT_DIR = config.get_file_path("conflation", "partitioned")
31+
OVERWRITE = True
32+
33+
PRECISION_PARTITION = config.get("upload", "geohash_precision_partition")
34+
PRECISION_SORT = config.get("upload", "geohash_precision_sort")
35+
36+
# -----------------------------------------------------------------------------
37+
# Main workflow
38+
# -----------------------------------------------------------------------------
39+
40+
if __name__ == "__main__":
41+
print(f"Reading conflated dataset from {INPUT_PATH} ...")
42+
gdf = gpd.read_parquet(INPUT_PATH)
43+
print(f"Loaded {len(gdf):,} POIs")
44+
45+
print("Computing geohash columns from centroids ...")
46+
gdf = add_geohash_columns(
47+
gdf,
48+
precision_partition = PRECISION_PARTITION,
49+
precision_sort = PRECISION_SORT,
50+
)
51+
52+
write_partitioned_dataset(gdf, output_dir = OUTPUT_DIR, overwrite = OVERWRITE)
53+
54+
n_partitions = sum(1 for _ in OUTPUT_DIR.iterdir() if _.is_dir())
55+
print(f"Done. Wrote {len(gdf):,} rows across {n_partitions} geohash partitions.")
56+
print(f"Output: {OUTPUT_DIR}")
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
"""
2+
Upload the geohash-partitioned conflated POI dataset to a public S3 bucket.
3+
4+
Reads the partitioned dataset directory from config and uploads all parquet files to S3
5+
with public-read ACL, preserving the Hive partition layout under a versioned S3 prefix:
6+
7+
s3://<bucket>/<s3_prefix_conflation>/<aws_version>/conflated_partitioned/
8+
geohash_prefix=9q/part-0.parquet
9+
geohash_prefix=dr/part-0.parquet
10+
...
11+
12+
Prerequisites — AWS setup (manual steps, not automated here):
13+
14+
1. Create S3 bucket in the AWS Console:
15+
- Choose a globally unique name (e.g. "openpois-public")
16+
- Uncheck "Block all public access" and acknowledge the warning
17+
- Leave other settings as defaults
18+
19+
2. Add a bucket policy for public GetObject access
20+
(S3 → your bucket → Permissions → Bucket policy):
21+
{
22+
"Version": "2012-10-17",
23+
"Statement": [{
24+
"Sid": "PublicReadGetObject",
25+
"Effect": "Allow",
26+
"Principal": "*",
27+
"Action": "s3:GetObject",
28+
"Resource": "arn:aws:s3:::<bucket-name>/*"
29+
}]
30+
}
31+
32+
3. Create an IAM user with upload permissions
33+
(IAM → Users → Create user → attach inline policy):
34+
{
35+
"Effect": "Allow",
36+
"Action": ["s3:PutObject"],
37+
"Resource": "arn:aws:s3:::<bucket-name>/*"
38+
}
39+
Then generate access keys under Security credentials → Create access key
40+
(select "CLI" as use case).
41+
42+
4. Configure AWS credentials locally — pick one option:
43+
Option A (env vars):
44+
export AWS_ACCESS_KEY_ID=<your-key-id>
45+
export AWS_SECRET_ACCESS_KEY=<your-secret>
46+
Option B (AWS CLI):
47+
aws configure (writes to ~/.aws/credentials)
48+
"""
49+
from config_versioned import Config
50+
51+
from openpois.io.s3 import upload_partitioned_dataset
52+
53+
# -----------------------------------------------------------------------------
54+
# Configuration constants
55+
# -----------------------------------------------------------------------------
56+
57+
config = Config("~/repos/openpois/config.yaml")
58+
59+
PARTITIONED_DIR = config.get_file_path("conflation", "partitioned")
60+
AWS_VERSION = config.get("versions", "aws")
61+
S3_BUCKET = config.get("upload", "s3_bucket")
62+
S3_PREFIX = config.get("upload", "s3_prefix_conflation")
63+
S3_REGION = config.get("upload", "s3_region")
64+
65+
66+
# -----------------------------------------------------------------------------
67+
# Main workflow
68+
# -----------------------------------------------------------------------------
69+
70+
if __name__ == "__main__":
71+
if not list(PARTITIONED_DIR.rglob("*.parquet")):
72+
raise FileNotFoundError(
73+
f"No parquet files found under {PARTITIONED_DIR}. "
74+
"Run format_for_upload.py first."
75+
)
76+
77+
n = upload_partitioned_dataset(
78+
local_dir = PARTITIONED_DIR,
79+
bucket = S3_BUCKET,
80+
s3_prefix = S3_PREFIX,
81+
aws_version = AWS_VERSION,
82+
s3_region = S3_REGION,
83+
)
84+
base_url = (
85+
f"https://{S3_BUCKET}.s3.{S3_REGION}.amazonaws.com"
86+
f"/{S3_PREFIX}/{AWS_VERSION}/{PARTITIONED_DIR.name}/"
87+
)
88+
print(f"Uploaded {n:,} files.")
89+
print(f"Public base URL: {base_url}")

exploratory/osm_snapshot/upload_to_s3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
PARTITIONED_DIR = config.get_file_path("snapshot_osm", "partitioned")
6161
AWS_VERSION = config.get("versions", "aws")
6262
S3_BUCKET = config.get("upload", "s3_bucket")
63-
S3_PREFIX = config.get("upload", "s3_prefix")
63+
S3_PREFIX = config.get("upload", "s3_prefix_osm")
6464
S3_REGION = config.get("upload", "s3_region")
6565

6666

0 commit comments

Comments
 (0)