|
| 1 | +""" |
| 2 | +Upload the geohash-partitioned conflated POI dataset to a public S3 bucket. |
| 3 | +
|
| 4 | +Reads the partitioned dataset directory from config and uploads all parquet files to S3 |
| 5 | +with public-read ACL, preserving the Hive partition layout under a versioned S3 prefix: |
| 6 | +
|
| 7 | + s3://<bucket>/<s3_prefix_conflation>/<aws_version>/conflated_partitioned/ |
| 8 | + geohash_prefix=9q/part-0.parquet |
| 9 | + geohash_prefix=dr/part-0.parquet |
| 10 | + ... |
| 11 | +
|
| 12 | +Prerequisites — AWS setup (manual steps, not automated here): |
| 13 | +
|
| 14 | +1. Create S3 bucket in the AWS Console: |
| 15 | + - Choose a globally unique name (e.g. "openpois-public") |
| 16 | + - Uncheck "Block all public access" and acknowledge the warning |
| 17 | + - Leave other settings as defaults |
| 18 | +
|
| 19 | +2. Add a bucket policy for public GetObject access |
| 20 | + (S3 → your bucket → Permissions → Bucket policy): |
| 21 | + { |
| 22 | + "Version": "2012-10-17", |
| 23 | + "Statement": [{ |
| 24 | + "Sid": "PublicReadGetObject", |
| 25 | + "Effect": "Allow", |
| 26 | + "Principal": "*", |
| 27 | + "Action": "s3:GetObject", |
| 28 | + "Resource": "arn:aws:s3:::<bucket-name>/*" |
| 29 | + }] |
| 30 | + } |
| 31 | +
|
| 32 | +3. Create an IAM user with upload permissions |
| 33 | + (IAM → Users → Create user → attach inline policy): |
| 34 | + { |
| 35 | + "Effect": "Allow", |
| 36 | + "Action": ["s3:PutObject"], |
| 37 | + "Resource": "arn:aws:s3:::<bucket-name>/*" |
| 38 | + } |
| 39 | + Then generate access keys under Security credentials → Create access key |
| 40 | + (select "CLI" as use case). |
| 41 | +
|
| 42 | +4. Configure AWS credentials locally — pick one option: |
| 43 | + Option A (env vars): |
| 44 | + export AWS_ACCESS_KEY_ID=<your-key-id> |
| 45 | + export AWS_SECRET_ACCESS_KEY=<your-secret> |
| 46 | + Option B (AWS CLI): |
| 47 | + aws configure (writes to ~/.aws/credentials) |
| 48 | +""" |
| 49 | +from config_versioned import Config |
| 50 | + |
| 51 | +from openpois.io.s3 import upload_partitioned_dataset |
| 52 | + |
| 53 | +# ----------------------------------------------------------------------------- |
| 54 | +# Configuration constants |
| 55 | +# ----------------------------------------------------------------------------- |
| 56 | + |
| 57 | +config = Config("~/repos/openpois/config.yaml") |
| 58 | + |
| 59 | +PARTITIONED_DIR = config.get_file_path("conflation", "partitioned") |
| 60 | +AWS_VERSION = config.get("versions", "aws") |
| 61 | +S3_BUCKET = config.get("upload", "s3_bucket") |
| 62 | +S3_PREFIX = config.get("upload", "s3_prefix_conflation") |
| 63 | +S3_REGION = config.get("upload", "s3_region") |
| 64 | + |
| 65 | + |
| 66 | +# ----------------------------------------------------------------------------- |
| 67 | +# Main workflow |
| 68 | +# ----------------------------------------------------------------------------- |
| 69 | + |
| 70 | +if __name__ == "__main__": |
| 71 | + if not list(PARTITIONED_DIR.rglob("*.parquet")): |
| 72 | + raise FileNotFoundError( |
| 73 | + f"No parquet files found under {PARTITIONED_DIR}. " |
| 74 | + "Run format_for_upload.py first." |
| 75 | + ) |
| 76 | + |
| 77 | + n = upload_partitioned_dataset( |
| 78 | + local_dir = PARTITIONED_DIR, |
| 79 | + bucket = S3_BUCKET, |
| 80 | + s3_prefix = S3_PREFIX, |
| 81 | + aws_version = AWS_VERSION, |
| 82 | + s3_region = S3_REGION, |
| 83 | + ) |
| 84 | + base_url = ( |
| 85 | + f"https://{S3_BUCKET}.s3.{S3_REGION}.amazonaws.com" |
| 86 | + f"/{S3_PREFIX}/{AWS_VERSION}/{PARTITIONED_DIR.name}/" |
| 87 | + ) |
| 88 | + print(f"Uploaded {n:,} files.") |
| 89 | + print(f"Public base URL: {base_url}") |
0 commit comments