|
| 1 | +import logging |
| 2 | +import time |
| 3 | +from typing import List, Tuple |
| 4 | + |
| 5 | +from google.cloud import bigquery |
| 6 | +from google.cloud.bigquery import LoadJobConfig, CopyJobConfig, SourceFormat |
| 7 | + |
| 8 | +# 500 below the documented max of 10k URIs per load job to allow for any overhead and avoid hitting limits. |
| 9 | +MAX_URIS_PER_JOB = 9500 |
| 10 | + |
| 11 | + |
| 12 | +def chunked(seq: List[str], size: int): |
| 13 | + for i in range(0, len(seq), size): |
| 14 | + yield seq[i : i + size] |
| 15 | + |
| 16 | + |
| 17 | +def collect_blobs_and_uris( |
| 18 | + storage_client, bucket_name: str, prefix: str |
| 19 | +) -> Tuple[List[object], List[str]]: |
| 20 | + """List blobs in GCS with given prefix and construct their URIs.""" |
| 21 | + try: |
| 22 | + blobs = list(storage_client.list_blobs(bucket_name, prefix=prefix)) |
| 23 | + uris = [f"gs://{bucket_name}/{b.name}" for b in blobs] |
| 24 | + logging.info("Found %s files to load to BigQuery.", len(uris)) |
| 25 | + return blobs, uris |
| 26 | + except Exception as e: |
| 27 | + logging.error("Failed to list blobs or construct URIs: %s", e) |
| 28 | + raise |
| 29 | + |
| 30 | + |
| 31 | +def make_staging_table_ref( |
| 32 | + target_table_ref: bigquery.TableReference, |
| 33 | +) -> bigquery.TableReference: |
| 34 | + """Construct a staging table reference in the same dataset with a unique name.""" |
| 35 | + try: |
| 36 | + staging_table_id = f"{target_table_ref.table_id}__staging_{int(time.time())}" |
| 37 | + dataset_ref = bigquery.DatasetReference( |
| 38 | + target_table_ref.project, target_table_ref.dataset_id |
| 39 | + ) |
| 40 | + return dataset_ref.table(staging_table_id) |
| 41 | + except Exception as e: |
| 42 | + logging.error("Failed to construct staging table reference: %s", e) |
| 43 | + raise |
| 44 | + |
| 45 | + |
| 46 | +def ensure_staging_table_like_target( |
| 47 | + bigquery_client, |
| 48 | + target_table_ref: bigquery.TableReference, |
| 49 | + staging_table_ref: bigquery.TableReference, |
| 50 | +) -> None: |
| 51 | + """Create staging table with same schema (and partitioning/clustering) as target.""" |
| 52 | + try: |
| 53 | + target_tbl = bigquery_client.get_table(target_table_ref) |
| 54 | + |
| 55 | + staging_tbl = bigquery.Table(staging_table_ref, schema=target_tbl.schema) |
| 56 | + staging_tbl.time_partitioning = getattr(target_tbl, "time_partitioning", None) |
| 57 | + staging_tbl.range_partitioning = getattr(target_tbl, "range_partitioning", None) |
| 58 | + staging_tbl.clustering_fields = getattr(target_tbl, "clustering_fields", None) |
| 59 | + |
| 60 | + bigquery_client.create_table(staging_tbl, exists_ok=True) |
| 61 | + logging.info( |
| 62 | + "Staging table ready: %s.%s.%s", |
| 63 | + staging_table_ref.project, |
| 64 | + staging_table_ref.dataset_id, |
| 65 | + staging_table_ref.table_id, |
| 66 | + ) |
| 67 | + except Exception as e: |
| 68 | + logging.error("Failed to create staging table like target: %s", e) |
| 69 | + raise |
| 70 | + |
| 71 | + |
| 72 | +def load_uris_into_staging( |
| 73 | + bigquery_client, |
| 74 | + staging_table_ref: bigquery.TableReference, |
| 75 | + source_uris: List[str], |
| 76 | +) -> None: |
| 77 | + """Load NDJSON files into staging in batches (10k URIs max/job).""" |
| 78 | + try: |
| 79 | + for batch_idx, uri_batch in enumerate( |
| 80 | + chunked(source_uris, MAX_URIS_PER_JOB), start=1 |
| 81 | + ): |
| 82 | + logging.info( |
| 83 | + "Loading batch %s into staging (%s files)...", batch_idx, len(uri_batch) |
| 84 | + ) |
| 85 | + job_cfg = LoadJobConfig( |
| 86 | + source_format=SourceFormat.NEWLINE_DELIMITED_JSON, |
| 87 | + write_disposition=( |
| 88 | + bigquery.WriteDisposition.WRITE_TRUNCATE |
| 89 | + if batch_idx == 1 |
| 90 | + else bigquery.WriteDisposition.WRITE_APPEND |
| 91 | + ), |
| 92 | + ) |
| 93 | + job = bigquery_client.load_table_from_uri( |
| 94 | + uri_batch, staging_table_ref, job_config=job_cfg |
| 95 | + ) |
| 96 | + job.result() # fail fast |
| 97 | + |
| 98 | + staging_loaded = bigquery_client.get_table(staging_table_ref) |
| 99 | + logging.info( |
| 100 | + "All batches loaded into staging. Reported rows: %s", |
| 101 | + staging_loaded.num_rows, |
| 102 | + ) |
| 103 | + except Exception as e: |
| 104 | + logging.error("Failed to load URIs into staging: %s", e) |
| 105 | + raise |
| 106 | + |
| 107 | + |
| 108 | +def publish_staging_to_target( |
| 109 | + bigquery_client, |
| 110 | + staging_table_ref: bigquery.TableReference, |
| 111 | + target_table_ref: bigquery.TableReference, |
| 112 | +) -> None: |
| 113 | + """Replace target contents with staging contents (publish moment).""" |
| 114 | + try: |
| 115 | + copy_cfg = CopyJobConfig( |
| 116 | + write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE |
| 117 | + ) |
| 118 | + |
| 119 | + logging.info("Publishing: copying staging to target with WRITE_TRUNCATE...") |
| 120 | + copy_job = bigquery_client.copy_table( |
| 121 | + sources=staging_table_ref, |
| 122 | + destination=target_table_ref, |
| 123 | + job_config=copy_cfg, |
| 124 | + ) |
| 125 | + copy_job.result() |
| 126 | + logging.info("Publish complete: target replaced successfully.") |
| 127 | + except Exception as e: |
| 128 | + logging.error("Failed to publish staging to target: %s", e) |
| 129 | + raise |
| 130 | + |
| 131 | + |
| 132 | +def cleanup_success( |
| 133 | + bigquery_client, staging_table_ref: bigquery.TableReference, blobs: List[object] |
| 134 | +) -> None: |
| 135 | + """Delete staging table and source blobs after successful publish.""" |
| 136 | + try: |
| 137 | + bigquery_client.delete_table(staging_table_ref, not_found_ok=True) |
| 138 | + logging.info("Deleted staging table.") |
| 139 | + |
| 140 | + for b in blobs: |
| 141 | + b.delete() |
| 142 | + logging.info("Deleted %s blobs.", len(blobs)) |
| 143 | + except Exception as e: |
| 144 | + logging.error("Failed during cleanup after success: %s", e) |
| 145 | + raise |
| 146 | + |
| 147 | + |
| 148 | +def cleanup_failure( |
| 149 | + bigquery_client, staging_table_ref: bigquery.TableReference |
| 150 | +) -> None: |
| 151 | + """Attempt to delete staging table after failure, but log and continue if it fails (to preserve for inspection).""" |
| 152 | + try: |
| 153 | + bigquery_client.delete_table(staging_table_ref, not_found_ok=True) |
| 154 | + logging.info("Deleted staging table after failure.") |
| 155 | + except Exception as e: |
| 156 | + logging.warning( |
| 157 | + "Failed to delete staging table after failure; leaving it for inspection. Exception: %s", |
| 158 | + e, |
| 159 | + ) |
0 commit comments