2121import random
2222import uuid
2323import zipfile
24+ import shutil
2425from dataclasses import dataclass
2526from datetime import datetime
2627from typing import Optional , List
3132from sqlalchemy import func
3233from sqlalchemy .orm import Session
3334
35+ from shared .common .gcp_memory_utils import limit_gcp_memory
3436from shared .common .gcp_utils import create_refresh_materialized_view_task
3537from shared .database .database import with_db_session
3638from shared .database_gen .sqlacodegen_models import Gtfsdataset , Gtfsfile , Gtfsfeed
4547
4648init_logger ()
4749
50+ # Limit the available memory of the process so if an OOM exception happens it can be handled properly by our code
51+ limit_gcp_memory ()
52+
4853
4954@dataclass
5055class DatasetFile :
@@ -268,40 +273,101 @@ def upload_dataset(self, feed_id, public=True) -> DatasetFile or None:
268273
269274 @with_db_session
270275 def process_from_bucket (self , db_session , public = True ) -> Optional [DatasetFile ]:
276+ """Process an existing dataset from the GCP bucket and update related DB entities.
277+
278+ To reduce local disk usage, we no longer unzip all files at once. Instead, we:
279+ - Download the dataset ZIP to a temporary local file.
280+ - Iterate over each member of the ZIP.
281+ - Extract a single file to a temporary path under WORKING_DIR.
282+ - Upload that file immediately to GCS and record it as a Gtfsfile.
283+ - Delete the local temporary extracted file before moving to the next one.
271284 """
272- Process an existing dataset from the GCP bucket updates the related database entities
273- :return: The DatasetFile object created
274- """
275- temp_file_path = None
285+ temp_zip_path = None
276286 try :
277- temp_file_path = self .generate_temp_filename ()
287+ temp_zip_path = self .generate_temp_filename ()
278288 blob_file_path = f"{ self .feed_stable_id } /{ self .dataset_stable_id } /{ self .dataset_stable_id } .zip"
279- self .logger .info (f "Processing dataset from bucket: { blob_file_path } " )
289+ self .logger .info ("Processing dataset from bucket: %s" , blob_file_path )
280290 download_from_gcs (
281- os .getenv ("DATASETS_BUCKET_NAME" ), blob_file_path , temp_file_path
291+ os .getenv ("DATASETS_BUCKET_NAME" ), blob_file_path , temp_zip_path
282292 )
283293
284- extracted_files_path = self .unzip_files (temp_file_path )
294+ # Stream files from ZIP to GCS one by one to minimize disk usage
295+ bucket = storage .Client ().get_bucket (self .bucket_name )
296+ extracted_files : List [Gtfsfile ] = []
297+ working_dir = os .getenv ("WORKING_DIR" , "/in-memory" )
285298
286- _ , extracted_files = self .upload_files_to_storage (
287- temp_file_path ,
288- self .dataset_stable_id ,
289- extracted_files_path ,
290- public = public ,
291- skip_dataset_upload = True , # Skip the upload of the dataset file
292- )
299+ if not zipfile .is_zipfile (temp_zip_path ):
300+ self .logger .error (
301+ "The downloaded file %s is not a valid ZIP file." , temp_zip_path
302+ )
303+ raise ValueError ("Downloaded dataset is not a valid ZIP file." )
304+
305+ with zipfile .ZipFile (temp_zip_path , "r" ) as zf :
306+ for member in zf .infolist ():
307+ # Skip directories
308+ if member .is_dir ():
309+ continue
310+
311+ # Extract a single file to a temporary path
312+ temp_extracted_path = os .path .join (
313+ working_dir ,
314+ f"{ self .feed_stable_id } -{ self .dataset_stable_id } -{ member .filename .replace ('/' , '_' )} " ,
315+ )
316+
317+ self .logger .info (
318+ "Extracting %s to %s" , member .filename , temp_extracted_path
319+ )
320+ with zf .open (member , "r" ) as src , open (
321+ temp_extracted_path , "wb"
322+ ) as dst :
323+ shutil .copyfileobj (src , dst )
324+
325+ # Upload this single file to GCS under extracted/
326+ if os .path .isfile (temp_extracted_path ):
327+ target_path = f"{ self .feed_stable_id } /{ self .dataset_stable_id } /extracted/{ member .filename } "
328+ file_blob = bucket .blob (target_path )
329+ file_blob .upload_from_filename (temp_extracted_path )
330+ if public :
331+ file_blob .make_public ()
332+ self .logger .info (
333+ "Uploaded extracted file %s to %s" ,
334+ member .filename ,
335+ file_blob .public_url ,
336+ )
337+
338+ extracted_files .append (
339+ Gtfsfile (
340+ id = str (uuid .uuid4 ()),
341+ file_name = member .filename ,
342+ file_size_bytes = os .path .getsize (temp_extracted_path ),
343+ hosted_url = file_blob .public_url if public else None ,
344+ hash = get_hash_from_file (temp_extracted_path ),
345+ )
346+ )
347+
348+ # Remove the local temporary extracted file to free disk space
349+ try :
350+ if os .path .exists (temp_extracted_path ):
351+ os .remove (temp_extracted_path )
352+ except Exception as cleanup_err :
353+ self .logger .warning (
354+ "Failed to remove temporary file %s: %s" ,
355+ temp_extracted_path ,
356+ cleanup_err ,
357+ )
293358
294359 dataset_file = DatasetFile (
295360 stable_id = self .dataset_stable_id ,
296361 file_sha256_hash = self .latest_hash ,
297362 hosted_url = f"{ self .public_hosted_datasets_url } /{ blob_file_path } " ,
298363 extracted_files = extracted_files ,
299364 zipped_size = (
300- os .path .getsize (temp_file_path )
301- if os .path .exists (temp_file_path )
365+ os .path .getsize (temp_zip_path )
366+ if os .path .exists (temp_zip_path )
302367 else None
303368 ),
304369 )
370+
305371 dataset , latest = self .create_dataset_entities (
306372 dataset_file , skip_dataset_creation = True , db_session = db_session
307373 )
@@ -319,8 +385,8 @@ def process_from_bucket(self, db_session, public=True) -> Optional[DatasetFile]:
319385 raise ValueError ("Dataset update failed, dataset is None." )
320386 return dataset_file
321387 finally :
322- if temp_file_path and os .path .exists (temp_file_path ):
323- os .remove (temp_file_path )
388+ if temp_zip_path and os .path .exists (temp_zip_path ):
389+ os .remove (temp_zip_path )
324390
325391 def unzip_files (self , temp_file_path ):
326392 extracted_files_path = os .path .join (temp_file_path .split ("." )[0 ], "extracted" )
0 commit comments