First draft

jcpitre · jcpitre · commit c2ada2dd4742 · 2026-01-14T17:17:14.000-05:00
diff --git a/api/src/shared/common/gcp_memory_utils.py b/api/src/shared/common/gcp_memory_utils.py
@@ -0,0 +1,137 @@
+import os
+import resource
+import shutil
+import logging
+
+MB_MULTIPLIER = 1024**2
+
+
+def find_tmpfs_mounts():
+    """
+    Returns a list of tmpfs mount points whose path contains 'in-memory',
+    from /proc/mounts.
+    """
+    tmpfs_mounts = []
+    try:
+        with open("/proc/mounts", "r") as f:
+            for line in f:
+                parts = line.split()
+                if len(parts) >= 3 and parts[2] == "tmpfs" and "in-memory" in parts[1]:
+                    tmpfs_mounts.append(parts[1])
+    except Exception as e:
+        logging.error(f"Error reading /proc/mounts: {e}")
+    return tmpfs_mounts
+
+
+def get_memory_limit_cgroup_bytes():
+    """
+    Returns the memory limit for the process (in bytes) as set by cgroups, or None if not found.
+    """
+    try:
+        with open("/sys/fs/cgroup/memory/memory.limit_in_bytes", "r") as f:
+            limit_bytes = int(f.read())
+            # If the limit is a very large number (e.g., 2**63), treat as unlimited
+            if limit_bytes < (2**60):
+                return limit_bytes
+    except Exception:
+        pass
+    return None
+
+
+def get_total_tmpfs_size_bytes():
+    """
+    Returns the total size (in bytes) of all tmpfs mounts whose path contains 'in-memory',
+    or None if none found or all unlimited.
+    """
+    tmpfs_mounts = find_tmpfs_mounts()
+    total_size = 0
+    found = False
+    for mount in tmpfs_mounts:
+        if os.path.exists(mount):
+            try:
+                total, _, _ = shutil.disk_usage(mount)
+                # If total is suspiciously large (>= 1 PB), treat as unlimited
+                if total < 1 << 50:  # Ignore unlimited mounts
+                    total_size += total
+                    found = True
+            except Exception as e:
+                logging.error(f"Error getting disk usage for {mount}: {e}")
+    if found:
+        return total_size
+    return None
+
+
+def get_available_process_memory_bytes():
+    """
+    Returns the available memory for the process in bytes:
+    total process memory limit (cgroup) minus the total size of all tmpfs
+    filesystems whose path contains 'in-memory'. If any value is unlimited
+    or not found, returns None.
+    """
+    mem_limit = get_memory_limit_cgroup_bytes()
+    tmpfs_size = get_total_tmpfs_size_bytes()
+    if mem_limit is None or tmpfs_size is None:
+        logging.warning("Could not determine available process memory " "(limit or tmpfs size missing/unlimited).")
+        return None
+    available_bytes = mem_limit - tmpfs_size
+    logging.info(
+        "Process memory limit: %.2f MiB, total tmpfs size: %.2f MiB, available: %.2f MiB",
+        mem_limit / MB_MULTIPLIER,
+        tmpfs_size / MB_MULTIPLIER,
+        available_bytes / MB_MULTIPLIER,
+    )
+    return available_bytes
+
+
+def limit_gcp_memory():
+    # Debug: Log all environment variables to help troubleshoot MEMORY_LIMIT
+    logging.info(f"All environment variables: {os.environ}")
+    # Margin comes from env in megabytes (string), default 200 MiB
+    memory_margin_str_mb = os.getenv("MEMORY_MARGIN_MB", "200")
+
+    available_memory_bytes = get_available_process_memory_bytes()
+    if not available_memory_bytes or available_memory_bytes <= 0:
+        logging.info("Could not find the total memory of the process.")
+        return
+
+    memory_margin_mb = 200
+    if memory_margin_str_mb:
+        try:
+            memory_margin_mb = int(memory_margin_str_mb)
+        except ValueError as err:
+            logging.error(
+                "Invalid MEMORY_MARGIN_MB value: %s. Using default of 200MB. Error: %s",
+                memory_margin_str_mb,
+                err,
+            )
+
+    memory_margin_bytes = memory_margin_mb * MB_MULTIPLIER if memory_margin_mb > 0 else 0
+    logging.info(
+        "Available memory: %.2f MiB, memory margin: %.2f MiB",
+        available_memory_bytes / MB_MULTIPLIER,
+        memory_margin_bytes / MB_MULTIPLIER,
+    )
+    mem_limit = available_memory_bytes - memory_margin_bytes
+    if mem_limit <= 0:
+        logging.warning(
+            "Computed RLIMIT_AS <= 0 (%.2f MiB). Skipping setrlimit.",
+            mem_limit / MB_MULTIPLIER,
+        )
+        return
+
+    # Set RLIMIT_AS in bytes, log the limit in MiB
+    resource.setrlimit(resource.RLIMIT_AS, (mem_limit, mem_limit))
+    logging.info(
+        "RLIMIT_AS set to %.2f MiB (raw: %d bytes)",
+        mem_limit / MB_MULTIPLIER,
+        mem_limit,
+    )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    available = get_available_process_memory_bytes()
+    if available is not None:
+        print(f"Available process memory: {available / MB_MULTIPLIER:.2f} MiB")
+    else:
+        print("Could not determine available process memory.")
diff --git a/functions-python/batch_process_dataset/src/main.py b/functions-python/batch_process_dataset/src/main.py
@@ -21,6 +21,7 @@
 import random
 import uuid
 import zipfile
+import shutil
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Optional, List
@@ -31,6 +32,7 @@
 from sqlalchemy import func
 from sqlalchemy.orm import Session
 
+from shared.common.gcp_memory_utils import limit_gcp_memory
 from shared.common.gcp_utils import create_refresh_materialized_view_task
 from shared.database.database import with_db_session
 from shared.database_gen.sqlacodegen_models import Gtfsdataset, Gtfsfile, Gtfsfeed
@@ -45,6 +47,9 @@
 
 init_logger()
 
+# Limit the available memory of the process so if an OOM exception happens it can be handled properly by our code
+limit_gcp_memory()
+
 
 @dataclass
 class DatasetFile:
@@ -268,40 +273,101 @@ def upload_dataset(self, feed_id, public=True) -> DatasetFile or None:
 
     @with_db_session
     def process_from_bucket(self, db_session, public=True) -> Optional[DatasetFile]:
+        """Process an existing dataset from the GCP bucket and update related DB entities.
+
+        To reduce local disk usage, we no longer unzip all files at once. Instead, we:
+        - Download the dataset ZIP to a temporary local file.
+        - Iterate over each member of the ZIP.
+        - Extract a single file to a temporary path under WORKING_DIR.
+        - Upload that file immediately to GCS and record it as a Gtfsfile.
+        - Delete the local temporary extracted file before moving to the next one.
         """
-        Process an existing dataset from the GCP bucket updates the related database entities
-        :return: The DatasetFile object created
-        """
-        temp_file_path = None
+        temp_zip_path = None
         try:
-            temp_file_path = self.generate_temp_filename()
+            temp_zip_path = self.generate_temp_filename()
             blob_file_path = f"{self.feed_stable_id}/{self.dataset_stable_id}/{self.dataset_stable_id}.zip"
-            self.logger.info(f"Processing dataset from bucket: {blob_file_path}")
+            self.logger.info("Processing dataset from bucket: %s", blob_file_path)
             download_from_gcs(
-                os.getenv("DATASETS_BUCKET_NAME"), blob_file_path, temp_file_path
+                os.getenv("DATASETS_BUCKET_NAME"), blob_file_path, temp_zip_path
             )
 
-            extracted_files_path = self.unzip_files(temp_file_path)
+            # Stream files from ZIP to GCS one by one to minimize disk usage
+            bucket = storage.Client().get_bucket(self.bucket_name)
+            extracted_files: List[Gtfsfile] = []
+            working_dir = os.getenv("WORKING_DIR", "/in-memory")
 
-            _, extracted_files = self.upload_files_to_storage(
-                temp_file_path,
-                self.dataset_stable_id,
-                extracted_files_path,
-                public=public,
-                skip_dataset_upload=True,  # Skip the upload of the dataset file
-            )
+            if not zipfile.is_zipfile(temp_zip_path):
+                self.logger.error(
+                    "The downloaded file %s is not a valid ZIP file.", temp_zip_path
+                )
+                raise ValueError("Downloaded dataset is not a valid ZIP file.")
+
+            with zipfile.ZipFile(temp_zip_path, "r") as zf:
+                for member in zf.infolist():
+                    # Skip directories
+                    if member.is_dir():
+                        continue
+
+                    # Extract a single file to a temporary path
+                    temp_extracted_path = os.path.join(
+                        working_dir,
+                        f"{self.feed_stable_id}-{self.dataset_stable_id}-{member.filename.replace('/', '_')}",
+                    )
+
+                    self.logger.info(
+                        "Extracting %s to %s", member.filename, temp_extracted_path
+                    )
+                    with zf.open(member, "r") as src, open(
+                        temp_extracted_path, "wb"
+                    ) as dst:
+                        shutil.copyfileobj(src, dst)
+
+                    # Upload this single file to GCS under extracted/
+                    if os.path.isfile(temp_extracted_path):
+                        target_path = f"{self.feed_stable_id}/{self.dataset_stable_id}/extracted/{member.filename}"
+                        file_blob = bucket.blob(target_path)
+                        file_blob.upload_from_filename(temp_extracted_path)
+                        if public:
+                            file_blob.make_public()
+                        self.logger.info(
+                            "Uploaded extracted file %s to %s",
+                            member.filename,
+                            file_blob.public_url,
+                        )
+
+                        extracted_files.append(
+                            Gtfsfile(
+                                id=str(uuid.uuid4()),
+                                file_name=member.filename,
+                                file_size_bytes=os.path.getsize(temp_extracted_path),
+                                hosted_url=file_blob.public_url if public else None,
+                                hash=get_hash_from_file(temp_extracted_path),
+                            )
+                        )
+
+                    # Remove the local temporary extracted file to free disk space
+                    try:
+                        if os.path.exists(temp_extracted_path):
+                            os.remove(temp_extracted_path)
+                    except Exception as cleanup_err:
+                        self.logger.warning(
+                            "Failed to remove temporary file %s: %s",
+                            temp_extracted_path,
+                            cleanup_err,
+                        )
 
             dataset_file = DatasetFile(
                 stable_id=self.dataset_stable_id,
                 file_sha256_hash=self.latest_hash,
                 hosted_url=f"{self.public_hosted_datasets_url}/{blob_file_path}",
                 extracted_files=extracted_files,
                 zipped_size=(
-                    os.path.getsize(temp_file_path)
-                    if os.path.exists(temp_file_path)
+                    os.path.getsize(temp_zip_path)
+                    if os.path.exists(temp_zip_path)
                     else None
                 ),
             )
+
             dataset, latest = self.create_dataset_entities(
                 dataset_file, skip_dataset_creation=True, db_session=db_session
             )
@@ -319,8 +385,8 @@ def process_from_bucket(self, db_session, public=True) -> Optional[DatasetFile]:
                 raise ValueError("Dataset update failed, dataset is None.")
             return dataset_file
         finally:
-            if temp_file_path and os.path.exists(temp_file_path):
-                os.remove(temp_file_path)
+            if temp_zip_path and os.path.exists(temp_zip_path):
+                os.remove(temp_zip_path)
 
     def unzip_files(self, temp_file_path):
         extracted_files_path = os.path.join(temp_file_path.split(".")[0], "extracted")