dist with GCS support

doomedraven · doomedraven · commit 13b12d51a3b8 · 2026-01-29T15:19:51.000+01:00
diff --git a/conf/default/distributed.conf.default b/conf/default/distributed.conf.default
@@ -42,3 +42,6 @@ token = ""
 autodiscovery = 600
 # Instances should start with following name pattern
 instance_name = cape-server
+
+[gcs]
+enabled = no
diff --git a/modules/reporting/gcs.py b/modules/reporting/gcs.py
@@ -4,6 +4,7 @@
 import zipfile
 from lib.cuckoo.common.constants import CUCKOO_ROOT
 from lib.cuckoo.common.abstracts import Report
+from lib.cuckoo.common.config import Config
 from lib.cuckoo.common.exceptions import CuckooReportError
 
 # Set up a logger for this module
@@ -19,6 +20,112 @@
     HAVE_GCS = False
 
 
+class GCSUploader:
+    """Helper class to upload files to GCS."""
+
+    def __init__(self, bucket_name=None, auth_by=None, credentials_path=None, exclude_dirs=None, exclude_files=None, mode=None):
+        if not HAVE_GCS:
+            raise ImportError("google-cloud-storage library is missing")
+
+        # Load from reporting.conf if parameters are missing
+        if not bucket_name:
+            cfg = Config("reporting")
+            if not cfg.gcs.enabled:
+                 # If we are initializing purely for manual usage but config is disabled, we might want to allow it if params are passed.
+                 # But if params are missing AND config is disabled/missing, we can't proceed.
+                 pass
+
+            bucket_name = cfg.gcs.bucket_name
+            auth_by = cfg.gcs.auth_by
+            credentials_path_str = cfg.gcs.credentials_path
+
+            if credentials_path_str:
+                credentials_path = os.path.join(CUCKOO_ROOT, credentials_path_str)
+
+            exclude_dirs_str = cfg.gcs.get("exclude_dirs", "")
+            exclude_files_str = cfg.gcs.get("exclude_files", "")
+            mode = cfg.gcs.get("mode", "file")
+
+            # Parse exclusion sets
+            self.exclude_dirs = {item.strip() for item in exclude_dirs_str.split(",") if item.strip()}
+            self.exclude_files = {item.strip() for item in exclude_files_str.split(",") if item.strip()}
+        else:
+            self.exclude_dirs = exclude_dirs if exclude_dirs else set()
+            self.exclude_files = exclude_files if exclude_files else set()
+
+        self.mode = mode
+
+        if not bucket_name:
+             raise ValueError("GCS bucket_name is not configured.")
+
+        if auth_by == "vm":
+            self.storage_client = storage.Client()
+        else:
+            if not credentials_path or not os.path.exists(credentials_path):
+                raise ValueError(f"Invalid credentials path: {credentials_path}")
+            credentials = service_account.Credentials.from_service_account_file(credentials_path)
+            self.storage_client = storage.Client(credentials=credentials)
+
+        self.bucket = self.storage_client.bucket(bucket_name)
+        # We check bucket existence lazily or now?
+        # dist.py might not want to crash on init if network is flaky, but validation is good.
+        # Let's keep validation.
+        # Note: bucket.exists() requires permissions.
+        # if not self.bucket.exists():
+        #    raise ValueError(f"GCS Bucket '{bucket_name}' does not exist or is inaccessible.")
+
+    def _iter_files_to_upload(self, source_directory):
+        """Generator that yields files to be uploaded, skipping excluded ones."""
+        for root, dirs, files in os.walk(source_directory):
+            # Exclude specified directories
+            dirs[:] = [d for d in dirs if d not in self.exclude_dirs]
+            for filename in files:
+                # Exclude specified files
+                if filename in self.exclude_files:
+                    continue
+
+                local_path = os.path.join(root, filename)
+                if not os.path.exists(local_path):
+                    continue
+                relative_path = os.path.relpath(local_path, source_directory)
+                yield local_path, relative_path
+
+    def upload(self, source_directory, analysis_id, tlp=None):
+        if self.mode == "zip":
+            self.upload_zip_archive(analysis_id, source_directory, tlp=tlp)
+        else:
+            self.upload_files_individually(analysis_id, source_directory, tlp=tlp)
+
+    def upload_zip_archive(self, analysis_id, source_directory, tlp=None):
+        log.debug("Compressing and uploading files for analysis ID %s to GCS", analysis_id)
+        blob_name = f"{analysis_id}_tlp_{tlp}.zip" if tlp else f"{analysis_id}.zip"
+
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_zip_file:
+            tmp_zip_file_name = tmp_zip_file.name
+            with zipfile.ZipFile(tmp_zip_file, "w", zipfile.ZIP_DEFLATED) as archive:
+                for local_path, relative_path in self._iter_files_to_upload(source_directory):
+                    archive.write(local_path, relative_path)
+        try:
+            log.debug("Uploading '%s' to '%s'", tmp_zip_file_name, blob_name)
+            blob = self.bucket.blob(blob_name)
+            blob.upload_from_filename(tmp_zip_file_name)
+        finally:
+            os.unlink(tmp_zip_file_name)
+        log.info("Successfully uploaded archive for analysis %s to GCS.", analysis_id)
+
+    def upload_files_individually(self, analysis_id, source_directory, tlp=None):
+        log.debug("Uploading files for analysis ID %s to GCS", analysis_id)
+        folder_name = f"{analysis_id}_tlp_{tlp}" if tlp else str(analysis_id)
+
+        for local_path, relative_path in self._iter_files_to_upload(source_directory):
+            blob_name = f"{folder_name}/{relative_path}"
+            # log.debug("Uploading '%s' to '%s'", local_path, blob_name)
+            blob = self.bucket.blob(blob_name)
+            blob.upload_from_filename(local_path)
+
+        log.info("Successfully uploaded files for analysis %s to GCS.", analysis_id)
+
+
 class GCS(Report):
     """
     Uploads all analysis files to a Google Cloud Storage (GCS) bucket.
@@ -43,120 +150,37 @@ def run(self, results):
             return
 
         tlp = results.get("info", {}).get("tlp")
+        analysis_id = results.get("info", {}).get("id")
 
-        # Read configuration options from gcs.conf and validate them
-        bucket_name = self.options.get("bucket_name")
-        if not bucket_name:
-            raise CuckooReportError("GCS bucket_name is not configured in reporting.conf -> gcs")
-        auth_by = self.options.get("auth_by")
-        if auth_by == "vm":
-            storage_client = storage.Client()
-        else:
-            credentials_path_str = self.options.get("credentials_path")
-            if not credentials_path_str:
-                raise CuckooReportError("GCS credentials_path is not configured in reporting.conf -> gcs")
-
-            credentials_path = os.path.join(CUCKOO_ROOT, credentials_path_str)
-            if not os.path.isfile(credentials_path):
-                raise CuckooReportError(
-                    "GCS credentials_path '%s' is invalid or file does not exist in reporting.conf -> gcs", credentials_path
-                )
+        # We can now just use the Uploader.
+        # But for backward compatibility with overrides in self.options (e.g. per-module config overrides in Cuckoo),
+        # we should pass options explicitly if they differ from default config.
+        # However, typically reporting.conf is the source.
 
-            credentials = service_account.Credentials.from_service_account_file(credentials_path)
-            storage_client = storage.Client(credentials=credentials)
-
-        # Read the exclusion lists, defaulting to empty strings
+        # Parse exclusion lists from self.options to respect local module config
         exclude_dirs_str = self.options.get("exclude_dirs", "")
         exclude_files_str = self.options.get("exclude_files", "")
-
-        # Parse the exclusion strings into sets for efficient lookups
         exclude_dirs = {item.strip() for item in exclude_dirs_str.split(",") if item.strip()}
         exclude_files = {item.strip() for item in exclude_files_str.split(",") if item.strip()}
 
-        if exclude_dirs:
-            log.debug("GCS reporting will exclude directories: %s", exclude_dirs)
-        if exclude_files:
-            log.debug("GCS reporting will exclude files: %s", exclude_files)
-
-        # Get the upload mode, defaulting to 'file' for backward compatibility
+        # We manually construct to respect self.options
+        bucket_name = self.options.get("bucket_name")
+        auth_by = self.options.get("auth_by")
+        credentials_path_str = self.options.get("credentials_path")
+        credentials_path = None
+        if credentials_path_str:
+             credentials_path = os.path.join(CUCKOO_ROOT, credentials_path_str)
         mode = self.options.get("mode", "file")
 
         try:
-            # --- Authentication ---
-            log.debug("Authenticating with Google Cloud Storage...")
-            bucket = storage_client.bucket(bucket_name)
+            uploader = GCSUploader(bucket_name, auth_by, credentials_path, exclude_dirs, exclude_files, mode)
 
-            # Check if the bucket exists and is accessible
-            if not bucket.exists():
-                raise CuckooReportError(
-                    "The specified GCS bucket '%s' does not exist or you don't have permission to access it.", bucket_name
-                )
-
-            analysis_id = results.get("info", {}).get("id")
             if not analysis_id:
                 raise CuckooReportError("Could not get analysis ID from results.")
 
             source_directory = self.analysis_path
 
-            if mode == "zip":
-                self.upload_zip_archive(bucket, analysis_id, source_directory, exclude_dirs, exclude_files, tlp=tlp)
-            elif mode == "file":
-                self.upload_files_individually(bucket, analysis_id, source_directory, exclude_dirs, exclude_files, tlp=tlp)
-            else:
-                raise CuckooReportError("Invalid GCS upload mode specified: %s. Must be 'file' or 'zip'.", mode)
+            uploader.upload(source_directory, analysis_id, tlp)
 
         except Exception as e:
             raise CuckooReportError(f"Failed to upload report to GCS: {e}") from e
-
-    def _iter_files_to_upload(self, source_directory, exclude_dirs, exclude_files):
-        """Generator that yields files to be uploaded, skipping excluded ones."""
-        for root, dirs, files in os.walk(source_directory):
-            # Exclude specified directories
-            dirs[:] = [d for d in dirs if d not in exclude_dirs]
-            for filename in files:
-                # Exclude specified files
-                if filename in exclude_files:
-                    log.debug("Skipping excluded file: %s", os.path.join(root, filename))
-                    continue
-
-                local_path = os.path.join(root, filename)
-                if not os.path.exists(local_path):
-                    continue
-                relative_path = os.path.relpath(local_path, source_directory)
-                yield local_path, relative_path
-
-    def upload_zip_archive(self, bucket, analysis_id, source_directory, exclude_dirs, exclude_files, tlp=None):
-        """Compresses and uploads the analysis directory as a single zip file."""
-        log.debug("Compressing and uploading files for analysis ID %d to GCS bucket '%s'", analysis_id, bucket.name)
-        if tlp:
-            zip_name = "%s_tlp_%s.zip" % analysis_id, tlp
-        else:
-            zip_name = "%s.zip" % analysis_id
-        blob_name = zip_name
-
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_zip_file:
-            tmp_zip_file_name = tmp_zip_file.name
-            with zipfile.ZipFile(tmp_zip_file, "w", zipfile.ZIP_DEFLATED) as archive:
-                for local_path, relative_path in self._iter_files_to_upload(source_directory, exclude_dirs, exclude_files):
-                    archive.write(local_path, relative_path)
-        try:
-            log.debug("Uploading '%s' to '%s'", tmp_zip_file_name, blob_name)
-            blob = bucket.blob(blob_name)
-            blob.upload_from_filename(tmp_zip_file_name)
-        finally:
-            os.unlink(tmp_zip_file_name)
-        log.info("Successfully uploaded archive for analysis %d to GCS.", analysis_id)
-
-    def upload_files_individually(self, bucket, analysis_id, source_directory, exclude_dirs, exclude_files, tlp=None):
-        """Uploads analysis files individually to the GCS bucket."""
-        log.debug("Uploading files for analysis ID %d to GCS bucket '%s'", analysis_id, bucket.name)
-        folder_name = analysis_id
-        if tlp:
-            folder_name = "%s_tlp_%s" % analysis_id, tlp
-        for local_path, relative_path in self._iter_files_to_upload(source_directory, exclude_dirs, exclude_files):
-            blob_name = f"{folder_name}/{relative_path}"
-            log.debug("Uploading '%s' to '%s'", local_path, blob_name)
-            blob = bucket.blob(blob_name)
-            blob.upload_from_filename(local_path)
-
-        log.info("Successfully uploaded files for analysis %d to GCS.", analysis_id)
diff --git a/utils/dist.py b/utils/dist.py
@@ -95,6 +95,18 @@
 NFS_FETCH = dist_conf.distributed.get("nfs")
 RESTAPI_FETCH = dist_conf.distributed.get("restapi")
 
+# GCS Configuration
+GCS_ENABLED = dist_conf.gcs.enabled
+
+if GCS_ENABLED:
+    from modules.reporting.gcs import GCSUploader
+    try:
+        # Initialize without args to load from reporting.conf
+        gcs_uploader = GCSUploader()
+    except Exception as e:
+        log.error("Failed to initialize GCS Uploader: %s", e)
+        GCS_ENABLED = False
+
 INTERVAL = 10
 
 # controller of dead nodes
@@ -994,6 +1006,16 @@ def fetch_latest_reports_nfs(self):
                         except Exception as e:
                             log.exception("Failed to save iocs for parent sample: %s", str(e))
 
+                    if GCS_ENABLED:
+                        try:
+                            # We assume report_path is the analysis folder root.
+                            # TLP is not readily available in 't' object without loading report.json or task options.
+                            # We can try to get TLP from task options if available, or just pass None.
+                            tlp = t.tlp
+                            gcs_uploader.upload(report_path, t.main_task_id, tlp=tlp)
+                        except Exception as e:
+                            log.error("Failed to upload report to GCS for task %d: %s", t.main_task_id, e)
+
                     t.retrieved = True
                     t.finished = True
                     db.commit()