Skip to content

Commit d975e51

Browse files
authored
gcs mode (kevoreilly#2718)
1 parent 7ed9de0 commit d975e51

File tree

2 files changed

+63
-31
lines changed

2 files changed

+63
-31
lines changed

conf/default/reporting.conf.default

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,10 @@ exclude_dirs = logs, shots
231231
# Good examples are large report formats you don't need in GCS.
232232
exclude_files =
233233

234+
# Mode: zip - will submit all files and folders as unique zip archive. Useful to not spam pubsub notification on file creation.
235+
# Mode: file - will submit one by one.
236+
mode = zip
237+
234238
# Can be vm or json
235239
auth_by = vm
236240
# only if auth_by = json. The absolute path to your Google Cloud service account JSON key file.

modules/reporting/gcs.py

Lines changed: 59 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
22
import logging
3+
import tempfile
4+
import zipfile
35
from lib.cuckoo.common.constants import CUCKOO_ROOT
46
from lib.cuckoo.common.abstracts import Report
57
from lib.cuckoo.common.exceptions import CuckooReportError
@@ -40,7 +42,6 @@ def run(self, results):
4042
)
4143
return
4244

43-
# Read configuration options from gcs.conf
4445
# Read configuration options from gcs.conf and validate them
4546
bucket_name = self.options.get("bucket_name")
4647
if not bucket_name:
@@ -66,8 +67,7 @@ def run(self, results):
6667
exclude_dirs_str = self.options.get("exclude_dirs", "")
6768
exclude_files_str = self.options.get("exclude_files", "")
6869

69-
# --- NEW: Parse the exclusion strings into sets for efficient lookups ---
70-
# The `if item.strip()` ensures we don't have empty strings from trailing commas
70+
# Parse the exclusion strings into sets for efficient lookups
7171
exclude_dirs = {item.strip() for item in exclude_dirs_str.split(",") if item.strip()}
7272
exclude_files = {item.strip() for item in exclude_files_str.split(",") if item.strip()}
7373

@@ -76,6 +76,9 @@ def run(self, results):
7676
if exclude_files:
7777
log.debug("GCS reporting will exclude files: %s", exclude_files)
7878

79+
# Get the upload mode, defaulting to 'file' for backward compatibility
80+
mode = self.options.get("mode", "file")
81+
7982
try:
8083
# --- Authentication ---
8184
log.debug("Authenticating with Google Cloud Storage...")
@@ -87,39 +90,64 @@ def run(self, results):
8790
"The specified GCS bucket '%s' does not exist or you don't have permission to access it.", bucket_name
8891
)
8992

90-
# --- File Upload ---
91-
# Use the analysis ID as a "folder" in the bucket
9293
analysis_id = results.get("info", {}).get("id")
9394
if not analysis_id:
9495
raise CuckooReportError("Could not get analysis ID from results.")
9596

96-
log.debug("Uploading files for analysis ID %d to GCS bucket '%s'", analysis_id, bucket_name)
97-
98-
# self.analysis_path is the path to the analysis results directory
99-
# e.g., /opt/cape/storage/analyses/123/
10097
source_directory = self.analysis_path
10198

102-
for root, dirs, files in os.walk(source_directory):
103-
# We modify 'dirs' in-place to prevent os.walk from descending into them.
104-
# This is the most efficient way to skip entire directory trees.
105-
dirs[:] = [d for d in dirs if d not in exclude_dirs]
106-
107-
for filename in files:
108-
# --- NEW: File Exclusion Logic ---
109-
if filename in exclude_files:
110-
log.debug("Skipping excluded file: %s", os.path.join(root, filename))
111-
continue # Skip to the next file
112-
113-
local_path = os.path.join(root, filename)
114-
relative_path = os.path.relpath(local_path, source_directory)
115-
blob_name = f"{analysis_id}/{relative_path}"
116-
117-
log.debug("Uploading '%s' to '%s'", local_path, blob_name)
118-
119-
blob = bucket.blob(blob_name)
120-
blob.upload_from_filename(local_path)
121-
122-
log.info("Successfully uploaded files for analysis %d to GCS.", analysis_id)
99+
if mode == "zip":
100+
self.upload_zip_archive(bucket, analysis_id, source_directory, exclude_dirs, exclude_files)
101+
elif mode == "file":
102+
self.upload_files_individually(bucket, analysis_id, source_directory, exclude_dirs, exclude_files)
103+
else:
104+
raise CuckooReportError("Invalid GCS upload mode specified: %s. Must be 'file' or 'zip'.", mode)
123105

124106
except Exception as e:
125-
raise CuckooReportError("Failed to upload report to GCS: %s", str(e))
107+
raise CuckooReportError(f"Failed to upload report to GCS: {e}") from e
108+
109+
def _iter_files_to_upload(self, source_directory, exclude_dirs, exclude_files):
110+
"""Generator that yields files to be uploaded, skipping excluded ones."""
111+
for root, dirs, files in os.walk(source_directory):
112+
# Exclude specified directories
113+
dirs[:] = [d for d in dirs if d not in exclude_dirs]
114+
for filename in files:
115+
# Exclude specified files
116+
if filename in exclude_files:
117+
log.debug("Skipping excluded file: %s", os.path.join(root, filename))
118+
continue
119+
120+
local_path = os.path.join(root, filename)
121+
relative_path = os.path.relpath(local_path, source_directory)
122+
yield local_path, relative_path
123+
124+
def upload_zip_archive(self, bucket, analysis_id, source_directory, exclude_dirs, exclude_files):
125+
"""Compresses and uploads the analysis directory as a single zip file."""
126+
log.debug("Compressing and uploading files for analysis ID %d to GCS bucket '%s'", analysis_id, bucket.name)
127+
zip_name = "%s.zip" % analysis_id
128+
blob_name = zip_name
129+
130+
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_zip_file:
131+
tmp_zip_file_name = tmp_zip_file.name
132+
with zipfile.ZipFile(tmp_zip_file, "w", zipfile.ZIP_DEFLATED) as archive:
133+
for local_path, relative_path in self._iter_files_to_upload(source_directory, exclude_dirs, exclude_files):
134+
archive.write(local_path, relative_path)
135+
136+
try:
137+
log.debug("Uploading '%s' to '%s'", tmp_zip_file_name, blob_name)
138+
blob = bucket.blob(blob_name)
139+
blob.upload_from_filename(tmp_zip_file_name)
140+
finally:
141+
os.unlink(tmp_zip_file_name)
142+
log.info("Successfully uploaded archive for analysis %d to GCS.", analysis_id)
143+
144+
def upload_files_individually(self, bucket, analysis_id, source_directory, exclude_dirs, exclude_files):
145+
"""Uploads analysis files individually to the GCS bucket."""
146+
log.debug("Uploading files for analysis ID %d to GCS bucket '%s'", analysis_id, bucket.name)
147+
for local_path, relative_path in self._iter_files_to_upload(source_directory, exclude_dirs, exclude_files):
148+
blob_name = f"{analysis_id}/{relative_path}"
149+
log.debug("Uploading '%s' to '%s'", local_path, blob_name)
150+
blob = bucket.blob(blob_name)
151+
blob.upload_from_filename(local_path)
152+
153+
log.info("Successfully uploaded files for analysis %d to GCS.", analysis_id)

0 commit comments

Comments
 (0)