diff --git a/lib/cuckoo/common/network_utils.py b/lib/cuckoo/common/network_utils.py index 9a14d77ec72..03e1d1f45c8 100644 --- a/lib/cuckoo/common/network_utils.py +++ b/lib/cuckoo/common/network_utils.py @@ -548,10 +548,11 @@ def winhttp_finalize_sessions(state): sessions_by_domain_keys[dom].add(key) if sessions_by_domain: + sessions_list = [{"host": dom, "events": evts} for dom, evts in sessions_by_domain.items()] out.append({ "process_id": p.get("process_id"), "process_name": p.get("process_name", ""), - "sessions": sessions_by_domain, + "sessions": sessions_list, }) return out diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py index 5a1455f791e..5a13f2e244e 100644 --- a/modules/processing/CAPE.py +++ b/modules/processing/CAPE.py @@ -256,6 +256,14 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str, file_info["options_hash"] = options_hash + # GravityRAT is infector so it will produce a lot of files. we don't need them + if category == "dropped" and any("GravityRAT" in i.get("name", "") for i in file_info.get("cape_yara", [])): + # delete file and continue + log.info("GravityRAT detected, removing file: %s", file_path) + with suppress(OSError): + os.remove(file_path) + return + if category in ("static", "file"): file_info["name"] = Path(self.task["target"]).name @@ -440,6 +448,28 @@ def run(self): self.process_file( self.file_path, False, meta.get(self.file_path, {}), category=self.task["category"], duplicated=duplicated ) + if "target" not in self.results: + target_restored = False + try: + db_analysis = mongo_find_one("analysis", {"info.id": int(self.task["id"])}, {"target": 1, "_id": 0}) + if db_analysis and "target" in db_analysis: + self.results["target"] = db_analysis["target"] + target_restored = True + log.info("Restored missing target info from MongoDB analysis collection") + except Exception as e: + log.error("Failed to restore target info from MongoDB: %s", e) + + if not target_restored: + json_path = os.environ.get("CAPE_REPORT") or os.path.join(self.reports_path, "report.json") + if path_exists(json_path): + try: + with open(json_path, "r", encoding="utf-8") as f: + report_data = json.load(f) + if "target" in report_data: + self.results["target"] = report_data["target"] + log.info("Restored missing target info from existing report.json") + except Exception as e: + log.error("Failed to restore target info from existing report: %s", e) for folder in ("CAPE_path", "procdump_path", "dropped_path", "package_files"): category = folder.replace("_path", "").replace("_files", "") diff --git a/modules/processing/behavior.py b/modules/processing/behavior.py index d306293bda5..e04558c5d0b 100644 --- a/modules/processing/behavior.py +++ b/modules/processing/behavior.py @@ -1350,14 +1350,15 @@ def run(self): # BSON/JSON keys must be strings. # Let's convert tuple keys to string representation "ip:port" - endpoint_map_str = {} - for (ip, port), entries in self.endpoint_map.items(): - endpoint_map_str[f"{ip}:{port}"] = entries + endpoint_map_list = [{"ip_port": f"{ip}:{port}", "pinfo": entries} for (ip, port), entries in self.endpoint_map.items()] + + http_host_map_list = [{"host": k, "pinfo": v} for k, v in self.http_host_map.items()] + dns_intents_list = [{"domain": k, "intents": v} for k, v in self.dns_intents.items()] return { - "endpoint_map": endpoint_map_str, - "http_host_map": self.http_host_map, - "dns_intents": self.dns_intents, + "endpoint_map": endpoint_map_list, + "http_host_map": http_host_map_list, + "dns_intents": dns_intents_list, "http_requests": self.http_requests, "winhttp_sessions": winhttp_finalize_sessions(self._winhttp_state), } @@ -1468,6 +1469,9 @@ def run(self): instance.event_apicall(call, process) except Exception: log.exception('Failure in partial behavior "%s"', instance.key) + # Reset the iterator so reporting modules can read the calls again + with suppress(AttributeError): + process["calls"].reset() for instance in instances: try: diff --git a/modules/processing/network.py b/modules/processing/network.py index 891b98749ed..bb0b814a32a 100644 --- a/modules/processing/network.py +++ b/modules/processing/network.py @@ -46,8 +46,6 @@ log = logging.getLogger(__name__) - - try: import re2 as re except ImportError: @@ -1116,21 +1114,55 @@ def _import_ja3_fprints(self): def _load_network_map(self) -> Dict: with suppress(Exception): - return self.results.get("behavior", {}).get("network_map") or {} + behavior_net_map = self.results.get("behavior", {}).get("network_map") or {} + if not behavior_net_map: + return {} + + # Create a separate dictionary to avoid modifying self.results in place + net_map = behavior_net_map.copy() + + raw_http_host_map = net_map.get("http_host_map", {}) + if isinstance(raw_http_host_map, list): + net_map["http_host_map"] = {item["host"]: item["pinfo"] for item in raw_http_host_map} + + raw_dns_intents = net_map.get("dns_intents", {}) + if isinstance(raw_dns_intents, list): + net_map["dns_intents"] = {item["domain"]: item["intents"] for item in raw_dns_intents} + + # We need to deep copy winhttp_sessions if we are modifying its internal dicts + raw_winhttp = net_map.get("winhttp_sessions", []) + new_winhttp = [] + for p in raw_winhttp: + new_p = dict(p) + raw_sessions = p.get("sessions", {}) + if isinstance(raw_sessions, list): + new_p["sessions"] = {item["host"]: item["events"] for item in raw_sessions} + new_winhttp.append(new_p) + net_map["winhttp_sessions"] = new_winhttp + + return net_map return {} - def _reconstruct_endpoint_map(self, raw_map: Dict[str, List[Dict]]) -> Dict[tuple, List[Dict]]: + def _reconstruct_endpoint_map(self, raw_map) -> Dict[tuple, List[Dict]]: """ Convert JSON-friendly "ip:port" keys back to (ip, int(port)) tuples. """ endpoint_map = {} - for key, val in raw_map.items(): - try: - ip, port_str = key.rsplit(":", 1) - port = int(port_str) - endpoint_map[(ip, port)] = val - except (ValueError, IndexError): - continue + if isinstance(raw_map, list): + for item in raw_map: + try: + ip, port_str = item["ip_port"].rsplit(":", 1) + endpoint_map[(ip, int(port_str))] = item["pinfo"] + except (ValueError, IndexError, KeyError): + continue + elif isinstance(raw_map, dict): + for key, val in raw_map.items(): + try: + ip, port_str = key.rsplit(":", 1) + port = int(port_str) + endpoint_map[(ip, port)] = val + except (ValueError, IndexError): + continue return endpoint_map def _pick_best(self, candidates: List[Dict]) -> Optional[Dict]: diff --git a/modules/reporting/mongodb.py b/modules/reporting/mongodb.py index dfe39c1a299..b85f522d843 100644 --- a/modules/reporting/mongodb.py +++ b/modules/reporting/mongodb.py @@ -4,10 +4,12 @@ import gc import logging - +from contextlib import suppress +from lib.cuckoo.common.iocs import dump_iocs from lib.cuckoo.common.abstracts import Report from lib.cuckoo.common.exceptions import CuckooDependencyError, CuckooReportError from modules.reporting.report_doc import ensure_valid_utf8, get_json_document, insert_calls +from lib.cuckoo.common.config import Config try: from pymongo.errors import InvalidDocument, OperationFailure @@ -22,6 +24,7 @@ MEGABYTE = 0x100000 log = logging.getLogger(__name__) +reporting_conf = Config("reporting") class MongoDB(Report): @@ -76,14 +79,18 @@ def loop_saver(self, report): if "_id" in keys: keys.remove("_id") + # We insert the info section first to get an _id obj_id = mongo_insert_one("analysis", {"info": report["info"]}).inserted_id keys.remove("info") for key in keys: try: - mongo_update_one("analysis", {"_id": obj_id}, {"$set": {key: report[key]}}, bypass_document_validation=True) + # We include info here so that mongo hooks (like normalize_files) can get the task_id + mongo_update_one("analysis", {"_id": obj_id}, {"$set": {key: report[key], "info": report["info"]}}, bypass_document_validation=True) except InvalidDocument: log.warning("Investigate your key: %s", key) + except Exception as e: + log.error("Failed to update key %s in loop_saver: %s", key, e) def run(self, results): """Writes report. @@ -108,28 +115,49 @@ def run(self, results): # the original dictionary and possibly compromise the following # reporting modules. report = get_json_document(results, self.analysis_path) + if not report or "info" not in report: + log.error("Failed to get JSON document or 'info' key is missing for Task") + return - mongo_delete_data(int(report["info"]["id"])) - log.debug("Deleted previous MongoDB data for Task %s", report["info"]["id"]) + local_task_id = int(report["info"].get("id", 0)) + if not local_task_id: + log.error("Task ID is missing in report['info']") + return # trick for distributed api - if results.get("info", {}).get("options", {}).get("main_task_id", ""): - report["info"]["id"] = int(results["info"]["options"]["main_task_id"]) + main_task_id = results.get("info", {}).get("options", {}).get("main_task_id") + if main_task_id: + with suppress(ValueError, TypeError): + report["info"]["id"] = int(main_task_id) if "network" not in report: report["network"] = {} + if "behavior" not in report or not isinstance(report["behavior"], dict): + report["behavior"] = {"processes": [], "processtree": [], "summary": {}} + + # Delete old data just before inserting new one to avoid "missing report" window + # or data loss if insertion fails during preparation (e.g. OOM) + ids_to_delete = {local_task_id, int(report["info"]["id"])} + log.debug("Deleting previous MongoDB data for Task IDs: %s", ids_to_delete) + mongo_delete_data(list(ids_to_delete)) + new_processes = insert_calls(report, mongodb=True) # Store the results in the report. - report["behavior"] = dict(report["behavior"]) report["behavior"]["processes"] = new_processes + # Store iocs as file + if reporting_conf.mongodb.dump_iocs: + dump_iocs(report, local_task_id) + ensure_valid_utf8(report) gc.collect() # Store the report and retrieve its object id. try: + log.debug("Inserting new MongoDB report for Task %s", report["info"]["id"]) mongo_insert_one("analysis", report) + except OperationFailure as e: # Check for error codes indicating the BSON object was too large # (10334 BSONObjectTooLarge) or the maximum nested object depth was @@ -145,8 +173,6 @@ def run(self, results): log.error("Deleting behavior process tree parent from results: %s", str(e)) del report["behavior"]["processtree"][0] mongo_insert_one("analysis", report) - else: - raise CuckooReportError("Failed inserting report in Mongo") from e except InvalidDocument as e: if str(e).startswith("cannot encode object") or "must not contain" in str(e): self.loop_saver(report) @@ -193,3 +219,8 @@ def run(self, results): except Exception as e: log.error("Failed to delete child key: %s", e) error_saved = False + + if error_saved: + log.error("Failed to insert report into MongoDB even after attempting to fix large documents for Task %s", report["info"]["id"]) + except Exception as e: + log.exception("Failed to store report in MongoDB for Task %s: %s", report["info"]["id"], e)