add modality memory usage

christinadionysio · christinadionysio · commit 215f032cd8ad · 2026-03-12T13:31:52.000+01:00
diff --git a/src/main/python/systemds/scuro/drsearch/node_executor.py b/src/main/python/systemds/scuro/drsearch/node_executor.py
@@ -51,12 +51,17 @@ class RefCountResultCache:
     def __init__(self):
         self.cache = {}
         self.ref_count = {}
+        self.memory_usage_per_node = {}
 
     def get(self, node_id: str) -> Any:
         return self.cache[node_id]
 
     def add_result(self, node_id: str, result: Any):
         self.cache[node_id] = result
+        self.memory_usage_per_node[node_id] = result.calculate_memory_usage()
+        print(
+            f"Node {node_id} has a CPU memory usage of {self.memory_usage_per_node[node_id]/1024**3:.5f} GB"
+        )
 
     def inc_ref(self, node_id: str):
         if node_id not in self.ref_count:
@@ -68,6 +73,7 @@ def dec_ref(self, node_id: str):
         if self.ref_count[node_id] == 0:
             del self.cache[node_id]
             del self.ref_count[node_id]
+            del self.memory_usage_per_node[node_id]
 
     def clear(self, node_id: str):
         del self.cache[node_id]
@@ -76,6 +82,9 @@ def clear(self, node_id: str):
     def __len__(self):
         return len(self.cache)
 
+    def get_memory_total_memory_usage(self):
+        return sum(self.memory_usage_per_node.values())
+
 
 def _execute_node_worker(
     node: RepresentationNode,
@@ -84,6 +93,9 @@ def _execute_node_worker(
     rep_cache: Optional[Dict[str, Any]],
     gpu_id: Optional[int],
 ):
+    proc = psutil.Process(os.getpid())
+    before = proc.memory_info().rss  # bytes
+
     if gpu_id is not None:
         device = torch.device(f"cuda:{gpu_id}")
         torch.cuda.set_device(device)
@@ -92,9 +104,9 @@ def _execute_node_worker(
     result = None
     node_operation = node.operation(params=node.parameters)
     operation_name = node_operation.name
-    print(
-        f"Executing node {node.node_id} inputs: {input_mods[0].modality_id}, gpu: {gpu_id}, operation: {operation_name}"
-    )
+    # print(
+    #     f"Executing node {node.node_id} inputs: {input_mods[0].modality_id}, gpu: {gpu_id}, operation: {operation_name}"
+    # )
     if gpu_id is not None and hasattr(node_operation, "gpu_id"):
         node_operation.gpu_id = gpu_id
 
@@ -120,13 +132,14 @@ def _execute_node_worker(
             )
         else:
             result = input_mods[0].combine(input_mods[1:], fusion_op)
-    peak_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+    delta_bytes = proc.memory_info().rss - before
     gpu_peak_bytes = (
         torch.cuda.max_memory_allocated(device) if gpu_id is not None else 0
     )
+    # print(f"Node {node.node_id}: {operation_name} has a CPU peak memory usage of {delta_bytes/1024**3:.2f} GB, and a GPU peak memory usage of {gpu_peak_bytes/1024**3:.2f} GB")
     return {
         "result": result,
-        "peak_bytes": peak_kb * 1024,
+        "peak_bytes": delta_bytes,
         "gpu_peak_bytes": gpu_peak_bytes,
         "operation_name": operation_name,
     }
@@ -135,7 +148,10 @@ def _execute_node_worker(
 def _execute_task_worker(
     task_node_id: str, task: Any, data: Any, gpu_id: Optional[int]
 ) -> Dict[str, Any]:
-    print(f"Executing task {task_node_id} on GPU {gpu_id}")
+    proc = psutil.Process(os.getpid())
+    before = proc.memory_info().rss  # bytes
+
+    # print(f"Executing task {task_node_id} on GPU {gpu_id}")
     if gpu_id is not None:
         device = torch.device(f"cuda:{gpu_id}")
         torch.cuda.set_device(device)
@@ -146,14 +162,15 @@ def _execute_task_worker(
     start = time.perf_counter()
     scores = task.run(data)
     end = time.perf_counter()
-    peak_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+    delta_bytes = proc.memory_info().rss - before
     gpu_peak_bytes = (
         torch.cuda.max_memory_allocated(device) if gpu_id is not None else 0
     )
+    # print(f"Task {task_node_id} has a CPU peak memory usage of {delta_bytes/1024**3:.2f} GB, and a GPU peak memory usage of {gpu_peak_bytes/1024**3:.2f} GB")
     return {
         "scores": scores,
         "task_time": end - start,
-        "peak_bytes": peak_kb * 1024,
+        "peak_bytes": delta_bytes,
         "gpu_peak_bytes": gpu_peak_bytes,
     }
 
@@ -290,6 +307,8 @@ def submit_new_ready_nodes():
                             "task",
                             memory_usage_data,
                         )
+                        self.scheduler.complete_node(node_id)
+
                     else:
                         transformed_modality = result["result"]
                         self._checkpoint_memory_usage(
@@ -299,9 +318,9 @@ def submit_new_ready_nodes():
                             result["operation_name"],
                             memory_usage_data,
                         )
-                        before_bytes = self._result_cache_size_bytes()
+                        before_bytes = self.result_cache.get_memory_total_memory_usage()
                         self._manage_result_cache(node_id, transformed_modality)
-                        after_bytes = self._result_cache_size_bytes()
+                        after_bytes = self.result_cache.get_memory_total_memory_usage()
                         self.scheduler.update_cpu_memory_in_use(
                             after_bytes - before_bytes
                         )
@@ -327,6 +346,9 @@ def _checkpoint_memory_usage(
             "estimated_cpu_bytes": self.scheduler.node_resources[node_id][0],
             "estimated_gpu_bytes": self.scheduler.node_resources[node_id][1],
         }
+        print(
+            f"Node {node_id}: {operation_name} has a CPU peak memory usage of {peak_bytes/1024**3:.2f}/{self.scheduler.node_resources[node_id][0]/1024**3:.2f} GB estimated, and a GPU peak memory usage of {gpu_peak_bytes/1024**3:.2f}/{self.scheduler.node_resources[node_id][1]/1024**3:.2f} GB estimated "
+        )
         self.memory_usage_checkpoint.checkpoint_if_due(data)
 
     def _result_cache_size_bytes(self) -> int:
@@ -357,10 +379,10 @@ def _manage_result_cache(self, node_id: str, result: Any):
             self.result_cache.add_result(node_id, result)
 
         if (
-            node_id in self.result_cache.ref_count
-            and self.result_cache.ref_count[node_id] == 0
+            parent_node_id in self.result_cache.ref_count
+            and self.result_cache.ref_count[parent_node_id] == 0
         ):
-            self.result_cache.clear(node_id)
+            self.result_cache.clear(parent_node_id)
 
     def _get_nodes_by_ids(self, nodes_ids: List[str]) -> List[RepresentationNode]:
         return [self.scheduler.mapping[node_id] for node_id in nodes_ids]
diff --git a/src/main/python/systemds/scuro/drsearch/task.py b/src/main/python/systemds/scuro/drsearch/task.py
@@ -24,7 +24,7 @@
 from systemds.scuro.models.model import Model
 import numpy as np
 from sklearn.model_selection import train_test_split
-
+import sys
 from systemds.scuro.representations.representation import RepresentationStats
 
 
@@ -104,15 +104,53 @@ def get_output_stats(self, input_stats):
         return RepresentationStats(0, (0,))
 
     def estimate_peak_memory_bytes(self, input_stats):
+        label_bytes = self.labels.nbytes * 3  # should be a np array
+        train_indices_bytes = sum([sys.getsizeof(i) for i in self.train_indices]) * 2
+        test_indices_bytes = sum([sys.getsizeof(i) for i in self.test_indices]) * 2
+        cv_train_indices_bytes = (
+            sum(
+                [
+                    sum([sys.getsizeof(i) for i in fold])
+                    for fold in self.cv_train_indices
+                ]
+            )
+            * 2
+        )
+        cv_val_indices_bytes = (
+            sum([sum([sys.getsizeof(i) for i in fold]) for fold in self.cv_val_indices])
+            * 2
+        )
+        fusion_train_indices_bytes = (
+            sum([sys.getsizeof(i) for i in self.fusion_train_indices]) * 2
+        )
+        input_data = input_stats.num_instances * input_stats.output_shape[0] * 4 * 3
+
+        total_bytes = (
+            input_data
+            + label_bytes
+            + train_indices_bytes
+            + test_indices_bytes
+            + cv_train_indices_bytes
+            + cv_val_indices_bytes
+            + fusion_train_indices_bytes
+        )
+        input_stats_bytes = input_stats.num_instances * input_stats.output_shape[0] * 4
         if hasattr(self.model, "estimate_peak_memory_bytes"):
-            # TODO: Investigate the influence of cv on the memory footprint of the task
-            return self.model.estimate_peak_memory_bytes(
-                input_stats.output_shape[0], len(self.train_indices)
+            model_peak_memory_cpu, model_peak_memory_gpu = (
+                self.model.estimate_peak_memory_bytes(
+                    input_stats.output_shape[0], len(self.train_indices)
+                )
             )
+            return {
+                "cpu_peak_bytes": model_peak_memory_cpu
+                + total_bytes
+                + input_stats_bytes,
+                "gpu_peak_bytes": model_peak_memory_gpu,
+            }
         else:
             # TODO: Implement a default estimate of the peak memory bytes for the task
             return {
-                "cpu_peak_bytes": 0,
+                "cpu_peak_bytes": total_bytes + input_stats_bytes,
                 "gpu_peak_bytes": 0,
             }
 
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
@@ -46,8 +46,6 @@ def __init__(
         self.metadata = metadata
         self.data = []
         self.data_type = data_type
-        self.cost = None
-        self.shape = None
         self.modality_id = modality_id
         self.transform_time = transform_time if transform_time else 0
         self.stats = RepresentationStats(0, ())
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
@@ -25,7 +25,7 @@
 from systemds.scuro.modality.modality import Modality
 from systemds.scuro.representations.window_aggregation import WindowAggregation
 import time
-import copy
+import sys
 
 
 class TransformedModality(Modality):
@@ -56,7 +56,7 @@ def __init__(
         )
         if set_data:
             self.data = modality.data
-        self.transformation = None
+
         self.self_contained = (
             self_contained and transformation.self_contained
             if isinstance(transformation, TransformedModality)
@@ -84,8 +84,28 @@ def __init__(
     #         )
     #     self.transformation.append(transformation)
 
-    def copy_from_instance(self):
-        return type(self)(self, self.transformation)
+    def calculate_memory_usage(self):
+        data_bytes = 0
+        for instance in self.data:
+            data_bytes += self._estimate_data_bytes(instance)
+
+        md_bytes = 0
+        for key, value in self.metadata.items():
+            md_bytes += self._estimate_data_bytes(key)
+            md_bytes += self._estimate_data_bytes(value)
+
+        total_bytes = (
+            data_bytes
+            + md_bytes
+            + sys.getsizeof(self.data_type)
+            + sys.getsizeof(self.modality_id)
+            + sys.getsizeof(self.schema)
+            + sys.getsizeof(self.stats)
+            + sys.getsizeof(self.self_contained)
+            + sys.getsizeof(self.transform_time)
+            + sys.getsizeof(self.modality_type)
+        )
+        return total_bytes
 
     def join(self, right, join_condition):
         chunked_execution = False
@@ -225,3 +245,13 @@ def _padded_dimensionality_reduction(self, dimensionality_reduction_operator):
             all_outputs.append(out)
             start = end
         return np.concatenate(all_outputs, axis=0)
+
+    def _estimate_data_bytes(self, instance):
+        if isinstance(instance, np.ndarray):
+            return instance.nbytes
+        elif isinstance(instance, list):
+            return sum(self._estimate_data_bytes(item) for item in instance)
+        elif isinstance(instance, dict):
+            return sum(self._estimate_data_bytes(item) for item in instance.values())
+        else:
+            return sys.getsizeof(instance)