add memory estimation for chache

christinadionysio · christinadionysio · commit 796b431c415e · 2026-03-10T11:02:58.000+01:00
diff --git a/src/main/python/systemds/scuro/drsearch/node_executor.py b/src/main/python/systemds/scuro/drsearch/node_executor.py
@@ -1,5 +1,6 @@
 from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, as_completed, wait
 from dataclasses import dataclass
+import os
 from systemds.scuro import Modality
 from systemds.scuro.drsearch.node_scheduler import MemoryAwareNodeScheduler
 from systemds.scuro.drsearch.representation_dag import (
@@ -20,6 +21,8 @@
     AggregatedRepresentation,
 )
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.utils.checkpointing import CheckpointManager
+from pympler import asizeof
 
 
 class RefCountResultCache:
@@ -95,19 +98,28 @@ def _execute_task_worker(task: Any, data: Any, gpu_id: Optional[int]) -> Dict[st
     return {"scores": scores, "task_time": end - start}
 
 
+# TODO: add a checkpoint manager only to the node executor, maybe get the name from outside to distinguish between unimodal and multimodal checkpoint managers
+# we can exclude all dag nodes that are loaded through an existing checkpoint and therefore speedup the further execution
 class NodeExecutor:
     def __init__(
         self,
         dags: List[RepresentationDag],
         modalities: List[Modality],
         tasks: List[Any],
+        checkpoint_manager: Optional[CheckpointManager] = None,
         max_num_workers: int = -1,
     ):
         available_total_cpu = float(psutil.virtual_memory().available)
         self.dags = dags
         self.scheduler = MemoryAwareNodeScheduler(
             dags, modalities, tasks, available_total_cpu
         )
+        self.checkpoint_manager = CheckpointManager(
+            checkpoint_dir=os.getcwd(),
+            prefix="node_executor_checkpoint_",
+            checkpoint_every=1,
+            resume=False,
+        )
         self.max_num_workers = (
             min(mp.cpu_count(), max_num_workers)
             if max_num_workers != -1
@@ -185,8 +197,12 @@ def submit_new_ready_nodes():
                         self.scheduler.add_failed_node(node_id)
                         continue
 
-                    self.scheduler.complete_node(node_id)
+                    before_bytes = self._result_cache_size_bytes()
                     self._manage_result_cache(node_id, result)
+                    after_bytes = self._result_cache_size_bytes()
+                    self.scheduler.update_cpu_memory_in_use(after_bytes - before_bytes)
+                    self.scheduler.complete_node(node_id)
+
                     node = self.scheduler.mapping[node_id]
                     if self._is_task_node(node):
                         task_results[node_id].task_time = result["task_time"]
@@ -199,10 +215,17 @@ def submit_new_ready_nodes():
                         task_results[node_id].test_score = result["scores"][
                             2
                         ].average_scores
+                        self.checkpoint_manager.increment(node_id)
+                        self.checkpoint_manager.checkpoint_if_due(task_results)
                     submit_new_ready_nodes()
 
         return list(task_results.values())
 
+    def _result_cache_size_bytes(self) -> int:
+        return asizeof.asizeof(self.result_cache.cache) + asizeof.asizeof(
+            self.result_cache.ref_count
+        )
+
     def _manage_result_cache(self, node_id: str, result: Any):
         parent_node_id = self.scheduler.get_valid_parent(node_id)
         if parent_node_id is not None:
diff --git a/src/main/python/systemds/scuro/drsearch/node_scheduler.py b/src/main/python/systemds/scuro/drsearch/node_scheduler.py
@@ -53,13 +53,16 @@ def __init__(
             torch.cuda.device_count() if torch and torch.cuda.is_available() else 0
         )
         self.memory_stats = {
-            "cpu_in_use": 0.0,
+            "cpu_in_use": sum([self.node_resources[node][0] for node in self.leaves]),
             "gpu_in_use": {
                 info["index"]: int(info["total_b"] - info["free_b"])
                 for info in self.gpu_memory_info
             },
         }
 
+    def update_cpu_memory_in_use(self, delta_bytes: int):
+        self.memory_stats["cpu_in_use"] += delta_bytes
+
     def get_runnable(self) -> List[RepresentationNode]:
         runnable_nodes = self._get_runnable_nodes()
 
@@ -173,7 +176,14 @@ def _get_pending_nodes(self) -> List[str]:
 
     def _reserve_memory(self, node_id: str, gpu_id: int) -> bool:
         cpu_mem, gpu_mem = self.node_resources[node_id]
-
+        print(
+            f"Reserving memory for node {node_id}: CPU {cpu_mem} , GPU {gpu_mem} - Total CPU {self.memory_stats['cpu_in_use']}"
+            + (
+                f" , Total GPU {self.memory_stats['gpu_in_use'][gpu_id]}"
+                if gpu_id is not None
+                else ""
+            )
+        )
         self.memory_stats["cpu_in_use"] += cpu_mem
         if gpu_id is not None:
             self.memory_stats["gpu_in_use"][gpu_id] += gpu_mem
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -220,7 +220,7 @@ def optimize_parallel(self, n_workers=None):
                             modality.modality_id, new_count
                         )
                         self._checkpoint_manager.checkpoint_if_due(
-                            self.operator_performance.results, "eval_count_by_modality"
+                            self.operator_performance.results,
                         )
                     except Exception as e:
                         print(f"Error processing modality {modality.modality_id}: {e}")
@@ -229,7 +229,6 @@ def optimize_parallel(self, n_workers=None):
                         traceback.print_exc()
                         self._checkpoint_manager.save_checkpoint(
                             self.operator_performance.results,
-                            "eval_count_by_modality",
                             {},
                         )
                         continue
@@ -259,7 +258,7 @@ def optimize(self):
                 new_count = self._count_results(local_result.results)
                 self._checkpoint_manager.increment(modality.modality_id, new_count)
                 self._checkpoint_manager.checkpoint_if_due(
-                    self.operator_performance.results, "eval_count_by_modality"
+                    self.operator_performance.results
                 )
                 if self.save_all_results:
                     self.store_results(f"{modality.modality_id}_unimodal_results.pkl")
@@ -269,7 +268,7 @@ def optimize(self):
 
                 traceback.print_exc()
                 self._checkpoint_manager.save_checkpoint(
-                    self.operator_performance.results, "eval_count_by_modality", {}
+                    self.operator_performance.results, {}
                 )
                 raise
 
@@ -336,7 +335,11 @@ def _process_modality(self, modality, skip_remaining: int = 0, scheduler=None):
         expanded_dags = self._expand_dags_with_task_roots(dags)
 
         node_executor = NodeExecutor(
-            expanded_dags, [modality], self.tasks, self.max_num_workers
+            expanded_dags,
+            [modality],
+            self.tasks,
+            self._checkpoint_manager,
+            self.max_num_workers,
         )
         task_results = node_executor.run()