dynamicVRAM + --cache-ram 2 (CORE-117) (#13603)

rattus128 · web-flow · commit fce0398470fe · 2026-04-28T19:15:02.000-04:00
* pinned_memory: remove JIT RAM pressure release

This doesn't work, as freeing intermediates for pins needs to be
higher-priority than freeing pins-for-pins if and when you are going
to do that. So this is too late as pins-for-pins is model load time
and we dont have JIT pins-for-pins.

* cacheing: Add a filter to only free intermediates from inactive wfs

This is to get priorities in amongst pins straight.

* mm: free inactive-ram from RAM cache first

Stuff from inactive workflows should be freed before anything else.

* caching: purge old ModelPatchers first

Dont try and score them, just dump them at the first sign of trouble
if they arent part of the workflow.
diff --git a/comfy/model_management.py b/comfy/model_management.py
@@ -663,6 +663,7 @@ def minimum_inference_memory():
 
 def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins_required=0, ram_required=0):
     cleanup_models_gc()
+    comfy.memory_management.extra_ram_release(max(pins_required, ram_required))
     unloaded_model = []
     can_unload = []
     unloaded_models = []
diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py
@@ -2,7 +2,6 @@
 import comfy.memory_management
 import comfy_aimdo.host_buffer
 import comfy_aimdo.torch
-import psutil
 
 from comfy.cli_args import args
 
@@ -12,11 +11,6 @@ def get_pin(module):
 def pin_memory(module):
     if module.pin_failed or args.disable_pinned_memory or get_pin(module) is not None:
         return
-    #FIXME: This is a RAM cache trigger event
-    ram_headroom = comfy.memory_management.RAM_CACHE_HEADROOM
-    #we split the difference and assume half the RAM cache headroom is for us
-    if ram_headroom > 0 and psutil.virtual_memory().available < (ram_headroom * 0.5):
-        comfy.memory_management.extra_ram_release(ram_headroom)
 
     size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
 
diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py
@@ -5,6 +5,7 @@
 import time
 import torch
 from typing import Sequence, Mapping, Dict
+from comfy.model_patcher import ModelPatcher
 from comfy_execution.graph import DynamicPrompt
 from abc import ABC, abstractmethod
 
@@ -523,13 +524,15 @@ def set_local(self, node_id, value):
         self.timestamps[self.cache_key_set.get_data_key(node_id)] = time.time()
         super().set_local(node_id, value)
 
-    def ram_release(self, target):
+    def ram_release(self, target, free_active=False):
         if psutil.virtual_memory().available >= target:
             return
 
         clean_list = []
 
         for key, cache_entry in self.cache.items():
+            if not free_active and self.used_generation[key] == self.generation:
+                continue
             oom_score =  RAM_CACHE_OLD_WORKFLOW_OOM_MULTIPLIER ** (self.generation - self.used_generation[key])
 
             ram_usage = RAM_CACHE_DEFAULT_RAM_USAGE
@@ -542,6 +545,9 @@ def scan_list_for_ram_usage(outputs):
                         scan_list_for_ram_usage(output)
                     elif isinstance(output, torch.Tensor) and output.device.type == 'cpu':
                         ram_usage += output.numel() * output.element_size()
+                    elif isinstance(output, ModelPatcher) and self.used_generation[key] != self.generation:
+                        #old ModelPatchers are the first to go
+                        ram_usage = 1e30
             scan_list_for_ram_usage(cache_entry.outputs)
 
             oom_score *= ram_usage
diff --git a/execution.py b/execution.py
@@ -779,7 +779,7 @@ async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs=
 
                     if self.cache_type == CacheType.RAM_PRESSURE:
                         comfy.model_management.free_memory(0, None, pins_required=ram_headroom, ram_required=ram_headroom)
-                        comfy.memory_management.extra_ram_release(ram_headroom)
+                        ram_release_callback(ram_headroom, free_active=True)
                 else:
                     # Only execute when the while-loop ends without break
                     # Send cached UI for intermediate output nodes that weren't executed