Merge pull request #20 from silveroxides/fix/dynamic-vram-speed

silveroxides · web-flow · commit 86bb28bef8ac · 2026-03-23T15:11:39.000+01:00
Fix/dynamic vram speed
diff --git a/nodes/loader_nodes.py b/nodes/loader_nodes.py
@@ -39,7 +39,7 @@ def INPUT_TYPES(cls):
                 "ckpt_name": (folder_paths.get_filename_list("checkpoints"),),
                 "quant_format": (["auto", "int8", "int8_tensorwise", "float8_e4m3fn", "float8_e4m3fn_blockwise", "float8_e4m3fn_rowwise", "mxfp8", "hybrid_mxfp8", "nvfp4"],),
                 "kernel_backend": (["pytorch", "triton"],),
-                "disable_dynamic": ("BOOLEAN", {"default": True}),
+                "disable_dynamic": ("BOOLEAN", {"default": False}),
             },
         }
 
@@ -160,7 +160,7 @@ def INPUT_TYPES(cls):
                 "unet_name": (folder_paths.get_filename_list("diffusion_models"),),
                 "quant_format": (["auto", "int8", "int8_tensorwise", "float8_e4m3fn", "float8_e4m3fn_blockwise", "float8_e4m3fn_rowwise", "mxfp8", "hybrid_mxfp8", "nvfp4"],),
                 "kernel_backend": (["pytorch", "triton"],),
-                "disable_dynamic": ("BOOLEAN", {"default": True}),
+                "disable_dynamic": ("BOOLEAN", {"default": False}),
             },
         }
 
@@ -265,7 +265,7 @@ def INPUT_TYPES(cls):
                 "type": (cls.CLIP_TYPES,),
                 "quant_format": (["auto", "int8", "int8_tensorwise", "float8_e4m3fn", "float8_e4m3fn_blockwise", "float8_e4m3fn_rowwise", "mxfp8", "hybrid_mxfp8", "nvfp4"],),
                 "kernel_backend": (["pytorch", "triton"],),
-                "disable_dynamic": ("BOOLEAN", {"default": True}),
+                "disable_dynamic": ("BOOLEAN", {"default": False}),
             },
         }
 
@@ -383,7 +383,7 @@ def INPUT_TYPES(cls):
                 "type": (cls.CLIP_TYPES,),
                 "quant_format": (["auto", "int8", "int8_tensorwise", "float8_e4m3fn", "float8_e4m3fn_blockwise", "float8_e4m3fn_rowwise", "mxfp8", "hybrid_mxfp8", "nvfp4"],),
                 "kernel_backend": (["pytorch", "triton"],),
-                "disable_dynamic": ("BOOLEAN", {"default": True}),
+                "disable_dynamic": ("BOOLEAN", {"default": False}),
             },
         }
 
@@ -789,7 +789,7 @@ def INPUT_TYPES(cls):
         return {
             "required": {
                 "ckpt_name": (folder_paths.get_filename_list("checkpoints"),),
-                "disable_dynamic": ("BOOLEAN", {"default": True}),
+                "disable_dynamic": ("BOOLEAN", {"default": False}),
             },
         }
     RETURN_TYPES = ("MODEL", "CLIP", "VAE")
@@ -808,7 +808,7 @@ def INPUT_TYPES(cls):
         return {
             "required": {
                 "unet_name": (folder_paths.get_filename_list("diffusion_models"),),
-                "disable_dynamic": ("BOOLEAN", {"default": True}),
+                "disable_dynamic": ("BOOLEAN", {"default": False}),
             },
         }
     RETURN_TYPES = ("MODEL",)
@@ -828,7 +828,7 @@ def INPUT_TYPES(cls):
             "required": {
                 "clip_name": (folder_paths.get_filename_list("text_encoders"),),
                 "type": (QuantizedCLIPLoader.CLIP_TYPES,),
-                "disable_dynamic": ("BOOLEAN", {"default": True}),
+                "disable_dynamic": ("BOOLEAN", {"default": False}),
             },
         }
     RETURN_TYPES = ("CLIP",)
@@ -851,7 +851,7 @@ def INPUT_TYPES(cls):
                 "text_encoder1": (te_list,),
                 "text_encoder2": (te_and_ckpt_list,),
                 "type": (QuantizedDualCLIPLoader.CLIP_TYPES,),
-                "disable_dynamic": ("BOOLEAN", {"default": True}),
+                "disable_dynamic": ("BOOLEAN", {"default": False}),
             },
         }
     RETURN_TYPES = ("CLIP",)
diff --git a/unified_ops.py b/unified_ops.py
@@ -288,17 +288,22 @@ def forward_comfy_cast_weights(self, input):
 
             input_dtype = input.dtype
 
-            if isinstance(weight, QuantizedTensor):
-                if weight.device != input.device:
-                    weight = weight.to(device=input.device)
+            is_quantized_fast_path = isinstance(weight, QuantizedTensor)
+            cast_dtype = weight.dtype if is_quantized_fast_path else None
+            cast_bias_dtype = input_dtype if is_quantized_fast_path else None
+            
+            weight, bias, offload_stream = cast_bias_weight(
+                self,
+                input,
+                dtype=cast_dtype,
+                bias_dtype=cast_bias_dtype,
+                offloadable=True,
+            )
 
+            if isinstance(weight, QuantizedTensor):
                 if hasattr(weight, "_params"):
                     object.__setattr__(weight._params, "orig_dtype", input_dtype)
 
-                bias = self.bias
-                if bias is not None:
-                    bias = bias.to(device=input.device, dtype=input_dtype)
-
                 if self.layout_type == "TensorCoreMXFP8Layout":
                     input_shape = input.shape
                     tensor_3d = input.ndim == 3
@@ -314,16 +319,15 @@ def forward_comfy_cast_weights(self, input):
                             q_input = input
                             
                         q_input = QuantizedTensor.from_float(q_input, "TensorCoreMXFP8Layout")
-                        output = torch.nn.functional.linear(q_input, weight, bias)
+                        out = torch.nn.functional.linear(q_input, weight, bias)
                         if tensor_3d:
-                            output = output.reshape(input_shape[0], input_shape[1], -1)
+                            out = out.reshape(input_shape[0], input_shape[1], -1)
                         if input.dtype == torch.float32:
-                            return output.to(torch.float32)
-                        return output
+                            out = out.to(torch.float32)
                     else:
-                        return torch.nn.functional.linear(input.reshape(input_shape), weight.dequantize(), bias)
+                        out = torch.nn.functional.linear(input.reshape(input_shape), weight.dequantize(), bias)
 
-                if self.layout_type == "TensorCoreNVFP4Layout":
+                elif self.layout_type == "TensorCoreNVFP4Layout":
                     input_shape = input.shape
                     tensor_3d = input.ndim == 3
                     
@@ -338,16 +342,15 @@ def forward_comfy_cast_weights(self, input):
                             q_input = input
 
                         q_input = QuantizedTensor.from_float(q_input, "TensorCoreNVFP4Layout")
-                        output = torch.nn.functional.linear(q_input, weight, bias)
+                        out = torch.nn.functional.linear(q_input, weight, bias)
                         if tensor_3d:
-                            output = output.reshape(input_shape[0], input_shape[1], -1)
+                            out = out.reshape(input_shape[0], input_shape[1], -1)
                         if input.dtype == torch.float32:
-                            return output.to(torch.float32)
-                        return output
+                            out = out.to(torch.float32)
                     else:
-                        return torch.nn.functional.linear(input.reshape(input_shape), weight.dequantize(), bias)
+                        out = torch.nn.functional.linear(input.reshape(input_shape), weight.dequantize(), bias)
 
-                if self.layout_type in ["TensorCoreFP8Layout", "TensorCoreFP8E4M3Layout", "TensorCoreFP8E5M2Layout"]:
+                elif self.layout_type in ["TensorCoreFP8Layout", "TensorCoreFP8E4M3Layout", "TensorCoreFP8E5M2Layout"]:
                     input_shape = input.shape
                     tensor_3d = input.ndim == 3
                     
@@ -362,30 +365,21 @@ def forward_comfy_cast_weights(self, input):
                             q_input = input
 
                         q_input = QuantizedTensor.from_float(q_input, self.layout_type, scale=getattr(self, 'input_scale', None))
-                        output = torch.nn.functional.linear(q_input, weight, bias)
+                        out = torch.nn.functional.linear(q_input, weight, bias)
                         if tensor_3d:
-                            output = output.reshape(input_shape[0], input_shape[1], -1)
+                            out = out.reshape(input_shape[0], input_shape[1], -1)
                         if input.dtype == torch.float32:
-                            return output.to(torch.float32)
-                        return output
+                            out = out.to(torch.float32)
                     else:
-                        return torch.nn.functional.linear(input.reshape(input_shape), weight.dequantize(), bias)
+                        out = torch.nn.functional.linear(input.reshape(input_shape), weight.dequantize(), bias)
 
-                # Default trigger for QuantizedTensor dispatch -> layout-specific handler
-                return torch.nn.functional.linear(input, weight, bias)
+                else:
+                    # Default trigger for QuantizedTensor dispatch -> layout-specific handler
+                    out = torch.nn.functional.linear(input, weight, bias)
+
+            else:
+                out = torch.nn.functional.linear(input, weight, bias)
 
-            # Fallback path if it's not wrapped in QuantizedTensor
-            if self.is_quantized:
-                weight = weight.to(device=input.device)
-                
-                # We strictly avoid dequantizing the full weight here unless we have to,
-                # but since we create QuantizedTensors for everything during load,
-                # this path should barely ever be hit unless the user passes a raw quant tensor.
-                # Just fallback to comfy manual cast.
-                pass
-
-            weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
-            out = torch.nn.functional.linear(input, weight, bias)
             uncast_bias_weight(self, weight, bias, offload_stream)
             return out