Merge pull request #341 from AInVFX/main

adrientoupet · web-flow · commit f5b902b8b020 · 2025-11-30T09:04:04.000-05:00
v2.5.13: Fix triton import error, OOM on long video float32 conversion, macOS CLI watermark
diff --git a/README.md b/README.md
@@ -36,6 +36,12 @@ We're actively working on improvements and new features. To stay informed:
 
 ## 🚀 Updates
 
+**2025.11.30 - Version 2.5.13**
+
+- **🔧 Fix: PyTorch 2.7+ triton import error** - Resolved installation crash caused by triton.ops import chain on newer triton versions
+- **💾 Fix: OOM on float32 conversion for long videos** - Graceful fallback to native dtype when insufficient memory for float32 conversion
+- **🍎 Fix: CLI watermark error on macOS** - Resolved MPS-related watermark processing crash on Apple Silicon
+
 **2025.11.28 - Version 2.5.12**
 
 - **🐛 Fix: Color artifacts regression** - Reverted in-place tensor operations in video transform pipeline that caused color artifacts on some images
diff --git a/inference_cli.py b/inference_cli.py
@@ -64,8 +64,14 @@
 if mp.get_start_method(allow_none=True) != 'spawn':
     mp.set_start_method('spawn', force=True)
 
-# Configure VRAM management and validate CUDA devices before heavy imports
-if platform.system() != "Darwin":
+# Configure platform-specific memory management before heavy imports
+# Must be set BEFORE import torch
+if platform.system() == "Darwin":
+    # MPS allocator requires: low_watermark <= high_watermark
+    # Setting both to 0.0 disables PyTorch memory limits, letting macOS manage memory
+    os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0")
+    os.environ.setdefault("PYTORCH_MPS_LOW_WATERMARK_RATIO", "0.0")
+else:
     os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "backend:cudaMallocAsync")
 
     # Pre-parse CUDA device argument for validation and environment setup
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "seedvr2_videoupscaler"
 description = "SeedVR2 official ComfyUI integration: ByteDance-Seed's one-step diffusion-based video/image upscaling with memory-efficient inference"
-version = "2.5.12"
+version = "2.5.13"
 authors = [
     {name = "numz"},
     {name = "adrientoupet"}
diff --git a/src/interfaces/video_upscaler.py b/src/interfaces/video_upscaler.py
@@ -509,15 +509,21 @@ def cleanup(dit_cache: bool = False, vae_cache: bool = False) -> None:
             )
 
             sample = ctx['final_video']
-            
+            debug.log("", category="none", force=True)
+
             # Ensure CPU tensor in float32 for maximum ComfyUI compatibility
             if torch.is_tensor(sample):
                 if sample.is_cuda or sample.is_mps:
                     sample = sample.cpu()
                 if sample.dtype != torch.float32:
-                    sample = sample.to(torch.float32)
+                    src_dtype = sample.dtype
+                    try:
+                        sample = sample.to(torch.float32)
+                        debug.log(f"Converted output from {src_dtype} to float32", category="precision")
+                    except Exception as e:
+                        debug.log(f"Could not convert to float32: {e}. Output is {src_dtype}, compatibility with other nodes not guaranteed", 
+                                  level="WARNING", category="precision", force=True)
 
-            debug.log("", category="none", force=True)
             debug.log("Upscaling completed successfully!", category="success", force=True)
             debug.end_timer("generation", "Video generation")
 
diff --git a/src/models/video_vae_v3/modules/attn_video_vae.py b/src/models/video_vae_v3/modules/attn_video_vae.py
@@ -17,7 +17,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from diffusers.models.attention_processor import Attention, SpatialNorm
-from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
 from diffusers.models.downsampling import Downsample2D
 from diffusers.models.lora import LoRACompatibleConv
 from diffusers.models.modeling_outputs import AutoencoderKLOutput
@@ -46,6 +45,8 @@
     CausalAutoencoderOutput,
     CausalDecoderOutput,
     CausalEncoderOutput,
+    DecoderOutput,
+    DiagonalGaussianDistribution,
     MemoryState,
     _inflation_mode_t,
     _memory_device_t,
diff --git a/src/models/video_vae_v3/modules/types.py b/src/models/video_vae_v3/modules/types.py
@@ -74,3 +74,51 @@ class CausalEncoderOutput(NamedTuple):
 
 class CausalDecoderOutput(NamedTuple):
     sample: torch.Tensor
+
+
+class DecoderOutput:
+    """Output of decoding method - matches diffusers.models.autoencoders.vae.DecoderOutput"""
+    def __init__(self, sample: torch.Tensor, commit_loss: Optional[torch.Tensor] = None):
+        self.sample = sample
+        self.commit_loss = commit_loss
+
+
+class DiagonalGaussianDistribution:
+    """Matches diffusers.models.autoencoders.vae.DiagonalGaussianDistribution exactly."""
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+            )
+
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor:
+        if self.deterministic:
+            return self.mode()
+        sample = torch.randn(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        return self.mean + self.std * sample
+
+    def mode(self) -> torch.Tensor:
+        return self.mean
+
+    def kl(self, other: Optional["DiagonalGaussianDistribution"] = None) -> torch.Tensor:
+        if other is None:
+            return 0.5 * torch.sum(
+                self.mean.pow(2) + self.var - 1.0 - self.logvar,
+                dim=[1, 2, 3],
+            )
+        return 0.5 * torch.sum(
+            (self.mean - other.mean).pow(2) / other.var
+            + self.var / other.var - 1.0 - self.logvar + other.logvar,
+            dim=[1, 2, 3],
+        )
diff --git a/src/models/video_vae_v3/modules/video_vae.py b/src/models/video_vae_v3/modules/video_vae.py
@@ -15,7 +15,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
 from einops import rearrange
 from ....common.half_precision_fixes import safe_pad_operation
 
@@ -36,6 +35,7 @@
     CausalAutoencoderOutput,
     CausalDecoderOutput,
     CausalEncoderOutput,
+    DiagonalGaussianDistribution,
     MemoryState,
     _inflation_mode_t,
     _memory_device_t,
diff --git a/src/utils/constants.py b/src/utils/constants.py
@@ -4,7 +4,7 @@
 """
 
 # Version information
-__version__ = "2.5.12"
+__version__ = "2.5.13"
 
 import os
 import warnings