ml-explore
diff --git a/‎video/wan2.1/README.md‎
Lines changed: 1 addition & 1 deletion b/‎video/wan2.1/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎video/wan2.1/img2video.py‎
Lines changed: 1 addition & 4 deletions b/‎video/wan2.1/img2video.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎video/wan2.1/txt2video.py‎
Lines changed: 1 addition & 4 deletions b/‎video/wan2.1/txt2video.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎video/wan2.1/wan/layers.py‎
Lines changed: 20 additions & 64 deletions b/‎video/wan2.1/wan/layers.py‎
Lines changed: 20 additions & 64 deletions
@@ -7,7 +7,7 @@ Hub](https://huggingface.co/Wan-AI).
 
 | Model | Task | HF Repo | RAM (unquantized), 81 frames | Single DiT step on M4 Max chip, 81 frames |
 |-------|------|---------|-----------------|---|
-| 1.3B | T2V | [Wan-AI/Wan2.1-T2V-1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B) | ~10GB | ~100 s/it |
+| 1.3B | T2V | [Wan-AI/Wan2.1-T2V-1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B) | ~10GB | ~90 s/it |
 | 14B | T2V | [Wan-AI/Wan2.1-T2V-14B](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B) | ~36GB | ~230 s/it |
 | 14B | I2V | [Wan-AI/Wan2.1-I2V-14B-480P](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-480P) | ~39GB | ~250 s/it |
 
 
@@ -71,9 +71,6 @@ def quantization_predicate(name, m):
     )
     parser.add_argument("--output", default="out.mp4")
     parser.add_argument("--preload-models", action="store_true")
-    parser.add_argument(
-        "--compile-vae", action="store_true", help="Compile VAE decoder"
-    )
     parser.add_argument(
         "--no-cache",
         action="store_true",
@@ -151,7 +148,7 @@ def quantization_predicate(name, m):
     mx.reset_peak_memory()
 
     # 3. VAE decode
-    video = pipeline.decode(x_t, compile_vae=args.compile_vae)
+    video = pipeline.decode(x_t)
     mx.eval(video)
     peak_mem_decoding = mx.get_peak_memory() / 1024**3
 
 
@@ -70,9 +70,6 @@ def quantization_predicate(name, m):
     )
     parser.add_argument("--output", default="out.mp4")
     parser.add_argument("--preload-models", action="store_true")
-    parser.add_argument(
-        "--compile-vae", action="store_true", help="Compile VAE decoder"
-    )
     parser.add_argument(
         "--no-cache",
         action="store_true",
@@ -147,7 +144,7 @@ def quantization_predicate(name, m):
     mx.reset_peak_memory()
 
     # 3. VAE decode
-    video = pipeline.decode(x_t, compile_vae=args.compile_vae)
+    video = pipeline.decode(x_t)
     mx.eval(video)
     peak_mem_decoding = mx.get_peak_memory() / 1024**3
 
 
@@ -4,8 +4,7 @@
 Transformer layers for Wan2.1 DiT.
 
 Norms, attention, blocks, and output head. Uses bidirectional (non-causal)
-attention with setattr-based block registration for weight remapping
-compatibility.
+attention with fused norm+modulate via mx.fast.layer_norm.
 """
 
 import math
@@ -18,26 +17,11 @@
 from .rope import rope_apply
 
 
-@partial(mx.compile, shapeless=True)
-def _modulate(x, scale, shift):
-    return x * (1 + scale) + shift
-
-
 @partial(mx.compile, shapeless=True)
 def _residual_gate(x, y, gate):
     return x + y * gate
 
 
-_gelu = mx.compile(nn.gelu_approx)
-
-
-@partial(mx.compile, shapeless=True)
-def _layer_norm(x, eps):
-    mean = x.mean(axis=-1, keepdims=True)
-    var = x.var(axis=-1, keepdims=True)
-    return (x - mean) / mx.sqrt(var + eps)
-
-
 class WanRMSNorm(nn.Module):
     def __init__(self, dim: int, eps: float = 1e-5):
         super().__init__()
@@ -48,22 +32,6 @@ def __call__(self, x: mx.array) -> mx.array:
         return mx.fast.rms_norm(x, self.weight, self.eps)
 
 
-class WanLayerNorm(nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine: bool = False):
-        super().__init__()
-        self.eps = eps
-        self.elementwise_affine = elementwise_affine
-        if elementwise_affine:
-            self.weight = mx.ones((dim,))
-            self.bias = mx.zeros((dim,))
-
-    def __call__(self, x: mx.array) -> mx.array:
-        if self.elementwise_affine:
-            return mx.fast.layer_norm(x, self.weight, self.bias, self.eps)
-        else:
-            return _layer_norm(x, self.eps)
-
-
 class WanSelfAttention(nn.Module):
     def __init__(
         self,
@@ -213,8 +181,9 @@ class WanAttentionBlock(nn.Module):
     """
     Transformer block with self-attn, cross-attn, and FFN.
 
-    Uses ffn_linear1/ffn_linear2 naming (not nn.Sequential) for weight
-    remapping compatibility and selective quantization.
+    Uses fused norm+modulate via mx.fast.layer_norm where the modulation
+    scale/shift are passed as weight/bias. Requires sanitize to bake 1+
+    into modulation scale positions.
     """
 
     def __init__(
@@ -228,19 +197,21 @@ def __init__(
     ):
         super().__init__()
         self.dim = dim
+        self.eps = eps
 
-        self.norm1 = WanLayerNorm(dim, eps)
-        self.norm2 = WanLayerNorm(dim, eps)
         if cross_attn_norm:
-            self.norm3 = WanLayerNorm(dim, eps, elementwise_affine=True)
+            self.norm3 = nn.LayerNorm(dim, eps=eps)
         else:
             self.norm3 = None
 
         self.self_attn = WanSelfAttention(dim, num_heads, eps)
         self.cross_attn = _cross_attn_classes[cross_attn_type](dim, num_heads, eps)
 
-        self.ffn_linear1 = nn.Linear(dim, ffn_dim)
-        self.ffn_linear2 = nn.Linear(ffn_dim, dim)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            nn.GELU(approx="tanh"),
+            nn.Linear(ffn_dim, dim),
+        )
 
         self.modulation = mx.zeros((1, 6, dim))
 
@@ -255,10 +226,9 @@ def __call__(
     ) -> mx.array:
         e = self.modulation + e
 
-        # Self-attention with modulation
-        x_norm = self.norm1(x)
+        # Self-attention: fused norm + modulate
         y = self.self_attn(
-            _modulate(x_norm, e[:, 1], e[:, 0]),
+            mx.fast.layer_norm(x, e[0, 1], e[0, 0], self.eps),
             grid_sizes,
             freqs,
         )
@@ -271,18 +241,15 @@ def __call__(
             x_normed = x
         x = x + self.cross_attn(x_normed, context, context_lens)
 
-        # FFN with modulation
-        x_norm = self.norm2(x)
-        y = self.ffn_linear2(
-            _gelu(self.ffn_linear1(_modulate(x_norm, e[:, 4], e[:, 3])))
-        )
+        # FFN: fused norm + modulate
+        y = self.ffn(mx.fast.layer_norm(x, e[0, 4], e[0, 3], self.eps))
         x = _residual_gate(x, y, e[:, 5])
 
         return x
 
 
 class Head(nn.Module):
-    """Output head with modulation. Uses raw weight arrays for remapping compat."""
+    """Output head with fused norm+modulate and nn.Linear."""
 
     def __init__(
         self,
@@ -293,23 +260,12 @@ def __init__(
     ):
         super().__init__()
         self.dim = dim
+        self.eps = eps
         out_features = math.prod(patch_size) * out_dim
-        self.norm = WanLayerNorm(dim, eps)
-        scale = 1.0 / dim**0.5
-        self.head_weight = mx.random.uniform(
-            low=-scale, high=scale, shape=(out_features, dim)
-        )
-        self.head_bias = mx.zeros((out_features,))
+        self.linear = nn.Linear(dim, out_features)
         self.modulation = mx.zeros((1, 2, dim))
 
     def __call__(self, x: mx.array, e: mx.array) -> mx.array:
         e = self.modulation + e[:, None, :]
-        x_norm = self.norm(x)
-        x = (
-            mx.matmul(
-                _modulate(x_norm, e[:, 1], e[:, 0]),
-                self.head_weight.T,
-            )
-            + self.head_bias
-        )
-        return x
+        x = mx.fast.layer_norm(x, e[0, 1], e[0, 0], self.eps)
+        return self.linear(x)