Apply head dimension [256, 20] surgery support to WAN 2.2

eltsai · eltsai · commit f1e8eea431b3 · 2026-05-04T23:48:22.000Z
diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml
@@ -389,3 +389,9 @@ enable_ssim: False
 enable_ml_diagnostics: False
 profiler_gcs_path: ""
 enable_ondemand_xprof: False
+
+# Model surgery parameters
+override_model_dims: True
+target_head_dim: 256
+target_num_heads: 20
+
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -226,6 +226,7 @@ def get_1d_rotary_pos_embed(
     ntk_factor=1.0,
     freqs_dtype=jnp.float32,
     use_real: bool = True,
+    original_dim: Optional[int] = None,
 ):
   """
   Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
@@ -236,7 +237,8 @@ def get_1d_rotary_pos_embed(
     pos = jnp.arange(pos)
 
   theta = theta * ntk_factor
-  freqs = 1.0 / (theta ** (jnp.arange(0, dim, 2, dtype=freqs_dtype)[: (dim // 2)] / dim)) / linear_factor
+  scale_dim = original_dim if original_dim is not None else dim
+  freqs = 1.0 / (theta ** (jnp.arange(0, dim, 2, dtype=freqs_dtype)[: (dim // 2)] / scale_dim)) / linear_factor
   freqs = jnp.outer(pos, freqs)
   if use_real:
     # Flux
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -40,12 +40,20 @@
 BlockSizes = common_types.BlockSizes
 
 
-def get_frequencies(max_seq_len: int, theta: int, attention_head_dim: int):
+def get_frequencies(max_seq_len: int, theta: int, attention_head_dim: int, original_attention_head_dim: int):
   h_dim = w_dim = 2 * (attention_head_dim // 6)
   t_dim = attention_head_dim - h_dim - w_dim
+  current_dims = [t_dim, h_dim, w_dim]
+
+  h_dim_old = w_dim_old = 2 * (original_attention_head_dim // 6)
+  t_dim_old = original_attention_head_dim - h_dim_old - w_dim_old
+  old_dims = [t_dim_old, h_dim_old, w_dim_old]
+
   freqs = []
-  for dim in [t_dim, h_dim, w_dim]:
-    freq = get_1d_rotary_pos_embed(dim, max_seq_len, theta, freqs_dtype=jnp.float32, use_real=False)
+  for dim, old_dim in zip(current_dims, old_dims):
+    freq = get_1d_rotary_pos_embed(
+        dim=dim, pos=max_seq_len, theta=theta, freqs_dtype=jnp.float32, use_real=False, original_dim=old_dim
+    )
     freqs.append(freq)
   freqs = jnp.concatenate(freqs, axis=1)
   t_size = attention_head_dim // 2 - 2 * (attention_head_dim // 6)
@@ -62,8 +70,16 @@ def get_frequencies(max_seq_len: int, theta: int, attention_head_dim: int):
 
 class WanRotaryPosEmbed(nnx.Module):
 
-  def __init__(self, attention_head_dim: int, patch_size: Tuple[int, int, int], max_seq_len: int, theta: float = 10000.0):
+  def __init__(
+      self,
+      attention_head_dim: int,
+      original_attention_head_dim: int,
+      patch_size: Tuple[int, int, int],
+      max_seq_len: int,
+      theta: float = 10000.0,
+  ):
     self.attention_head_dim = attention_head_dim
+    self.original_attention_head_dim = original_attention_head_dim
     self.patch_size = patch_size
     self.max_seq_len = max_seq_len
     self.theta = theta
@@ -73,7 +89,7 @@ def __call__(self, hidden_states: jax.Array) -> jax.Array:
     p_t, p_h, p_w = self.patch_size
     ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
 
-    freqs_split = get_frequencies(self.max_seq_len, self.theta, self.attention_head_dim)
+    freqs_split = get_frequencies(self.max_seq_len, self.theta, self.attention_head_dim, self.original_attention_head_dim)
 
     freqs_f = jnp.expand_dims(jnp.expand_dims(freqs_split[0][:ppf], axis=1), axis=1)
     freqs_f = jnp.broadcast_to(freqs_f, (ppf, pph, ppw, freqs_split[0].shape[-1]))
@@ -494,15 +510,16 @@ def __init__(
       enable_jax_named_scopes: bool = False,
       use_base2_exp: bool = False,
       use_experimental_scheduler: bool = False,
+      target_head_dim: int = 128,
   ):
-    inner_dim = num_attention_heads * attention_head_dim
+    inner_dim = num_attention_heads * target_head_dim
     out_channels = out_channels or in_channels
     self.num_layers = num_layers
     self.scan_layers = scan_layers
     self.enable_jax_named_scopes = enable_jax_named_scopes
 
     # 1. Patch & position embedding
-    self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
+    self.rope = WanRotaryPosEmbed(target_head_dim, attention_head_dim, patch_size, rope_max_seq_len)
     self.patch_embedding = nnx.Conv(
         in_channels,
         inner_dim,
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -99,6 +99,38 @@ def _add_sharding_rule(vs: nnx.VariableState, logical_axis_rules) -> nnx.Variabl
   return vs
 
 
+from flax.traverse_util import flatten_dict, unflatten_dict
+
+
+def perform_wan_scaling_surgery(params, target_head_dim, source_head_dim):
+  """
+  scales Q and K weights to preserve attention entropy when
+  changing head dimensions.
+
+  Formula: correction_factor = (target_dim / source_dim)^0.25
+  """
+  if target_head_dim == source_head_dim:
+    print("Target and Source head dims are identical. Skipping surgery.")
+    return params
+
+  ratio = target_head_dim / source_head_dim
+  correction_factor = ratio**0.25
+
+  flat_params = flatten_dict(params, sep="/")
+  new_flat_params = {}
+  modified_count = 0
+
+  for key, tensor in flat_params.items():
+    if ("query" in key or "key" in key) and "kernel" in key and "attn" in key:
+      new_flat_params[key] = tensor * correction_factor
+      modified_count += 1
+    else:
+      new_flat_params[key] = tensor
+
+  print(f"Surgery complete. Scaled {modified_count} tensors by {correction_factor:.4f}")
+  return unflatten_dict(new_flat_params, sep="/")
+
+
 # For some reason, jitting this function increases the memory significantly, so instead manually move weights to device.
 def create_sharded_logical_transformer(
     devices_array: np.array,
@@ -141,6 +173,11 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
   wan_config["use_base2_exp"] = config.use_base2_exp
   wan_config["use_experimental_scheduler"] = config.use_experimental_scheduler
 
+  wan_config["target_head_dim"] = wan_config["attention_head_dim"]
+  if getattr(config, "override_model_dims", False):
+    wan_config["target_head_dim"] = config.target_head_dim
+    wan_config["num_attention_heads"] = config.target_num_heads
+
   # 2. eval_shape - will not use flops or create weights on device
   # thus not using HBM memory.
   p_model_factory = partial(create_model, wan_config=wan_config)
@@ -171,6 +208,8 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
         scan_layers=config.scan_layers,
         subfolder=subfolder,
     )
+    if getattr(config, "override_model_dims", False):
+      params = perform_wan_scaling_surgery(params, config.target_head_dim, wan_config["attention_head_dim"])
 
   params = jax.tree_util.tree_map_with_path(
       lambda path, x: cast_with_exclusion(path, x, dtype_to_cast=config.weights_dtype), params