Fixes 720p videos bad quality. (#269)

entrpn · susanbao · web-flow · commit 0c10d44c8e39 · 2025-10-17T10:14:18.000-07:00
Moves to use mixed precision by:
Excluding norm, conditioning and AdaLN layers from being casted to bfloat16 when weights_dtype and activations_dtype is set to bfloat16 (which is the default).
Moves VAE to full fp32.
Inputs are casted to fp32.
Scheduler samples are casted to fp32.

---------

Co-authored-by: susanbao &lt;susanbaonju@gmail.com&gt;
Co-authored-by: Sanbao Su &lt;sanbao@google.com&gt;
diff --git a/README.md b/README.md
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -323,4 +323,4 @@ eval_data_dir: ""
 enable_generate_video_for_eval: False # This will increase the used TPU memory.
 eval_max_number_of_samples_in_bucket: 60 # The number of samples per bucket for evaluation. This is calculated by num_eval_samples / len(considered_timesteps_list).
 
-enable_ssim: True
+enable_ssim: False
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -162,7 +162,7 @@ def run(config, pipeline=None, filename_prefix=""):
 
 def main(argv: Sequence[str]) -> None:
   pyconfig.initialize(argv)
-  flax.config.update('flax_always_shard_variable', False)
+  flax.config.update("flax_always_shard_variable", False)
   run(pyconfig.config)
 
 
diff --git a/src/maxdiffusion/models/embeddings_flax.py b/src/maxdiffusion/models/embeddings_flax.py
@@ -89,7 +89,7 @@ def __init__(
         in_features=in_channels,
         out_features=time_embed_dim,
         use_bias=sample_proj_bias,
-        dtype=dtype,
+        dtype=jnp.float32,
         param_dtype=weights_dtype,
         precision=precision,
         kernel_init=nnx.with_partitioning(
@@ -121,7 +121,7 @@ def __init__(
         in_features=time_embed_dim,
         out_features=time_embed_dim_out,
         use_bias=sample_proj_bias,
-        dtype=dtype,
+        dtype=jnp.float32,
         param_dtype=weights_dtype,
         precision=precision,
         kernel_init=nnx.with_partitioning(
@@ -269,7 +269,7 @@ def __init__(
         in_features=in_features,
         out_features=hidden_size,
         use_bias=True,
-        dtype=dtype,
+        dtype=jnp.float32,
         param_dtype=weights_dtype,
         precision=precision,
         kernel_init=nnx.with_partitioning(
@@ -288,7 +288,7 @@ def __init__(
         in_features=hidden_size,
         out_features=out_features,
         use_bias=True,
-        dtype=dtype,
+        dtype=jnp.float32,
         param_dtype=weights_dtype,
         precision=precision,
         kernel_init=nnx.with_partitioning(
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -116,7 +116,7 @@ def __init__(
         rngs=rngs,
         in_features=dim,
         out_features=time_proj_dim,
-        dtype=dtype,
+        dtype=jnp.float32,
         param_dtype=weights_dtype,
         precision=precision,
         kernel_init=nnx.with_partitioning(
@@ -332,33 +332,39 @@ def __call__(
       rngs: nnx.Rngs = None,
   ):
     shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
-        (self.adaln_scale_shift_table + temb), 6, axis=1
+        (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
     )
     hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor"))
     encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", None))
 
     # 1. Self-attention
-    norm_hidden_states = (self.norm1(hidden_states) * (1 + scale_msa) + shift_msa).astype(hidden_states.dtype)
+    norm_hidden_states = (self.norm1(hidden_states.astype(jnp.float32)) * (1 + scale_msa) + shift_msa).astype(
+        hidden_states.dtype
+    )
     attn_output = self.attn1(
         hidden_states=norm_hidden_states,
         encoder_hidden_states=norm_hidden_states,
         rotary_emb=rotary_emb,
         deterministic=deterministic,
         rngs=rngs,
     )
-    hidden_states = (hidden_states + attn_output * gate_msa).astype(hidden_states.dtype)
+    hidden_states = (hidden_states.astype(jnp.float32) + attn_output * gate_msa).astype(hidden_states.dtype)
 
     # 2. Cross-attention
-    norm_hidden_states = self.norm2(hidden_states)
+    norm_hidden_states = self.norm2(hidden_states.astype(jnp.float32)).astype(hidden_states.dtype)
     attn_output = self.attn2(
         hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states, deterministic=deterministic, rngs=rngs
     )
     hidden_states = hidden_states + attn_output
 
     # 3. Feed-forward
-    norm_hidden_states = (self.norm3(hidden_states) * (1 + c_scale_msa) + c_shift_msa).astype(hidden_states.dtype)
+    norm_hidden_states = (self.norm3(hidden_states.astype(jnp.float32)) * (1 + c_scale_msa) + c_shift_msa).astype(
+        hidden_states.dtype
+    )
     ff_output = self.ffn(norm_hidden_states, deterministic=deterministic, rngs=rngs)
-    hidden_states = (hidden_states + ff_output * c_gate_msa).astype(hidden_states.dtype)
+    hidden_states = (hidden_states.astype(jnp.float32) + ff_output.astype(jnp.float32) * c_gate_msa).astype(
+        hidden_states.dtype
+    )
     return hidden_states
 
 
@@ -563,7 +569,7 @@ def layer_forward(hidden_states):
 
     shift, scale = jnp.split(self.scale_shift_table + jnp.expand_dims(temb, axis=1), 2, axis=1)
 
-    hidden_states = (self.norm_out(hidden_states) * (1 + scale) + shift).astype(hidden_states.dtype)
+    hidden_states = (self.norm_out(hidden_states.astype(jnp.float32)) * (1 + scale) + shift).astype(hidden_states.dtype)
     hidden_states = self.proj_out(hidden_states)
 
     hidden_states = hidden_states.reshape(
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -40,6 +40,28 @@
 import qwix
 
 
+def cast_with_exclusion(path, x, dtype_to_cast):
+  """
+  Casts arrays to dtype_to_cast, but keeps params from any 'norm' layer in float32.
+  """
+
+  exclusion_keywords = [
+      "norm",  # For all LayerNorm/GroupNorm layers
+      "condition_embedder",  # The entire time/text conditioning module
+      "scale_shift_table",  # Catches both the final and the AdaLN tables
+  ]
+
+  path_str = ".".join(str(k.key) if isinstance(k, jax.tree_util.DictKey) else str(k) for k in path)
+
+  if any(keyword in path_str.lower() for keyword in exclusion_keywords):
+    print("is_norm_path: ", path)
+    # Keep LayerNorm/GroupNorm weights and biases in full precision
+    return x.astype(jnp.float32)
+  else:
+    # Cast everything else to dtype_to_cast
+    return x.astype(dtype_to_cast)
+
+
 def basic_clean(text):
   if is_ftfy_available():
     import ftfy
@@ -118,7 +140,10 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
         num_layers=wan_config["num_layers"],
         scan_layers=config.scan_layers,
     )
-  params = jax.tree_util.tree_map(lambda x: x.astype(config.weights_dtype), params)
+
+  params = jax.tree_util.tree_map_with_path(
+      lambda path, x: cast_with_exclusion(path, x, dtype_to_cast=config.weights_dtype), params
+  )
   for path, val in flax.traverse_util.flatten_dict(params).items():
     if restored_checkpoint:
       path = path[:-1]
@@ -214,8 +239,8 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
           subfolder="vae",
           rngs=rngs,
           mesh=mesh,
-          dtype=config.activations_dtype,
-          weights_dtype=config.weights_dtype,
+          dtype=jnp.float32,
+          weights_dtype=jnp.float32,
       )
       return wan_vae
 
@@ -474,7 +499,7 @@ def encode_prompt(
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
-      prompt_embeds = jnp.array(prompt_embeds.detach().numpy(), dtype=self.config.weights_dtype)
+      prompt_embeds = jnp.array(prompt_embeds.detach().numpy(), dtype=jnp.float32)
 
     if negative_prompt_embeds is None:
       negative_prompt = negative_prompt or ""
@@ -484,7 +509,7 @@ def encode_prompt(
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
-      negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().numpy(), dtype=self.config.weights_dtype)
+      negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().numpy(), dtype=jnp.float32)
 
     return prompt_embeds, negative_prompt_embeds
 
@@ -507,7 +532,7 @@ def prepare_latents(
         int(height) // vae_scale_factor_spatial,
         int(width) // vae_scale_factor_spatial,
     )
-    latents = jax.random.normal(rng, shape=shape, dtype=self.config.weights_dtype)
+    latents = jax.random.normal(rng, shape=shape, dtype=jnp.float32)
 
     return latents
 
@@ -597,7 +622,7 @@ def __call__(
         latents_mean = jnp.array(self.vae.latents_mean).reshape(1, self.vae.z_dim, 1, 1, 1)
         latents_std = 1.0 / jnp.array(self.vae.latents_std).reshape(1, self.vae.z_dim, 1, 1, 1)
         latents = latents / latents_std + latents_mean
-        latents = latents.astype(self.config.weights_dtype)
+        latents = latents.astype(jnp.float32)
 
     with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
       video = self.vae.decode(latents, self.vae_cache)[0]
diff --git a/src/maxdiffusion/schedulers/scheduling_unipc_multistep_flax.py b/src/maxdiffusion/schedulers/scheduling_unipc_multistep_flax.py
@@ -674,6 +674,9 @@ def step(
     Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
     the multistep UniPC.
     """
+
+    sample = sample.astype(jnp.float32)
+
     if state.timesteps is None:
       raise ValueError("Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler")
 
diff --git a/tests/schedulers/test_scheduler_unipc.py b/tests/schedulers/test_scheduler_unipc.py
@@ -518,8 +518,8 @@ def test_fp16_support(self):
                         step_output = scheduler.step(state, residual, t, sample)
                         sample = step_output.prev_sample
                         state = step_output.state
-
-                    self.assertEqual(sample.dtype, jnp.bfloat16)
+                    # sample is casted to fp32 inside step and output should be fp32.
+                    self.assertEqual(sample.dtype, jnp.float32)
 
     def test_full_loop_with_noise(self):
         scheduler_class = self.scheduler_classes[0]