AI-Hypercomputer
diff --git a/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 15 additions & 1 deletion b/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎src/maxdiffusion/configs/base_wan_1_3b.yml‎
Lines changed: 15 additions & 0 deletions b/‎src/maxdiffusion/configs/base_wan_1_3b.yml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_27b.yml‎
Lines changed: 15 additions & 1 deletion b/‎src/maxdiffusion/configs/base_wan_27b.yml‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎src/maxdiffusion/configs/base_wan_animate.yml‎
Lines changed: 14 additions & 0 deletions b/‎src/maxdiffusion/configs/base_wan_animate.yml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_i2v_14b.yml‎
Lines changed: 15 additions & 1 deletion b/‎src/maxdiffusion/configs/base_wan_i2v_14b.yml‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎src/maxdiffusion/configs/base_wan_i2v_27b.yml‎
Lines changed: 15 additions & 1 deletion b/‎src/maxdiffusion/configs/base_wan_i2v_27b.yml‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎src/maxdiffusion/generate_wan.py‎
Lines changed: 7 additions & 1 deletion b/‎src/maxdiffusion/generate_wan.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 4 additions & 2 deletions b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 4 additions & 2 deletions
@@ -47,9 +47,23 @@ text_encoder_dtype: 'float32'
 # Whether to compile the text_encoder with torch.compile
 compile_text_encoder: False
 
+# Maximum sequence length for the text encoder
+max_sequence_length: 512
+
+vae_weights_dtype: 'float32'
+vae_dtype: 'float32'
+scheduler_dtype: 'float32'
+
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
-vae_spatial: -1 # default to total_device * 2 // (dp)
+
+# Chunk size for VAE decode scan. Increase to improve decode time at the cost of memory.
+vae_decode_chunk: 1
+
+# Chunk size for VAE encode scan. (num_input_frames - 1) must be divisible by this value.
+# Increase to improve encode time at the cost of memory.
+vae_encode_chunk: 4
+vae_spatial: -1
 
 # matmul and conv precision from https://jax.readthedocs.io/en/latest/jax.lax.html#jax.lax.Precision
 # Options are "DEFAULT", "HIGH", "HIGHEST"
 
@@ -47,9 +47,24 @@ text_encoder_dtype: 'float32'
 # Whether to compile the text_encoder with torch.compile
 compile_text_encoder: False
 
+# Maximum sequence length for the text encoder
+max_sequence_length: 512
+
+vae_weights_dtype: 'float32'
+vae_dtype: 'float32'
+scheduler_dtype: 'float32'
+
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
 
+# Chunk size for VAE decode scan. Increase to improve decode time at the cost of memory.
+vae_decode_chunk: 1
+
+# Chunk size for VAE encode scan. (num_input_frames - 1) must be divisible by this value.
+# Increase to improve encode time at the cost of memory.
+vae_encode_chunk: 4
+vae_spatial: -1
+
 # matmul and conv precision from https://jax.readthedocs.io/en/latest/jax.lax.html#jax.lax.Precision
 # Options are "DEFAULT", "HIGH", "HIGHEST"
 # fp32 activations and fp32 weights with HIGHEST will provide the best precision
 
@@ -47,9 +47,23 @@ text_encoder_dtype: 'float32'
 # Whether to compile the text_encoder with torch.compile
 compile_text_encoder: False
 
+# Maximum sequence length for the text encoder
+max_sequence_length: 512
+
+vae_weights_dtype: 'float32'
+vae_dtype: 'float32'
+scheduler_dtype: 'float32'
+
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
-vae_spatial: -1 # default to total_device * 2 // (dp)
+
+# Chunk size for VAE decode scan. Increase to improve decode time at the cost of memory.
+vae_decode_chunk: 1
+
+# Chunk size for VAE encode scan. (num_input_frames - 1) must be divisible by this value.
+# Increase to improve encode time at the cost of memory.
+vae_encode_chunk: 4
+vae_spatial: -1
 
 # matmul and conv precision from https://jax.readthedocs.io/en/latest/jax.lax.html#jax.lax.Precision
 # Options are "DEFAULT", "HIGH", "HIGHEST"
 
@@ -47,8 +47,22 @@ text_encoder_dtype: 'float32'
 # Whether to compile the text_encoder with torch.compile
 compile_text_encoder: False
 
+# Maximum sequence length for the text encoder
+max_sequence_length: 512
+
+vae_weights_dtype: 'float32'
+vae_dtype: 'float32'
+scheduler_dtype: 'float32'
+
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
+
+# Chunk size for VAE decode scan. Increase to improve decode time at the cost of memory.
+vae_decode_chunk: 1
+
+# Chunk size for VAE encode scan. (num_input_frames - 1) must be divisible by this value.
+# Increase to improve encode time at the cost of memory.
+vae_encode_chunk: 4
 # Number of devices to shard VAE spatial activations across. -1 uses all devices.
 vae_spatial: -1
 
 
@@ -47,9 +47,23 @@ text_encoder_dtype: 'float32'
 # Whether to compile the text_encoder with torch.compile
 compile_text_encoder: False
 
+# Maximum sequence length for the text encoder
+max_sequence_length: 512
+
+vae_weights_dtype: 'float32'
+vae_dtype: 'float32'
+scheduler_dtype: 'float32'
+
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
-vae_spatial: -1 # default to total_device * 2 // (dp)
+
+# Chunk size for VAE decode scan. Increase to improve decode time at the cost of memory.
+vae_decode_chunk: 1
+
+# Chunk size for VAE encode scan. (num_input_frames - 1) must be divisible by this value.
+# Increase to improve encode time at the cost of memory.
+vae_encode_chunk: 4
+vae_spatial: -1
 
 # matmul and conv precision from https://jax.readthedocs.io/en/latest/jax.lax.html#jax.lax.Precision
 # Options are "DEFAULT", "HIGH", "HIGHEST"
 
@@ -47,9 +47,23 @@ text_encoder_dtype: 'float32'
 # Whether to compile the text_encoder with torch.compile
 compile_text_encoder: False
 
+# Maximum sequence length for the text encoder
+max_sequence_length: 512
+
+vae_weights_dtype: 'float32'
+vae_dtype: 'float32'
+scheduler_dtype: 'float32'
+
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
-vae_spatial: -1 # default to total_device * 2 // (dp)
+
+# Chunk size for VAE decode scan. Increase to improve decode time at the cost of memory.
+vae_decode_chunk: 1
+
+# Chunk size for VAE encode scan. (num_input_frames - 1) must be divisible by this value.
+# Increase to improve encode time at the cost of memory.
+vae_encode_chunk: 4
+vae_spatial: -1
 
 # matmul and conv precision from https://jax.readthedocs.io/en/latest/jax.lax.html#jax.lax.Precision
 # Options are "DEFAULT", "HIGH", "HIGHEST"
 
@@ -315,11 +315,17 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
       f"  Inference:           {generation_time:>7.1f}s",
   ]
   if trace:
+    vae_decode_total = trace.get("vae_decode", 0.0)
+    vae_decode_tpu = trace.get("vae_decode_tpu", 0.0)
+    vae_decode_post = vae_decode_total - vae_decode_tpu
     summary.extend([
         f"  {'─' * 40}",
         f"  Conditioning:        {trace.get('conditioning', 0.0):>7.1f}s",
+        f"    - VAE Encode:      {trace.get('vae_encode', 0.0):>7.1f}s",
         f"  Denoise Total:       {trace.get('denoise_total', 0.0):>7.1f}s",
-        f"  VAE Decode:          {trace.get('vae_decode', 0.0):>7.1f}s",
+        f"  VAE Decode:          {vae_decode_total:>7.1f}s",
+        f"    - TPU Compute:     {vae_decode_tpu:>7.1f}s",
+        f"    - Host Formatting: {vae_decode_post:>7.1f}s",
     ])
   summary.append(f"{'=' * 50}")
   max_logging.log("\n".join(summary))
 
@@ -325,7 +325,7 @@ def _tpu_flash_attention(
 ) -> jax.Array:
   """TPU Flash Attention"""
 
-  num_context_shards = mesh.shape["context"]
+  num_context_shards = mesh.shape["context"] if "context" in mesh.shape else 1
   query, orig_q_seq_len = _reshape_data_for_flash(query, heads, num_context_shards)
   key, _ = _reshape_data_for_flash(key, heads, num_context_shards)
   value, _ = _reshape_data_for_flash(value, heads, num_context_shards)
@@ -491,7 +491,9 @@ def ring_scan_body(carry, _):
         raise ValueError("ring attention requires context > 1")
     return attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype)
 
-  devices_in_batch_sharding = mesh.shape["data"] * (mesh.shape["fsdp"] if "fsdp" in mesh.shape else 1)
+  data_dim = mesh.shape["data"] if "data" in mesh.shape else 1
+  fsdp_dim = mesh.shape["fsdp"] if "fsdp" in mesh.shape else 1
+  devices_in_batch_sharding = data_dim * fsdp_dim
   # This warning might show up when doing model eval for example, when calculating model flops
   # and that is expected.
   if not (query.shape[0] / devices_in_batch_sharding).is_integer():