feat: add optional batched text encoder and diffusion loop

Perseus14 · Perseus14 · commit 88b1c2a9e084 · 2026-05-05T05:33:21.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -348,6 +348,13 @@ flow_shift: 3.0
 # Skips the unconditional forward pass on ~35% of steps via residual compensation.
 # See: FasterCache (Lv et al. 2024), WAN 2.1 paper §4.4.2
 use_cfg_cache: False
+
+# Batch positive and negative prompts in text encoder to save compute.
+use_batched_text_encoder: False
+
+# Use jax.lax.scan for the diffusion loop (non-cache path only).
+# Note: Enabling this will disable per-step profiling.
+scan_diffusion_loop: False
 use_magcache: False
 magcache_thresh: 0.12
 magcache_K: 2
diff --git a/src/maxdiffusion/configs/base_wan_1_3b.yml b/src/maxdiffusion/configs/base_wan_1_3b.yml
@@ -302,6 +302,13 @@ flow_shift: 3.0
 # Diffusion CFG cache (FasterCache-style, WAN 2.1 T2V only)
 use_cfg_cache: False
 
+# Batch positive and negative prompts in text encoder to save compute.
+use_batched_text_encoder: False
+
+# Use jax.lax.scan for the diffusion loop (non-cache path only).
+# Note: Enabling this will disable per-step profiling.
+scan_diffusion_loop: False
+
 # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
 guidance_rescale: 0.0
 num_inference_steps: 30
diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml
@@ -323,6 +323,13 @@ boundary_ratio: 0.875
 
 # Diffusion CFG cache (FasterCache-style)
 use_cfg_cache: False
+
+# Batch positive and negative prompts in text encoder to save compute.
+use_batched_text_encoder: False
+
+# Use jax.lax.scan for the diffusion loop (non-cache path only).
+# Note: Enabling this will disable per-step profiling.
+scan_diffusion_loop: False
 # SenCache: Sensitivity-Aware Caching (arXiv:2602.24208) — skip forward pass
 # when predicted output change (based on accumulated latent/timestep drift) is small
 use_sen_cache: False
diff --git a/src/maxdiffusion/configs/base_wan_i2v_14b.yml b/src/maxdiffusion/configs/base_wan_i2v_14b.yml
@@ -306,6 +306,13 @@ flow_shift: 5.0
 
 # Diffusion CFG cache (FasterCache-style)
 use_cfg_cache: False
+
+# Batch positive and negative prompts in text encoder to save compute.
+use_batched_text_encoder: False
+
+# Use jax.lax.scan for the diffusion loop (non-cache path only).
+# Note: Enabling this will disable per-step profiling.
+scan_diffusion_loop: False
 # SenCache: Sensitivity-Aware Caching (arXiv:2602.24208)
 use_sen_cache: False
 use_magcache: False
diff --git a/src/maxdiffusion/configs/base_wan_i2v_27b.yml b/src/maxdiffusion/configs/base_wan_i2v_27b.yml
@@ -318,6 +318,13 @@ boundary_ratio: 0.875
 
 # Diffusion CFG cache (FasterCache-style)
 use_cfg_cache: False
+
+# Batch positive and negative prompts in text encoder to save compute.
+use_batched_text_encoder: False
+
+# Use jax.lax.scan for the diffusion loop (non-cache path only).
+# Note: Enabling this will disable per-step profiling.
+scan_diffusion_loop: False
 # SenCache: Sensitivity-Aware Caching (arXiv:2602.24208)
 use_sen_cache: False
 
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -501,24 +501,45 @@ def encode_prompt(
       negative_prompt_embeds: jax.Array = None,
   ):
     prompt = [prompt] if isinstance(prompt, str) else prompt
-    if prompt_embeds is None:
-      prompt_embeds = self._get_t5_prompt_embeds(
-          prompt=prompt,
-          num_videos_per_prompt=num_videos_per_prompt,
-          max_sequence_length=max_sequence_length,
-      )
-      prompt_embeds = jnp.array(prompt_embeds.detach().float().numpy(), dtype=jnp.float32)
-
-    if negative_prompt_embeds is None:
-      batch_size = len(prompt_embeds)
-      negative_prompt = negative_prompt or ""
-      negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-      negative_prompt_embeds = self._get_t5_prompt_embeds(
-          prompt=negative_prompt,
+    batch_size = len(prompt)
+
+    if negative_prompt is None:
+      negative_prompt = [""] * batch_size
+    elif isinstance(negative_prompt, str):
+      negative_prompt = [negative_prompt] * batch_size
+
+    use_batched_text_encoder = getattr(self.config, "use_batched_text_encoder", False)
+    if use_batched_text_encoder and prompt_embeds is None and negative_prompt_embeds is None:
+      # Batch both together
+      combined_prompts = prompt + negative_prompt
+      combined_embeds = self._get_t5_prompt_embeds(
+          prompt=combined_prompts,
           num_videos_per_prompt=num_videos_per_prompt,
           max_sequence_length=max_sequence_length,
       )
-      negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().float().numpy(), dtype=jnp.float32)
+      combined_embeds = jnp.array(combined_embeds.detach().float().numpy(), dtype=jnp.float32)
+
+      # Split back
+      prompt_embeds = combined_embeds[: batch_size * num_videos_per_prompt]
+      negative_prompt_embeds = combined_embeds[batch_size * num_videos_per_prompt :]
+
+    else:
+      # Fallback to separate encoding if one of them is already provided
+      if prompt_embeds is None:
+        prompt_embeds = self._get_t5_prompt_embeds(
+            prompt=prompt,
+            num_videos_per_prompt=num_videos_per_prompt,
+            max_sequence_length=max_sequence_length,
+        )
+        prompt_embeds = jnp.array(prompt_embeds.detach().float().numpy(), dtype=jnp.float32)
+
+      if negative_prompt_embeds is None:
+        negative_prompt_embeds = self._get_t5_prompt_embeds(
+            prompt=negative_prompt,
+            num_videos_per_prompt=num_videos_per_prompt,
+            max_sequence_length=max_sequence_length,
+        )
+        negative_prompt_embeds = jnp.array(negative_prompt_embeds.detach().float().numpy(), dtype=jnp.float32)
 
     return prompt_embeds, negative_prompt_embeds
 
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py
@@ -261,6 +261,52 @@ def run_inference_2_1(
   profiler_steps = config.profiler_steps if config else 0
   last_profiling_step = np.clip(first_profiling_step + profiler_steps - 1, first_profiling_step, num_inference_steps - 1)
 
+  scan_diffusion_loop = getattr(config, "scan_diffusion_loop", False) if config else False
+
+  if scan_diffusion_loop and not use_magcache and not use_cfg_cache:
+    timesteps = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)
+
+    scheduler_state = scheduler_state.replace(last_sample=jnp.zeros_like(latents), step_index=jnp.array(0, dtype=jnp.int32))
+
+    def scan_body(carry, t):
+      current_latents, current_scheduler_state = carry
+
+      if do_cfg:
+        latents_doubled = jnp.concatenate([current_latents] * 2)
+        timestep = jnp.broadcast_to(t, bsz * 2)
+        noise_pred, _, _ = transformer_forward_pass_full_cfg(
+            graphdef,
+            sharded_state,
+            rest_of_state,
+            latents_doubled,
+            timestep,
+            prompt_embeds_combined,
+            guidance_scale=guidance_scale,
+        )
+      else:
+        timestep = jnp.broadcast_to(t, bsz)
+        noise_pred, _ = transformer_forward_pass(
+            graphdef,
+            sharded_state,
+            rest_of_state,
+            current_latents,
+            timestep,
+            prompt_cond_embeds,
+            do_classifier_free_guidance=False,
+            guidance_scale=guidance_scale,
+        )
+
+      new_latents, new_scheduler_state = scheduler.step(current_scheduler_state, noise_pred, t, current_latents).to_tuple()
+
+      return (new_latents, new_scheduler_state), None
+
+    initial_carry = (latents, scheduler_state)
+
+    final_carry, _ = jax.lax.scan(scan_body, initial_carry, timesteps)
+
+    final_latents, _ = final_carry
+    return final_latents
+
   profiler = None
   for step in range(num_inference_steps):
     if config and max_utils.profiler_enabled(config) and step == first_profiling_step:
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py
@@ -471,6 +471,61 @@ def run_inference_2_2(
   profiler_steps = config.profiler_steps if config else 0
   last_profiling_step = np.clip(first_profiling_step + profiler_steps - 1, first_profiling_step, num_inference_steps - 1)
 
+  scan_diffusion_loop = getattr(config, "scan_diffusion_loop", False) if config else False
+
+  if scan_diffusion_loop:
+    timesteps = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)
+
+    scheduler_state = scheduler_state.replace(last_sample=jnp.zeros_like(latents), step_index=jnp.array(0, dtype=jnp.int32))
+
+    def scan_body(carry, t):
+      current_latents, current_scheduler_state = carry
+
+      if do_classifier_free_guidance:
+        model_latents = jnp.concatenate([current_latents] * 2)
+      else:
+        model_latents = current_latents
+
+      timestep = jnp.broadcast_to(t, model_latents.shape[0])
+      use_high_noise = jnp.greater_equal(t, boundary)
+
+      def high_branch(_):
+        return transformer_forward_pass(
+            high_noise_graphdef,
+            high_noise_state,
+            high_noise_rest,
+            model_latents,
+            timestep,
+            prompt_embeds_combined,
+            do_classifier_free_guidance,
+            guidance_scale_high,
+        )
+
+      def low_branch(_):
+        return transformer_forward_pass(
+            low_noise_graphdef,
+            low_noise_state,
+            low_noise_rest,
+            model_latents,
+            timestep,
+            prompt_embeds_combined,
+            do_classifier_free_guidance,
+            guidance_scale_low,
+        )
+
+      noise_pred, latents_out = jax.lax.cond(use_high_noise, high_branch, low_branch, operand=None)
+
+      new_latents, new_scheduler_state = scheduler.step(current_scheduler_state, noise_pred, t, latents_out).to_tuple()
+
+      return (new_latents, new_scheduler_state), None
+
+    initial_carry = (latents, scheduler_state)
+
+    final_carry, _ = jax.lax.scan(scan_body, initial_carry, timesteps)
+
+    final_latents, _ = final_carry
+    return final_latents
+
   profiler = None
   for step in range(num_inference_steps):
     if config and max_utils.profiler_enabled(config) and step == first_profiling_step:
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py
@@ -317,6 +317,54 @@ def run_inference_2_1_i2v(
   profiler_steps = config.profiler_steps if config else 0
   last_profiling_step = np.clip(first_profiling_step + profiler_steps - 1, first_profiling_step, num_inference_steps - 1)
 
+  scan_diffusion_loop = getattr(config, "scan_diffusion_loop", False) if config else False
+
+  if scan_diffusion_loop and not use_magcache:
+    timesteps = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)
+
+    scheduler_state = scheduler_state.replace(last_sample=jnp.zeros_like(latents), step_index=jnp.array(0, dtype=jnp.int32))
+
+    def scan_body(carry, t):
+      current_latents, current_scheduler_state = carry
+
+      latents_input = current_latents
+      if do_cfg:
+        latents_input = jnp.concatenate([current_latents, current_latents], axis=0)
+
+      latent_model_input = jnp.concatenate([latents_input, condition_combined], axis=-1)
+      timestep = jnp.broadcast_to(t, latents_input.shape[0])
+      latent_model_input = jnp.transpose(latent_model_input, (0, 4, 1, 2, 3))
+
+      outputs = transformer_forward_pass(
+          graphdef,
+          sharded_state,
+          rest_of_state,
+          latent_model_input,
+          timestep,
+          prompt_embeds_combined,
+          do_classifier_free_guidance=do_cfg,
+          guidance_scale=guidance_scale,
+          encoder_hidden_states_image=image_embeds_combined,
+          skip_blocks=None,
+          cached_residual=None,
+          return_residual=False,
+      )
+      noise_pred, _ = outputs
+
+      noise_pred = jnp.transpose(noise_pred, (0, 2, 3, 4, 1))
+      new_latents, new_scheduler_state = scheduler.step(
+          current_scheduler_state, noise_pred, t, current_latents, return_dict=False
+      )
+
+      return (new_latents, new_scheduler_state), None
+
+    initial_carry = (latents, scheduler_state)
+
+    final_carry, _ = jax.lax.scan(scan_body, initial_carry, timesteps)
+
+    final_latents, _ = final_carry
+    return final_latents
+
   profiler = None
   for step in range(num_inference_steps):
     if config and max_utils.profiler_enabled(config) and step == first_profiling_step:
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p2.py
@@ -609,6 +609,38 @@ def low_noise_branch(operands):
   profiler_steps = config.profiler_steps if config else 0
   last_profiling_step = np.clip(first_profiling_step + profiler_steps - 1, first_profiling_step, num_inference_steps - 1)
 
+  scan_diffusion_loop = getattr(config, "scan_diffusion_loop", False) if config else False
+
+  if scan_diffusion_loop:
+    timesteps = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)
+
+    scheduler_state = scheduler_state.replace(last_sample=jnp.zeros_like(latents), step_index=jnp.array(0, dtype=jnp.int32))
+
+    def scan_body(carry, t):
+      current_latents, current_scheduler_state = carry
+
+      latents_input = current_latents
+      if do_classifier_free_guidance:
+        latents_input = jnp.concatenate([current_latents, current_latents], axis=0)
+      latent_model_input = jnp.concatenate([latents_input, condition], axis=-1)
+      timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+      use_high_noise = jnp.greater_equal(t, boundary)
+      noise_pred, _ = jax.lax.cond(
+          use_high_noise, high_noise_branch, low_noise_branch, (latent_model_input, timestep, prompt_embeds, image_embeds)
+      )
+      noise_pred = jnp.transpose(noise_pred, (0, 2, 3, 4, 1))
+      new_latents, new_scheduler_state = scheduler.step(current_scheduler_state, noise_pred, t, current_latents).to_tuple()
+
+      return (new_latents, new_scheduler_state), None
+
+    initial_carry = (latents, scheduler_state)
+
+    final_carry, _ = jax.lax.scan(scan_body, initial_carry, timesteps)
+
+    final_latents, _ = final_carry
+    return final_latents
+
   profiler = None
   for step in range(num_inference_steps):
     if config and max_utils.profiler_enabled(config) and step == first_profiling_step: