debug

prishajain1 · prishajain1 · commit 18f5d63f1e13 · 2026-05-06T10:20:48.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -521,13 +521,11 @@ def __call__(
       attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
 
       if perturbation_mask is not None:
-        print("DEBUG: Applying perturbation mask")
         # value is [B, S, InnerDim]
         # attn_output is [B, S, InnerDim]
         attn_output = value + perturbation_mask * (attn_output - value)
 
       if getattr(self, "to_gate_logits", None) is not None:
-        print("DEBUG: Applying gated attention")
         gate_logits = self.to_gate_logits(hidden_states)
         b, s, _ = attn_output.shape
         attn_output = attn_output.reshape(b, s, self.heads, self.dim_head)
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
@@ -746,7 +746,6 @@ def __init__(
 
     # 2. Prompt embeddings
     if self.use_prompt_embeddings:
-      print("DEBUG: Initializing caption projection (LTX-2.0 path)")
       self.caption_projection = NNXPixArtAlphaTextProjection(
           rngs=rngs,
           in_features=self.caption_channels,
@@ -766,7 +765,6 @@ def __init__(
       self.audio_caption_projection = None
 
     if self.cross_attn_mod:
-      print("DEBUG: Initializing prompt_adaln (LTX-2.3 path)")
       self.prompt_adaln = LTX2AdaLayerNormSingle(
           rngs=rngs,
           embedding_dim=inner_dim,
@@ -1105,7 +1103,6 @@ def __call__(
       audio_embedded_timestep = audio_embedded_timestep.reshape(batch_size, -1, audio_embedded_timestep.shape[-1])
 
       if self.cross_attn_mod and sigma is not None:
-        print("DEBUG: Executing prompt_adaln (LTX-2.3 path)")
         audio_sigma = audio_sigma if audio_sigma is not None else sigma
         temb_prompt, _ = self.prompt_adaln(
             sigma.flatten(),
@@ -1122,7 +1119,6 @@ def __call__(
         temb_prompt_audio = None
 
       if use_cross_timestep:
-        print("DEBUG: Using cross timestep (LTX-2.3 path)")
         assert sigma is not None and audio_sigma is not None, "sigma and audio_sigma must be provided when use_cross_timestep is True"
         video_ca_timestep = audio_sigma.flatten()
         audio_ca_timestep = sigma.flatten()
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -1398,6 +1398,14 @@ def __call__(
       prompt_attention_mask_jax = jnp.concatenate([negative_prompt_attention_mask_jax, prompt_attention_mask_jax], axis=0)
       latents_jax = jnp.concatenate([latents_jax] * 2, axis=0)
       audio_latents_jax = jnp.concatenate([audio_latents_jax] * 2, axis=0)
+    def _print_stats_gemma(name, tensor):
+      print(
+          f"DEBUG {name} shape: {tensor.shape}, mean: {jnp.round(jnp.mean(tensor), 6)}, min: {jnp.round(jnp.min(tensor), 4)}, max: {jnp.round(jnp.max(tensor), 4)}, std: {jnp.round(jnp.std(tensor), 4)}"
+      )
+    if do_cfg and do_stg:
+      _print_stats_gemma("text_encoder_output_flattened", prompt_embeds_jax[:2])
+    else:
+      _print_stats_gemma("text_encoder_output_flattened", prompt_embeds_jax)
 
     if hasattr(self, "mesh") and self.mesh is not None:
       data_sharding_3d = NamedSharding(self.mesh, P())
@@ -1444,19 +1452,31 @@ def __call__(
         audio_embeds_sharded = jax.device_put(audio_embeds, spec)
       def _print_stats(name, tensor):
         print(
-            f"DEBUG {name} shape: {tensor.shape}, mean: {jnp.round(jnp.mean(tensor), 4)}, min: {jnp.round(jnp.min(tensor), 4)}, max: {jnp.round(jnp.max(tensor), 4)}, std: {jnp.round(jnp.std(tensor), 4)}"
+            f"DEBUG {name} shape: {tensor.shape}, mean: {jnp.round(jnp.mean(tensor), 6)}, min: {jnp.round(jnp.min(tensor), 4)}, max: {jnp.round(jnp.max(tensor), 4)}, std: {jnp.round(jnp.std(tensor), 4)}"
         )
       print(f"WEIGHT DEBUG: block 0 to_q kernel mean: {float(self.transformer.transformer_blocks.attn1.to_q.kernel.value[0].mean()):.6f}")
-      _print_stats("video_embeds", video_embeds)
-      _print_stats("audio_embeds", audio_embeds)
+      if do_cfg and do_stg:
+        _print_stats("video_text_embedding", video_embeds[:2])
+        _print_stats("audio_text_embedding", audio_embeds[:2])
+      else:
+        _print_stats("video_text_embedding", video_embeds)
+        _print_stats("audio_text_embedding", audio_embeds)
 
       timesteps_jax = jnp.array(timesteps, dtype=jnp.float32)
 
       diffusion_loop_start = time.time()
       scan_diffusion_loop = getattr(self.config, "scan_diffusion_loop", True)
 
-      _print_stats("latents_jax_before_loop", latents_jax)
-      _print_stats("audio_latents_jax_before_loop", audio_latents_jax)
+      if do_cfg and do_stg:
+        _print_stats("latents_jax_before_loop", latents_jax[:batch_size])
+        _print_stats("audio_latents_jax_before_loop", audio_latents_jax[:batch_size])
+      elif do_cfg:
+        _print_stats("latents_jax_before_loop", latents_jax[:batch_size])
+        _print_stats("audio_latents_jax_before_loop", audio_latents_jax[:batch_size])
+      else:
+        _print_stats("latents_jax_before_loop", latents_jax)
+        _print_stats("audio_latents_jax_before_loop", audio_latents_jax)
+
       if scan_diffusion_loop:
         latents_jax, audio_latents_jax = run_diffusion_loop(
             graphdef,
@@ -1948,6 +1968,21 @@ def scan_body(carry, inputs):
 
       is_first_step = (t == timesteps_jax[0])
 
+      def print_raw_stats():
+        print_stats_jit("noise_pred_video_raw", noise_pred)
+        print_stats_jit("noise_pred_audio_raw", noise_pred_audio)
+        if do_cfg:
+          uncond_v = noise_pred[:batch_size]
+          cond_v = noise_pred[batch_size : 2 * batch_size]
+          uncond_a = noise_pred_audio[:batch_size]
+          cond_a = noise_pred_audio[batch_size : 2 * batch_size]
+          print_stats_jit("noise_pred_video_raw_uncond", uncond_v)
+          print_stats_jit("noise_pred_video_raw_cond", cond_v)
+          print_stats_jit("noise_pred_audio_raw_uncond", uncond_a)
+          print_stats_jit("noise_pred_audio_raw_cond", cond_a)
+
+      jax.lax.cond(is_first_step, print_raw_stats, lambda: None)
+
 
       # Extract latents_step based on stacking strategy
       if do_cfg and do_stg:
@@ -1981,7 +2016,6 @@ def scan_body(carry, inputs):
           x0_combined = rescale_noise_cfg(x0_combined, x0_text, guidance_rescale=guidance_rescale)
           
         noise_pred = convert_to_vel(latents_step, x0_combined, sigma_t)
-        jax.lax.cond(is_first_step, lambda: print_stats_jit("noise_pred_video_after_guidance", noise_pred), lambda: None)
 
         # Audio guidance
         noise_pred_audio_uncond, noise_pred_audio_text, noise_pred_audio_perturb, noise_pred_audio_isolated = jnp.split(noise_pred_audio, 4, axis=0)
@@ -2001,7 +2035,6 @@ def scan_body(carry, inputs):
           x0_audio_combined = rescale_noise_cfg(x0_audio_combined, x0_audio_text, guidance_rescale=audio_guidance_rescale)
 
         noise_pred_audio = convert_to_vel(audio_latents_step, x0_audio_combined, sigma_t)
-        jax.lax.cond(is_first_step, lambda: print_stats_jit("noise_pred_audio_after_guidance", noise_pred_audio), lambda: None)
 
       # ... (Standard CFG paths can be added here, but for brevity and since LTX2.3 runs with STG this handles the core logic)
       elif do_cfg: