AI-Hypercomputer
diff --git a/‎src/maxdiffusion/checkpointing/ltx2_checkpointer.py‎
Lines changed: 3 additions & 3 deletions b/‎src/maxdiffusion/checkpointing/ltx2_checkpointer.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/maxdiffusion/configs/ltx2_video.yml‎
Lines changed: 13 additions & 0 deletions b/‎src/maxdiffusion/configs/ltx2_video.yml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/maxdiffusion/generate_ltx2.py‎
Lines changed: 8 additions & 3 deletions b/‎src/maxdiffusion/generate_ltx2.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/maxdiffusion/models/ltx2/attention_ltx2.py‎
Lines changed: 42 additions & 36 deletions b/‎src/maxdiffusion/models/ltx2/attention_ltx2.py‎
Lines changed: 42 additions & 36 deletions
@@ -79,19 +79,19 @@ def load_ltx2_configs_from_orbax(self, step: Optional[int]) -> Tuple[Optional[di
     return restored_checkpoint, step
 
   def load_checkpoint(
-      self, step=None, vae_only=False, load_transformer=True
+      self, step=None, vae_only=False, load_transformer=True, load_upsampler=False
   ) -> Tuple[LTX2Pipeline, Optional[dict], Optional[int]]:
     restored_checkpoint, step = self.load_ltx2_configs_from_orbax(step)
     opt_state = None
 
     if restored_checkpoint:
       max_logging.log("Loading LTX2 pipeline from checkpoint")
-      pipeline = LTX2Pipeline.from_checkpoint(self.config, restored_checkpoint, vae_only, load_transformer)
+      pipeline = LTX2Pipeline.from_checkpoint(self.config, restored_checkpoint, vae_only, load_transformer, load_upsampler)
       if "opt_state" in restored_checkpoint.ltx2_state.keys():
         opt_state = restored_checkpoint.ltx2_state["opt_state"]
     else:
       max_logging.log("No checkpoint found, loading pipeline from pretrained hub")
-      pipeline = LTX2Pipeline.from_pretrained(self.config, vae_only, load_transformer)
+      pipeline = LTX2Pipeline.from_pretrained(self.config, vae_only, load_transformer, load_upsampler)
 
     return pipeline, opt_state, step
 
 
@@ -2,6 +2,8 @@
 hardware: 'tpu'
 skip_jax_distributed_system: False
 attention: 'flash'
+a2v_attention_kernel: 'flash'
+v2a_attention_kernel: 'dot_product'
 attention_sharding_uniform: True 
 precision: 'bf16'
 scan_layers: True
@@ -68,6 +70,7 @@ flash_block_sizes: {
   block_kv_dkv_compute: 2048,
   use_fused_bwd_kernel: True,
 }
+flash_min_seq_length: 4096
 dcn_context_parallelism: 1
 dcn_tensor_parallelism: 1
 ici_data_parallelism: 1
@@ -102,3 +105,13 @@ jit_initializers: True
 enable_single_replica_ckpt_restoring: False
 seed: 0
 audio_format: "s16"
+
+# LTX-2 Latent Upsampler
+run_latent_upsampler: False
+upsampler_model_path: "Lightricks/LTX-2"
+upsampler_spatial_patch_size: 1
+upsampler_temporal_patch_size: 1
+upsampler_adain_factor: 0.0
+upsampler_tone_map_compression_ratio: 0.0
+upsampler_rational_spatial_scale: 2.0
+upsampler_output_type: "pil"
@@ -81,7 +81,6 @@ def get_git_commit_hash():
 
 
 def call_pipeline(config, pipeline, prompt, negative_prompt):
-  # Set default generation arguments
   generator = jax.random.key(config.seed) if hasattr(config, "seed") else jax.random.key(0)
   guidance_scale = config.guidance_scale if hasattr(config, "guidance_scale") else 3.0
 
@@ -99,6 +98,7 @@ def call_pipeline(config, pipeline, prompt, negative_prompt):
       decode_noise_scale=getattr(config, "decode_noise_scale", None),
       max_sequence_length=getattr(config, "max_sequence_length", 1024),
       dtype=jnp.bfloat16 if getattr(config, "activations_dtype", "bfloat16") == "bfloat16" else jnp.float32,
+      output_type=getattr(config, "upsampler_output_type", "pil"),
   )
   return out
 
@@ -114,9 +114,11 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
     else:
       max_logging.log("Could not retrieve Git commit hash.")
 
+  checkpoint_loader = LTX2Checkpointer(config=config)
   if pipeline is None:
-    checkpoint_loader = LTX2Checkpointer(config=config)
-    pipeline, _, _ = checkpoint_loader.load_checkpoint()
+    # Use the config flag to determine if the upsampler should be loaded
+    run_latent_upsampler = getattr(config, "run_latent_upsampler", False)
+    pipeline, _, _ = checkpoint_loader.load_checkpoint(load_upsampler=run_latent_upsampler)
 
   pipeline.enable_vae_slicing()
   pipeline.enable_vae_tiling()
@@ -135,6 +137,7 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
   )
 
   out = call_pipeline(config, pipeline, prompt, negative_prompt)
+
   # out should have .frames and .audio
   videos = out.frames if hasattr(out, "frames") else out[0]
   audios = out.audio if hasattr(out, "audio") else None
@@ -143,6 +146,8 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
   max_logging.log(f"model name: {getattr(config, 'model_name', 'ltx-video')}")
   max_logging.log(f"model path: {config.pretrained_model_name_or_path}")
   max_logging.log(f"model type: {getattr(config, 'model_type', 'T2V')}")
+  if getattr(config, "run_latent_upsampler", False):
+    max_logging.log(f"upsampler model path: {config.upsampler_model_path}")
   max_logging.log(f"hardware: {jax.devices()[0].platform}")
   max_logging.log(f"number of devices: {jax.device_count()}")
   max_logging.log(f"per_device_batch_size: {config.per_device_batch_size}")
 
@@ -16,6 +16,7 @@
 
 from typing import Optional, Tuple
 from flax import nnx
+import jax
 import jax.numpy as jnp
 from ... import common_types
 from ..attention_flax import NNXAttentionOp
@@ -347,6 +348,7 @@ def __init__(
       attention_kernel: str = "flash",
       rope_type: str = "interleaved",
       flash_block_sizes: BlockSizes = None,
+      flash_min_seq_length: int = 4096,
   ):
     self.heads = heads
     self.rope_type = rope_type
@@ -434,6 +436,7 @@ def __init__(
         axis_names_q=(common_types.BATCH, common_types.SELF_ATTN_HEAD, common_types.SELF_ATTN_Q_LENGTH, common_types.D_KV),
         axis_names_kv=(common_types.BATCH, common_types.SELF_ATTN_HEAD, common_types.SELF_ATTN_KV_LENGTH, common_types.D_KV),
         flash_block_sizes=flash_block_sizes,
+        flash_min_seq_length=flash_min_seq_length,
     )
 
   def __call__(
@@ -447,46 +450,49 @@ def __call__(
     # Determine context (Self or Cross)
     context = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
 
-    # 1. Project
-    query = self.to_q(hidden_states)
-    key = self.to_k(context)
-    value = self.to_v(context)
+    # 1. Project and Norm
+    with jax.named_scope("QKV Projection"):
+      query = self.to_q(hidden_states)
+      key = self.to_k(context)
+      value = self.to_v(context)
 
-    # 2. Norm (Full Inner Dimension)
-    query = self.norm_q(query)
-    key = self.norm_k(key)
+    with jax.named_scope("QKV Norm"):
+      query = self.norm_q(query)
+      key = self.norm_k(key)
 
     # 3. Apply RoPE to tensors of shape [B, S, InnerDim]
     # Frequencies are shape [B, S, InnerDim]
     # 3. Apply RoPE
-    if rotary_emb is not None:
-      if hasattr(self, "rope_type") and self.rope_type == "split":
-        # Split RoPE: passing full freqs [B, H, S, D//2]
-        # apply_split_rotary_emb handles reshaping query/key
-
-        query = apply_split_rotary_emb(query, rotary_emb)
-
-        if k_rotary_emb is not None:
-          key = apply_split_rotary_emb(key, k_rotary_emb)
-        elif encoder_hidden_states is None:
-          key = apply_split_rotary_emb(key, rotary_emb)
-
-      else:
-        # Interleaved (Default)
-        query = apply_rotary_emb(query, rotary_emb)
-        if k_rotary_emb is not None:
-          key = apply_rotary_emb(key, k_rotary_emb)
-        elif encoder_hidden_states is None:
-          key = apply_rotary_emb(key, rotary_emb)
-
-    # 4. Attention
-    # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
-    attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
-
-    # 7. Output Projection
-    hidden_states = self.to_out(attn_output)
-
-    if self.dropout_layer is not None:
-      hidden_states = self.dropout_layer(hidden_states)
+    with jax.named_scope("Apply RoPE"):
+      if rotary_emb is not None:
+        if hasattr(self, "rope_type") and self.rope_type == "split":
+          # Split RoPE: passing full freqs [B, H, S, D//2]
+          # apply_split_rotary_emb handles reshaping query/key
+
+          query = apply_split_rotary_emb(query, rotary_emb)
+
+          if k_rotary_emb is not None:
+            key = apply_split_rotary_emb(key, k_rotary_emb)
+          elif encoder_hidden_states is None:
+            key = apply_split_rotary_emb(key, rotary_emb)
+
+        else:
+          # Interleaved (Default)
+          query = apply_rotary_emb(query, rotary_emb)
+          if k_rotary_emb is not None:
+            key = apply_rotary_emb(key, k_rotary_emb)
+          elif encoder_hidden_states is None:
+            key = apply_rotary_emb(key, rotary_emb)
+
+    with jax.named_scope("Attention and Output Project"):
+      # 4. Attention
+      # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
+      attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
+
+      # 7. Output Projection
+      hidden_states = self.to_out(attn_output)
+
+      if self.dropout_layer is not None:
+        hidden_states = self.dropout_layer(hidden_states)
 
     return hidden_states