Offload LTX-2 text encoder to TorchAX and resolve lint issues

mbohlool · mbohlool · commit 7707c3df6da3 · 2026-05-04T20:42:03.000Z
diff --git a/dependencies/requirements/base_requirements/requirements.txt b/dependencies/requirements/base_requirements/requirements.txt
@@ -35,6 +35,7 @@ tensorflow-datasets
 tensorflow
 tokamax
 tokenizers
+torchax>=0.0.11
 transformers<5.0.0
 
 # pinning torch and torchvision to specific versions to avoid
diff --git a/dependencies/requirements/generated_requirements/requirements.txt b/dependencies/requirements/generated_requirements/requirements.txt
@@ -179,6 +179,7 @@ toml>=0.10.2
 tomlkit>=0.14.0
 toolz>=1.1.0
 torch @ https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl
+torchax>=0.0.11
 torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl
 tqdm>=4.67.3
 transformers>=4.57.6
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/torchax_text_encoder.py b/src/maxdiffusion/models/ltx2/text_encoders/torchax_text_encoder.py
@@ -0,0 +1,75 @@
+"""
+Copyright 2026 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Tuple
+
+import torch
+import jax
+from torchax import interop, default_env
+
+# --- Monkeypatch transformers masking_utils to avoid torchax integer tracing bug ---
+import transformers.masking_utils
+
+_orig_sliding_window_overlay = transformers.masking_utils.sliding_window_overlay
+
+
+def _patched_sliding_window_overlay(sliding_window: int):
+  # pylint: disable=unused-argument
+
+  def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+    # Since sequence length < sliding window (e.g. 256 < 4096), this mask is always True.
+    # We return a standard boolean tensor using new_ones to guarantee Torchax compatibility
+    # and prevent any implicit tracing crashes.
+    return q_idx.new_ones((), dtype=torch.bool)
+
+  return inner_mask
+
+
+transformers.masking_utils.sliding_window_overlay = _patched_sliding_window_overlay
+# -----------------------------------------------------------------------------------
+
+
+class TorchaxGemma3TextEncoder(interop.JittableModule):
+  """
+  A jittable Torchax module for wrapping the HuggingFace PyTorch
+  Gemma3ForConditionalGeneration text encoder.
+  """
+
+  def __init__(self, text_encoder):
+    super().__init__(text_encoder, extra_jit_args={"static_argnames": ["output_hidden_states"]})
+
+  def __call__(
+      self, input_ids: jax.Array, attention_mask: jax.Array, output_hidden_states: bool = True
+  ) -> Tuple[jax.Array, ...]:
+    with default_env():
+      input_ids = interop.torch_view(input_ids)
+      attention_mask = interop.torch_view(attention_mask)
+
+      output = self.functional_call(
+          self._forward_inner,
+          params=self.params,
+          buffers=self.buffers,
+          input_ids=input_ids,
+          attention_mask=attention_mask,
+          output_hidden_states=output_hidden_states,
+      )
+    return interop.jax_view(output)
+
+  @staticmethod
+  def _forward_inner(model, input_ids, attention_mask, output_hidden_states=True):
+    # We only return hidden states as a tuple of tensors.
+    # That allows interop.jax_view to convert them into a tuple of jax Arrays
+    return model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=output_hidden_states).hidden_states
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
@@ -22,16 +22,20 @@
 import jax
 import jax.numpy as jnp
 from jax.sharding import Mesh, NamedSharding, PartitionSpec as P
+from torchax import default_env
+from maxdiffusion.models.ltx2.text_encoders.torchax_text_encoder import TorchaxGemma3TextEncoder
+from maxdiffusion.tpu_utils import get_tpu_type, TpuType
+from maxdiffusion.maxdiffusion_utils import get_dummy_ltx2_inputs
+import contextlib
 import flax
 import flax.linen as nn
 import flax.traverse_util
 from flax import nnx
 from flax.linen import partitioning as nn_partitioning
 from transformers import AutoTokenizer, GemmaTokenizer, GemmaTokenizerFast, Gemma3ForConditionalGeneration
-from maxdiffusion.tpu_utils import get_tpu_type, TpuType
 import qwix
 from ...utils import logging
-from ...schedulers import FlaxFlowMatchScheduler
+from ...schedulers import FlaxFlowMatchScheduler  # pylint: disable=no-name-in-module
 from ...models.ltx2.autoencoder_kl_ltx2 import LTX2VideoAutoencoderKL
 from ...models.ltx2.autoencoder_kl_ltx2_audio import FlaxAutoencoderKLLTX2Audio
 from ...models.ltx2.vocoder_ltx2 import LTX2Vocoder
@@ -53,7 +57,6 @@
 from ... import max_logging
 from ... import max_utils
 from ...max_utils import get_precision, device_put_replicated, get_flash_block_sizes
-from maxdiffusion.maxdiffusion_utils import get_dummy_ltx2_inputs
 
 
 @flax.struct.dataclass
@@ -65,7 +68,8 @@ class LTX2PipelineOutput:
 def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
   """
   Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure.
-  Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891).
+  Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are Flawed]
+  (https://huggingface.co/papers/2305.08891).
   """
   std_text = jnp.std(noise_pred_text, axis=list(range(1, noise_pred_text.ndim)), keepdims=True)
   std_cfg = jnp.std(noise_cfg, axis=list(range(1, noise_cfg.ndim)), keepdims=True)
@@ -110,6 +114,9 @@ def create_sharded_logical_transformer(
     restored_checkpoint=None,
     subfolder: str = "",
 ):
+  """Creates a sharded logical transformer."""
+
+  # pylint: disable=too-many-positional-arguments,unused-argument
   def create_model(rngs: nnx.Rngs, ltx2_config: dict):
     transformer = LTX2VideoTransformer3DModel(**ltx2_config, rngs=rngs)
     return transformer
@@ -186,6 +193,8 @@ def calculate_shift(
     base_shift: float = 0.5,
     max_shift: float = 1.15,
 ):
+  """Calculates the shift for the timestep schedule."""
+  # pylint: disable=too-many-positional-arguments
   m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
   b = base_shift - m * base_seq_len
   mu = image_seq_len * m + b
@@ -200,6 +209,8 @@ def retrieve_timesteps(
     sigmas: Optional[List[float]] = None,
     **kwargs,
 ):
+  """Retrieves timesteps for the scheduler."""
+  # pylint: disable=too-many-positional-arguments
   if timesteps is not None and sigmas is not None:
     raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
 
@@ -222,6 +233,8 @@ class LTX2Pipeline:
   Pipeline for LTX-2.
   """
 
+  # pylint: disable=missing-function-docstring,too-many-positional-arguments,unused-argument
+
   def __init__(
       self,
       scheduler: FlaxFlowMatchScheduler,
@@ -245,6 +258,8 @@ def __init__(
     self.transformer = transformer
     self.latent_upsampler = latent_upsampler
     self.latent_upsampler_params = latent_upsampler_params
+    self.mesh = None
+    self.config = None
 
     # VAE compression ratios
     self.vae_spatial_compression_ratio = getattr(self.vae, "spatial_compression_ratio", 32)
@@ -316,6 +331,11 @@ def load_text_encoder(cls, config: HyperParameters):
         torch_dtype=torch.bfloat16,
     )
     text_encoder.eval()
+
+    with default_env():
+      text_encoder = text_encoder.to("jax")
+      text_encoder = TorchaxGemma3TextEncoder(text_encoder)
+
     return text_encoder
 
   @classmethod
@@ -396,7 +416,7 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
         sharding = sharding.get_value()
         try:
           replicate_vae = config.replicate_vae
-        except ValueError:
+        except Exception:  # pylint: disable=broad-exception-caught
           replicate_vae = False
         if replicate_vae:
           sharding = NamedSharding(mesh, P())
@@ -444,7 +464,7 @@ def create_model(rngs: nnx.Rngs, config: HyperParameters):
         sharding = sharding.get_value()
         try:
           replicate_vae = config.replicate_vae
-        except ValueError:
+        except Exception:  # pylint: disable=broad-exception-caught
           replicate_vae = False
         if replicate_vae:
           sharding = NamedSharding(mesh, P())
@@ -750,39 +770,48 @@ def _get_gemma_prompt_embeds(
     prompt = [p.strip() for p in prompt]
 
     if self.text_encoder is not None:
-      # PyTorch Text Encoder
+      # Torchax Text Encoder
       text_inputs = self.tokenizer(
           prompt,
           padding="max_length",
           max_length=max_sequence_length,
           truncation=True,
           add_special_tokens=True,
-          return_tensors="pt",
+          return_tensors="np",
+      )
+      text_input_ids = jnp.array(text_inputs.input_ids)
+      prompt_attention_mask = jnp.array(text_inputs.attention_mask)
+
+      # Distribute the batch dimension across available TPUs to prevent Softmax OOM
+      # (reduces 512MB allocation down to 64MB per TPU for batch size 16)
+      devices = np.array(jax.devices())
+      num_shards = 1
+      for i in range(len(devices), 0, -1):
+        if text_input_ids.shape[0] % i == 0:
+          num_shards = i
+          break
+
+      if num_shards > 1:
+        mesh = Mesh(devices[:num_shards], axis_names=("batch",))
+        sharding = NamedSharding(mesh, P("batch"))
+        text_input_ids = jax.device_put(text_input_ids, sharding)
+        prompt_attention_mask = jax.device_put(prompt_attention_mask, sharding)
+
+      # Torchax wrapper returns tuple of hidden states natively
+      text_encoder_hidden_states = self.text_encoder(
+          input_ids=text_input_ids, attention_mask=prompt_attention_mask, output_hidden_states=True
       )
-      text_input_ids = text_inputs.input_ids
-      prompt_attention_mask = text_inputs.attention_mask
-
-      text_input_ids = text_input_ids.to(self.text_encoder.device)
-      prompt_attention_mask = prompt_attention_mask.to(self.text_encoder.device)
-
-      with torch.no_grad():
-        text_encoder_outputs = self.text_encoder(
-            input_ids=text_input_ids, attention_mask=prompt_attention_mask, output_hidden_states=True
-        )
-
-      text_encoder_hidden_states = text_encoder_outputs.hidden_states
-      del text_encoder_outputs  # Free memory
 
       prompt_embeds_list = []
       # Iterate instead of stacking eagerly to avoid 5.7+ GB HBM allocations outside JIT
       for state in text_encoder_hidden_states:
-        state_np = state.cpu().to(torch.float32).numpy()
-        prompt_embeds_list.append(jnp.array(state_np, dtype=jnp.bfloat16))
+        state = jax.device_put(state, jax.devices()[0])
+        prompt_embeds_list.append(state.astype(jnp.bfloat16))
 
       prompt_embeds = prompt_embeds_list
-      del text_encoder_hidden_states  # Free PyTorch tensor memory
+      del text_encoder_hidden_states  # Free memory
 
-      prompt_attention_mask = jnp.array(prompt_attention_mask.cpu().to(torch.float32).numpy(), dtype=jnp.bool_)
+      prompt_attention_mask = prompt_attention_mask.astype(jnp.bool_)
     else:
       raise ValueError("`text_encoder` is required to encode prompts.")
 
@@ -939,7 +968,7 @@ def check_inputs(
 
   @staticmethod
   def _pack_latents(latents: jax.Array, patch_size: int = 1, patch_size_t: int = 1) -> jax.Array:
-    batch_size, num_channels, num_frames, height, width = latents.shape
+    batch_size, _, num_frames, height, width = latents.shape
     post_patch_num_frames = num_frames // patch_size_t
     post_patch_height = height // patch_size
     post_patch_width = width // patch_size
@@ -1028,7 +1057,7 @@ def _pack_audio_latents(
       latents: jax.Array, patch_size: Optional[int] = None, patch_size_t: Optional[int] = None
   ) -> jax.Array:
     if patch_size is not None and patch_size_t is not None:
-      batch_size, num_channels, latent_length, latent_mel_bins = latents.shape
+      batch_size, _, latent_length, latent_mel_bins = latents.shape
       post_patch_latent_length = latent_length // patch_size_t
       post_patch_mel_bins = latent_mel_bins // patch_size
       latents = latents.reshape(batch_size, -1, post_patch_latent_length, patch_size_t, post_patch_mel_bins, patch_size)
@@ -1327,7 +1356,6 @@ def __call__(
     graphdef, state = nnx.split(self.transformer)
 
     # 7. Denoising Loop
-    import contextlib
 
     context_manager = self.mesh if hasattr(self, "mesh") and self.mesh is not None else contextlib.nullcontext()
     axis_rules_context = (
@@ -1380,9 +1408,7 @@ def __call__(
         )
       else:
         # Old Python loop path
-        for i in range(len(timesteps_jax)):
-          t = timesteps_jax[i]
-
+        for _, t in enumerate(timesteps_jax):
           # Isolate input sharding to scan_layers=False to avoid affecting the standard path
           latents_jax_sharded = latents_jax
           audio_latents_jax_sharded = audio_latents_jax
@@ -1503,20 +1529,45 @@ def __call__(
     if output_type == "latent":
       return LTX2PipelineOutput(frames=latents, audio=audio_latents)
 
-    # Force latents and VAE weights to be fully replicated using with_sharding_constraint, this speeds up single video latency ~3x
-    try:
-      mesh = latents.sharding.mesh
-      replicated_sharding = NamedSharding(mesh, P())
-      latents = jax.lax.with_sharding_constraint(latents, replicated_sharding)
-
-      # Replicate VAE weights
-      graphdef, state = nnx.split(self.vae)
-      state = jax.tree_util.tree_map(
-          lambda x: jax.lax.with_sharding_constraint(x, replicated_sharding) if isinstance(x, jax.Array) else x, state
+    # Force latents and VAE weights to be fully replicated using with_sharding_constraint,
+    # this speeds up single video latency ~3x
+    if batch_size <= 2:
+      try:
+        mesh = latents.sharding.mesh
+        replicated_sharding = NamedSharding(mesh, P())
+        latents = jax.lax.with_sharding_constraint(latents, replicated_sharding)
+
+        # Replicate VAE weights
+        graphdef, state = nnx.split(self.vae)
+        state = jax.tree_util.tree_map(
+            lambda x: jax.lax.with_sharding_constraint(x, replicated_sharding) if isinstance(x, jax.Array) else x, state
+        )
+        self.vae = nnx.merge(graphdef, state)
+      except Exception:  # pylint: disable=broad-exception-caught
+        max_logging.log("[Tuning] Failed to apply sharding constraint")
+    else:
+      max_logging.log(
+          f"[Tuning] Skipping VAE replication and disabling slicing to prevent HBM OOM for batch_size {batch_size} > 2"
       )
-      self.vae = nnx.merge(graphdef, state)
-    except Exception as e:
-      max_logging.log(f"[Tuning] Failed to apply sharding constraint: {e}")
+      try:
+        # Disable sequential slicing to avoid JAX concatenating 17GB arrays on the TPU
+        self.vae.use_slicing = False
+
+        # Distribute the batch dimension across the existing mesh to ensure topological compatibility
+        mesh = latents.sharding.mesh
+        active_axes = []
+        current_shards = 1
+
+        for axis_name, size in mesh.shape.items():
+          if size > 1 and batch_size % (current_shards * size) == 0:
+            active_axes.append(axis_name)
+            current_shards *= size
+
+        if active_axes:
+          batch_sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec(tuple(active_axes)))
+          latents = jax.lax.with_sharding_constraint(latents, batch_sharding)
+      except Exception:  # pylint: disable=broad-exception-caught
+        max_logging.log("[Tuning] Failed to apply batch sharding constraint to VAE")
 
     if getattr(self.vae.config, "timestep_conditioning", False):
       noise = jax.random.normal(generator, latents.shape, dtype=latents.dtype)
@@ -1587,6 +1638,8 @@ def transformer_forward_pass(
     audio_num_frames,
     fps,
 ):
+  """Forward pass for the transformer."""
+  # pylint: disable=too-many-positional-arguments,unused-argument
   transformer = nnx.merge(graphdef, state)
 
   # Expand timestep to batch size
@@ -1647,6 +1700,8 @@ def run_diffusion_loop(
     scheduler_step,
     logical_axis_rules,
 ):
+  """Runs the diffusion loop."""
+  # pylint: disable=too-many-positional-arguments
   latents_jax = latents_jax.astype(jnp.float32)
   audio_latents_jax = audio_latents_jax.astype(jnp.float32)
   transformer = nnx.merge(graphdef, state)