Offload LTX-2 text encoder to TorchAX and resolve lint issues

mbohlool · mbohlool · commit 7b2888537405 · 2026-05-27T22:01:40.000Z
diff --git a/dependencies/requirements/base_requirements/requirements.txt b/dependencies/requirements/base_requirements/requirements.txt
@@ -35,6 +35,7 @@ tensorflow-datasets
 tensorflow
 tokamax
 tokenizers
+torchax>=0.0.11
 transformers<5.0.0
 
 # pinning torch and torchvision to specific versions to avoid
diff --git a/dependencies/requirements/generated_requirements/requirements.txt b/dependencies/requirements/generated_requirements/requirements.txt
@@ -179,6 +179,7 @@ toml>=0.10.2
 tomlkit>=0.14.0
 toolz>=1.1.0
 torch @ https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl
+torchax>=0.0.11
 torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl
 tqdm>=4.67.3
 transformers>=4.57.6
diff --git a/src/maxdiffusion/configs/ltx2_video.yml b/src/maxdiffusion/configs/ltx2_video.yml
@@ -103,9 +103,11 @@ skip_first_n_steps_for_profiler: 0
 profiler_steps: 5
 
 replicate_vae: False
-
 use_bwe: False
 
+run_text_encoder_on_tpu: True
+# Dynamically disables VAE slicing and distributes the batch dimension to avoid HBM OOM for larger batch sizes.
+enable_dynamic_vae_sharding: True
 allow_split_physical_axes: False
 learning_rate_schedule_steps: -1
 max_train_steps: 500
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/torchax_text_encoder.py b/src/maxdiffusion/models/ltx2/text_encoders/torchax_text_encoder.py
@@ -0,0 +1,83 @@
+"""
+Copyright 2026 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Tuple
+
+import torch
+import jax
+from torchax import interop, default_env
+
+# --- Monkeypatch transformers masking_utils to avoid torchax integer tracing bug ---
+import transformers.masking_utils
+
+_orig_sliding_window_overlay = transformers.masking_utils.sliding_window_overlay
+
+
+def _patched_sliding_window_overlay(sliding_window: int):
+  # pylint: disable=unused-argument
+
+  def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+    # Explicit Sequence Length Assumption:
+    # This patch assumes that the maximum sequence length used for text prompts (typically <= 1024)
+    # is strictly less than the sliding window size of Gemma-3 (typically 4096).
+    # Under this assumption, the sliding window causal constraint `kv_idx > q_idx - sliding_window`
+    # is mathematically always True for all valid query/key indices (0 <= q_idx, kv_idx < seq_len).
+    #
+    # We return a standard boolean tensor `q_idx.new_ones((), dtype=torch.bool)` to guarantee
+    # Torchax compatibility and prevent any implicit tracing crashes.
+    # If a future model uses a sequence length exceeding the sliding window, this assumption must be re-evaluated.
+    return q_idx.new_ones((), dtype=torch.bool)
+
+  return inner_mask
+
+
+class TorchaxGemma3TextEncoder(interop.JittableModule):
+  """
+  A jittable Torchax module for wrapping the HuggingFace PyTorch
+  Gemma3ForConditionalGeneration text encoder.
+  """
+
+  def __init__(self, text_encoder):
+    super().__init__(text_encoder, extra_jit_args={"static_argnames": ["output_hidden_states"]})
+
+  def __call__(
+      self, input_ids: jax.Array, attention_mask: jax.Array, output_hidden_states: bool = True
+  ) -> Tuple[jax.Array, ...]:
+    # Dynamically patch transformers.masking_utils only during the duration of this call
+    transformers.masking_utils.sliding_window_overlay = _patched_sliding_window_overlay
+    try:
+      with default_env():
+        input_ids = interop.torch_view(input_ids)
+        attention_mask = interop.torch_view(attention_mask)
+
+        output = self.functional_call(
+            self._forward_inner,
+            params=self.params,
+            buffers=self.buffers,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=output_hidden_states,
+        )
+      return interop.jax_view(output)
+    finally:
+      # Restore original behavior to prevent side effects on other potential models in same env
+      transformers.masking_utils.sliding_window_overlay = _orig_sliding_window_overlay
+
+  @staticmethod
+  def _forward_inner(model, input_ids, attention_mask, output_hidden_states=True):
+    # We only return hidden states as a tuple of tensors.
+    # That allows interop.jax_view to convert them into a tuple of jax Arrays
+    return model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=output_hidden_states).hidden_states
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
diff --git a/src/maxdiffusion/tests/generate_ltx2_smoke_test.py b/src/maxdiffusion/tests/generate_ltx2_smoke_test.py