Merge pull request #3214 from AI-Hypercomputer:qwen-deepstack

Google-ML-Automation · Google-ML-Automation · commit 973d8e18fc09 · 2026-02-24T14:32:33.000-08:00
PiperOrigin-RevId: 874794710
diff --git a/src/maxtext/configs/models/qwen3-omni-30b-a3b.yml b/src/maxtext/configs/models/qwen3-omni-30b-a3b.yml
@@ -39,6 +39,7 @@ max_position_embeddings: 65536
 
 # General Model Settings
 enable_dropout: False
+scan_layers: False  # deepstack does not support scan_layers
 
 # Vision Encoder Configuration
 # Based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1928,6 +1928,13 @@ def set_derived_and_validate_values(self) -> "MaxTextConfig":
     if self.steps == -1:
       self.steps = self.learning_rate_schedule_steps
 
+    # Validate deepstack + scan_layers incompatibility
+    if self.deepstack_visual_indexes_for_vit and self.scan_layers:
+      raise ValueError(
+          "Deepstack visual embedding injection requires scan_layers=False. "
+          "Set scan_layers=False in your config to use deepstack features."
+      )
+
     # Validate WSD learning rate schedule fractions
     if self.lr_schedule_type == LearningRateScheduleType.WSD:
       total_fraction = self.warmup_steps_fraction + self.wsd_decay_steps_fraction
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -265,6 +265,31 @@ def __call__(
       return inputs
 
 
+def deepstack_process(hidden_states, bidirectional_mask, visual_embeds):
+  """Process deepstack visual embeddings by adding them to hidden states at visual token positions.
+
+  Args:
+    hidden_states: [batch, seq_len, hidden_dim] decoder hidden states
+    bidirectional_mask: [batch, seq_len] boolean mask marking visual token positions
+    visual_embeds: [batch, num_visual_tokens, hidden_dim] visual features from encoder layer
+
+  Returns:
+    Updated hidden_states with visual features added at visual positions
+  """
+  # Expand mask to [batch, seq_len, 1] for broadcasting
+  mask_expanded = bidirectional_mask[:, :, jnp.newaxis]
+  # Use cumsum to map each True position in mask to its index in visual_embeds
+  visual_token_idx = jnp.cumsum(bidirectional_mask, axis=1) - 1  # [batch, seq_len], 0-indexed
+
+  # Gather visual tokens: for each position, get the corresponding visual token
+  batch_idx = jnp.arange(hidden_states.shape[0])[:, jnp.newaxis]  # [batch, 1]
+  visual_embeds_scattered = visual_embeds[batch_idx, visual_token_idx, :]  # [batch, seq_len, hidden]
+
+  # Only add where mask is True: hidden_states += visual_embeds * mask
+  hidden_states = hidden_states + visual_embeds_scattered * mask_expanded
+  return hidden_states
+
+
 class Decoder(nn.Module):
   """A stack of decoder layers as a part of an encoder-decoder architecture."""
 
@@ -722,6 +747,7 @@ def __call__(
       attention_metadata=None,
       audio_embeddings: None | jnp.ndarray = None,
       audio_masks: None | jnp.ndarray = None,
+      deepstack_visual_embeds: None | list[jnp.ndarray] = None,
   ):
     cfg = self.config
     mesh = self.mesh
@@ -939,6 +965,12 @@ def __call__(
             if kv_caches is not None and kv_cache is not None:
               kv_caches[lyr] = kv_cache
 
+            if deepstack_visual_embeds is not None and lyr < len(deepstack_visual_embeds):
+              visual_embeds = deepstack_visual_embeds[lyr]
+              # Use bidirectional_mask to identify visual token positions
+              if bidirectional_mask is not None and visual_embeds is not None:
+                y = deepstack_process(y, bidirectional_mask, visual_embeds)
+
     assert isinstance(y, jax.Array)
 
     # After the final transformer layer, `y` holds the raw, un-normalized hidden state.
diff --git a/src/maxtext/layers/encoders.py b/src/maxtext/layers/encoders.py
@@ -65,7 +65,6 @@ def __call__(self, input_images, deterministic=False):
     # vision encoder output, frozen params in many cases
     encoder = getattr(self, self.encoder_name)
     encoder_output = encoder(input_images, deterministic=deterministic)
-
     deep_feats = None
     if isinstance(encoder_output, tuple):
       embeddings = encoder_output[0]
@@ -75,6 +74,8 @@ def __call__(self, input_images, deterministic=False):
 
     if self.config.freeze_vision_encoder_params:
       embeddings = jax.lax.stop_gradient(embeddings)
+      if deep_feats is not None:
+        deep_feats = [jax.lax.stop_gradient(feat) for feat in deep_feats]
 
     # vision embedder / projection layer, not frozen in most cases, trained / finetuned together with main model
     projector = getattr(self, self.projector_name)
diff --git a/src/maxtext/models/models.py b/src/maxtext/models/models.py
@@ -151,10 +151,12 @@ def __call__(
     bidirectional_mask = None
     image_embeddings = None
     audio_embeddings = None
+    deepstack_visual_embeds = None
 
     if self.config.use_multimodal and encoder_images is not None:
-      # qwen3-omni-30b-a3b returns deep features from the vision encoder.
-      image_embeddings, _ = self.vision_encoder(input_images=encoder_images, deterministic=not enable_dropout)
+      image_embeddings, deepstack_visual_embeds = self.vision_encoder(
+          input_images=encoder_images, deterministic=not enable_dropout
+      )
       bidirectional_mask = mm_processor.get_bidirectional_mask_vision(self.config, decoder_input_tokens)
 
     if self.config.use_multimodal and encoder_audios is not None and self.audio_encoder is not None:
@@ -182,6 +184,7 @@ def __call__(
         audio_masks=audio_masks,
         kv_caches=kv_caches,
         attention_metadata=attention_metadata,
+        deepstack_visual_embeds=deepstack_visual_embeds,
     )
 
     # If we are initializing the model AND MTP is enabled, we must create
@@ -458,8 +461,11 @@ def __call__(
 
     bidirectional_mask = None
     image_embeddings = None
+    deepstack_visual_embeds = None
     if self.config.use_multimodal and encoder_images is not None:
-      image_embeddings, _ = self.vision_encoder(input_images=encoder_images, deterministic=not enable_dropout)
+      image_embeddings, deepstack_visual_embeds = self.vision_encoder(
+          input_images=encoder_images, deterministic=not enable_dropout
+      )
       bidirectional_mask = mm_processor.get_bidirectional_mask_vision(self.config, decoder_input_tokens)
 
     audio_embeddings = None
@@ -488,6 +494,7 @@ def __call__(
         audio_masks=audio_masks,
         kv_caches=kv_caches,
         attention_metadata=attention_metadata,
+        deepstack_visual_embeds=deepstack_visual_embeds,
     )
 
     # Materialize hidden state when vocab tiling is enabled
diff --git a/tests/unit/qwen3_omni_layers_test.py b/tests/unit/qwen3_omni_layers_test.py
@@ -26,15 +26,16 @@
 import jax.numpy as jnp
 from jax.sharding import Mesh
 from MaxText import common_types
-from MaxText import maxengine
 from MaxText import pyconfig
 from MaxText.globals import MAXTEXT_REPO_ROOT
+from maxtext.inference.maxengine import maxengine
 from maxtext.layers.attentions import Attention
 from maxtext.layers.embeddings import (
     PositionalEmbedding,
     Qwen3OmniMoeVisionPosEmbedInterpolate as JaxQwen3OmniMoeVisionPosEmbedInterpolate,
     Qwen3OmniMoeVisionRotaryEmbedding as JaxQwen3OmniMoeVisionRotaryEmbedding,
 )
+from maxtext.layers.decoders import deepstack_process
 from maxtext.layers.encoders import AudioEncoder
 from maxtext.models.qwen3 import (
     Qwen3OmniAudioEncoder,
@@ -579,6 +580,86 @@ def test_vision_encoder_single_image(self):
       )
 
 
+class TestDeepstackProcess(unittest.TestCase):
+  """Tests for deepstack_process.
+
+  Adds deepstack visual embeddings into decoder hidden states at the
+  positions indicated by the bidirectional mask (visual token positions).
+  """
+
+  def test_adds_only_at_visual_positions(self):
+    """Visual embeddings should be added at True mask positions and nowhere else."""
+    batch, seq_len, hidden_dim = 2, 8, 4
+    hidden_states = jnp.zeros((batch, seq_len, hidden_dim))
+    # positions 1, 3, 5 are visual for both batch items (3 visual tokens each)
+    mask = jnp.array(
+        [
+            [False, True, False, True, False, True, False, False],
+            [False, True, False, True, False, True, False, False],
+        ]
+    )
+    visual_embeds = jnp.ones((batch, 3, hidden_dim))
+
+    result = deepstack_process(hidden_states, mask, visual_embeds)
+
+    for b in range(batch):
+      for pos in [1, 3, 5]:
+        np.testing.assert_allclose(np.array(result[b, pos]), np.ones(hidden_dim), err_msg=f"batch={b} pos={pos}")
+      for pos in [0, 2, 4, 6, 7]:
+        np.testing.assert_allclose(np.array(result[b, pos]), np.zeros(hidden_dim), err_msg=f"batch={b} pos={pos}")
+
+  def test_visual_tokens_mapped_in_order(self):
+    """Each visual embed should be added to the corresponding visual position in cumsum order."""
+    batch, seq_len, hidden_dim = 1, 6, 2
+    hidden_states = jnp.zeros((batch, seq_len, hidden_dim))
+    mask = jnp.array([[False, True, False, True, False, False]])
+    # two distinct visual tokens, a third token that won't be used
+    visual_embeds = jnp.array([[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]])
+
+    result = deepstack_process(hidden_states, mask, visual_embeds)
+
+    # 1st visual position → visual_embeds[0, 0]
+    np.testing.assert_allclose(np.array(result[0, 1]), [1.0, 2.0])
+    # 2nd visual position → visual_embeds[0, 1]
+    np.testing.assert_allclose(np.array(result[0, 3]), [3.0, 4.0])
+    # non-visual positions untouched
+    for pos in [0, 2, 4, 5]:
+      np.testing.assert_allclose(np.array(result[0, pos]), [0.0, 0.0])
+
+  def test_matches_reference_scatter(self):
+    """Output must match a reference numpy loop that scatters visual embeds by position."""
+    batch, seq_len, hidden_dim, num_visual = 2, 10, 8, 4
+    np.random.seed(0)
+
+    hidden_np = np.random.randn(batch, seq_len, hidden_dim).astype(np.float32)
+    mask_np = np.zeros((batch, seq_len), dtype=bool)
+    mask_np[:, [1, 3, 5, 7]] = True  # 4 visual tokens per batch item
+    visual_np = np.random.randn(batch, num_visual, hidden_dim).astype(np.float32)
+
+    # Reference: per-batch scatter
+    expected = hidden_np.copy()
+    for b in range(batch):
+      vi = 0
+      for s in range(seq_len):
+        if mask_np[b, s]:
+          expected[b, s] += visual_np[b, vi]
+          vi += 1
+
+    result = deepstack_process(jnp.array(hidden_np), jnp.array(mask_np), jnp.array(visual_np))
+    np.testing.assert_allclose(np.array(result), expected, rtol=1e-5, atol=1e-5)
+
+  def test_hidden_states_unchanged_without_visual_tokens(self):
+    """When mask is all-False, hidden states should be returned unchanged."""
+    batch, seq_len, hidden_dim = 2, 6, 4
+    np.random.seed(1)
+    hidden_np = np.random.randn(batch, seq_len, hidden_dim).astype(np.float32)
+    mask = jnp.zeros((batch, seq_len), dtype=bool)
+    visual_embeds = jnp.ones((batch, 1, hidden_dim))
+
+    result = deepstack_process(jnp.array(hidden_np), mask, visual_embeds)
+    np.testing.assert_allclose(np.array(result), hidden_np, rtol=1e-6, atol=1e-6)
+
+
 class TestQwen3OmniPreprocessing(unittest.TestCase):
   """Test MaxText Qwen3 Omni preprocessor against HuggingFace reference."""