Merge pull request #3995 from AI-Hypercomputer:hengtaoguo-va

Google-ML-Automation · Google-ML-Automation · commit 8c4dced3875d · 2026-05-29T14:39:00.000-07:00
PiperOrigin-RevId: 923605138
diff --git a/src/maxtext/common/common_types.py b/src/maxtext/common/common_types.py
@@ -80,9 +80,12 @@ class MultimodalInput:
 
   image_embeddings: Array | None = None
   image_masks: Array | None = None
+  video_embeddings: Array | None = None
+  video_masks: Array | None = None
   audio_embeddings: Array | None = None
   audio_masks: Array | None = None
   bidirectional_mask: Array | None = None
+  bidirectional_mask_video: Array | None = None
 
 
 class DecoderBlockType(enum.Enum):
diff --git a/src/maxtext/inference/decode.py b/src/maxtext/inference/decode.py
@@ -184,6 +184,7 @@ def main(argv: Sequence[str]) -> None:
           mrope_deltas=mrope_position_deltas,
           images=processor_outputs.pixel_values if config.use_multimodal else None,
           image_masks=processor_outputs.pixel_mask if config.use_multimodal and "llama4" in config.model_name else None,
+          videos=getattr(processor_outputs, "video_values", None) if config.use_multimodal else None,
           audio_values=processor_outputs.audio_values if config.use_audio else None,
           audio_masks=processor_outputs.audio_mask if config.use_audio else None,
           true_length=true_length,
diff --git a/src/maxtext/inference/maxengine/maxengine.py b/src/maxtext/inference/maxengine/maxengine.py
@@ -408,6 +408,8 @@ def _prefill_jit(
       mrope_deltas: jax.Array | None = None,
       images: jax.Array | None = None,
       image_masks: jax.Array | None = None,
+      videos: jax.Array | None = None,
+      video_masks: jax.Array | None = None,
       audio_values: jax.Array | None = None,
       audio_masks: jax.Array | None = None,
       true_length: int,
@@ -504,6 +506,8 @@ def _prefill_jit(
           positions,
           encoder_images=images,
           encoder_image_masks=image_masks,
+          encoder_videos=videos,
+          encoder_video_masks=video_masks,
           encoder_audios=audio_values,
           decoder_segment_ids=sequence_indicator,
           enable_dropout=False,
@@ -586,6 +590,8 @@ def prefill(
       mrope_deltas: jax.Array | None = None,
       images: jax.Array | None = None,
       image_masks: jax.Array | None = None,
+      videos: jax.Array | None = None,
+      video_masks: jax.Array | None = None,
       audio_values: jax.Array | None = None,
       audio_masks: jax.Array | None = None,
       true_length: int,
@@ -617,6 +623,8 @@ def prefill(
         mrope_deltas=mrope_deltas,
         images=images,
         image_masks=image_masks,
+        videos=videos,
+        video_masks=video_masks,
         audio_values=audio_values,
         audio_masks=audio_masks,
         sampler=sampler,
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -642,6 +642,9 @@ def _apply_embedding(
       image_embeddings = multimodal_input.image_embeddings
       bidirectional_mask = multimodal_input.bidirectional_mask
       image_masks = multimodal_input.image_masks
+      video_embeddings = getattr(multimodal_input, "video_embeddings", None)
+      video_masks = getattr(multimodal_input, "video_masks", None)
+      bidirectional_mask_video = getattr(multimodal_input, "bidirectional_mask_video", None)
       audio_embeddings = multimodal_input.audio_embeddings
       audio_masks = multimodal_input.audio_masks
 
@@ -669,6 +672,17 @@ def _apply_embedding(
         else:
           raise ValueError(f"Unsupported model_name for multimodal: {cfg.model_name}")
 
+      if video_embeddings is not None and cfg.use_multimodal:
+        if cfg.model_name in ["qwen3-omni-30b-a3b", "qwen3.5-397b-a17b"]:
+          y = mm_utils.merge_mm_embeddings(
+              text_embeddings=y,
+              multimodal_embeddings=video_embeddings,
+              mask=bidirectional_mask_video,
+              token_masks=video_masks,
+          )
+        else:
+          raise ValueError(f"Unsupported model_name for video: {cfg.model_name}")
+
       if audio_embeddings is not None and cfg.use_audio:
         if cfg.model_name in ["qwen3-omni-30b-a3b"]:
           y = mm_utils.merge_mm_embeddings(
diff --git a/src/maxtext/models/models.py b/src/maxtext/models/models.py
@@ -127,6 +127,8 @@ def __call__(
       decoder_segment_ids=None,
       encoder_images: None | jnp.ndarray = None,
       encoder_image_masks: None | jnp.ndarray = None,
+      encoder_videos: None | jnp.ndarray = None,
+      encoder_video_masks: None | jnp.ndarray = None,
       encoder_audios: None | jnp.ndarray = None,
       enable_dropout=True,
       model_mode=MODEL_MODE_TRAIN,
@@ -153,17 +155,28 @@ def __call__(
           f" which is always {DECODING_ACTIVE_SEQUENCE_INDICATOR}."
       )
 
-    bidirectional_mask = None
+    bidirectional_mask_image = None
+    bidirectional_mask_video = None
     image_embeddings = None
+    video_embeddings = None
     audio_embeddings = None
     deepstack_visual_embeds = None
 
     if self.config.use_multimodal and encoder_images is not None:
       image_embeddings, deepstack_visual_embeds = self.vision_encoder(
           input_images=encoder_images, deterministic=not enable_dropout
       )
+      bidirectional_mask_image = mm_processor.get_bidirectional_mask_vision(
+          self.config, decoder_input_tokens, is_video=False
+      )
 
-      bidirectional_mask = mm_processor.get_bidirectional_mask_vision(self.config, decoder_input_tokens)
+    if self.config.use_multimodal and encoder_videos is not None:
+      video_embeddings, deepstack_visual_embeds = self.vision_encoder(
+          input_images=encoder_videos, deterministic=not enable_dropout
+      )
+      bidirectional_mask_video = mm_processor.get_bidirectional_mask_vision(
+          self.config, decoder_input_tokens, is_video=True
+      )
 
     if self.config.use_multimodal and encoder_audios is not None and self.audio_encoder is not None:
       audio_embeddings = self.audio_encoder(input_audio=encoder_audios, deterministic=not enable_dropout)
@@ -174,13 +187,16 @@ def __call__(
       audio_masks = mm_processor.get_bidirectional_mask_audio(self.config, decoder_input_tokens)
 
     multimodal_input = None
-    if image_embeddings is not None or audio_embeddings is not None:
+    if image_embeddings is not None or video_embeddings is not None or audio_embeddings is not None:
       multimodal_input = MultimodalInput(
           image_embeddings=image_embeddings,
           image_masks=encoder_image_masks,
+          video_embeddings=video_embeddings,
+          video_masks=encoder_video_masks,
           audio_embeddings=audio_embeddings,
           audio_masks=audio_masks,
-          bidirectional_mask=bidirectional_mask,
+          bidirectional_mask=bidirectional_mask_image,
+          bidirectional_mask_video=bidirectional_mask_video,
       )
 
     logits, hidden_state, kv_caches = self.decoder(
@@ -425,6 +441,8 @@ def __call__(
       cache=None,
       encoder_images: jax.Array | None = None,
       encoder_image_masks: jax.Array | None = None,
+      encoder_videos: jax.Array | None = None,
+      encoder_video_masks: jax.Array | None = None,
       encoder_audios: jax.Array | None = None,
       enable_dropout=True,
       model_mode=MODEL_MODE_TRAIN,
@@ -466,16 +484,28 @@ def __call__(
           f" which is always {DECODING_ACTIVE_SEQUENCE_INDICATOR}."
       )
 
-    bidirectional_mask = None
+    bidirectional_mask_image = None
+    bidirectional_mask_video = None
     image_embeddings = None
+    video_embeddings = None
+    audio_embeddings = None
     deepstack_visual_embeds = None
     if self.config.use_multimodal and encoder_images is not None:
       image_embeddings, deepstack_visual_embeds = self.vision_encoder(
           input_images=encoder_images, deterministic=not enable_dropout
       )
-      bidirectional_mask = mm_processor.get_bidirectional_mask_vision(self.config, decoder_input_tokens)
+      bidirectional_mask_image = mm_processor.get_bidirectional_mask_vision(
+          self.config, decoder_input_tokens, is_video=False
+      )
+
+    if self.config.use_multimodal and encoder_videos is not None:
+      video_embeddings, deepstack_visual_embeds = self.vision_encoder(
+          input_images=encoder_videos, deterministic=not enable_dropout
+      )
+      bidirectional_mask_video = mm_processor.get_bidirectional_mask_vision(
+          self.config, decoder_input_tokens, is_video=True
+      )
 
-    audio_embeddings = None
     if self.config.use_multimodal and encoder_audios is not None and self.audio_encoder is not None:
       audio_embeddings = self.audio_encoder(input_audio=encoder_audios, deterministic=not enable_dropout)
 
@@ -485,13 +515,16 @@ def __call__(
       audio_masks = mm_processor.get_bidirectional_mask_audio(self.config, decoder_input_tokens)
 
     multimodal_input = None
-    if image_embeddings is not None or audio_embeddings is not None:
+    if image_embeddings is not None or video_embeddings is not None or audio_embeddings is not None:
       multimodal_input = MultimodalInput(
           image_embeddings=image_embeddings,
           image_masks=encoder_image_masks,
+          video_embeddings=video_embeddings,
+          video_masks=encoder_video_masks,
           audio_embeddings=audio_embeddings,
           audio_masks=audio_masks,
-          bidirectional_mask=bidirectional_mask,
+          bidirectional_mask=bidirectional_mask_image,
+          bidirectional_mask_video=bidirectional_mask_video,
       )
 
     mutable_collections = []
diff --git a/src/maxtext/multimodal/processor.py b/src/maxtext/multimodal/processor.py
@@ -207,7 +207,7 @@ def get_dummy_audio_shape_for_init(config):
   return audio_shape
 
 
-def get_bidirectional_mask_vision(config, decoder_input_tokens):
+def get_bidirectional_mask_vision(config, decoder_input_tokens, is_video: bool = False):
   """Get the bidirectional mask for specific models."""
   bidirectional_mask_vision = None
   if config.model_name in ["gemma3-4b", "gemma3-12b", "gemma3-27b"]:
@@ -225,11 +225,10 @@ def get_bidirectional_mask_vision(config, decoder_input_tokens):
   elif config.model_name in ["qwen3-omni-30b-a3b", "qwen3.5-397b-a17b"]:
     from maxtext.multimodal.processor_qwen3_omni import QWEN3_OMNI_IMAGE_TOKEN, QWEN3_OMNI_VIDEO_TOKEN  # pylint: disable=import-outside-toplevel
 
-    # Create bidirectional_mask for vision/video token merging
-    bidirectional_mask_vision = (decoder_input_tokens == QWEN3_OMNI_IMAGE_TOKEN) | (
-        decoder_input_tokens == QWEN3_OMNI_VIDEO_TOKEN
-    )
-    # Create image/video mask for deepstack visual embedding injection
+    if is_video:
+      bidirectional_mask_vision = decoder_input_tokens == QWEN3_OMNI_VIDEO_TOKEN
+    else:
+      bidirectional_mask_vision = decoder_input_tokens == QWEN3_OMNI_IMAGE_TOKEN
   return bidirectional_mask_vision
 
 
diff --git a/src/maxtext/multimodal/processor_qwen3_omni.py b/src/maxtext/multimodal/processor_qwen3_omni.py
@@ -472,6 +472,12 @@ def _np_extract_fbank_features(waveform_batch: np.ndarray) -> np.ndarray:
 
 def pre_process_audio_qwen3_omni(audio_array):
   """Preprocess audio for Qwen3-Omni model."""
+  chunk_samples = 16000  # hop_length (160) * chunk_size (100)
+  remainder = len(audio_array) % chunk_samples
+  if remainder > 0:
+    padding_size = chunk_samples - remainder
+    audio_array = np.pad(audio_array, (0, padding_size), mode="constant")
+
   audio_features = np.expand_dims(audio_array, axis=0)  # Add batch dimension
   audio_features = _np_extract_fbank_features(audio_features)
   audio_features_mask = np.ones((audio_features.shape[0], audio_features.shape[2]), dtype=np.int32)
@@ -532,7 +538,17 @@ def preprocess_mm_data_qwen3_omni(config):
   if config.video_path:
     video_array, _ = _read_video_decord(config.video_path)
     video_processed, video_grid_thw = preprocess_video(video_array, config)
-    processor_outputs.video_values = video_processed
+    video_values = np.reshape(
+        video_processed,
+        (
+            1,
+            config.num_channels_for_vit,
+            config.temporal_patch_size_for_vit * video_grid_thw[0, 0],
+            config.patch_size_for_vit * video_grid_thw[0, 1],
+            config.patch_size_for_vit * video_grid_thw[0, 2],
+        ),
+    )
+    processor_outputs.video_values = video_values
     processor_outputs.video_grid_thw = video_grid_thw
     processor_outputs.video_second_per_grid = np.asarray([config.temporal_patch_size_for_vit], dtype=np.float32)
     processor_outputs.num_videos = 1  # Only one video for now.
@@ -1143,6 +1159,9 @@ def get_mm_offsets_qwen3_omni(config, processor_output):
   if processor_output.audio_lengths is not None:
     audio_lengths = processor_output.audio_lengths
     for audio_len in audio_lengths:
-      total_offset += int(audio_len) - 1  # -1 for the original <|audio_pad|> token
+      if getattr(config, "use_audio_in_video", False):
+        total_offset += int(audio_len) + 2  # +2 for <|audio_start|> and <|audio_end|>, no <|audio_pad|> to remove
+      else:
+        total_offset += int(audio_len) - 1  # -1 for the original <|audio_pad|> token
 
   return total_offset
diff --git a/tests/unit/qwen3_omni_layers_test.py b/tests/unit/qwen3_omni_layers_test.py
@@ -551,7 +551,9 @@ def test_vision_encoder_single_image(self):
     grid_thw = np.array([[1, h, w]], dtype=np.int64)
     grid_thw_torch = torch.from_numpy(grid_thw)
 
-    torch_output, torch_deep_feats = torch_encoder(torch_hidden_states, grid_thw_torch)
+    torch_encoder_output = torch_encoder(torch_hidden_states, grid_thw_torch)
+    torch_output = torch_encoder_output.pooler_output
+    torch_deep_feats = torch_encoder_output.deepstack_features
     jax_encoder_output, jax_deep_feats = jax_encoder(jax_hidden_states)
     jax_output = jax_projector(jax_encoder_output)
 
@@ -561,8 +563,8 @@ def test_vision_encoder_single_image(self):
     assert_all_close_jax_torch(
         jax_output,
         torch_output,
-        rtol=1e-2,
-        atol=1e-2,
+        rtol=1.5e-2,
+        atol=1.5e-2,
         error_msg="Vision encoder final output differs",
     )
 
@@ -576,8 +578,8 @@ def test_vision_encoder_single_image(self):
       assert_all_close_jax_torch(
           jax_feat,
           torch_feat,
-          rtol=1e-2,
-          atol=1e-2,
+          rtol=1.5e-2,
+          atol=1.5e-2,
           error_msg=f"Deep feature {i} differs",
       )
 
@@ -722,6 +724,16 @@ def test_preprocess_mm_data(self):
     USE_AUDIO_IN_VIDEO = True
     hf_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
     audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+    if audios is not None:
+      padded_audios = []
+      for audio in audios:
+        chunk_samples = 16000
+        remainder = len(audio) % chunk_samples
+        if remainder > 0:
+          padding_size = chunk_samples - remainder
+          audio = np.pad(audio, (0, padding_size), mode="constant")
+        padded_audios.append(audio)
+      audios = padded_audios
     hf_processor_outputs = processor(
         text=hf_prompt,
         audio=audios,
@@ -749,9 +761,11 @@ def test_preprocess_mm_data(self):
         rtol=1e-2,
         atol=1e-2,
     )
+    hf_pixel_values_videos = np.array(hf_processor_outputs["pixel_values_videos"]).astype(np.float32)
+    mt_video_values = np.array(mt_processor_outputs.video_values).reshape(hf_pixel_values_videos.shape)
     assert np.allclose(
-        mt_processor_outputs.video_values,
-        np.array(hf_processor_outputs["pixel_values_videos"]).astype(np.float32),
+        mt_video_values,
+        hf_pixel_values_videos,
         rtol=5e-2,
         atol=5e-2,
     )