Qwen3-Omni SFT+Eval

hengtaoguo · hengtaoguo · commit b277c408652c · 2026-05-20T17:59:52.000Z
diff --git a/benchmarks/multimodal/multimodal_eval.py b/benchmarks/multimodal/multimodal_eval.py
@@ -150,6 +150,13 @@ def construct_prompt(
         question=parsed_dataset_example.question,
         choices=choices_text if choices_text else "N/A",
     )
+    if config.use_multimodal and "qwen3-omni" in config.model_name:
+      prompt = mm_processor.reformat_prompt(
+          prompt,
+          image_placeholder,
+          config.model_name,
+          num_images=1,
+      )
   elif local_args.ckpt_type == "sft":
     prompt = mm_processor.reformat_prompt(
         parsed_dataset_example.question,
@@ -200,11 +207,31 @@ def main(config, local_args):
     print("\n" + "*" * 50)
 
     # Tokenize the input
-    tokens, true_length = tokenizer.encode(prompt, is_bos=True, prefill_lengths=[prefill_length])
+    is_bos = config.add_bos and getattr(tokenizer, "bos_id", None) is not None
+    tokens, true_length = tokenizer.encode(prompt, is_bos=is_bos, prefill_lengths=[prefill_length])
+    position_ids = None
+    mrope_position_deltas = None
+
     if config.use_multimodal:
       tokens = mm_processor.prepare_text_for_image_fusion(tokens=tokens, config=config, processor_output=processor_output)
       image_offsets = mm_processor.get_image_offsets(config=config, processor_output=processor_output)
       true_length += image_offsets
+
+      if config.use_mrope:
+        from maxtext.multimodal import processor_qwen3_omni  # pylint: disable=import-outside-toplevel
+
+        position_ids, mrope_position_deltas = processor_qwen3_omni.get_rope_index(
+            input_ids=tokens[np.newaxis, :],  # Add batch dimension for processing
+            image_grid_thw=processor_output.pixel_grid_thw,  # pytype: disable=attribute-error
+            video_grid_thw=processor_output.video_grid_thw,  # pytype: disable=attribute-error
+            attention_mask=np.ones_like(tokens)[np.newaxis, :],
+            use_audio_in_video=config.use_audio and getattr(processor_output, "num_videos", 0) > 0,
+            audio_lengths=processor_output.audio_lengths,  # pytype: disable=attribute-error
+            second_per_grids=processor_output.video_second_per_grid,  # pytype: disable=attribute-error
+            spatial_merge_size=config.spatial_merge_size_for_vit,  # pytype: disable=attribute-error
+            position_id_per_seconds=config.position_id_per_seconds,
+        )
+
     if true_length > max_prefill_predict_length:
       max_logging.log(
           f"Warning: Prompt length {true_length} exceeds max prefill length" f" {max_prefill_predict_length}. Truncating."
@@ -216,7 +243,18 @@ def main(config, local_args):
 
     # Perform prefill
     prefill_result, first_token = engine.prefill(
-        params=params, padded_tokens=tokens, images=processor_output.pixel_values, true_length=true_length
+        params=params,
+        padded_tokens=tokens,
+        positions=position_ids,
+        mrope_deltas=mrope_position_deltas,
+        images=processor_output.pixel_values if config.use_multimodal else None,
+        image_masks=getattr(processor_output, "pixel_mask", None)
+        if config.use_multimodal and "llama4" in config.model_name
+        else None,
+        audio_values=getattr(processor_output, "audio_values", None) if config.use_audio else None,
+        audio_masks=getattr(processor_output, "audio_mask", None) if config.use_audio else None,
+        true_length=true_length,
+        slot=0,
     )
     slot = 0
 
@@ -243,8 +281,9 @@ def main(config, local_args):
         break
 
     correct_answer = parsed_dataset_example.answer
+    # If fails to parse answer, use the raw output as the predicted answer for correctness checking
     if predicted_answer == utils_rl.FALLBACK_ANSWER:
-      predicted_answer = utils_rl.extract_answer(output, tmvp_config)
+      predicted_answer = output
 
     exact_correct, _ = utils_rl.check_correctness(predicted_answer, [correct_answer], tmvp_config)
     is_correct = exact_correct
diff --git a/src/maxtext/input_pipeline/hf_data_processing.py b/src/maxtext/input_pipeline/hf_data_processing.py
@@ -118,6 +118,7 @@ def vision_sft_preprocessing_pipeline(
       add_eos_token=False,
       legacy=False,
       token=config.hf_access_token,
+      extra_special_tokens={},
   )
   pad_id = _get_pad_id(tokenizer)
 
@@ -256,6 +257,7 @@ def preprocessing_pipeline(
       add_eos_token=add_eos if not use_sft else False,
       legacy=False,
       token=hf_access_token,
+      extra_special_tokens={},
   )
 
   dataset = dataset.select_columns(data_column_names)
diff --git a/src/maxtext/input_pipeline/input_pipeline_utils.py b/src/maxtext/input_pipeline/input_pipeline_utils.py
@@ -745,6 +745,9 @@ def _pad_image_and_mask(self, preprocessed_image: mm_utils.PreprocessorOutput) -
     if preprocessed_image.pixel_values is None:
       raise ValueError("Input preprocessed_image must have pixel_values to pad images.")
 
+    if self.config.model_name and self.config.model_name.startswith("qwen3-omni"):
+      return preprocessed_image
+
     # Determine the maximum number of images/masks allowed.
     image_offsets = mm_processor.get_image_offsets(self.config, preprocessed_image)
     single_image_offset = image_offsets // preprocessed_image.pixel_values.shape[0]
diff --git a/src/maxtext/multimodal/processor.py b/src/maxtext/multimodal/processor.py
@@ -68,6 +68,10 @@ def preprocess_image_for_training(image, model_name):
     from maxtext.multimodal.processor_llama4 import preprocess_mm_data_llama4  # pylint: disable=import-outside-toplevel
 
     return preprocess_mm_data_llama4(image)
+  elif model_name in ["qwen3-omni-30b-a3b"]:
+    from maxtext.multimodal.processor_qwen3_omni import preprocess_mm_data_qwen3_omni_for_training  # pylint: disable=import-outside-toplevel
+
+    return preprocess_mm_data_qwen3_omni_for_training(image)
   else:
     raise ValueError(f"Model {model_name} not supported for image preprocessing.")
 
diff --git a/src/maxtext/multimodal/processor_qwen3_omni.py b/src/maxtext/multimodal/processor_qwen3_omni.py
@@ -122,7 +122,7 @@ def smart_resize(
   return h_bar, w_bar
 
 
-def pre_process_qwen3_image(image: np.ndarray | list[np.ndarray], config):
+def pre_process_qwen3_image(image: np.ndarray | list[np.ndarray], config, force_resize=None):
   """Performs a bi-linear resize (with anti-aliasing) and normalizes the image."""
   patch_size = config.patch_size_for_vit
   merge_size = config.spatial_merge_size_for_vit
@@ -135,23 +135,27 @@ def pre_process_qwen3_image(image: np.ndarray | list[np.ndarray], config):
 
   for img in images_in:
     pil_img = Image.fromarray(img)
-    # Qwen3-Omni performs one resize during fetch_image and another resize before patchify.
-    resized_height_1, resized_width_1 = smart_resize(
-        height=img.shape[0],
-        width=img.shape[1],
-        factor=IMAGE_FACTOR,
-        min_pixels=MIN_PIXELS,
-        max_pixels=MAX_PIXELS,
-    )
-    pil_img = pil_img.resize((resized_width_1, resized_height_1))
-    resized_height_2, resized_width_2 = smart_resize(
-        height=resized_height_1,
-        width=resized_width_1,
-        factor=patch_size * merge_size,
-        min_pixels=MIN_PIXELS,
-        max_pixels=MAX_PIXELS,
-    )
-    resized_img_pil = pil_img.resize((resized_width_2, resized_height_2), resample=resample_method)
+    if force_resize is not None:
+      resized_height_2, resized_width_2 = force_resize
+      resized_img_pil = pil_img.resize((resized_width_2, resized_height_2), resample=resample_method)
+    else:
+      # Qwen3-Omni performs one resize during fetch_image and another resize before patchify.
+      resized_height_1, resized_width_1 = smart_resize(
+          height=img.shape[0],
+          width=img.shape[1],
+          factor=IMAGE_FACTOR,
+          min_pixels=MIN_PIXELS,
+          max_pixels=MAX_PIXELS,
+      )
+      pil_img = pil_img.resize((resized_width_1, resized_height_1))
+      resized_height_2, resized_width_2 = smart_resize(
+          height=resized_height_1,
+          width=resized_width_1,
+          factor=patch_size * merge_size,
+          min_pixels=MIN_PIXELS,
+          max_pixels=MAX_PIXELS,
+      )
+      resized_img_pil = pil_img.resize((resized_width_2, resized_height_2), resample=resample_method)
     resized_img_np = np.array(resized_img_pil).astype(np.float32)
 
     img_np = mm_utils.normalize_images(resized_img_np, mean=IMAGE_MEAN, std=IMAGE_STD)
@@ -474,6 +478,35 @@ def pre_process_audio_qwen3_omni(audio_array):
   return audio_features, audio_features_mask
 
 
+def preprocess_mm_data_qwen3_omni_for_training(images):
+  """Preprocesses image(s) for Qwen3-Omni SFT training using default model constants."""
+
+  class _DefaultConfig:
+    patch_size_for_vit = 16
+    spatial_merge_size_for_vit = 2
+    temporal_patch_size_for_vit = QWEN3_TEMPORAL_PATCH_SIZE
+
+  images_in = [images] if isinstance(images, np.ndarray) else images
+  pixel_values, pixel_grid_thw = pre_process_qwen3_image(
+      images_in, _DefaultConfig(), force_resize=(QWEN3_OMNI_IMAGE_SIZE, QWEN3_OMNI_IMAGE_SIZE)
+  )
+  pixel_values = np.reshape(
+      pixel_values,
+      (
+          len(images_in),
+          3,  # num_channels_for_vit
+          _DefaultConfig.temporal_patch_size_for_vit * pixel_grid_thw[0, 0],
+          _DefaultConfig.patch_size_for_vit * pixel_grid_thw[0, 1],
+          _DefaultConfig.patch_size_for_vit * pixel_grid_thw[0, 2],
+      ),
+  )
+  return Qwen3OmniPreprocessorOutput(
+      num_images=len(images_in),
+      pixel_values=pixel_values,
+      pixel_grid_thw=pixel_grid_thw,
+  )
+
+
 def preprocess_mm_data_qwen3_omni(config):
   """Placeholder for multimodal data preprocessing."""
   processor_outputs = Qwen3OmniPreprocessorOutput()

Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,7 @@ def vision_sft_preprocessing_pipeline(`
`118`	`118`	`add_eos_token=False,`
`119`	`119`	`legacy=False,`
`120`	`120`	`token=config.hf_access_token,`
	`121`	`+ extra_special_tokens={},`
`121`	`122`	`)`
`122`	`123`	`pad_id = _get_pad_id(tokenizer)`
`123`	`124`
`@@ -256,6 +257,7 @@ def preprocessing_pipeline(`
`256`	`257`	`add_eos_token=add_eos if not use_sft else False,`
`257`	`258`	`legacy=False,`
`258`	`259`	`token=hf_access_token,`
	`260`	`+ extra_special_tokens={},`
`259`	`261`	`)`
`260`	`262`
`261`	`263`	`dataset = dataset.select_columns(data_column_names)`