Merge pull request #4011 from AI-Hypercomputer:multimodal_gemma3

Google-ML-Automation · Google-ML-Automation · commit f0842cabeb60 · 2026-05-29T11:36:53.000-07:00
PiperOrigin-RevId: 923520359
diff --git a/src/maxtext/checkpoint_conversion/to_maxtext.py b/src/maxtext/checkpoint_conversion/to_maxtext.py
@@ -924,6 +924,21 @@ def main(
       max_logging.log(f"HuggingFace model loaded. dtypes: {unique_dtypes}")
       print_ram_usage("After full HF model load")
 
+      # transformers>=5.8 removed the intermediate `vision_model` attribute,
+      # so keys are now `model.vision_tower.embeddings.*` instead
+      # of `model.vision_tower.vision_model.embeddings.*`.
+      # Remap to the old format so that the param_mapping continues to work.
+      if eager_load_method == "transformers" and config.use_multimodal:
+        old_prefix = "model.vision_tower.vision_model."
+        new_prefix = "model.vision_tower."
+        needs_remap = any(k.startswith(new_prefix) and not k.startswith(old_prefix) for k in hf_state_dict_numpy)
+        if needs_remap:
+          max_logging.log("Detected new-style key layout; remapping vision_tower keys.")
+          hf_state_dict_numpy = {
+              (old_prefix + k[len(new_prefix) :] if k.startswith(new_prefix) and not k.startswith(old_prefix) else k): v
+              for k, v in hf_state_dict_numpy.items()
+          }
+
       def _eager_getter(key):
         if key not in hf_state_dict_numpy:
           raise ValueError(f"HuggingFace key {key} not found in state_dict.")
diff --git a/src/maxtext/checkpoint_conversion/utils/hf_shape.py b/src/maxtext/checkpoint_conversion/utils/hf_shape.py
@@ -53,7 +53,7 @@ def GEMMA3_HF_WEIGHTS_TO_SHAPE(config):
   vision_patch_size = vision_config["patch_size"]
   vision_num_channels = vision_config["num_channels"]
   vision_image_size = vision_config["image_size"]
-  vision_num_positions = (vision_image_size / vision_patch_size) ** 2
+  vision_num_positions = (vision_image_size // vision_patch_size) ** 2
 
   vocab_size = text_config["vocab_size"]
 
diff --git a/tests/end_to_end/tpu/gemma3/4b/test_gemma3.sh b/tests/end_to_end/tpu/gemma3/4b/test_gemma3.sh
@@ -10,14 +10,14 @@
 
 # Usage:
 # export HF_TOKEN=<your Hugging Face access token>
-# export RUN_ID=$(date +%Y-%m-%d-%H-%M)
+# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
 # bash test_gemma3_to_mt.sh $RUN_ID
 # bash test_gemma3.sh $RUN_ID
 
 
 set -ex
 
-run_id=${1:-$(date +%Y-%m-%d-%H-%M)}
+run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
 MODEL_NAME='gemma3-4b'
 
 # To convert the multimodal model, make sure the use_multimodal is set to be true
diff --git a/tests/end_to_end/tpu/gemma3/4b/test_gemma3_multimodal_sft.sh b/tests/end_to_end/tpu/gemma3/4b/test_gemma3_multimodal_sft.sh
@@ -1,75 +1,87 @@
 #!/bin/bash
 
-# This file contains an end-to-end Airflow nightly test, designed to run once a day on a v5p-8, along with documentation to guide users in getting started with Gemma3-4B.
+# Validates the Gemma3-4B SFT multimodal pipeline using a pre-converted MaxText checkpoint.
 
-# The flow of this file is as follows:
-# 1. Convert the checkpoint downloaded from Hugging Face to make it compatible with MaxText
-# 2. Run multimodal decoding of Gemma3-4B, with the converted checkpoint.
-# 3. Run supervised finetuning (SFT) of Gemma3-4B on ChartQA dataset with the converted checkpoint.
-# 4. Run decoding from the finetuned checkpoint from step 3, seeing the short answer from SFT.
-# 5. Convert the SFT checkpoint back to HuggingFace format.
+# The flow of this script is as follows:
+# 1. Run inference on the pre-converted checkpoint.
+# 2. Run SFT of Gemma3-4B on ChartQA dataset with the converted checkpoint.
+# 3. Run inference on the checkpoint produced by the SFT run.
+# 4. Convert the checkpoint produced by the SFT run back to HuggingFace format.
+
+# Usage:
+# export HF_TOKEN=<your Hugging Face access token>
+# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
+# bash test_gemma3_to_mt.sh $RUN_ID true
+# bash test_gemma3_multimodal_sft.sh $RUN_ID
 
 # Note: You can stop at any step if you just want to run part of the flow.
 
 set -ex
-idx=$(date +%Y-%m-%d-%H-%M)
+
+run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
 MODEL_NAME='gemma3-4b'
-export MODEL_VARIATION='4b'
-HF_TOKEN='' # Important!!! Save your hf access token here
-HF_GOLDEN_MODEL='google/gemma-3-4b-pt'
-TOKENIZER_PATH="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"'/tokenizer.gemma3'
-# To convert the multimodal model, make sure the use_multimodal is set to be true
-USE_MULTIMODAL=true
-SCAN_LAYERS=false
-SFT_STEPS=10
 
-# Installing torch for deps in forward_pass_logit_checker.py
-python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
+# Non-Googlers please remember to point `BASE_OUTPUT_DIRECTORY` to the GCS paths where you have the scanned and unscanned checkpoints stored
+BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME}
+UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/unscanned/${run_id}/0/items
+SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/scanned/${run_id}/0/items
 
-# After downloading checkpoints, copy them to GCS bucket at $MODEL_BUCKET \
-# Non-Googlers please remember to point these variables to GCS buckets that you own, this script uses internal buckets for testing.
-export MODEL_BUCKET=gs://maxtext-gemma/unified/gemma3
+# Step 1: Install torch and google-jetstream
+python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
+python3 -m pip install google-jetstream@https://github.com/AI-Hypercomputer/JetStream/archive/29329e8e73820993f77cfc8efe34eb2a73f5de98.zip --no-deps
 
-# 1. Convert the HuggingFace checkpoint to MaxText unscanned ckpt:
-python3 -m maxtext.checkpoint_conversion.to_maxtext "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml \
+# Step 2: Run inference on the original checkpoint converted from Hugging Face
+python3 -m maxtext.inference.decode \
     model_name=${MODEL_NAME} \
-    hf_access_token=${HF_TOKEN} \
-    base_output_directory=${MODEL_BUCKET}/${MODEL_VARIATION}/unscanned/${idx} \
-    use_multimodal=${USE_MULTIMODAL} \
-    scan_layers=${SCAN_LAYERS}
-
-# 2. Decode the converted checkpoint to make sure it works
-export UNSCANNED_CKPT_PATH=${MODEL_BUCKET}/${MODEL_VARIATION}/unscanned/${idx}/0/items
-python3 -m maxtext.inference.decode "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml model_name=${MODEL_NAME} tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${UNSCANNED_CKPT_PATH} per_device_batch_size=1 run_name=ht_test max_prefill_predict_length=272 max_target_length=300 steps=1 async_checkpointing=false scan_layers=$SCAN_LAYERS use_multimodal=${USE_MULTIMODAL} prompt=\'Describe\ image\ \<start_of_image\>\' image_path=\'tests/assets/test_image.jpg\' attention=\'dot_product\'
+    load_parameters_path=${UNSCANNED_CKPT_PATH} \
+    per_device_batch_size=1 \
+    run_name=${run_id} \
+    max_prefill_predict_length=272 \
+    max_target_length=300 \
+    steps=1 \
+    async_checkpointing=false \
+    scan_layers=false \
+    use_multimodal=True \
+    tokenizer_type=huggingface \
+    prompt=\'Describe\ image\ \<start_of_image\>\' \
+    image_path=\'tests/assets/test_image.jpg\' \
+    attention=\'dot_product\' skip_jax_distributed_system=True
 
-# 3. SFT the MaxText converted checkpoint on ChartQA dataset
-export BASE_OUTPUT_DIRECTORY=${MODEL_BUCKET}/${MODEL_VARIATION}/unscanned/sft
-python -m maxtext.trainers.post_train.sft.train_sft_native "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//sft-vision-chartqa.yml \
-    run_name=$idx \
-    model_name=$MODEL_NAME tokenizer_path="google/gemma-3-4b-pt" \
+# Step 3: Run SFT on the MaxText checkpoint on ChartQA dataset
+python -m maxtext.trainers.post_train.sft.train_sft_native "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"/post_train/sft-vision-chartqa.yml \
+    run_name=${run_id} \
+    model_name=${MODEL_NAME} \
     per_device_batch_size=1 \
     max_prefill_predict_length=1024 max_target_length=2048 \
-    steps=$SFT_STEPS \
-    scan_layers=$SCAN_LAYERS async_checkpointing=False \
+    steps=5 \
+    scan_layers=false async_checkpointing=False \
     attention=dot_product \
-    dataset_type=hf hf_path=parquet hf_access_token=$HF_TOKEN \
+    dataset_type=hf hf_path=parquet \
     hf_train_files=gs://aireenmei-multipod/dataset/hf/chartqa/train-* \
-    base_output_directory=$BASE_OUTPUT_DIRECTORY \
-    load_parameters_path=$UNSCANNED_CKPT_PATH \
-    dtype=bfloat16 weight_dtype=bfloat16 sharding_tolerance=0.05
+    base_output_directory=${BASE_OUTPUT_DIRECTORY}/multimodal/sft \
+    load_parameters_path=${UNSCANNED_CKPT_PATH} \
+    dtype=bfloat16 weight_dtype=bfloat16 sharding_tolerance=0.05 \
+    checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False
 
-# 4. Decode from the finetuned checkpoint from step 3
-export FINAL_CKPT_STEP=$((SFT_STEPS - 1))
-export FINETUNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/${idx}/checkpoints/${FINAL_CKPT_STEP}/items
-python3 -m maxtext.inference.decode "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml model_name=${MODEL_NAME} tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${FINETUNED_CKPT_PATH} per_device_batch_size=1 run_name=ht_test max_prefill_predict_length=272 max_target_length=300 steps=1 async_checkpointing=false scan_layers=$SCAN_LAYERS use_multimodal=${USE_MULTIMODAL} prompt=\'Describe\ image\ \<start_of_image\>\' image_path=\'tests/assets/test_image.jpg\' attention=\'dot_product\'
+# Step 4: Run inference on the checkpoint generated from the previous run
+python3 -m maxtext.inference.decode \
+    model_name=${MODEL_NAME} \
+    load_parameters_path=${BASE_OUTPUT_DIRECTORY}/multimodal/sft/${run_id}/checkpoints/4/items \
+    per_device_batch_size=1 \
+    run_name=${run_id}} \
+    max_prefill_predict_length=272 \
+    max_target_length=300 \
+    steps=1 \
+    async_checkpointing=false \
+    scan_layers=false \
+    use_multimodal=true \
+    prompt=\'Describe\ image\ \<start_of_image\>\' \
+    image_path=\'tests/assets/test_image.jpg\' \
+    attention=\'dot_product\'
 
-# 5. Convert the SFT checkpoint back to HuggingFace format.
-export LOCAL_PATH=./tmp/hf/${MODEL_NAME}/${idx}
-export CKPT_PATH="gs://maxtext-gemma/unified/gemma3/4b/unscanned/sft/2025-08-08-18-28/2025-08-08-18-28/checkpoints/9/items"
-python3 -m maxtext.checkpoint_conversion.to_huggingface "${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}"//base.yml \
+# Step 5: Convert the SFT checkpoint back to HuggingFace format
+python3 -m maxtext.checkpoint_conversion.to_huggingface \
     model_name=${MODEL_NAME} \
-    hf_access_token=${HF_TOKEN} \
-    load_parameters_path=${CKPT_PATH} \
-    base_output_directory=${LOCAL_PATH} \
-    use_multimodal=${USE_MULTIMODAL} \
-    scan_layers=$SCAN_LAYERS
+    load_parameters_path=${BASE_OUTPUT_DIRECTORY}/multimodal/sft/${run_id}/checkpoints/4/items \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY}/to_huggingface/unscanned/${run_id} \
+    use_multimodal=true scan_layers=false
diff --git a/tests/end_to_end/tpu/gemma3/4b/test_gemma3_rl.sh b/tests/end_to_end/tpu/gemma3/4b/test_gemma3_rl.sh
@@ -10,14 +10,14 @@
 
 # Usage:
 # export HF_TOKEN=<your Hugging Face access token>
-# export RUN_ID=$(date +%Y-%m-%d-%H-%M)
+# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
 # bash test_gemma3_to_mt.sh $RUN_ID
 # bash test_gemma3_rl.sh $RUN_ID
 
 
 set -ex
 
-run_id=${1:-$(date +%Y-%m-%d-%H-%M)}
+run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
 MODEL_NAME='gemma3-4b'
 
 # Non-Googlers please remember to point `BASE_OUTPUT_DIRECTORY` to the GCS paths where you have the scanned and unscanned checkpoints stored
diff --git a/tests/end_to_end/tpu/gemma3/4b/test_gemma3_sft.sh b/tests/end_to_end/tpu/gemma3/4b/test_gemma3_sft.sh
@@ -10,14 +10,14 @@
 
 # Usage:
 # export HF_TOKEN=<your Hugging Face access token>
-# export RUN_ID=$(date +%Y-%m-%d-%H-%M)
+# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
 # bash test_gemma3_to_mt.sh $RUN_ID
 # bash test_gemma3_sft.sh $RUN_ID
 
 
 set -ex
 
-run_id=${1:-$(date +%Y-%m-%d-%H-%M)}
+run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
 MODEL_NAME='gemma3-4b'
 
 # Non-Googlers please remember to point `BASE_OUTPUT_DIRECTORY` to the GCS paths where you have the scanned and unscanned checkpoints stored
diff --git a/tests/end_to_end/tpu/gemma3/4b/test_gemma3_to_mt.sh b/tests/end_to_end/tpu/gemma3/4b/test_gemma3_to_mt.sh
@@ -7,19 +7,21 @@
 # 2. Convert the HuggingFace checkpoint to MaxText format in both unscanned and scanned formats.
 # 3. Run a forward pass logits check to verify the converted checkpoint matches the original HF model.
 
-# Pre-requisites:
-# 1. Set HF_TOKEN environment variable to your Hugging Face access token with read permissions
-# export HF_TOKEN=<Hugging Face access token>
+# Usage:
+# export HF_TOKEN=<your Hugging Face access token>
+# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
+# bash test_gemma3_to_mt.sh $RUN_ID - to convert the checkpoint and run logit check for non-multimodal version
+# bash test_gemma3_to_mt.sh $RUN_ID true - to convert the checkpoint and run logit check for multimodal version
 
 
 set -ex
 
-run_id=${1:-$(date +%Y-%m-%d-%H-%M)}
+run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
 MODEL_NAME='gemma3-4b'
 HF_GOLDEN_MODEL='google/gemma-3-4b-it'
 
 # To convert the multimodal model, make sure the use_multimodal is set to be true
-USE_MULTIMODAL=false
+USE_MULTIMODAL=${2:-false}
 
 # Non-Googlers please remember to point `BASE_OUTPUT_DIRECTORY` to the GCS paths where you want to store scanned and unscanned checkpoints
 BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME}/to_maxtext
@@ -58,12 +60,14 @@ echo "Scanned checkpoint path: ${SCANNED_CKPT_PATH}"
 # Step 3: Test whether the forward pass logits match the original HF model
 # to get higher precision (eg. float32) run on CPU with `JAX_PLATFORMS=cpu`
 # ToDo: improve forward_pass_logit_checker to test multi-modal prompt
-python3 -m tests.utils.forward_pass_logit_checker \
-    load_parameters_path=${UNSCANNED_CKPT_PATH} \
-    model_name=${MODEL_NAME} \
-    use_multimodal=${USE_MULTIMODAL} \
-    scan_layers=false \
-    --hf_model_path=${HF_GOLDEN_MODEL} \
-    --max_kl_div=0.03 \
-    --run_hf_model=true \
-    hardware=cpu skip_jax_distributed_system=True
+if [ "${USE_MULTIMODAL}" = "false" ]; then
+    python3 -m tests.utils.forward_pass_logit_checker \
+        load_parameters_path=${UNSCANNED_CKPT_PATH} \
+        model_name=${MODEL_NAME} \
+        use_multimodal=${USE_MULTIMODAL} \
+        scan_layers=false \
+        --hf_model_path=${HF_GOLDEN_MODEL} \
+        --max_kl_div=0.03 \
+        --run_hf_model=true \
+        hardware=cpu skip_jax_distributed_system=True
+fi