Metal CI: Add Qwen 3.5 MoE tiny model integration test

manuelcandales · manuelcandales · commit 2330a5a05c2c · 2026-04-22T11:15:16.000-04:00
Export the tiny model with --backend metal, build the C++ runner,
and verify both decode (T=1) and prefill (T&gt;2) complete successfully.
Uses a byte-level tokenizer matching the tiny model's vocab_size=256.

Authored with Claude.
diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml
@@ -12,6 +12,8 @@ on:
       - .github/workflows/metal.yml
       - backends/apple/metal/**
       - backends/aoti/**
+      - examples/models/qwen3_5_moe/**
+      - extension/llm/export/**
   workflow_dispatch:
 
 concurrency:
@@ -59,6 +61,102 @@ jobs:
         ${CONDA_RUN} python -m unittest backends.apple.metal.tests.test_modules.TestMetalBackendModules
         echo "::endgroup::"
 
+  test-metal-qwen35-moe-tiny:
+    name: test-metal-qwen35-moe-tiny
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m2-stable
+      python-version: '3.11'
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
+        echo "::endgroup::"
+
+        # Isolate Inductor cache per job to prevent PCH conflicts
+        export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX")
+        export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX")
+
+        echo "::group::Export Qwen 3.5 MoE (tiny model, Metal)"
+        ${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.export \
+          --tiny-test \
+          --backend metal \
+          --qlinear fpa4w \
+          --output-dir /tmp/qwen35_moe_metal_tiny
+        echo "::endgroup::"
+
+        echo "::group::Build Metal runtime and Qwen 3.5 MoE runner"
+        ${CONDA_RUN} cmake --workflow --preset llm-release-metal
+        cd examples/models/qwen3_5_moe
+        ${CONDA_RUN} cmake --workflow --preset qwen3-5-moe-metal
+        cd -
+        echo "::endgroup::"
+
+        # Create a byte-level tokenizer for the tiny model (vocab_size=256).
+        # Maps each byte value to its own token ID so any prompt produces valid IDs.
+        ${CONDA_RUN} python - <<'PY'
+        import json
+        vocab = {chr(i) if 32 <= i < 127 else f'<0x{i:02X}>': i for i in range(256)}
+        tokenizer = {
+          'version': '1.0',
+          'model': {'type': 'BPE', 'vocab': vocab, 'merges': []},
+          'added_tokens': [{'id': i, 'content': chr(i) if 32 <= i < 127 else f'<0x{i:02X}>', 'single_word': False, 'lstrip': False, 'rstrip': False, 'normalized': False, 'special': False} for i in range(256)],
+        }
+        with open('/tmp/qwen35_moe_metal_tiny/tokenizer.json', 'w') as f:
+          json.dump(tokenizer, f)
+        print('Created byte-level tokenizer.json')
+        PY
+
+        RUNNER=./cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner
+        # Patch absolute libomp install name to rpath-based lookup (same as test_model_e2e.sh)
+        if otool -L "$RUNNER" | grep -q "/opt/llvm-openmp/lib/libomp.dylib"; then
+          install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib "$RUNNER"
+        fi
+        MODEL=/tmp/qwen35_moe_metal_tiny/model.pte
+        TOKENIZER=/tmp/qwen35_moe_metal_tiny/tokenizer.json
+
+        echo "::group::Run Qwen 3.5 MoE inference (T=1 decode)"
+        # Single-char prompt → 1 token → exercises decode-only path
+        set +e
+        OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \
+          --prompt "A" --temperature 0 --max_new_tokens 4 2>&1)
+        RC=$?
+        set -e
+        echo "$OUTPUT"
+        if [ $RC -ne 0 ]; then
+          echo "Failed: runner exited with code $RC"
+          exit 1
+        fi
+        echo "$OUTPUT" | grep -q "Prompt tokens: 1" || { echo "Failed: expected 1 prompt token for decode path"; exit 1; }
+        echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: decode did not complete"; exit 1; }
+        echo "Success: decode completed"
+        echo "::endgroup::"
+
+        echo "::group::Run Qwen 3.5 MoE inference (T>2 prefill + decode)"
+        set +e
+        OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \
+          --prompt "one two three" --temperature 0 --max_new_tokens 4 2>&1)
+        RC=$?
+        set -e
+        echo "$OUTPUT"
+        if [ $RC -ne 0 ]; then
+          echo "Failed: runner exited with code $RC"
+          exit 1
+        fi
+        # Byte-level tokenizer: "one two three" = 13 tokens (13 bytes)
+        PROMPT_TOKENS=$(echo "$OUTPUT" | grep -o "Prompt tokens: [0-9]*" | head -1 | grep -o "[0-9]*")
+        if [ "$PROMPT_TOKENS" -le 2 ]; then
+          echo "Failed: expected >2 prompt tokens for prefill path, got $PROMPT_TOKENS"
+          exit 1
+        fi
+        echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: prefill + decode did not complete"; exit 1; }
+        echo "Success: prefill ($PROMPT_TOKENS tokens) + decode completed"
+        echo "::endgroup::"
+
   export-model-metal-artifact:
     name: export-model-metal-artifact
     # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)