Metal CI: Add Qwen 3.5 MoE tiny model integration test

manuelcandales · manuelcandales · commit e40ab21353b5 · 2026-04-21T23:28:37.000-04:00
Export the tiny model with --backend metal, build the C++ runner,
and verify both decode (T=1) and prefill (T&gt;2) complete successfully.
Uses a byte-level tokenizer matching the tiny model's vocab_size=256.

Authored with Claude.
diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml
@@ -12,6 +12,8 @@ on:
       - .github/workflows/metal.yml
       - backends/apple/metal/**
       - backends/aoti/**
+      - examples/models/qwen3_5_moe/**
+      - extension/llm/export/**
   workflow_dispatch:
 
 concurrency:
@@ -59,6 +61,89 @@ jobs:
         ${CONDA_RUN} python -m unittest backends.apple.metal.tests.test_modules.TestMetalBackendModules
         echo "::endgroup::"
 
+  test-metal-qwen35-moe-tiny:
+    name: test-metal-qwen35-moe-tiny
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m2-stable
+      python-version: '3.11'
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
+        echo "::endgroup::"
+
+        # Isolate Inductor cache per job to prevent PCH conflicts
+        export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX")
+        export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX")
+
+        echo "::group::Export Qwen 3.5 MoE (tiny model, Metal)"
+        ${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.export \
+          --tiny-test \
+          --backend metal \
+          --qlinear fpa4w \
+          --output-dir /tmp/qwen35_moe_metal_tiny
+        echo "::endgroup::"
+
+        echo "::group::Build Metal runtime and Qwen 3.5 MoE runner"
+        ${CONDA_RUN} cmake --workflow --preset llm-release-metal
+        cd examples/models/qwen3_5_moe
+        ${CONDA_RUN} cmake --workflow --preset qwen3-5-moe-metal
+        cd -
+        echo "::endgroup::"
+
+        # Create a byte-level tokenizer for the tiny model (vocab_size=256).
+        # Maps each byte value to its own token ID so any prompt produces valid IDs.
+        ${CONDA_RUN} python -c "
+        import json
+        vocab = {chr(i) if 32 <= i < 127 else f'<0x{i:02X}>': i for i in range(256)}
+        merges = []
+        tokenizer = {
+          'version': '1.0',
+          'model': {'type': 'BPE', 'vocab': vocab, 'merges': merges},
+          'added_tokens': [{'id': i, 'content': chr(i) if 32 <= i < 127 else f'<0x{i:02X}>', 'single_word': False, 'lstrip': False, 'rstrip': False, 'normalized': False, 'special': False} for i in range(256)],
+        }
+        with open('/tmp/qwen35_moe_metal_tiny/tokenizer.json', 'w') as f:
+          json.dump(tokenizer, f)
+        print('Created byte-level tokenizer.json')
+        "
+
+        echo "::group::Run Qwen 3.5 MoE inference (T=1 decode)"
+        OUTPUT=$(./cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
+          --model_path /tmp/qwen35_moe_metal_tiny/model.pte \
+          --tokenizer_path /tmp/qwen35_moe_metal_tiny/tokenizer.json \
+          --prompt "Hello" \
+          --temperature 0 \
+          --max_new_tokens 4 2>&1)
+        echo "$OUTPUT"
+        if echo "$OUTPUT" | grep -q "Decode:"; then
+          echo "Success: decode completed"
+        else
+          echo "Failed: decode did not complete"
+          exit 1
+        fi
+        echo "::endgroup::"
+
+        echo "::group::Run Qwen 3.5 MoE inference (T>2 prefill + decode)"
+        OUTPUT=$(./cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
+          --model_path /tmp/qwen35_moe_metal_tiny/model.pte \
+          --tokenizer_path /tmp/qwen35_moe_metal_tiny/tokenizer.json \
+          --prompt "one two three" \
+          --temperature 0 \
+          --max_new_tokens 4 2>&1)
+        echo "$OUTPUT"
+        if echo "$OUTPUT" | grep -q "Decode:"; then
+          echo "Success: prefill + decode completed"
+        else
+          echo "Failed: prefill + decode did not complete"
+          exit 1
+        fi
+        echo "::endgroup::"
+
   export-model-metal-artifact:
     name: export-model-metal-artifact
     # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)