Skip to content

Commit 2330a5a

Browse files
Metal CI: Add Qwen 3.5 MoE tiny model integration test
Export the tiny model with --backend metal, build the C++ runner, and verify both decode (T=1) and prefill (T>2) complete successfully. Uses a byte-level tokenizer matching the tiny model's vocab_size=256. Authored with Claude.
1 parent 6be4fb5 commit 2330a5a

1 file changed

Lines changed: 98 additions & 0 deletions

File tree

.github/workflows/metal.yml

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ on:
1212
- .github/workflows/metal.yml
1313
- backends/apple/metal/**
1414
- backends/aoti/**
15+
- examples/models/qwen3_5_moe/**
16+
- extension/llm/export/**
1517
workflow_dispatch:
1618

1719
concurrency:
@@ -59,6 +61,102 @@ jobs:
5961
${CONDA_RUN} python -m unittest backends.apple.metal.tests.test_modules.TestMetalBackendModules
6062
echo "::endgroup::"
6163
64+
test-metal-qwen35-moe-tiny:
65+
name: test-metal-qwen35-moe-tiny
66+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
67+
with:
68+
runner: macos-m2-stable
69+
python-version: '3.11'
70+
submodules: 'recursive'
71+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
72+
timeout: 120
73+
script: |
74+
set -eux
75+
76+
echo "::group::Setup ExecuTorch"
77+
PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
78+
echo "::endgroup::"
79+
80+
# Isolate Inductor cache per job to prevent PCH conflicts
81+
export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX")
82+
export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX")
83+
84+
echo "::group::Export Qwen 3.5 MoE (tiny model, Metal)"
85+
${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.export \
86+
--tiny-test \
87+
--backend metal \
88+
--qlinear fpa4w \
89+
--output-dir /tmp/qwen35_moe_metal_tiny
90+
echo "::endgroup::"
91+
92+
echo "::group::Build Metal runtime and Qwen 3.5 MoE runner"
93+
${CONDA_RUN} cmake --workflow --preset llm-release-metal
94+
cd examples/models/qwen3_5_moe
95+
${CONDA_RUN} cmake --workflow --preset qwen3-5-moe-metal
96+
cd -
97+
echo "::endgroup::"
98+
99+
# Create a byte-level tokenizer for the tiny model (vocab_size=256).
100+
# Maps each byte value to its own token ID so any prompt produces valid IDs.
101+
${CONDA_RUN} python - <<'PY'
102+
import json
103+
vocab = {chr(i) if 32 <= i < 127 else f'<0x{i:02X}>': i for i in range(256)}
104+
tokenizer = {
105+
'version': '1.0',
106+
'model': {'type': 'BPE', 'vocab': vocab, 'merges': []},
107+
'added_tokens': [{'id': i, 'content': chr(i) if 32 <= i < 127 else f'<0x{i:02X}>', 'single_word': False, 'lstrip': False, 'rstrip': False, 'normalized': False, 'special': False} for i in range(256)],
108+
}
109+
with open('/tmp/qwen35_moe_metal_tiny/tokenizer.json', 'w') as f:
110+
json.dump(tokenizer, f)
111+
print('Created byte-level tokenizer.json')
112+
PY
113+
114+
RUNNER=./cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner
115+
# Patch absolute libomp install name to rpath-based lookup (same as test_model_e2e.sh)
116+
if otool -L "$RUNNER" | grep -q "/opt/llvm-openmp/lib/libomp.dylib"; then
117+
install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib "$RUNNER"
118+
fi
119+
MODEL=/tmp/qwen35_moe_metal_tiny/model.pte
120+
TOKENIZER=/tmp/qwen35_moe_metal_tiny/tokenizer.json
121+
122+
echo "::group::Run Qwen 3.5 MoE inference (T=1 decode)"
123+
# Single-char prompt → 1 token → exercises decode-only path
124+
set +e
125+
OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \
126+
--prompt "A" --temperature 0 --max_new_tokens 4 2>&1)
127+
RC=$?
128+
set -e
129+
echo "$OUTPUT"
130+
if [ $RC -ne 0 ]; then
131+
echo "Failed: runner exited with code $RC"
132+
exit 1
133+
fi
134+
echo "$OUTPUT" | grep -q "Prompt tokens: 1" || { echo "Failed: expected 1 prompt token for decode path"; exit 1; }
135+
echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: decode did not complete"; exit 1; }
136+
echo "Success: decode completed"
137+
echo "::endgroup::"
138+
139+
echo "::group::Run Qwen 3.5 MoE inference (T>2 prefill + decode)"
140+
set +e
141+
OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \
142+
--prompt "one two three" --temperature 0 --max_new_tokens 4 2>&1)
143+
RC=$?
144+
set -e
145+
echo "$OUTPUT"
146+
if [ $RC -ne 0 ]; then
147+
echo "Failed: runner exited with code $RC"
148+
exit 1
149+
fi
150+
# Byte-level tokenizer: "one two three" = 13 tokens (13 bytes)
151+
PROMPT_TOKENS=$(echo "$OUTPUT" | grep -o "Prompt tokens: [0-9]*" | head -1 | grep -o "[0-9]*")
152+
if [ "$PROMPT_TOKENS" -le 2 ]; then
153+
echo "Failed: expected >2 prompt tokens for prefill path, got $PROMPT_TOKENS"
154+
exit 1
155+
fi
156+
echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: prefill + decode did not complete"; exit 1; }
157+
echo "Success: prefill ($PROMPT_TOKENS tokens) + decode completed"
158+
echo "::endgroup::"
159+
62160
export-model-metal-artifact:
63161
name: export-model-metal-artifact
64162
# Skip this job if the pull request is from a fork (HuggingFace secrets are not available)

0 commit comments

Comments
 (0)