|
12 | 12 | - .github/workflows/metal.yml |
13 | 13 | - backends/apple/metal/** |
14 | 14 | - backends/aoti/** |
| 15 | + - examples/models/qwen3_5_moe/** |
| 16 | + - extension/llm/export/** |
15 | 17 | workflow_dispatch: |
16 | 18 |
|
17 | 19 | concurrency: |
@@ -59,6 +61,102 @@ jobs: |
59 | 61 | ${CONDA_RUN} python -m unittest backends.apple.metal.tests.test_modules.TestMetalBackendModules |
60 | 62 | echo "::endgroup::" |
61 | 63 |
|
| 64 | + test-metal-qwen35-moe-tiny: |
| 65 | + name: test-metal-qwen35-moe-tiny |
| 66 | + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main |
| 67 | + with: |
| 68 | + runner: macos-m2-stable |
| 69 | + python-version: '3.11' |
| 70 | + submodules: 'recursive' |
| 71 | + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} |
| 72 | + timeout: 120 |
| 73 | + script: | |
| 74 | + set -eux |
| 75 | +
|
| 76 | + echo "::group::Setup ExecuTorch" |
| 77 | + PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh |
| 78 | + echo "::endgroup::" |
| 79 | +
|
| 80 | + # Isolate Inductor cache per job to prevent PCH conflicts |
| 81 | + export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX") |
| 82 | + export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX") |
| 83 | +
|
| 84 | + echo "::group::Export Qwen 3.5 MoE (tiny model, Metal)" |
| 85 | + ${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.export \ |
| 86 | + --tiny-test \ |
| 87 | + --backend metal \ |
| 88 | + --qlinear fpa4w \ |
| 89 | + --output-dir /tmp/qwen35_moe_metal_tiny |
| 90 | + echo "::endgroup::" |
| 91 | +
|
| 92 | + echo "::group::Build Metal runtime and Qwen 3.5 MoE runner" |
| 93 | + ${CONDA_RUN} cmake --workflow --preset llm-release-metal |
| 94 | + cd examples/models/qwen3_5_moe |
| 95 | + ${CONDA_RUN} cmake --workflow --preset qwen3-5-moe-metal |
| 96 | + cd - |
| 97 | + echo "::endgroup::" |
| 98 | +
|
| 99 | + # Create a byte-level tokenizer for the tiny model (vocab_size=256). |
| 100 | + # Maps each byte value to its own token ID so any prompt produces valid IDs. |
| 101 | + ${CONDA_RUN} python - <<'PY' |
| 102 | + import json |
| 103 | + vocab = {chr(i) if 32 <= i < 127 else f'<0x{i:02X}>': i for i in range(256)} |
| 104 | + tokenizer = { |
| 105 | + 'version': '1.0', |
| 106 | + 'model': {'type': 'BPE', 'vocab': vocab, 'merges': []}, |
| 107 | + 'added_tokens': [{'id': i, 'content': chr(i) if 32 <= i < 127 else f'<0x{i:02X}>', 'single_word': False, 'lstrip': False, 'rstrip': False, 'normalized': False, 'special': False} for i in range(256)], |
| 108 | + } |
| 109 | + with open('/tmp/qwen35_moe_metal_tiny/tokenizer.json', 'w') as f: |
| 110 | + json.dump(tokenizer, f) |
| 111 | + print('Created byte-level tokenizer.json') |
| 112 | + PY |
| 113 | +
|
| 114 | + RUNNER=./cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner |
| 115 | + # Patch absolute libomp install name to rpath-based lookup (same as test_model_e2e.sh) |
| 116 | + if otool -L "$RUNNER" | grep -q "/opt/llvm-openmp/lib/libomp.dylib"; then |
| 117 | + install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib "$RUNNER" |
| 118 | + fi |
| 119 | + MODEL=/tmp/qwen35_moe_metal_tiny/model.pte |
| 120 | + TOKENIZER=/tmp/qwen35_moe_metal_tiny/tokenizer.json |
| 121 | +
|
| 122 | + echo "::group::Run Qwen 3.5 MoE inference (T=1 decode)" |
| 123 | + # Single-char prompt → 1 token → exercises decode-only path |
| 124 | + set +e |
| 125 | + OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \ |
| 126 | + --prompt "A" --temperature 0 --max_new_tokens 4 2>&1) |
| 127 | + RC=$? |
| 128 | + set -e |
| 129 | + echo "$OUTPUT" |
| 130 | + if [ $RC -ne 0 ]; then |
| 131 | + echo "Failed: runner exited with code $RC" |
| 132 | + exit 1 |
| 133 | + fi |
| 134 | + echo "$OUTPUT" | grep -q "Prompt tokens: 1" || { echo "Failed: expected 1 prompt token for decode path"; exit 1; } |
| 135 | + echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: decode did not complete"; exit 1; } |
| 136 | + echo "Success: decode completed" |
| 137 | + echo "::endgroup::" |
| 138 | +
|
| 139 | + echo "::group::Run Qwen 3.5 MoE inference (T>2 prefill + decode)" |
| 140 | + set +e |
| 141 | + OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \ |
| 142 | + --prompt "one two three" --temperature 0 --max_new_tokens 4 2>&1) |
| 143 | + RC=$? |
| 144 | + set -e |
| 145 | + echo "$OUTPUT" |
| 146 | + if [ $RC -ne 0 ]; then |
| 147 | + echo "Failed: runner exited with code $RC" |
| 148 | + exit 1 |
| 149 | + fi |
| 150 | + # Byte-level tokenizer: "one two three" = 13 tokens (13 bytes) |
| 151 | + PROMPT_TOKENS=$(echo "$OUTPUT" | grep -o "Prompt tokens: [0-9]*" | head -1 | grep -o "[0-9]*") |
| 152 | + if [ "$PROMPT_TOKENS" -le 2 ]; then |
| 153 | + echo "Failed: expected >2 prompt tokens for prefill path, got $PROMPT_TOKENS" |
| 154 | + exit 1 |
| 155 | + fi |
| 156 | + echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: prefill + decode did not complete"; exit 1; } |
| 157 | + echo "Success: prefill ($PROMPT_TOKENS tokens) + decode completed" |
| 158 | + echo "::endgroup::" |
| 159 | +
|
62 | 160 | export-model-metal-artifact: |
63 | 161 | name: export-model-metal-artifact |
64 | 162 | # Skip this job if the pull request is from a fork (HuggingFace secrets are not available) |
|
0 commit comments