Add Qwen 3.6 MoE model and switch CI to Qwen3.6-35B-A3B-HQQ-INT4 (pytorch#18955)

mergennachin · web-flow · commit 7fdd30613ac0 · 2026-04-16T20:02:20.000-04:00
Qwen 3.6 MoE shares architecture and runner with Qwen 3.5 MoE. Add a
stub README pointing to the existing qwen3_5_moe example. Update CI
scripts and cuda.yml to use the Qwen 3.6 prequantized checkpoint.
Improve qwen3_5_moe README: add quick-start section for prequantized
weights, list available prequantized checkpoints, and clean up
terminology.
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -184,7 +184,7 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
-  SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
+  SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
     MODEL_NAME="qwen3_5_moe"
     TASK=""
     MAX_SEQ_LEN=""
@@ -194,7 +194,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4"
     exit 1
     ;;
 esac
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -216,7 +216,7 @@ case "$HF_MODEL" in
     AUDIO_FILE="test_audio.wav"
     IMAGE_PATH=""
     ;;
-  SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
+  SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
     MODEL_NAME="qwen3_5_moe"
     RUNNER_TARGET="qwen3_5_moe_runner"
     RUNNER_PATH="qwen3_5_moe"
@@ -230,7 +230,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4"
     exit 1
     ;;
 esac
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -180,7 +180,7 @@ jobs:
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
           - repo: "SocialLocalMobile"
-            name: "Qwen3.5-35B-A3B-HQQ-INT4"
+            name: "Qwen3.6-35B-A3B-HQQ-INT4"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -194,11 +194,11 @@ jobs:
           # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.5-35B-A3B-HQQ-INT4"
+              name: "Qwen3.6-35B-A3B-HQQ-INT4"
             quant: "non-quantized"
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.5-35B-A3B-HQQ-INT4"
+              name: "Qwen3.6-35B-A3B-HQQ-INT4"
             quant: "quantized-int4-weight-only"
           # Voxtral Realtime only supports int4-tile-packed on CUDA
           - model:
@@ -254,7 +254,7 @@ jobs:
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
-      runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
@@ -310,7 +310,7 @@ jobs:
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
           - repo: "SocialLocalMobile"
-            name: "Qwen3.5-35B-A3B-HQQ-INT4"
+            name: "Qwen3.6-35B-A3B-HQQ-INT4"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -324,11 +324,11 @@ jobs:
           # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.5-35B-A3B-HQQ-INT4"
+              name: "Qwen3.6-35B-A3B-HQQ-INT4"
             quant: "non-quantized"
           - model:
               repo: "SocialLocalMobile"
-              name: "Qwen3.5-35B-A3B-HQQ-INT4"
+              name: "Qwen3.6-35B-A3B-HQQ-INT4"
             quant: "quantized-int4-weight-only"
           # Voxtral Realtime only supports int4-tile-packed on CUDA
           - model:
@@ -378,7 +378,7 @@ jobs:
             quant: "non-quantized"
     with:
       timeout: 90
-      runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ matrix.model.name == 'Qwen3.6-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
@@ -30,6 +30,24 @@ Export produces a `model.pte` and `aoti_cuda_blob.ptd` containing the
 compiled CUDA kernels and quantized weights. Int4 quantization is
 recommended — the model is too large to fit in VRAM at bf16.
 
+### Quick start: prequantized weights
+
+The fastest path is to export from prequantized weights, which skips
+the slow quantization step entirely.
+
+Prequantized checkpoints are available for download:
+- [SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
+- [SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4)
+
+```bash
+python export.py --prequantized <path-to-bundle>
+```
+
+See [Generating Prequantized Weights](#generating-prequantized-weights)
+to create your own.
+
+### Quantize and Export
+
 ```bash
 python export.py \
     --model-id Qwen/Qwen3.5-35B-A3B \
@@ -60,7 +78,7 @@ python export.py \
 | `--qlinear-group-size` | `32` | Group size for linear quantization |
 | `--qembedding` | (none) | Embedding quantization: `8w` |
 | `--hqq` | off | Use HQQ scale-only optimization for expert quantization (slower, better accuracy) |
-| `--prequantized` | (none) | Path to prequantized bundle directory (skips quantization) |
+| `--prequantized` | (none) | Path to prequantized checkpoint directory (skips quantization) |
 | `--turboquant` | off | Enable TurboQuant TQ4 KV cache compression (3.8x cache savings) |
 
 ### TurboQuant KV Cache Compression
@@ -72,11 +90,11 @@ KV cache compression (3.8x savings) on the 10 full-attention layers.
 python export.py --prequantized qwen35_moe_int4_hqq --turboquant
 ```
 
-### Prequantized Export
+### Generating Prequantized Weights
 
 Quantization is slow (~30 min with HQQ). To avoid re-quantizing on every
-export, use `quantize_and_save.py` to create a self-contained bundle, then
-export from it:
+export, use `quantize_and_save.py` to create a prequantized checkpoint
+directory, then export from it:
 
 ```bash
 # Step 1: Quantize once (slow)
@@ -88,13 +106,13 @@ python quantize_and_save.py \
     --hqq \
     --output qwen35_moe_int4_hqq
 
-# Step 2: Export from bundle (fast, no --model-dir needed)
+# Step 2: Export from prequantized checkpoint (fast, no --model-dir needed)
 python export.py \
     --prequantized qwen35_moe_int4_hqq
 ```
 
-The bundle contains `model.safetensors`, `config.json`, and tokenizer files.
-It can be uploaded to HuggingFace Hub for easy sharing.
+The output directory contains `model.safetensors`, `config.json`, and
+tokenizer files. It can be uploaded to HuggingFace Hub for easy sharing.
 
 ## Build
 
diff --git a/examples/models/qwen3_6_moe/README.md b/examples/models/qwen3_6_moe/README.md
@@ -0,0 +1,11 @@
+# Qwen 3.6 MoE
+
+Qwen 3.6 MoE uses the same architecture and runner as Qwen 3.5 MoE.
+See [examples/models/qwen3_5_moe](../qwen3_5_moe/) for export, build,
+and inference instructions.
+
+Prequantized INT4 weights are available at
+[SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4](https://huggingface.co/SocialLocalMobile/Qwen3.6-35B-A3B-HQQ-INT4).
+
+**Note:** This model has not been tested or evaluated. It is provided
+mainly for development purposes.