up

metascroy · metascroy · commit 47e5f99110e1 · 2026-04-08T20:20:00.000-07:00
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
@@ -30,6 +30,16 @@ Export produces a `model.pte` and `aoti_cuda_blob.ptd` containing the
 compiled CUDA kernels and quantized weights. Int4 quantization is
 recommended — the model is too large to fit in VRAM at bf16.
 
+```bash
+python export.py \
+    --model-id Qwen/Qwen3.5-35B-A3B \
+    --output-dir ./qwen35_moe_exports \
+    --qlinear 4w \
+    --qembedding 8w
+```
+
+Or with a local directory:
+
 ```bash
 python export.py \
     --model-dir ~/models/Qwen3.5-35B-A3B \
@@ -42,7 +52,8 @@ python export.py \
 
 | Flag | Default | Description |
 |------|---------|-------------|
-| `--model-dir` | (required) | HuggingFace model directory with `config.json` + safetensors |
+| `--model-id` | (none) | HuggingFace model ID (e.g. `Qwen/Qwen3.5-35B-A3B`). Downloads automatically. |
+| `--model-dir` | (none) | Local HuggingFace model directory with `config.json` + safetensors |
 | `--output-dir` | `./qwen35_moe_exports` | Output directory |
 | `--max-seq-len` | `4096` | KV cache length |
 | `--qlinear` | (none) | Linear layer quantization: `4w`, `8w`, `8da4w`, `8da8w` |
@@ -144,6 +155,17 @@ with MLX custom ops (`mlx::gather_qmm`, `mlx::gated_delta_rule`, `mlx::rope`).
 
 ### Export (MLX)
 
+```bash
+python export.py \
+    --model-id Qwen/Qwen3.5-35B-A3B \
+    --backend mlx \
+    --qlinear 4w \
+    --qlinear-group-size 64 \
+    --output-dir ./qwen35_moe_mlx
+```
+
+Or with a local directory:
+
 ```bash
 python export.py \
     --model-dir ~/models/Qwen3.5-35B-A3B \
@@ -158,6 +180,8 @@ python export.py \
 | Flag | Default | Description |
 |------|---------|-------------|
 | `--backend mlx` | `cuda` | Use MLX backend for Apple Silicon |
+| `--model-id` | (none) | HuggingFace model ID (downloads automatically) |
+| `--model-dir` | (none) | Local model directory |
 | `--qlinear` | (none) | Linear layer quantization: `4w`, `8w` |
 | `--qlinear-group-size` | `32` | Group size (64 recommended for MLX) |
 | `--qembedding` | (none) | Embedding quantization: `8w` |
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -4,10 +4,11 @@
 Supports CUDA and MLX backends.
 
 Usage:
+  python export.py --model-id Qwen/Qwen3.5-35B-A3B
   python export.py --model-dir /path/to/Qwen3.5-MoE-A3B
   python export.py --model-dir /path/to/model --qlinear 4w
   python export.py --prequantized /path/to/quantized_bundle/
-  python export.py --model-dir /path/to/model --backend mlx --qlinear 4w
+  python export.py --model-id Qwen/Qwen3.5-35B-A3B --backend mlx --qlinear 4w
 """
 
 import argparse
@@ -673,6 +674,11 @@ def main():
         default=None,
         help="HuggingFace model directory (not needed with --prequantized)",
     )
+    parser.add_argument(
+        "--model-id",
+        default=None,
+        help="HuggingFace model-id",
+    )
     parser.add_argument(
         "--output-dir", default="./qwen35_moe_exports", help="Output directory"
     )
@@ -731,6 +737,13 @@ def main():
     )
     args = parser.parse_args()
 
+    if args.model_id:
+        if args.model_dir is not None:
+            raise ValueError("Cannot specify model_dir when model_id is provided.")
+        from huggingface_hub import snapshot_download
+
+        args.model_dir = snapshot_download(repo_id=args.model_id)
+
     if not args.prequantized and not args.model_dir and not args.tiny_test:
         parser.error(
             "--model-dir is required unless --prequantized or --tiny-test is provided."