Update on "[ET Device Support] CUDA-native Qwen 3.5 MoE inference with device tensor pipeline"

Gasoonjia · Gasoonjia · commit 5863ffa8e5f3 · 2026-04-09T09:13:31.000-07:00
Integrate the ET device tensor pipeline into the Qwen 3.5 MoE model to eliminate unnecessary H2D/D2H copies during inference. - Export: Multi-method export (`forward` + `sample`) with device memory planning enabled and method-level H2D/D2H skipping. - Runner: Custom CUDA-native inference loop that keeps logits on GPU between forward and sample, reuses CUDA tensors across iterations, and only copies the 8-byte token ID back to CPU for EOS checking. Differential Revision: [D100133933](https://our.internmc.facebook.com/intern/diff/D100133933/) [ghstack-poisoned]
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -12,7 +12,12 @@
 
 import torch
 import torch.nn as nn
-from model import FusedMoEExperts, Qwen35MoE, Qwen35MoEConfig
+
+from executorch.examples.models.qwen3_5_moe.model import (
+    FusedMoEExperts,
+    Qwen35MoE,
+    Qwen35MoEConfig,
+)
 
 
 # ---------------------------------------------------------------------------
@@ -56,7 +61,9 @@ def load_prequantized_model(prequantized_dir, max_seq_len=4096):
     Returns:
         (model, config) ready for export.
     """
-    from quantize_and_save import load_quantized_state_dict
+    from executorch.examples.models.qwen3_5_moe.quantize_and_save import (
+        load_quantized_state_dict,
+    )
 
     config_path = os.path.join(prequantized_dir, "config.json")
     safetensors_path = os.path.join(prequantized_dir, "model.safetensors")
@@ -373,6 +380,7 @@ def _apply_turboquant(model, config):
 def export_and_lower(model, config, args):
     """Export model to .pte via torch.export + CUDA backend."""
     import torch._inductor.config as inductor_config
+
     from executorch.backends.cuda.cuda_backend import CudaBackend
     from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
     from executorch.exir import (
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
@@ -20,7 +20,7 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
+from torch.nn import functional as F
 
 
 # ---------------------------------------------------------------------------