intel_gpu: fix Qwen3 MoE GEMM3_SWIGLU on MTL-class (non-systolic) iGPU

peterchen-intel · Copilot · peterchen-intel · commit e30ef24fb6d4 · 2026-04-12T14:33:30.000Z
causing it to be skipped on MTL-class iGPU (12.70.x, XeHPG, no DPAS).
This left raw FP32 weight-decompression chains that overwhelmed
propagate_constants with ~56 GB of constant-folding memory.

Root cause of inference failure: moe_3gemm_swiglu_opt uses oneDNN
internally (onednn_linear for gate/up/down matrix multiplications).
OneDNN requires an in-order OCL queue.  MTL uses out-of-order queue
by default because use_onednn is false when supports_immad=false.

Fix:
  three MoE transformation passes (FuseVectorizedMOE3GEMM,
  ConvertMOEToMOECompressed, FuseMOE3GemmCompressed) run on all
  architectures.  FuseMOE3GemmCompressed creates MOE3GemmFusedCompressed
  which the OCL moe_3gemm_swiglu_opt kernel executes.
- Detect MOE3GemmFusedCompressed in apply_model_specific_options and
  force use_onednn=true so finalize_impl sets queue_type=in_order,
  satisfying the oneDNN in-order queue requirement.
- Fix moe_gather validate_impl to accept rank-2 input for models where
  the batch dimension is pre-flattened (Qwen3-style).
- Re-apply iGPU transfer skip (usm_shared -&gt; usm_device) in
  network.cpp and program.cpp for integrated GPUs where both allocation
  types share system DRAM (xe2+ or 12.7x-class MTL/ARL-S).

Tested on DUT1486ARLHx (12.70.4 / XeHPG / 64 GB): model loads in 14 s,
generates meaningful tokens, Unevictable stays below 120 MB.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_gather.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_gather.hpp
@@ -36,7 +36,11 @@ struct MoeGatherRef : public ImplementationManager {
         const auto& out_layout = node.get_output_layout(0);
         const auto& input_pshapes = in0_layout.get_partial_shape();
 
-        if (input_pshapes.rank() != 3 || input_pshapes[2].is_dynamic()) {
+        // Accept rank-2 [tokens, hidden] (Qwen3-style, batch already flattened)
+        // and rank-3 [batch, tokens, hidden].  The kernel only needs the last
+        // dimension (hidden_size) to be static.
+        const auto input_rank = input_pshapes.rank().get_length();
+        if ((input_rank != 2 && input_rank != 3) || input_pshapes[input_rank - 1].is_dynamic()) {
             return false;
         }
 
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -1085,11 +1085,15 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
         return;
 
     if (alloc_type == allocation_type::usm_host || alloc_type == allocation_type::usm_shared) {
-        // usm_device memory does not provide performance benefits on the integrated Xe2+ platforms
-        if (get_engine().get_device_info().arch >= gpu_arch::xe2 &&
-            get_engine().get_device_info().dev_type == device_type::integrated_gpu) {
+        const auto& dev_info = get_engine().get_device_info();
+        const bool skip_transfer_on_igpu = dev_info.dev_type == device_type::integrated_gpu &&
+                                           (dev_info.arch >= gpu_arch::xe2 ||
+                                            (dev_info.gfx_ver.major == 12 && dev_info.gfx_ver.minor < 73));
+        // On MTL-class and xe2+ integrated GPUs, usm_shared and usm_device
+        // live in the same DRAM. Copying constant storage only inflates
+        // pinned memory without a corresponding benefit.
+        if (skip_transfer_on_igpu)
             return;
-        }
 
         // Allocate and transfer memory
         auto device_mem = inst_mem.get_engine()->allocate_memory(inst_mem.get_layout(), allocation_type::usm_device, false);
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -734,10 +734,15 @@ void program::transfer_memory_to_device() {
                 continue;
 
             allocation_type target_alloc_type = alloc_type;
-            // usm_device memory does not provide performance benefits on the LNL platform
+            const auto& dev_info = get_engine().get_device_info();
+            const bool skip_transfer_on_igpu = dev_info.dev_type == device_type::integrated_gpu &&
+                                               (dev_info.arch >= gpu_arch::xe2 ||
+                                                (dev_info.gfx_ver.major == 12 && dev_info.gfx_ver.minor < 73));
+            // On MTL-class and xe2+ integrated GPUs, usm_shared and usm_device
+            // live in the same DRAM. Copying constant storage only inflates
+            // pinned memory without a corresponding benefit.
             if ((alloc_type == allocation_type::usm_host || alloc_type == allocation_type::usm_shared) &&
-                !(get_engine().get_device_info().arch >= gpu_arch::xe2 &&
-                  get_engine().get_device_info().dev_type == device_type::integrated_gpu)) {
+                !skip_transfer_on_igpu) {
                 // Convert to usm_device for performance optimization
                 target_alloc_type = allocation_type::usm_device;
             }
diff --git a/src/plugins/intel_gpu/src/plugin/ops/moe.cpp b/src/plugins/intel_gpu/src/plugin/ops/moe.cpp
@@ -3,7 +3,6 @@
 //
 #include "openvino/op/moe.hpp"
 
-#include <intel_gpu/primitives/eltwise.hpp>
 #include <intel_gpu/primitives/moe_gather.hpp>
 #include <intel_gpu/primitives/moe_scatter_reduction.hpp>
 #include <intel_gpu/primitives/swiglu.hpp>
@@ -98,31 +97,11 @@ static void CreateMOECompressedOp(ProgramBuilder& p, const std::shared_ptr<ov::o
         input_infos.push_back(cldnn::input_info(input));
     }
     if (config.expert_type == ov::op::internal::MOE::Expert_type::GEMM3_SWIGLU) {
-        // Create GEMM3_SWIGLU specific primitives
-        //   0: hidden_states - input tensor with hidden representations
-        //   1: routing_weights - [num_experts, ...] normalized weights for selected experts
-        //      (input to final multiplication)
-        //   2: router_topk_output_indices - [..., topk] indices of selected top-k experts
-        //   3: w0_weight - expert weights for first projection,
-        //   shape [num_experts, inter_size, group_num, group_size]
-        //   4: w0_scale - expert scale for first projection for compressed experts,
-        //   shape [num_experts, inter_size, group_num, 1]
-        //   5: w0_zp - expert zp for first projection for compressed experts,
-        //   shape [num_experts, inter_size, group_num, 1]
-        //   6: w1_weight - expert weights for second projection,
-        //   shape [num_experts, inter_size, group_num, group_size]
-        //   7: w1_scale - expert scale for second projection for compressed experts,
-        //   shape [num_experts, inter_size, group_num, 1]
-        //   8: w1_zp - expert zp for second projection for compressed experts,
-        //   shape [num_experts, inter_size, group_num, 1]
-        //   9: w2_weight - expert weights for final projection,
-        //   shape [num_experts, hidden_size, group_num, group_size]
-        //   10: w2_scale - expert scale for final projection for compressed experts,
-        //   shape [num_experts, hidden_size, group_num, 1]
-        //   11: w2_zp - expert zp for final projection for compressed experts,
-        //   shape [num_experts, hidden_size, group_num, 1]
-
-        // Use moe_3gemm_fused_compressed to replace it.
+        // GEMM3_SWIGLU (Qwen3-style MoE) should be handled by FuseMOE3GemmCompressed
+        // which converts MOECompressed(GEMM3_SWIGLU) → MOE3GemmFusedCompressed executed
+        // by the OCL moe_3gemm_swiglu_opt kernel on all architectures.  If execution
+        // reaches here the transformation pipeline is misconfigured.
+        OPENVINO_THROW("[GPU] MOECompressed(GEMM3_SWIGLU) must be handled by FuseMOE3GemmCompressed before program build");
     } else {
         // Create GEMM2_BIAS_SWIGLU_CLAMP specific primitives
         // input0 : input {#tokens, hidden_size}
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -487,13 +487,17 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
                 return (info.arch != cldnn::gpu_arch::xe2) && (info.arch != cldnn::gpu_arch::xe3);
             });
 
+            // FuseVectorizedMOE3GEMM converts the original vectorized MoE graph
+            // (separate MatMul + scatter/gather ops) into MOE(GEMM3_SWIGLU) with
+            // packed INT4 weights.  This structural conversion must run on ALL
+            // architectures so that ConvertMOEToMOECompressed can match the INT4
+            // constants downstream.  Without it the raw FP32 decompression chains
+            // reach propagate_constants and cause OOM on MTL-class iGPU.
+            //
+            // FuseMOE3GemmCompressed converts MOECompressed(GEMM3_SWIGLU) into
+            // MOE3GemmFusedCompressed, executed by the OCL moe_3gemm_swiglu_opt
+            // kernel on all architectures including non-systolic (MTL-class) iGPU.
             manager.register_pass<ov::pass::FuseVectorizedMOE3GEMM>();
-            pass_config->set_callback<ov::pass::FuseVectorizedMOE3GEMM>([&](const_node_ptr& root) -> bool {
-                // Currently moe gemm3 is only supported by systolic-array architectures
-                auto& engine = m_context->get_engine();
-                const auto& info = engine.get_device_info();
-                return (!info.supports_immad);
-            });
 
             bool is_pa = false;
             for (const auto& op : func->get_ops()) {
diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
@@ -6,6 +6,7 @@
 
 #include "intel_gpu/op/indirect_sdpa.hpp"
 #include "intel_gpu/op/kv_cache.hpp"
+#include "intel_gpu/op/moe_3gemm_fused_compressed.hpp"
 #include "intel_gpu/op/sdpa.hpp"
 #include "intel_gpu/plugin/remote_context.hpp"
 #include "intel_gpu/primitives/paged_attention.hpp"
@@ -215,6 +216,14 @@ void ExecutionConfig::apply_model_specific_options(const IRemoteContext* context
             m_use_onednn = true;
         }
 
+        // moe_3gemm_fused_compressed uses oneDNN internally for matrix multiplications,
+        // so it requires an in-order queue.  Force use_onednn=true here so that
+        // finalize_impl will set queue_type=in_order regardless of whether the
+        // hardware supports systolic (supports_immad).
+        if (ov::is_type<ov::intel_gpu::op::MOE3GemmFusedCompressed>(op)) {
+            m_use_onednn = true;
+        }
+
         if (auto multi_subgraph_op = ov::as_type_ptr<ov::op::util::MultiSubGraphOp>(op)) {
             for (const auto& sub_graph : multi_subgraph_op->get_functions()) {
                 for (auto& sub_op : sub_graph->get_ops()) {