|
3 | 3 | // |
4 | 4 | #include "openvino/op/moe.hpp" |
5 | 5 |
|
6 | | -#include <intel_gpu/primitives/eltwise.hpp> |
7 | 6 | #include <intel_gpu/primitives/moe_gather.hpp> |
8 | 7 | #include <intel_gpu/primitives/moe_scatter_reduction.hpp> |
9 | 8 | #include <intel_gpu/primitives/swiglu.hpp> |
@@ -98,31 +97,11 @@ static void CreateMOECompressedOp(ProgramBuilder& p, const std::shared_ptr<ov::o |
98 | 97 | input_infos.push_back(cldnn::input_info(input)); |
99 | 98 | } |
100 | 99 | if (config.expert_type == ov::op::internal::MOE::Expert_type::GEMM3_SWIGLU) { |
101 | | - // Create GEMM3_SWIGLU specific primitives |
102 | | - // 0: hidden_states - input tensor with hidden representations |
103 | | - // 1: routing_weights - [num_experts, ...] normalized weights for selected experts |
104 | | - // (input to final multiplication) |
105 | | - // 2: router_topk_output_indices - [..., topk] indices of selected top-k experts |
106 | | - // 3: w0_weight - expert weights for first projection, |
107 | | - // shape [num_experts, inter_size, group_num, group_size] |
108 | | - // 4: w0_scale - expert scale for first projection for compressed experts, |
109 | | - // shape [num_experts, inter_size, group_num, 1] |
110 | | - // 5: w0_zp - expert zp for first projection for compressed experts, |
111 | | - // shape [num_experts, inter_size, group_num, 1] |
112 | | - // 6: w1_weight - expert weights for second projection, |
113 | | - // shape [num_experts, inter_size, group_num, group_size] |
114 | | - // 7: w1_scale - expert scale for second projection for compressed experts, |
115 | | - // shape [num_experts, inter_size, group_num, 1] |
116 | | - // 8: w1_zp - expert zp for second projection for compressed experts, |
117 | | - // shape [num_experts, inter_size, group_num, 1] |
118 | | - // 9: w2_weight - expert weights for final projection, |
119 | | - // shape [num_experts, hidden_size, group_num, group_size] |
120 | | - // 10: w2_scale - expert scale for final projection for compressed experts, |
121 | | - // shape [num_experts, hidden_size, group_num, 1] |
122 | | - // 11: w2_zp - expert zp for final projection for compressed experts, |
123 | | - // shape [num_experts, hidden_size, group_num, 1] |
124 | | - |
125 | | - // Use moe_3gemm_fused_compressed to replace it. |
| 100 | + // GEMM3_SWIGLU (Qwen3-style MoE) should be handled by FuseMOE3GemmCompressed |
| 101 | + // which converts MOECompressed(GEMM3_SWIGLU) → MOE3GemmFusedCompressed executed |
| 102 | + // by the OCL moe_3gemm_swiglu_opt kernel on all architectures. If execution |
| 103 | + // reaches here the transformation pipeline is misconfigured. |
| 104 | + OPENVINO_THROW("[GPU] MOECompressed(GEMM3_SWIGLU) must be handled by FuseMOE3GemmCompressed before program build"); |
126 | 105 | } else { |
127 | 106 | // Create GEMM2_BIAS_SWIGLU_CLAMP specific primitives |
128 | 107 | // input0 : input {#tokens, hidden_size} |
|
0 commit comments