dphnAI
diff --git a/‎aphrodite/_custom_ops.py‎
Lines changed: 12 additions & 0 deletions b/‎aphrodite/_custom_ops.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎aphrodite/model_executor/layers/quantization/exl3.py‎
Lines changed: 149 additions & 79 deletions b/‎aphrodite/model_executor/layers/quantization/exl3.py‎
Lines changed: 149 additions & 79 deletions
@@ -860,6 +860,18 @@ def silu_and_mul_per_block_quant(
     return output, scales
 
 
+def silu_mul(out: torch.Tensor, gate: torch.Tensor, up: torch.Tensor) -> None:
+    torch.ops._C.silu_mul(out, gate, up)
+
+
+def make_gate_up_indices(
+    out: torch.Tensor,
+    indices: torch.Tensor,
+    offset: int,
+) -> None:
+    torch.ops._C.make_gate_up_indices(out, indices, offset)
+
+
 # quantization ops
 # awq
 def awq_dequantize(
 
@@ -1054,11 +1054,6 @@ def apply(
         else:
             x_2d = x_2d.contiguous()
 
-        output = torch.zeros(
-            (x_2d.shape[0], layer.hidden_size),
-            device=x_2d.device,
-            dtype=torch.float32,
-        )
         topk_ids = topk_ids.to(torch.long)
         topk_weights = topk_weights.to(torch.float16)
         total_assignments = x_2d.shape[0] * topk_ids.shape[-1]
@@ -1083,6 +1078,11 @@ def apply(
                 x.shape[:-1],
             )
 
+        output = torch.zeros(
+            (x_2d.shape[0], layer.hidden_size),
+            device=x_2d.device,
+            dtype=torch.float32,
+        )
         flat_expert = topk_ids.reshape(-1)
         flat_weight = topk_weights.reshape(-1)
         flat_token = torch.arange(x_2d.shape[0], device=x_2d.device)
@@ -1195,41 +1195,69 @@ def _apply_single_token(
     ) -> torch.Tensor:
         x_3d = x_2d.unsqueeze(0)
 
-        ops.exl3_mgemm(
-            x_3d,
-            layer.exl3_gate_ptrs_trellis,
+        if layer.exl3_fuse_gate_up:
+            ops.make_gate_up_indices(
+                layer.exl3_small_gate_up_ids,
+                topk_ids,
+                layer.local_num_experts,
+            )
+            ops.exl3_mgemm(
+                x_3d,
+                layer.exl3_gate_up_ptrs_trellis,
+                layer.exl3_small_interm_gu,
+                layer.exl3_gate_up_ptrs_suh,
+                layer.exl3_small_yh_gu,
+                layer.exl3_gate_up_ptrs_svh,
+                layer.exl3_small_gate_up_ids,
+                None,
+                layer.exl3_moe_k_gate,
+                -1,
+                layer.exl3_gate_mcg,
+                layer.exl3_gate_mul1,
+                -1,
+                -1,
+                0,
+            )
+        else:
+            ops.exl3_mgemm(
+                x_3d,
+                layer.exl3_gate_ptrs_trellis,
+                layer.exl3_small_interm_g,
+                layer.exl3_gate_ptrs_suh,
+                layer.exl3_small_yh,
+                layer.exl3_gate_ptrs_svh,
+                topk_ids,
+                None,
+                layer.exl3_moe_k_gate,
+                -1,
+                layer.exl3_gate_mcg,
+                layer.exl3_gate_mul1,
+                -1,
+                -1,
+                0,
+            )
+            ops.exl3_mgemm(
+                x_3d,
+                layer.exl3_up_ptrs_trellis,
+                layer.exl3_small_interm_u,
+                layer.exl3_up_ptrs_suh,
+                layer.exl3_small_yh,
+                layer.exl3_up_ptrs_svh,
+                topk_ids,
+                None,
+                layer.exl3_moe_k_up,
+                -1,
+                layer.exl3_up_mcg,
+                layer.exl3_up_mul1,
+                -1,
+                -1,
+                0,
+            )
+        ops.silu_mul(
+            layer.exl3_small_interm_a,
             layer.exl3_small_interm_g,
-            layer.exl3_gate_ptrs_suh,
-            layer.exl3_small_yh,
-            layer.exl3_gate_ptrs_svh,
-            topk_ids,
-            None,
-            layer.exl3_moe_k_gate,
-            -1,
-            layer.exl3_gate_mcg,
-            layer.exl3_gate_mul1,
-            -1,
-            -1,
-            0,
-        )
-        ops.exl3_mgemm(
-            x_3d,
-            layer.exl3_up_ptrs_trellis,
             layer.exl3_small_interm_u,
-            layer.exl3_up_ptrs_suh,
-            layer.exl3_small_yh,
-            layer.exl3_up_ptrs_svh,
-            topk_ids,
-            None,
-            layer.exl3_moe_k_up,
-            -1,
-            layer.exl3_up_mcg,
-            layer.exl3_up_mul1,
-            -1,
-            -1,
-            0,
         )
-        layer.exl3_small_interm_a.copy_(torch.nn.functional.silu(layer.exl3_small_interm_g) * layer.exl3_small_interm_u)
         ops.exl3_mgemm(
             layer.exl3_small_interm_a,
             layer.exl3_down_ptrs_trellis,
@@ -1261,52 +1289,74 @@ def _apply_small_batch(
         original_dtype: torch.dtype,
         original_shape: tuple[int, ...],
     ) -> torch.Tensor:
-        output = torch.empty(
-            (x_2d.shape[0], layer.hidden_size),
-            device=x_2d.device,
-            dtype=torch.float32,
-        )
+        output = layer.exl3_small_batch_out[: x_2d.shape[0]]
         x_3d = x_2d.unsqueeze(1).unsqueeze(1)
         topk_ids_3d = topk_ids.unsqueeze(1)
         topk_weights_3d = topk_weights.unsqueeze(1)
 
         for i in range(x_2d.shape[0]):
-            ops.exl3_mgemm(
-                x_3d[i],
-                layer.exl3_gate_ptrs_trellis,
+            if layer.exl3_fuse_gate_up:
+                ops.make_gate_up_indices(
+                    layer.exl3_small_gate_up_ids,
+                    topk_ids_3d[i],
+                    layer.local_num_experts,
+                )
+                ops.exl3_mgemm(
+                    x_3d[i],
+                    layer.exl3_gate_up_ptrs_trellis,
+                    layer.exl3_small_interm_gu,
+                    layer.exl3_gate_up_ptrs_suh,
+                    layer.exl3_small_yh_gu,
+                    layer.exl3_gate_up_ptrs_svh,
+                    layer.exl3_small_gate_up_ids,
+                    None,
+                    layer.exl3_moe_k_gate,
+                    -1,
+                    layer.exl3_gate_mcg,
+                    layer.exl3_gate_mul1,
+                    -1,
+                    -1,
+                    0,
+                )
+            else:
+                ops.exl3_mgemm(
+                    x_3d[i],
+                    layer.exl3_gate_ptrs_trellis,
+                    layer.exl3_small_interm_g,
+                    layer.exl3_gate_ptrs_suh,
+                    layer.exl3_small_yh,
+                    layer.exl3_gate_ptrs_svh,
+                    topk_ids_3d[i],
+                    None,
+                    layer.exl3_moe_k_gate,
+                    -1,
+                    layer.exl3_gate_mcg,
+                    layer.exl3_gate_mul1,
+                    -1,
+                    -1,
+                    0,
+                )
+                ops.exl3_mgemm(
+                    x_3d[i],
+                    layer.exl3_up_ptrs_trellis,
+                    layer.exl3_small_interm_u,
+                    layer.exl3_up_ptrs_suh,
+                    layer.exl3_small_yh,
+                    layer.exl3_up_ptrs_svh,
+                    topk_ids_3d[i],
+                    None,
+                    layer.exl3_moe_k_up,
+                    -1,
+                    layer.exl3_up_mcg,
+                    layer.exl3_up_mul1,
+                    -1,
+                    -1,
+                    0,
+                )
+            ops.silu_mul(
+                layer.exl3_small_interm_a,
                 layer.exl3_small_interm_g,
-                layer.exl3_gate_ptrs_suh,
-                layer.exl3_small_yh,
-                layer.exl3_gate_ptrs_svh,
-                topk_ids_3d[i],
-                None,
-                layer.exl3_moe_k_gate,
-                -1,
-                layer.exl3_gate_mcg,
-                layer.exl3_gate_mul1,
-                -1,
-                -1,
-                0,
-            )
-            ops.exl3_mgemm(
-                x_3d[i],
-                layer.exl3_up_ptrs_trellis,
                 layer.exl3_small_interm_u,
-                layer.exl3_up_ptrs_suh,
-                layer.exl3_small_yh,
-                layer.exl3_up_ptrs_svh,
-                topk_ids_3d[i],
-                None,
-                layer.exl3_moe_k_up,
-                -1,
-                layer.exl3_up_mcg,
-                layer.exl3_up_mul1,
-                -1,
-                -1,
-                0,
-            )
-            layer.exl3_small_interm_a.copy_(
-                torch.nn.functional.silu(layer.exl3_small_interm_g) * layer.exl3_small_interm_u
             )
             ops.exl3_mgemm(
                 layer.exl3_small_interm_a,
@@ -1357,6 +1407,9 @@ def ptr_tensor(prefix: str, attr: str, shard_id: str):
         layer.exl3_up_ptrs_trellis = ptr_tensor("w13", "trellis", "w3")
         layer.exl3_up_ptrs_suh = ptr_tensor("w13", "suh", "w3")
         layer.exl3_up_ptrs_svh = ptr_tensor("w13", "svh", "w3")
+        layer.exl3_gate_up_ptrs_trellis = torch.cat([layer.exl3_gate_ptrs_trellis, layer.exl3_up_ptrs_trellis])
+        layer.exl3_gate_up_ptrs_suh = torch.cat([layer.exl3_gate_ptrs_suh, layer.exl3_up_ptrs_suh])
+        layer.exl3_gate_up_ptrs_svh = torch.cat([layer.exl3_gate_ptrs_svh, layer.exl3_up_ptrs_svh])
         layer.exl3_down_ptrs_trellis = ptr_tensor("w2", "trellis", "w2")
         layer.exl3_down_ptrs_suh = ptr_tensor("w2", "suh", "w2")
         layer.exl3_down_ptrs_svh = ptr_tensor("w2", "svh", "w2")
@@ -1376,16 +1429,33 @@ def ptr_tensor(prefix: str, attr: str, shard_id: str):
         layer.exl3_up_mul1 = (0, "w3") in layer.w13_mul1.exl3_tensors
         layer.exl3_down_mcg = (0, "w2") in layer.w2_mcg.exl3_tensors
         layer.exl3_down_mul1 = (0, "w2") in layer.w2_mul1.exl3_tensors
+        layer.exl3_fuse_gate_up = (
+            layer.exl3_moe_k_gate == layer.exl3_moe_k_up
+            and layer.exl3_gate_mcg == layer.exl3_up_mcg
+            and layer.exl3_gate_mul1 == layer.exl3_up_mul1
+        )
 
         layer.exl3_small_batch_threshold = min(
             layer.local_num_experts // layer.top_k,
             _EXL3_MOE_MAX_EXPERTS_PER_TOKEN,
         )
-        layer.exl3_small_yh = torch.empty((layer.top_k, 1, layer.hidden_size), dtype=torch.float16, device=device)
-        layer.exl3_small_interm_g = torch.empty((layer.top_k, 1, intermediate_size), dtype=torch.float16, device=device)
-        layer.exl3_small_interm_u = torch.empty((layer.top_k, 1, intermediate_size), dtype=torch.float16, device=device)
+        layer.exl3_small_yh_gu = torch.empty(
+            (layer.top_k * 2, 1, layer.hidden_size), dtype=torch.float16, device=device
+        )
+        layer.exl3_small_interm_gu = torch.empty(
+            (layer.top_k * 2, 1, intermediate_size), dtype=torch.float16, device=device
+        )
+        layer.exl3_small_yh = layer.exl3_small_yh_gu[: layer.top_k]
+        layer.exl3_small_interm_g = layer.exl3_small_interm_gu[: layer.top_k]
+        layer.exl3_small_interm_u = layer.exl3_small_interm_gu[layer.top_k :]
+        layer.exl3_small_gate_up_ids = torch.empty((1, layer.top_k * 2), dtype=torch.long, device=device)
         layer.exl3_small_interm_a = torch.empty((layer.top_k, 1, intermediate_size), dtype=torch.float16, device=device)
         layer.exl3_small_out_d = torch.empty((layer.top_k, 1, layer.hidden_size), dtype=torch.float32, device=device)
+        layer.exl3_small_batch_out = torch.empty(
+            (layer.exl3_small_batch_threshold, layer.hidden_size),
+            dtype=torch.float32,
+            device=device,
+        )
 
         concurrency = max(
             1,