dphnAI
diff --git a/‎aphrodite/lora/layers/fused_moe.py‎
Lines changed: 37 additions & 277 deletions b/‎aphrodite/lora/layers/fused_moe.py‎
Lines changed: 37 additions & 277 deletions
diff --git a/‎aphrodite/lora/layers/utils.py‎
Lines changed: 5 additions & 2 deletions b/‎aphrodite/lora/layers/utils.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎aphrodite/lora/ops/triton_ops/utils.py‎
Lines changed: 17 additions & 0 deletions b/‎aphrodite/lora/ops/triton_ops/utils.py‎
Lines changed: 17 additions & 0 deletions
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import functools
 
 import torch
 import torch.nn as nn
@@ -14,31 +13,17 @@
 )
 from aphrodite.distributed.utils import divide
 from aphrodite.lora.layers.base import BaseLayerWithLoRA
-from aphrodite.lora.ops.triton_ops.utils import get_lora_op_configs
 from aphrodite.model_executor.layers.fused_moe import FusedMoE
-from aphrodite.model_executor.layers.fused_moe.config import (
-    _get_config_dtype_str,
-)
-from aphrodite.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import (
-    UnfusedOAITritonExperts,
-)
-from aphrodite.model_executor.layers.fused_moe.fused_marlin_moe import (
-    MarlinExperts,
-)
-from aphrodite.model_executor.layers.fused_moe.fused_moe import (
-    TritonExperts,
-)
 from aphrodite.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
 )
-from aphrodite.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEKernel,
-)
+from aphrodite.model_executor.layers.fused_moe.lora_context import MoELoRAContext
+from aphrodite.model_executor.layers.fused_moe.modular_kernel import FusedMoEKernel
 from aphrodite.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoDPEPModular,
 )
 
-from .utils import _get_lora_device, try_get_optimal_moe_lora_config
+from .utils import _get_lora_device
 
 
 class FusedMoEWithLoRA(BaseLayerWithLoRA):
@@ -56,275 +41,46 @@ def __init__(self, base_layer: FusedMoE) -> None:
         # For non-gated MoE (is_act_and_mul=False), only 1 slice is needed
         # since there's only up_proj (w1), not gate_proj + up_proj (w1 + w3)
         self._w13_slices = 2 if base_layer.moe_config.is_act_and_mul else 1
-        self._inject_lora_into_fused_moe()
-
-    def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]:
-        normalized_config = {}
-        for key, value in config.items():
-            if key.islower():
-                if key.startswith("block_"):
-                    normalized_key = "BLOCK_SIZE_" + key.split("_")[-1].upper()
-                else:
-                    normalized_key = key.upper()
-            else:
-                normalized_key = key
-            normalized_config[normalized_key] = value
-        return normalized_config
-
-    def _get_lora_moe_configs(
-        self,
-        op_prefix: str,
-        num_loras: int,
-        rank: int,
-        num_slices: int,
-        M: int,
-        layer: FusedMoE,
-        top_k: int,
-        config_dtype: str,
-    ):
-        if envs.APHRODITE_TUNED_CONFIG_FOLDER:
-            hidden_size = layer.hidden_size
-            intermediate_size = (
-                self.w2_lora_a_stacked[0].shape[-1] if op_prefix == "w2" else self.w13_lora_b_stacked[0].shape[-2]
-            )
-            shrink_config = get_lora_op_configs(
-                op_type=f"fused_moe_lora_{op_prefix}_shrink",
-                max_loras=num_loras,
-                batch=M,
-                hidden_size=hidden_size,
-                rank=rank,
-                num_slices=num_slices,
-                moe_intermediate_size=intermediate_size,
-            )
-            expand_config = get_lora_op_configs(
-                op_type=f"fused_moe_lora_{op_prefix}_expand",
-                max_loras=num_loras,
-                batch=M,
-                hidden_size=hidden_size,  # lora_a_stacked.shape[-1],
-                rank=rank,
-                num_slices=num_slices,
-                moe_intermediate_size=intermediate_size,  # lora_b_stacked.shape[-2],
-            )
-        else:  # fall back to the default config
-            get_config_func = functools.partial(
-                try_get_optimal_moe_lora_config,
-                w1_shape=layer.w13_weight.shape,
-                w2_shape=layer.w2_weight.shape,
-                rank=rank,
-                top_k=top_k,
-                dtype=config_dtype,
-                M=M,
-                block_shape=layer.quant_method.moe_quant_config.block_shape,
-            )
-            shrink_config = get_config_func(op_type=f"fused_moe_lora_{op_prefix}_shrink")
-            expand_config = get_config_func(op_type=f"fused_moe_lora_{op_prefix}_expand")
-        shrink_config = self._normalize_keys(shrink_config)
-        expand_config = self._normalize_keys(expand_config)
-        return shrink_config, expand_config
-
-    def _inject_lora_into_fused_moe(self):
-        moe_state_dict = {}
-        top_k = self.base_layer.top_k
 
         self.base_layer.ensure_moe_quant_config_init()
-        quant_config = self.base_layer.quant_method.moe_quant_config
-
         if getattr(self.base_layer.quant_method, "supports_internal_mk", False):
-            # Use the existing modular kernel from the quant method
-            m_fused_moe_fn = self.base_layer.quant_method.moe_kernel
+            moe_kernel = self.base_layer.quant_method.moe_kernel
             # Don't let the kernel own shared experts so the runner can
             # overlap them with routed experts via a separate CUDA stream.
-            m_fused_moe_fn.shared_experts = None
+            moe_kernel.shared_experts = None
         else:
-            # Create a new modular kernel via select_gemm_impl.
-            # Don't pass shared_experts to the kernel so the runner can
-            # overlap them with routed experts via a separate CUDA stream.
             prepare_finalize = MoEPrepareAndFinalizeNoDPEPModular()
-            m_fused_moe_fn = FusedMoEKernel(
+            moe_kernel = FusedMoEKernel(
                 prepare_finalize,
                 self.base_layer.quant_method.select_gemm_impl(prepare_finalize, self.base_layer),
             )
-
-        if quant_config.use_mxfp4_w4a16:
-            assert isinstance(
-                m_fused_moe_fn.impl.fused_experts,
-                (MarlinExperts, UnfusedOAITritonExperts),
-            )
-        else:
-            assert isinstance(m_fused_moe_fn.impl.fused_experts, TritonExperts)
-
-        def fwd_decorator(layer, func):
-            def wrapper(*args, **kwargs):
-                moe_state_dict["hidden_states"] = kwargs["hidden_states"]
-                moe_state_dict["topk_ids"] = kwargs["topk_ids"]
-                moe_state_dict["topk_weights"] = kwargs["topk_weights"]
-                moe_state_dict["expert_map"] = kwargs["expert_map"]
-                moe_state_dict["apply_router_weight_on_input"] = kwargs["apply_router_weight_on_input"]
-                result = func(*args, **kwargs)
-                return result
-
-            return wrapper
-
-        def act_decorator(layer, func):
-            def wrapper(*args, **kwargs):
-                _, output, input = args
-
-                hidden_states = moe_state_dict["hidden_states"]
-                topk_weights = moe_state_dict["topk_weights"]
-                curr_topk_ids = moe_state_dict["topk_ids"]
-
-                expert_map = moe_state_dict["expert_map"]
-
-                config_dtype = _get_config_dtype_str(
-                    dtype=hidden_states.dtype,
-                    use_fp8_w8a8=False,
-                    use_int8_w8a16=False,
-                    use_int4_w4a16=False,
-                )
-                num_tokens = hidden_states.size(0)
-                M = num_tokens
-                max_lora_rank = self.w13_lora_a_stacked[0].shape[-2]
-                shrink_config, expand_config = self._get_lora_moe_configs(
-                    op_prefix="w13",
-                    num_loras=self.max_loras,
-                    rank=max_lora_rank,
-                    num_slices=self._w13_slices,
-                    M=M,
-                    layer=layer,
-                    top_k=top_k,
-                    config_dtype=config_dtype,
-                )
-
-                # SPARSITY_FACTOR is a heuristic margin ensuring tokens * top_k
-                # activates only a small fraction of total experts * loras.
-                SPARSITY_FACTOR = 8
-                naive_block_assignment = (
-                    expert_map is None
-                    and num_tokens * top_k * SPARSITY_FACTOR <= self.base_layer.local_num_experts * self.max_loras
-                )
-
-                # get the block size of m from customized config or default config
-                (
-                    token_lora_mapping,
-                    sorted_token_ids_lora,
-                    expert_ids_lora,
-                    num_tokens_post_padded_lora,
-                ) = self.punica_wrapper.moe_lora_align_block_size(
-                    curr_topk_ids,
-                    num_tokens,
-                    shrink_config["BLOCK_SIZE_M"],
-                    self.base_layer.local_num_experts,
-                    self.max_loras,
-                    self.adapter_enabled,
-                    expert_map,
-                    naive_block_assignment=naive_block_assignment,
-                )
-
-                moe_state_dict["sorted_token_ids_lora"] = sorted_token_ids_lora
-                moe_state_dict["expert_ids_lora"] = expert_ids_lora
-                moe_state_dict["num_tokens_post_padded_lora"] = num_tokens_post_padded_lora
-                moe_state_dict["token_lora_mapping"] = token_lora_mapping
-
-                if sorted_token_ids_lora is not None:
-                    expert_ids_lora = expert_ids_lora.view(self.max_loras, -1)
-                    sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1)
-                #
-
-                self.punica_wrapper.add_lora_fused_moe(
-                    input.view(-1, top_k, input.shape[-1]),
-                    hidden_states,
-                    self.w13_lora_a_stacked,
-                    self.w13_lora_b_stacked,
-                    topk_weights,
-                    sorted_token_ids_lora,
-                    expert_ids_lora,
-                    num_tokens_post_padded_lora,
-                    max_lora_rank,
-                    top_k,
-                    shrink_config,  ## pass the shrink config
-                    expand_config,  ## pass the expand config
-                    self.adapter_enabled,
-                    fully_sharded=self.fully_sharded,
-                    token_lora_mapping=token_lora_mapping,
-                )
-
-                result = func(*args, **kwargs)
-
-                moe_state_dict["intermediate_cache2"] = output
-                return result
-
-            return wrapper
-
-        def moe_sum_decorator(layer, func):
-            def wrapper(*args, **kwargs):
-                hidden_states = moe_state_dict["hidden_states"]
-                topk_weights = moe_state_dict["topk_weights"]
-
-                config_dtype = _get_config_dtype_str(
-                    dtype=hidden_states.dtype,
-                    use_fp8_w8a8=False,
-                    use_int8_w8a16=False,
-                    use_int4_w4a16=False,
-                )
-                num_tokens = hidden_states.size(0)
-                M = num_tokens
-                max_lora_rank = self.w2_lora_a_stacked[0].shape[-2]
-                shrink_config, expand_config = self._get_lora_moe_configs(
-                    op_prefix="w2",
-                    num_loras=self.max_loras,
-                    rank=max_lora_rank,
-                    num_slices=1,
-                    M=M,
-                    layer=layer,
-                    top_k=top_k,
-                    config_dtype=config_dtype,
-                )
-
-                sorted_token_ids_lora = moe_state_dict["sorted_token_ids_lora"]
-                expert_ids_lora = moe_state_dict["expert_ids_lora"]
-                num_tokens_post_padded_lora = moe_state_dict["num_tokens_post_padded_lora"]
-                token_lora_mapping = moe_state_dict.get("token_lora_mapping")
-
-                if sorted_token_ids_lora is not None:
-                    expert_ids_lora = expert_ids_lora.view(self.max_loras, -1)
-                    sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1)
-                intermediate_cache2 = moe_state_dict["intermediate_cache2"]
-                intermediate_cache3 = args[0]
-
-                shard_size_w2 = divide(self.base_layer.hidden_size, self.tp_size)
-
-                self.punica_wrapper.add_lora_fused_moe(
-                    intermediate_cache3,
-                    intermediate_cache2,
-                    self.w2_lora_a_stacked,
-                    self.w2_lora_b_stacked,
-                    topk_weights,
-                    sorted_token_ids_lora,
-                    expert_ids_lora,
-                    num_tokens_post_padded_lora,
-                    max_lora_rank,
-                    top_k,
-                    shrink_config,  ## pass the shrink config
-                    expand_config,  ## pass the expand config
-                    self.adapter_enabled,
-                    True,
-                    fully_sharded=self.fully_sharded,
-                    offset=shard_size_w2 * self.tp_rank if self.fully_sharded else 0,
-                    token_lora_mapping=token_lora_mapping,
-                )
-
-                result = func(*args, **kwargs)
-                return result
-
-            return wrapper
-
-        fused_experts = m_fused_moe_fn.impl.fused_experts
-
-        m_fused_moe_fn.apply = fwd_decorator(self.base_layer, m_fused_moe_fn.apply)
-        fused_experts.activation = act_decorator(self.base_layer, fused_experts.activation)
-        fused_experts.moe_sum = moe_sum_decorator(self.base_layer, fused_experts.moe_sum)
-        # TODO(bnell): find a less intrusive way to handle this.
-        self.base_layer._replace_quant_method(FusedMoEModularMethod(self.base_layer.quant_method, m_fused_moe_fn))
+        assert moe_kernel.supports_lora(), (
+            f"{type(moe_kernel.fused_experts).__name__} does not support LoRA. "
+            "For unquantized MoE, set moe_backend='triton' or moe_backend='auto' "
+            "(auto selects Triton automatically when LoRA is enabled). "
+            "For quantized MoE, mix LoRAExpertsMixin into the experts class "
+            "and consume self._lora_context in apply()."
+        )
+        self._fused_experts = moe_kernel.fused_experts
+        self.base_layer._replace_quant_method(FusedMoEModularMethod(self.base_layer.quant_method, moe_kernel))
+
+    def _build_lora_context(self):
+        return MoELoRAContext(
+            w13_lora_a_stacked=self.w13_lora_a_stacked,
+            w13_lora_b_stacked=self.w13_lora_b_stacked,
+            w2_lora_a_stacked=self.w2_lora_a_stacked,
+            w2_lora_b_stacked=self.w2_lora_b_stacked,
+            adapter_enabled=self.adapter_enabled,
+            max_loras=self.max_loras,
+            top_k=self.base_layer.top_k,
+            w13_num_slices=self._w13_slices,
+            fully_sharded=self.fully_sharded,
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+            local_num_experts=self.base_layer.local_num_experts,
+            punica_wrapper=self.punica_wrapper,
+            use_tuned_config=bool(envs.APHRODITE_TUNED_CONFIG_FOLDER),
+        )
 
     def _create_lora_a_weights(
         self,
@@ -543,6 +299,10 @@ def set_lora(
             sliced_w2_lora_b, non_blocking=True
         )
 
+    def set_mapping(self, punica_wrapper):
+        super().set_mapping(punica_wrapper)
+        self._fused_experts.set_lora_context(self._build_lora_context())
+
     def forward(self, *args, **kwargs):
         return self.base_layer.forward(*args, **kwargs)
 
 
@@ -88,9 +88,12 @@ def try_get_optimal_moe_lora_config(
     top_k: int,
     dtype: str | None,
     M: int,
-    block_shape: list[int] | None = None,
 ) -> dict[str, int | None]:
-    config = try_get_optimal_moe_config(w1_shape, w2_shape, top_k, dtype, M, block_shape).copy()
+    # LoRA shrink/expand operates on bf16/fp16 adapters regardless of the
+    # base MoE weight's block-wise quantization, so block_shape is omitted
+    # from the config lookup — the non-quantized branch in get_default_config
+    # ignores it anyway.
+    config = try_get_optimal_moe_config(w1_shape, w2_shape, top_k, dtype, M).copy()
     if op_type in [
         "fused_moe_lora_w13_shrink",
         "fused_moe_lora_w2_shrink",
 
@@ -296,3 +296,20 @@ def supports_pdl(device: torch.device | None = None) -> bool:
 def supports_tma(device: torch.device | None = None) -> bool:
     # TMA requires compute capability SM90 or above
     return current_platform.is_cuda() and current_platform.has_device_capability(90)
+
+
+def _normalize_lora_config_keys(
+    config: dict[str, int | None],
+) -> dict[str, int | None]:
+    """Normalize Triton config dict keys to uppercase BLOCK_SIZE_* format."""
+    out: dict[str, int | None] = {}
+    for key, val in config.items():
+        if key.islower():
+            if key.startswith("block_"):
+                nk = "BLOCK_SIZE_" + key.split("_")[-1].upper()
+            else:
+                nk = key.upper()
+        else:
+            nk = key
+        out[nk] = val
+    return out