draft

sufubao · sufubao · commit ca0c282b1584 · 2026-01-26T07:03:52.000Z
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -11,11 +11,7 @@
 
 from lightllm.common.basemodel.layer_weights.hf_load_utils import load_hf_weights
 from lightllm.common.basemodel.infer_struct import InferStateInfo
-from lightllm.common.basemodel.routing_manager import (
-    create_routing_capture_manager,
-    reset_moe_layer_counter,
-    get_moe_layer_count,
-)
+from lightllm.common.basemodel.routing_manager import reset_moe_layer_counter
 from lightllm.common.kv_cache_mem_manager import MemoryManager
 from lightllm.common.kv_cache_mem_manager.mem_utils import select_mem_manager_class
 from lightllm.common.req_manager import ReqManager
@@ -282,45 +278,16 @@ def _init_prefill_cuda_graph(self):
                 self.prefill_graph.warmup(self)
 
     def _init_custom(self):
-        if self.args.enable_return_routed_experts:
-            # Get MoE layer count from counter (set during _init_weights)
-            num_moe_layers = get_moe_layer_count()
-            if num_moe_layers == 0:
-                logger.warning(
-                    "enable_return_routed_experts is set but no MoE layers found. "
-                    "Routing capture will not be enabled."
-                )
-                return
-
-            # Get MoE parameters from model config
-            n_routed_experts = self.config.get("n_routed_experts", self.config.get("num_experts", 0))
-            if n_routed_experts == 0:
-                logger.warning(
-                    "enable_return_routed_experts is set but n_routed_experts=0. "
-                    "Routing capture will not be enabled."
-                )
-                return
+        """Hook for model-specific initialization. Override in subclasses."""
+        pass
 
-            topk = self.config.get("num_experts_per_tok", 1)
-            num_experts = n_routed_experts
+    def _post_forward(self, model_input: ModelInput, microbatch_index: int = 0) -> None:
+        """Hook called after forward pass completes. Override in subclasses for post-processing."""
+        pass
 
-            # Check if overlap mode is enabled
-            enable_overlap = getattr(self.args, "enable_decode_microbatch_overlap", False)
-
-            logger.info(
-                f"Initializing routing capture: num_moe_layers={num_moe_layers}, "
-                f"topk={topk}, num_experts={num_experts}, enable_overlap={enable_overlap}"
-            )
-
-            create_routing_capture_manager(
-                num_moe_layers=num_moe_layers,
-                topk=topk,
-                num_experts=num_experts,
-                batch_max_tokens=self.max_total_token_num,
-                kv_cache_size=self.mem_manager.size,
-                enable_overlap=enable_overlap,
-            )
-        return
+    def _post_forward_dual(self, model_input0: ModelInput, model_input1: ModelInput) -> None:
+        """Hook called after dual microbatch forward pass completes. Override in subclasses."""
+        pass
 
     @torch.no_grad()
     def forward(self, model_input: ModelInput):
@@ -332,7 +299,7 @@ def forward(self, model_input: ModelInput):
         else:
             result = self._decode(model_input)
 
-        # Note: flush is now handled by backend layer (ChunkedPrefill, DP, etc.)
+        self._post_forward(model_input)
         return result
 
     def _create_inferstate(self, model_input: ModelInput, microbatch_index: int = 0):
@@ -726,6 +693,7 @@ def microbatch_overlap_prefill(self, model_input0: ModelInput, model_input1: Mod
         dist_group_manager.clear_deepep_buffer()
         model_output0.prefill_mem_indexes_ready_event = prefill_mem_indexes_ready_event
         model_output1.prefill_mem_indexes_ready_event = prefill_mem_indexes_ready_event
+        self._post_forward_dual(model_input0, model_input1)
         return model_output0, model_output1
 
     @torch.no_grad()
@@ -819,6 +787,7 @@ def microbatch_overlap_decode(self, model_input0: ModelInput, model_input1: Mode
             infer_state1.init_att_state()
 
             model_output0, model_output1 = self._overlap_tpsp_token_forward(infer_state0, infer_state1=infer_state1)
+        self._post_forward_dual(model_input0, model_input1)
         return model_output0, model_output1
 
     @final
diff --git a/lightllm/common/basemodel/moe_model_mixin.py b/lightllm/common/basemodel/moe_model_mixin.py
@@ -0,0 +1,89 @@
+"""Mixin for MoE (Mixture of Experts) models.
+
+Provides R3 (Rollout Router Replay) routing capture functionality for MoE models.
+MoE models that want R3 support should inherit from this mixin and call
+`_init_routing_capture()` in their `_init_custom()` method.
+"""
+
+from lightllm.common.basemodel.batch_objs import ModelInput
+from lightllm.common.basemodel.routing_manager import (
+    create_routing_capture_manager,
+    get_moe_layer_count,
+    flush_routing_capture,
+    flush_routing_capture_dual,
+)
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+
+class MoeModelMixin:
+    """Mixin class providing R3 routing capture support for MoE models.
+
+    Usage:
+        class MyMoeModel(MoeModelMixin, LlamaTpPartModel):
+            def _init_custom(self):
+                super()._init_custom()
+                self._init_routing_capture()  # Enable R3 if flag is set
+    """
+
+    def _init_routing_capture(self) -> None:
+        """Initialize R3 routing capture if enabled via --enable_return_routed_experts.
+
+        Should be called in the model's _init_custom() method after weights are loaded.
+        This method is idempotent - safe to call multiple times.
+        """
+        if not getattr(self.args, "enable_return_routed_experts", False):
+            return
+
+        # Get MoE layer count from counter (set during _init_weights)
+        num_moe_layers = get_moe_layer_count()
+        if num_moe_layers == 0:
+            logger.warning(
+                "enable_return_routed_experts is set but no MoE layers found. " "Routing capture will not be enabled."
+            )
+            return
+
+        # Get MoE parameters from model config
+        n_routed_experts = self.config.get("n_routed_experts", self.config.get("num_experts", 0))
+        if n_routed_experts == 0:
+            logger.warning(
+                "enable_return_routed_experts is set but n_routed_experts=0. " "Routing capture will not be enabled."
+            )
+            return
+
+        topk = self.config.get("num_experts_per_tok", 1)
+        num_experts = n_routed_experts
+
+        # Check if overlap mode is enabled
+        enable_overlap = getattr(self.args, "enable_decode_microbatch_overlap", False)
+
+        logger.info(
+            f"Initializing routing capture: num_moe_layers={num_moe_layers}, "
+            f"topk={topk}, num_experts={num_experts}, enable_overlap={enable_overlap}"
+        )
+
+        create_routing_capture_manager(
+            num_moe_layers=num_moe_layers,
+            topk=topk,
+            num_experts=num_experts,
+            batch_max_tokens=self.max_total_token_num,
+            kv_cache_size=self.mem_manager.size,
+            enable_overlap=enable_overlap,
+        )
+
+    def _post_forward(self, model_input: ModelInput, microbatch_index: int = 0) -> None:
+        """Hook called after forward pass completes.
+
+        Flushes R3 routing capture data from GPU to CPU buffer.
+        No-op if R3 is not enabled.
+        """
+        flush_routing_capture(model_input.mem_indexes, microbatch_index)
+
+    def _post_forward_dual(self, model_input0: ModelInput, model_input1: ModelInput) -> None:
+        """Hook called after dual microbatch forward pass completes.
+
+        Flushes R3 routing capture data for both microbatches.
+        No-op if R3 is not enabled.
+        """
+        flush_routing_capture_dual(model_input0.mem_indexes, model_input1.mem_indexes)
diff --git a/lightllm/common/basemodel/routing_manager.py b/lightllm/common/basemodel/routing_manager.py
@@ -182,3 +182,31 @@ def create_routing_capture_manager(
 def get_routing_capture_manager() -> Optional[RoutingCaptureManager]:
     """Get the global routing capture manager."""
     return g_routing_capture_manager
+
+
+def flush_routing_capture(mem_indexes: torch.Tensor, microbatch_index: int = 0) -> None:
+    """Flush routing capture to CPU if manager is active.
+
+    Call after forward pass completes. No-op if R3 capture is not enabled.
+
+    Args:
+        mem_indexes: KV cache slot indices for the batch
+        microbatch_index: Microbatch index (0 for single batch, 0/1 for overlap)
+    """
+    if g_routing_capture_manager is not None:
+        g_routing_capture_manager.flush_to_cpu_async(mem_indexes, microbatch_index)
+
+
+def flush_routing_capture_dual(mem_indexes0: torch.Tensor, mem_indexes1: torch.Tensor) -> None:
+    """Flush routing capture for dual microbatch overlap mode.
+
+    Call after forward pass completes for both microbatches.
+    No-op if R3 capture is not enabled.
+
+    Args:
+        mem_indexes0: KV cache slot indices for microbatch 0
+        mem_indexes1: KV cache slot indices for microbatch 1
+    """
+    if g_routing_capture_manager is not None:
+        g_routing_capture_manager.flush_to_cpu_async(mem_indexes0, microbatch_index=0)
+        g_routing_capture_manager.flush_to_cpu_async(mem_indexes1, microbatch_index=1)
diff --git a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py
@@ -246,7 +246,6 @@ def _load_mlp(self, mlp_prefix):
 
     def _init_moe(self):
         moe_intermediate_size = self.network_config_["moe_intermediate_size"]
-
         self.moe_gate = ROWMMWeight(
             weight_names=f"model.layers.{self.layer_num_}.mlp.gate.weight",
             data_type=self.data_type_,
diff --git a/lightllm/models/deepseek2/model.py b/lightllm/models/deepseek2/model.py
@@ -6,6 +6,7 @@
 from lightllm.models.deepseek2.infer_struct import Deepseek2InferStateInfo
 from lightllm.models.llama.model import LlamaTpPartModel
 from lightllm.common.kv_cache_mem_manager.mem_utils import select_mem_manager_class
+from lightllm.common.basemodel.moe_model_mixin import MoeModelMixin
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.envs_utils import enable_env_vars, get_env_start_args, get_added_mtp_kv_layer_num
 from lightllm.distributed.communication_op import dist_group_manager
@@ -15,7 +16,7 @@
 
 
 @ModelRegistry(["deepseek_v2", "deepseek_v3"])
-class Deepseek2TpPartModel(LlamaTpPartModel):
+class Deepseek2TpPartModel(MoeModelMixin, LlamaTpPartModel):
     # weight class
     transformer_weight_class = Deepseek2TransformerLayerWeight
 
@@ -48,6 +49,7 @@ def _init_some_value(self):
     def _init_custom(self):
         self._init_to_get_yarn_rotary()
         dist_group_manager.new_deepep_group(self.config["n_routed_experts"], self.config["hidden_size"])
+        self._init_routing_capture()  # R3 routing capture for MoE
 
     def _verify_params(self):
         return super()._verify_params()
diff --git a/lightllm/models/gpt_oss/layer_infer/transformer_layer_infer.py b/lightllm/models/gpt_oss/layer_infer/transformer_layer_infer.py
@@ -42,7 +42,6 @@ def _gpt_oss_rmsnorm(self, hidden_states, weight, eps=1e-6):
     def _ffn(self, input, infer_state, layer_weight: GptOssTransformerLayerWeight) -> torch.Tensor:
         hidden_states = input.view(-1, self.embed_dim_)
         num_tokens, hidden_dim = hidden_states.shape
-
         router_logits = layer_weight.moe_gate.mm(hidden_states)
         hidden_states = layer_weight.experts.experts(
             hidden_states,
diff --git a/lightllm/models/gpt_oss/model.py b/lightllm/models/gpt_oss/model.py
@@ -2,6 +2,7 @@
 from lightllm.models.gpt_oss.layer_weights.transformer_layer_weight import GptOssTransformerLayerWeight
 from lightllm.models.llama.model import LlamaTpPartModel
 from lightllm.models.registry import ModelRegistry
+from lightllm.common.basemodel.moe_model_mixin import MoeModelMixin
 
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.utils.log_utils import init_logger
@@ -10,7 +11,7 @@
 
 
 @ModelRegistry("gpt_oss")
-class GptOssTpPartModel(LlamaTpPartModel):
+class GptOssTpPartModel(MoeModelMixin, LlamaTpPartModel):
     # weight class
     transformer_weight_class = GptOssTransformerLayerWeight
 
@@ -25,3 +26,7 @@ def __init__(self, kvargs):
         assert (
             get_env_start_args().llm_decode_att_backend[0] == "fa3"
         ), "For now GPT-OSS type model only support flashattention-3"
+
+    def _init_custom(self):
+        super()._init_custom()
+        self._init_routing_capture()  # R3 routing capture for MoE
diff --git a/lightllm/models/mixtral/model.py b/lightllm/models/mixtral/model.py
@@ -2,6 +2,7 @@
 import numpy as np
 from lightllm.models.registry import ModelRegistry
 from lightllm.common.basemodel.basemodel import TpPartBaseModel
+from lightllm.common.basemodel.moe_model_mixin import MoeModelMixin
 from lightllm.common.kv_cache_mem_manager import MemoryManager
 from lightllm.models.llama.layer_infer.post_layer_infer import LlamaPostLayerInfer
 from lightllm.models.llama.layer_infer.pre_layer_infer import LlamaPreLayerInfer
@@ -16,7 +17,7 @@
 
 
 @ModelRegistry("mixtral")
-class MixtralTpPartModel(TpPartBaseModel):
+class MixtralTpPartModel(MoeModelMixin, TpPartBaseModel):
     # weight class
     pre_and_post_weight_class = LlamaPreAndPostLayerWeight
     transformer_weight_class = MixtralTransformerLayerWeight
@@ -45,6 +46,7 @@ def _verify_params(self):
 
     def _init_custom(self):
         self._init_to_get_rotary()
+        self._init_routing_capture()  # R3 routing capture for MoE
         return
 
     def _init_mem_manager(self):
diff --git a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
@@ -131,7 +131,6 @@ def _moe_ffn(
 
         hidden_states = input.view(-1, self.embed_dim_)
         num_tokens, hidden_dim = hidden_states.shape
-
         router_logits = layer_weight.moe_gate.mm(hidden_states)
         layer_weight.experts.experts(
             hidden_states,
diff --git a/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
@@ -60,7 +60,6 @@ def _init_moe(self):
             tp_rank=0,
             tp_world_size=1,
         )
-
         moe_mode = os.getenv("MOE_MODE", "TP")
         assert moe_mode in ["EP", "TP"]
         if moe_mode == "TP":
diff --git a/lightllm/models/qwen3_moe/model.py b/lightllm/models/qwen3_moe/model.py
@@ -4,6 +4,7 @@
 from lightllm.models.qwen3_moe.layer_infer.transformer_layer_infer import Qwen3MOETransformerLayerInfer
 from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight
 from lightllm.models.qwen3.model import Qwen3TpPartModel
+from lightllm.common.basemodel.moe_model_mixin import MoeModelMixin
 from lightllm.utils.log_utils import init_logger
 from lightllm.distributed.communication_op import dist_group_manager
 
@@ -12,7 +13,7 @@
 
 
 @ModelRegistry("qwen3_moe")
-class Qwen3MOEModel(Qwen3TpPartModel):
+class Qwen3MOEModel(MoeModelMixin, Qwen3TpPartModel):
     # weight class
     transformer_weight_class = Qwen3MOETransformerLayerWeight
 
@@ -26,3 +27,4 @@ def __init__(self, kvargs):
     def _init_custom(self):
         super()._init_custom()
         dist_group_manager.new_deepep_group(self.config["num_experts"], self.config["hidden_size"])
+        self._init_routing_capture()  # R3 routing capture for MoE
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -199,12 +199,6 @@ def make_argument_parser() -> argparse.ArgumentParser:
         choices=["round_robin", "bs_balancer"],
         help="the dp balancer type, default is bs_balancer",
     )
-    parser.add_argument(
-        "--enable_return_routed_experts",
-        action="store_true",
-        default=False,
-        help="Enable returning routed expert indices for MoE models (R3 feature).",
-    )
     parser.add_argument(
         "--max_req_total_len", type=int, default=16384, help="the max value for req_input_len + req_output_len"
     )
@@ -614,4 +608,10 @@ def make_argument_parser() -> argparse.ArgumentParser:
         default=False,
         help="""Enable prefix prompt cache fetch for data parallel inference, disabled by default.""",
     )
+    parser.add_argument(
+        "--enable_return_routed_experts",
+        action="store_true",
+        default=False,
+        help="Enable returning routed expert indices for MoE models (R3 feature).",
+    )
     return parser
diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py
@@ -220,6 +220,7 @@ class ChatCompletionRequest(BaseModel):
     role_settings: Optional[Dict[str, str]] = None
     character_settings: Optional[List[Dict[str, str]]] = None
 
+    # Class variables to store loaded default values
     _loaded_defaults: ClassVar[Dict[str, Any]] = {}
 
     @classmethod
diff --git a/lightllm/server/api_openai.py b/lightllm/server/api_openai.py
@@ -325,11 +325,7 @@ async def chat_completions_impl(request: ChatCompletionRequest, raw_request: Req
             )
             choices.append(choice)
         resp = ChatCompletionResponse(
-            id=group_request_id,
-            created=created_time,
-            model=request.model,
-            choices=choices,
-            usage=usage,
+            id=group_request_id, created=created_time, model=request.model, choices=choices, usage=usage
         )
         return resp
 
diff --git a/lightllm/server/core/objs/req.py b/lightllm/server/core/objs/req.py
diff --git a/lightllm/server/core/objs/sampling_params.py b/lightllm/server/core/objs/sampling_params.py
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
diff --git a/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py b/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py

Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,6 @@ def _init_moe(self):`
`60`	`60`	`tp_rank=0,`
`61`	`61`	`tp_world_size=1,`
`62`	`62`	`)`
`63`		`-`
`64`	`63`	`moe_mode = os.getenv("MOE_MODE", "TP")`
`65`	`64`	`assert moe_mode in ["EP", "TP"]`
`66`	`65`	`if moe_mode == "TP":`