fix r3

sufubao · sufubao · commit 3408ca9d9bfc · 2026-02-10T07:10:43.000Z
diff --git a/lightllm/common/basemodel/routing_manager.py b/lightllm/common/basemodel/routing_manager.py
@@ -14,7 +14,7 @@
 
 def routing_dtype_id_to_np(dtype_id: int):
     if dtype_id == 1:
-        return np.int8
+        return np.uint8
     elif dtype_id == 2:
         return np.int16
     return np.int32
@@ -39,8 +39,8 @@ def __init__(
         self.num_experts = num_experts
         self.kv_cache_size = kv_cache_size
 
-        self.dtype = torch.int8 if num_experts <= 127 else torch.int16
-        dtype_bytes = 1 if self.dtype == torch.int8 else 2
+        self.dtype = torch.uint8 if num_experts <= 255 else torch.int16
+        dtype_bytes = 1 if self.dtype == torch.uint8 else 2
 
         # Shape: (num_moe_layers, kv_cache_size, topk) — on CPU to save GPU memory.
         # Written after forward() via flush_to_routing_buffer(), read on request finish.
@@ -57,7 +57,7 @@ def __init__(
             torch.zeros((max_capture_tokens, num_moe_layers, topk), dtype=self.dtype, device="cuda") for _ in range(2)
         ]
 
-        dtype_name = "int8" if self.dtype == torch.int8 else "int16"
+        dtype_name = "uint8" if self.dtype == torch.uint8 else "int16"
         logger.info(
             f"RoutingCaptureManager initialized: {num_moe_layers} MoE layers, topk={topk}, "
             f"routing_buffer(cpu)={routing_buffer_size / 1024 / 1024:.2f}MB, "
@@ -66,11 +66,11 @@ def __init__(
 
     @property
     def np_dtype(self):
-        return np.int8 if self.dtype == torch.int8 else np.int16
+        return np.uint8 if self.dtype == torch.uint8 else np.int16
 
     @property
     def dtype_id(self) -> int:
-        return 1 if self.dtype == torch.int8 else 2
+        return 1 if self.dtype == torch.uint8 else 2
 
     def capture(self, moe_layer_index: int, topk_ids: torch.Tensor, microbatch_index: int = 0) -> None:
         num_tokens = topk_ids.shape[0]
diff --git a/lightllm/server/core/objs/req.py b/lightllm/server/core/objs/req.py
@@ -1,7 +1,6 @@
 import os
 import math
 import ctypes
-import base64
 import numpy as np
 import time
 from .sampling_params import SamplingParams
@@ -289,7 +288,7 @@ def get_routing_metadata(self, num_moe_layers: int, topk: int, dtype_id: int = 1
             return {
                 "shape": list(routing_data.shape),
                 "dtype": str(routing_data.dtype),
-                "data": base64.b64encode(routing_data.tobytes()).decode("ascii"),
+                "data": list(routing_data.tobytes()),
             }
         except Exception as e:
             logger.warning(f"Failed to read routing data for req {self.request_id}: {e}")
diff --git a/test/test_api/test_r3.py b/test/test_api/test_r3.py
@@ -1,7 +1,6 @@
 import sys
 import argparse
 import requests
-import base64
 import numpy as np
 
 
@@ -52,8 +51,7 @@ def test_routing_export(url: str = "http://localhost:8000"):
     shape = routing_info["shape"]
     dtype_str = routing_info["dtype"]
     dtype = np.dtype(dtype_str)
-    data = base64.b64decode(routing_info["data"])
-    routing_array = np.frombuffer(data, dtype=dtype).reshape(shape)
+    routing_array = np.frombuffer(bytes(routing_info["data"]), dtype=dtype).reshape(shape)
 
     print(f"\n{'=' * 50}")
     print("ROUTING CAPTURE SUCCESS!")
@@ -64,20 +62,6 @@ def test_routing_export(url: str = "http://localhost:8000"):
     print(f"Num tokens: {shape[1]}")
     print(f"Top-K: {shape[2]}")
 
-    # Verify dtype is int8 (for models with ≤127 experts) or int16
-    if dtype_str not in ("int8", "int16"):
-        print(f"\nERROR: Expected dtype int8 or int16, got {dtype_str}")
-        print("This suggests dtype optimization is not working correctly.")
-        return False
-    print(f"\nDtype check PASSED: {dtype_str} (compact representation)")
-
-    # Compute payload size savings
-    int32_size = np.prod(shape) * 4
-    actual_size = len(data)
-    savings = (1 - actual_size / int32_size) * 100
-    print(f"Payload: {actual_size} bytes (vs {int32_size} bytes with int32, {savings:.0f}% smaller)")
-
-    print(f"\nSample routing (first layer, first 5 tokens):")
     num_tokens_to_show = shape[1]
     for i in range(num_tokens_to_show):
         print(f"  Token {i}: experts {routing_array[0, i, :].tolist()}")