fix: fix bugs

niushengxiao · niushengxiao · commit f91690db05cd · 2026-05-28T14:56:35.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -76,7 +76,7 @@ def _maybe_upgrade_quant_method_for_ep_moe(self, quant_method: QuantizationMetho
         if not self.enable_ep_moe:
             return quant_method
 
-        target_method = "deepgemm-fp8fp4-b32" if is_sm100_gpu() else "deepgemm-fp8w8a8-b128"
+        target_method = "deepgemm-fp4fp8-b32" if is_sm100_gpu() else "deepgemm-fp8w8a8-b128"
         if quant_method.method_name == "none":
             from lightllm.common.quantization.registry import QUANTMETHODS
 
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/deepgemm_impl.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/deepgemm_impl.py
@@ -28,7 +28,7 @@ def _get_ep_num_sms(self) -> int:
         return getattr(dist_group_manager, "ep_num_sms", None) or 0
 
     def _use_sm100_fp4_moe(self) -> bool:
-        return is_sm100_gpu() and self.quant_method.method_name == "deepgemm-fp8fp4-b32"
+        return is_sm100_gpu() and self.quant_method.method_name == "deepgemm-fp4fp8-b32"
 
     def _get_mega_moe_weights(self, w13: WeightPack, w2: WeightPack):
         cache_key = (
diff --git a/lightllm/common/kv_cache_mem_manager/mem_manager.py b/lightllm/common/kv_cache_mem_manager/mem_manager.py
@@ -67,8 +67,7 @@ def profile_size(self, mem_fraction):
 
         torch.cuda.empty_cache()
         world_size = dist.get_world_size()
-
-        available_memory = get_available_gpu_memory(world_size) * mem_fraction
+        available_memory = get_available_gpu_memory(world_size) - get_total_gpu_memory() * (1 - mem_fraction)
         cell_size = self.get_cell_size()
         self.size = int(available_memory * 1024 ** 3 / cell_size)
         if world_size > 1:
diff --git a/lightllm/common/quantization/deepgemm.py b/lightllm/common/quantization/deepgemm.py
@@ -126,7 +126,7 @@ def _create_weight(
         return mm_param, mm_param_list
 
 
-@QUANTMETHODS.register(["deepgemm-fp8fp4-b32"], platform="cuda")
+@QUANTMETHODS.register(["deepgemm-fp4fp8-b32"], platform="cuda")
 class DeepGEMMFP8FP4B32QuantizationMethod(DeepGEMMBaseQuantizationMethod):
     def __init__(self):
         super().__init__()
@@ -139,7 +139,7 @@ def __init__(self):
 
     @property
     def method_name(self):
-        return "deepgemm-fp8fp4-b32"
+        return "deepgemm-fp4fp8-b32"
 
     def quantize(self, weight: torch.Tensor, output: WeightPack):
         from deep_gemm.utils import per_token_cast_to_fp4
@@ -174,7 +174,7 @@ def apply(
         use_custom_tensor_mananger: bool = True,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        raise NotImplementedError("deepgemm-fp8fp4-b32 is only implemented for fused MoE expert weights")
+        raise NotImplementedError("deepgemm-fp4fp8-b32 is only implemented for fused MoE expert weights")
 
     def _create_weight(
         self, out_dims: Union[int, List[int]], in_dim: int, dtype: torch.dtype, device_id: int, num_experts: int = 1
diff --git a/requirements.txt b/requirements.txt
@@ -98,4 +98,3 @@ nixl==1.1.0
 xformers==0.0.35
 redis==7.3.0
 litellm>=1.52.0,<1.85
-flash-attn-4[13]==4.0.0b14
diff --git a/test/benchmark/service/benchmark_multiturn.py b/test/benchmark/service/benchmark_multiturn.py
@@ -317,20 +317,8 @@ def stream_one_turn(
                 continue
 
             if first_token_time is not None:
-                generated_text = "".join(generated_text_parts)
-                estimated_completion_tokens = len(tokenizer.encode(generated_text, add_special_tokens=False))
-                estimated_completion_tokens = max(estimated_completion_tokens, len(generated_text_parts))
-                print(f"\n[turn warning] {e}; keeping partial turn with estimated usage (attempt={attempt + 1})")
-                return {
-                    "ttft": first_token_time - start_time,
-                    "decode_times": decode_times,
-                    "prompt_tokens": prompt_tokens or prompt_token_len,
-                    "completion_tokens": completion_tokens or estimated_completion_tokens,
-                    "cached_tokens": cached_tokens,
-                    "cached_tokens_reported": cached_tokens_reported,
-                    "usage_estimated": completion_tokens == 0 or prompt_tokens == 0,
-                    "generated_text": generated_text,
-                }
+                print(f"\n[turn warning] {e}; discarding partial turn (attempt={attempt + 1})")
+                return None
 
             print(f"\n[turn exception] {e}")
             return None
@@ -344,15 +332,25 @@ def stream_one_turn(
                 continue
             return None
 
+        generated_text = "".join(generated_text_parts)
+        usage_estimated = False
+        if prompt_tokens == 0:
+            prompt_tokens = prompt_token_len
+            usage_estimated = True
+        if completion_tokens == 0:
+            estimated_completion_tokens = len(tokenizer.encode(generated_text, add_special_tokens=False))
+            completion_tokens = max(estimated_completion_tokens, len(generated_text_parts))
+            usage_estimated = True
+
         return {
             "ttft": first_token_time - start_time,
             "decode_times": decode_times,
             "prompt_tokens": prompt_tokens,
             "completion_tokens": completion_tokens,
             "cached_tokens": cached_tokens,
             "cached_tokens_reported": cached_tokens_reported,
-            "usage_estimated": False,
-            "generated_text": "".join(generated_text_parts),
+            "usage_estimated": usage_estimated,
+            "generated_text": generated_text,
         }
 
     return None
@@ -402,8 +400,9 @@ def run_session(
                 print(
                     f"\rconc={progress_state['concurrency']} "
                     f"finished_turns={progress_state['finished_turns']} "
-                    f"active_sessions={progress_state['active_sessions']}",
+                    f"active_sessions={progress_state['active_sessions']}\033[K",
                     end="",
+                    flush=True,
                 )
             turn_input_len = rng.randint(min_turn_input_increment, turn_input_increment)
             prompt, prompt_len = append_turn_input(