fix: fix bugs

niushengxiao · niushengxiao · commit ac368f401c69 · 2026-05-27T20:06:24.000+08:00
diff --git a/lightllm/common/kv_cache_mem_manager/mem_manager.py b/lightllm/common/kv_cache_mem_manager/mem_manager.py
@@ -67,8 +67,7 @@ def profile_size(self, mem_fraction):
 
         torch.cuda.empty_cache()
         world_size = dist.get_world_size()
-
-        available_memory = get_available_gpu_memory(world_size) * mem_fraction
+        available_memory = get_available_gpu_memory(world_size) - get_total_gpu_memory() * (1 - mem_fraction)
         cell_size = self.get_cell_size()
         self.size = int(available_memory * 1024 ** 3 / cell_size)
         if world_size > 1:
diff --git a/requirements.txt b/requirements.txt
@@ -98,4 +98,3 @@ nixl==1.1.0
 xformers==0.0.35
 redis==7.3.0
 litellm>=1.52.0,<1.85
-flash-attn-4[13]==4.0.0b14
diff --git a/test/benchmark/service/benchmark_multiturn.py b/test/benchmark/service/benchmark_multiturn.py
@@ -317,20 +317,8 @@ def stream_one_turn(
                 continue
 
             if first_token_time is not None:
-                generated_text = "".join(generated_text_parts)
-                estimated_completion_tokens = len(tokenizer.encode(generated_text, add_special_tokens=False))
-                estimated_completion_tokens = max(estimated_completion_tokens, len(generated_text_parts))
-                print(f"\n[turn warning] {e}; keeping partial turn with estimated usage (attempt={attempt + 1})")
-                return {
-                    "ttft": first_token_time - start_time,
-                    "decode_times": decode_times,
-                    "prompt_tokens": prompt_tokens or prompt_token_len,
-                    "completion_tokens": completion_tokens or estimated_completion_tokens,
-                    "cached_tokens": cached_tokens,
-                    "cached_tokens_reported": cached_tokens_reported,
-                    "usage_estimated": completion_tokens == 0 or prompt_tokens == 0,
-                    "generated_text": generated_text,
-                }
+                print(f"\n[turn warning] {e}; discarding partial turn (attempt={attempt + 1})")
+                return None
 
             print(f"\n[turn exception] {e}")
             return None
@@ -344,15 +332,25 @@ def stream_one_turn(
                 continue
             return None
 
+        generated_text = "".join(generated_text_parts)
+        usage_estimated = False
+        if prompt_tokens == 0:
+            prompt_tokens = prompt_token_len
+            usage_estimated = True
+        if completion_tokens == 0:
+            estimated_completion_tokens = len(tokenizer.encode(generated_text, add_special_tokens=False))
+            completion_tokens = max(estimated_completion_tokens, len(generated_text_parts))
+            usage_estimated = True
+
         return {
             "ttft": first_token_time - start_time,
             "decode_times": decode_times,
             "prompt_tokens": prompt_tokens,
             "completion_tokens": completion_tokens,
             "cached_tokens": cached_tokens,
             "cached_tokens_reported": cached_tokens_reported,
-            "usage_estimated": False,
-            "generated_text": "".join(generated_text_parts),
+            "usage_estimated": usage_estimated,
+            "generated_text": generated_text,
         }
 
     return None
@@ -402,8 +400,9 @@ def run_session(
                 print(
                     f"\rconc={progress_state['concurrency']} "
                     f"finished_turns={progress_state['finished_turns']} "
-                    f"active_sessions={progress_state['active_sessions']}",
+                    f"active_sessions={progress_state['active_sessions']}\033[K",
                     end="",
+                    flush=True,
                 )
             turn_input_len = rng.randint(min_turn_input_increment, turn_input_increment)
             prompt, prompt_len = append_turn_input(