fix(detoken,openai): unhang client on detoken init failure; map "error" to API error path

sufubao · sufubao · commit d7fd4a06d501 · 2026-05-08T16:20:16.000+08:00
- detoken: on _add_new_group_req_index failure, set FINISHED_ERROR, push an
  empty-string sentinel into out_tokens_queue at finish_token_index, mark
  can_released_mark, and continue with the rest of the group instead of
  re-raising. Without this the http loop stays blocked (queue empty, no
  finish ever forwarded) and the shm req leaks until client disconnect.
- openai: surface FINISHED_ERROR as a controlled error response. Non-stream
  chat / completions return HTTP 500; streaming chat / completions yield an
  SSE error event followed by [DONE] and stop. Previously "error" leaked
  into ChatCompletionResponseChoice / CompletionChoice whose finish_reason
  literals reject it, raising Pydantic ValidationError.
diff --git a/lightllm/server/api_openai.py b/lightllm/server/api_openai.py
@@ -70,6 +70,19 @@ async def _safe_stream_wrapper(stream_generator):
         yield f"data: {error_data}\n\n"
 
 
+# OpenAI response schemas restrict finish_reason to {"stop","length","tool_calls"} (chat) and
+# {"stop","length"} (completions). The internal FINISHED_ERROR status surfaces as "error" — surface
+# that at the API boundary as a controlled error response rather than letting it leak into the
+# Pydantic models (which would raise ValidationError) or to the client as a silent stop.
+_INTERNAL_ERROR_MESSAGE = "Internal server error during request processing"
+_INTERNAL_ERROR_TYPE = "InternalServerError"
+
+
+def _sse_internal_error_payload() -> str:
+    error = {"error": {"message": _INTERNAL_ERROR_MESSAGE, "type": _INTERNAL_ERROR_TYPE}}
+    return json.dumps(error, ensure_ascii=False)
+
+
 def _serialize_sse_chunk(chunk, choice_nulls=(), response_nulls=()):
     """Serialize a streaming chunk, explicitly including specified null fields."""
     d = chunk.model_dump(exclude_none=True)
@@ -350,6 +363,9 @@ async def chat_completions_impl(request: ChatCompletionRequest, raw_request: Req
                 finish_reason_dict[sub_req_id] = finish_status.get_finish_reason()
                 prompt_tokens_dict[sub_req_id] = metadata["prompt_tokens"]
                 prompt_cache_len_dict[sub_req_id] = metadata.get("prompt_cache_len", 0)
+        if any(r == "error" for r in finish_reason_dict.values()):
+            logger.error(f"internal pipeline error during chat completion group_id={group_request_id}")
+            return create_error_response(HTTPStatus.INTERNAL_SERVER_ERROR, _INTERNAL_ERROR_MESSAGE)
         choices = []
         sub_ids = list(final_output_dict.keys())[: request.n]
         for i in range(request.n):
@@ -468,6 +484,15 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
             delta = request_output
             current_finish_reason = finish_status.get_finish_reason()
 
+            if current_finish_reason == "error":
+                logger.error(
+                    f"internal pipeline error during chat stream group_id={group_request_id} "
+                    f"sub_req_id={sub_req_id}"
+                )
+                yield f"data: {_sse_internal_error_payload()}\n\n"
+                yield "data: [DONE]\n\n".encode("utf-8")
+                return
+
             # Emit the initial role-only chunk once per choice, as required by the
             # OpenAI SSE spec: role appears only in the first delta with content="".
             if not has_emitted_first_chunk[choice_index]:
@@ -877,6 +902,9 @@ async def process_single_prompt(prompt: Union[str, List[int]], prompt_index: int
     tasks = [asyncio.create_task(process_single_prompt(prompt, i)) for i, prompt in enumerate(prompts)]
 
     results = await asyncio.gather(*tasks)
+    if any(r.get("finish_reason") == "error" for r in results):
+        logger.error("internal pipeline error during completion")
+        return create_error_response(HTTPStatus.INTERNAL_SERVER_ERROR, _INTERNAL_ERROR_MESSAGE)
     return _build_completion_response(results, request, created_time, len(prompts) > 1)
 
 
@@ -911,6 +939,15 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
             if finish_status.is_finished():
                 current_finish_reason = finish_status.get_finish_reason()
 
+            if current_finish_reason == "error":
+                logger.error(
+                    f"internal pipeline error during completion stream group_id={group_request_id} "
+                    f"sub_req_id={sub_req_id}"
+                )
+                yield f"data: {_sse_internal_error_payload()}\n\n"
+                yield "data: [DONE]\n\n"
+                return
+
             output_text = request_output
             if request.echo and metadata.get("is_first_token", False):
                 prompt_str = prompt
diff --git a/lightllm/server/detokenization/manager.py b/lightllm/server/detokenization/manager.py
@@ -46,9 +46,10 @@ def _init_get_token_id_to_token_str(self):
         self.token_id_to_token = {token_id: token for token, token_id in self.tokenizer.get_vocab().items()}
         return
 
-    def _add_new_group_req_index(self, recv_obj: GroupReqIndexes):
+    def _add_new_group_req_index(self, recv_obj: GroupReqIndexes) -> int:
         from lightllm.server.core.objs import FinishStatus
 
+        failed_count = 0
         for req_index in recv_obj.shm_req_indexes:
             req = self.shm_req_manager.get_req_obj_by_index(req_index)
             try:
@@ -68,10 +69,23 @@ def _add_new_group_req_index(self, recv_obj: GroupReqIndexes):
                     decode_req.init_token_healing_prefix_str(self.token_id_to_token, self.tokenizer)
 
                 self.req_id_to_out[req.request_id] = decode_req
-            except Exception as e:
+            except Exception:
+                # Init failed (shm link, tokenizer, decode-mode fix, …). Mark the req
+                # finished with an error and push a sentinel into out_tokens_queue so the
+                # http loop forwards a terminal status — otherwise the queue stays empty,
+                # the client hangs until disconnect, and the shm slot leaks because
+                # can_released_mark never gets set. Continue with the rest of the group.
+                logger.exception(f"detokenization init failed for req_id {req.request_id}")
                 req.finish_status.set_status(FinishStatus.FINISHED_ERROR)
-                raise e
-        return
+                req.finish_token_index = req.input_len
+                try:
+                    if not req.out_tokens_queue.is_full():
+                        req.out_tokens_queue.push("", req.input_len, False, 1)
+                except Exception:
+                    logger.exception(f"failed to push error sentinel for req_id {req.request_id}")
+                req.can_released_mark = True
+                failed_count += 1
+        return failed_count
 
     def handle_loop(self):
         try:
@@ -83,9 +97,12 @@ def handle_loop(self):
                         recv_obj: GroupReqIndexes = self.zmq_recv_socket.recv_pyobj(zmq.NOBLOCK)
                         assert isinstance(recv_obj, GroupReqIndexes)
                         try:
-                            self._add_new_group_req_index(recv_obj=recv_obj)
+                            failed_count = self._add_new_group_req_index(recv_obj=recv_obj)
                         except Exception:
                             logger.exception("add new group req index has exception")
+                            failed_count = len(recv_obj.shm_req_indexes)
+                        if failed_count:
+                            # Wake the http loop so it drains the error sentinel(s) we just pushed.
                             self.pub_to_httpserver.send_pyobj(None, protocol=pickle.HIGHEST_PROTOCOL)
 
                     # 当队列中存在较多的请求时，将一次接受的数量上调