Merge pull request #300 from InfiniTensor/issue/299

wooway777 · web-flow · commit 4eab14dcb7a2 · 2026-04-13T19:13:47.000+08:00
issue/299 - allow ignoring eos in server
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
@@ -334,21 +334,23 @@ def _check_request_finished(self, req: InferenceRequest, token_id: int) -> bool:
             req.finish_reason = FinishReason.LENGTH
             return True
 
-        # Check EOS token
-        eos_ids = req.eos_token_ids or self.eos_token_ids
-        if eos_ids and token_id in eos_ids:
-            req.finish_reason = FinishReason.EOS_TOKEN
-            return True
-
-        # Check stop strings
-        # Remove stop string from generated_text if STOP_STRING finish reason
-        stop_strings = req.sampling_params.stop or []
-        for stop_str in stop_strings:
-            if req.generated_text.endswith(stop_str):
-                req.generated_text = req.generated_text[: -len(stop_str)]
-                req.finish_reason = FinishReason.STOP_STRING
+        if not req.sampling_params.ignore_eos:
+            # Check EOS token - only stop if ignore_eos is False
+            eos_ids = req.eos_token_ids or self.eos_token_ids
+            if eos_ids and token_id in eos_ids:
+                req.finish_reason = FinishReason.EOS_TOKEN
                 return True
 
+            # While ignoring EOS, stop strings are also ignored to avoid requiring additional arguments for benchmarking.
+            # Check stop strings
+            # Remove stop string from generated_text if STOP_STRING is the finishing reason
+            stop_strings = req.sampling_params.stop or []
+            for stop_str in stop_strings:
+                if req.generated_text.endswith(stop_str):
+                    req.generated_text = req.generated_text[: -len(stop_str)]
+                    req.finish_reason = FinishReason.STOP_STRING
+                    return True
+
         return False
 
     def tokenize(self, text: str) -> List[int]:
diff --git a/python/infinilm/llm/sampling_params.py b/python/infinilm/llm/sampling_params.py
@@ -15,7 +15,10 @@ class SamplingParams:
     top_k: int = 1
     max_tokens: Optional[int] = None
     stop: Optional[List[str]] = None
-    stop_token_ids: Optional[List[int]] = None  # Placeholder for future usage, not currently handled
+    stop_token_ids: Optional[List[int]] = (
+        None  # Placeholder for future usage, not currently handled
+    )
+    ignore_eos: bool = False
 
     def __post_init__(self):
         if self.stop is None:
@@ -32,4 +35,5 @@ def clone(self) -> "SamplingParams":
             max_tokens=self.max_tokens,
             stop=self.stop.copy() if self.stop else None,
             stop_token_ids=self.stop_token_ids.copy() if self.stop_token_ids else None,
+            ignore_eos=self.ignore_eos,
         )
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
@@ -109,6 +109,7 @@ def __init__(
         port: int = 8000,
         enable_graph: bool = False,
         attn_backend: str = "default",
+        ignore_eos: bool = False,
     ):
         """Initialize inference server.
 
@@ -150,6 +151,7 @@ def __init__(
         self.port = port
         self.enable_graph = enable_graph
         self.attn_backend = attn_backend
+        self.ignore_eos = ignore_eos
 
         self.engine: AsyncLLMEngine = None
 
@@ -331,6 +333,7 @@ def pick(key: str, default):
             top_k=int(pick("top_k", self.top_k)),
             max_tokens=int(max_tokens) if max_tokens is not None else None,
             stop=stop,
+            ignore_eos=self.ignore_eos,
         )
 
     async def _stream_chat(self, request_id: str, data: dict, http_request: Request):
@@ -382,7 +385,11 @@ async def _stream_chat(self, request_id: str, data: dict, http_request: Request)
                 # Skip EOS token text for OpenAI API compatibility
                 # Check if this token is an EOS token by comparing token_id with eos_token_ids
                 eos_token_ids = self.engine.engine.eos_token_ids
-                is_eos_token = eos_token_ids and token_output.token_id in eos_token_ids
+                is_eos_token = (
+                    not sampling_params.ignore_eos
+                    and eos_token_ids
+                    and token_output.token_id in eos_token_ids
+                )
 
                 if not is_eos_token and token_output.token_text:
                     # Send token
@@ -631,6 +638,13 @@ def parse_args():
         choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
         help="Logging level",
     )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        dest="ignore_eos",
+        default=False,
+        help="Ignore EOS token and continue generation",
+    )
 
     return parser.parse_args()
 
@@ -688,6 +702,7 @@ def main():
         port=args.port,
         enable_graph=args.enable_graph,
         attn_backend=args.attn,
+        ignore_eos=args.ignore_eos,
     )
     server.start()