issue/340 - set default max_num_batched_tokens

wangpengcheng · wangpengcheng · commit a7d2c78f5400 · 2026-05-26T07:52:23.000Z
diff --git a/csrc/global_state/infinilm_config.hpp b/csrc/global_state/infinilm_config.hpp
@@ -23,7 +23,7 @@ struct InfinilmConfig {
 
         if (const char *max_num_batched_tokens_env = getenv("INFINILM_MAX_NUM_BATCHED_TOKENS")) {
             max_num_batched_tokens = std::stoi(max_num_batched_tokens_env);
-            ASSERT(max_num_batched_tokens >= 1024 && max_num_batched_tokens < max_position_embeddings);
+            ASSERT(max_num_batched_tokens >= 1024 && max_num_batched_tokens <= max_position_embeddings);
         }
     }
 
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
@@ -6,6 +6,7 @@
 - AsyncLLM class for asynchronous streaming (server use)
 """
 
+import os
 import asyncio
 import time
 import uuid
@@ -66,10 +67,20 @@ def __init__(self, config: EngineConfig):
                     f"KV Connector created: {config.kv_transfer_config.kv_connector} "
                     f"(role={config.kv_transfer_config.kv_role})"
                 )
+
+            max_position_embeddings = self.model_runner.model_engine.hf_config[
+                "max_position_embeddings"
+            ]
+            max_num_batched_tokens = int(
+                os.getenv("INFINILM_MAX_NUM_BATCHED_TOKENS", max_position_embeddings)
+            )
+            assert 1024 <= max_num_batched_tokens <= max_position_embeddings
+
             self.scheduler = Scheduler(
                 max_batch_size=config.max_batch_size,
                 num_blocks=config.num_blocks,
                 block_size=config.block_size,
+                max_num_batched_tokens=max_num_batched_tokens,
                 connector=connector,
             )
             logger.info(f"Using Paged KV Cache with num_blocks={config.num_blocks}")
@@ -685,13 +696,13 @@ def add_request(
         elif prompt is not None:
             prompt_token_ids = self.engine.tokenize(prompt)
         else:
-            assert (
-                messages is not None
-            ), "Either messages or prompt/prompt_token_ids must be provided"
+            assert messages is not None, (
+                "Either messages or prompt/prompt_token_ids must be provided"
+            )
 
-            assert (
-                apply_chat_template
-            ), "apply_chat_template needs to be true for multi-role conversation"
+            assert apply_chat_template, (
+                "apply_chat_template needs to be true for multi-role conversation"
+            )
 
             prompt = self.engine.apply_chat_template(
                 messages, add_generation_prompt=add_generation_prompt
diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py
@@ -41,6 +41,7 @@ def __init__(
         max_batch_size: int = 16,
         num_blocks: int = 512,
         block_size: int = 256,
+        max_num_batched_tokens: int = 1024,
         connector=None,
     ):
         self.waiting_queue = janus.Queue()
@@ -56,13 +57,9 @@ def __init__(
         self.cache_manager = BlockManager(num_blocks=num_blocks, block_size=block_size)
         self.block_size = block_size
 
-        self.connector = connector
+        self.max_num_batched_tokens = max_num_batched_tokens
 
-        assert "INFINILM_MAX_NUM_BATCHED_TOKENS" in os.environ
-        self.max_num_batched_tokens = int(
-            os.getenv("INFINILM_MAX_NUM_BATCHED_TOKENS", 65535)
-        )
-        assert self.max_num_batched_tokens > 1024
+        self.connector = connector
 
     def add_request(self, request: InferenceRequest):
         if request is not None:

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ struct InfinilmConfig {`
`23`	`23`
`24`	`24`	`if (const char *max_num_batched_tokens_env = getenv("INFINILM_MAX_NUM_BATCHED_TOKENS")) {`
`25`	`25`	`max_num_batched_tokens = std::stoi(max_num_batched_tokens_env);`
`26`		`- ASSERT(max_num_batched_tokens >= 1024 && max_num_batched_tokens < max_position_embeddings);`
	`26`	`+ ASSERT(max_num_batched_tokens >= 1024 && max_num_batched_tokens <= max_position_embeddings);`
`27`	`27`	`}`
`28`	`28`	`}`
`29`	`29`