issue/204 - support graph in server scripts

wooway777 · PanZezhong1725 · commit ee262bcf4d69 · 2026-01-30T05:47:09.000Z
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
@@ -50,6 +50,7 @@ class EngineConfig:
         temperature: Default sampling temperature.
         top_p: Default top-p sampling parameter.
         top_k: Default top-k sampling parameter.
+        enable_graph: Whether to enable graph compiling.
     """
 
     model_path: str
@@ -63,6 +64,7 @@ class EngineConfig:
     temperature: float = 1.0
     top_p: float = 0.8
     top_k: int = 1
+    enable_graph: bool = False
 
 
 class LLMEngine:
@@ -74,11 +76,18 @@ def __init__(self, config: EngineConfig):
         # Initialize device and dtype
         self._init_device()
 
+        # Initialize KV cache
+        cache_config = PagedKVCacheConfig(
+            num_blocks=config.num_blocks, block_size=config.block_size
+        )
+
         # Initialize model engine
         self.model_engine = InferEngine(
             model_path=config.model_path,
             device=self.device,
             distributed_config=DistConfig(config.tensor_parallel_size),
+            cache_config=cache_config,
+            enable_graph_compiling=config.enable_graph,
         )
 
         # Load model weights
@@ -92,12 +101,6 @@ def __init__(self, config: EngineConfig):
         )
         self._fix_tokenizer_decoder()
 
-        # Initialize KV cache
-        cache_config = PagedKVCacheConfig(
-            num_blocks=config.num_blocks, block_size=config.block_size
-        )
-        self.model_engine.reset_cache(cache_config)
-
         # Initialize scheduler
         self.scheduler = Scheduler(
             max_batch_size=config.max_batch_size,
@@ -113,6 +116,7 @@ def __init__(self, config: EngineConfig):
         logger.info(
             f"LLMEngine initialized with model at {config.model_path} "
             f"on device {config.device}"
+            f"enable_graph={config.enable_graph}"
         )
 
     def _init_device(self):
@@ -308,6 +312,7 @@ def __init__(
         temperature: float = 1.0,
         top_p: float = 0.8,
         top_k: int = 1,
+        enable_graph: bool = False,
     ):
         """Initialize LLM.
 
@@ -323,6 +328,7 @@ def __init__(
             temperature: Default sampling temperature.
             top_p: Default top-p sampling parameter.
             top_k: Default top-k sampling parameter.
+            enable_graph: Whether to enable graph compiling.
         """
         config = EngineConfig(
             model_path=model_path,
@@ -336,6 +342,7 @@ def __init__(
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            enable_graph=enable_graph,
         )
         self.engine = LLMEngine(config)
         self.config = config
@@ -452,6 +459,7 @@ def __init__(
         temperature: float = 1.0,
         top_p: float = 0.8,
         top_k: int = 1,
+        enable_graph: bool = False,
     ):
         """Initialize AsyncLLMEngine.
 
@@ -467,6 +475,7 @@ def __init__(
             temperature: Default sampling temperature.
             top_p: Default top-p sampling parameter.
             top_k: Default top-k sampling parameter.
+            enable_graph: Whether to enable graph compiling.
         """
         config = EngineConfig(
             model_path=model_path,
@@ -480,6 +489,7 @@ def __init__(
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            enable_graph=enable_graph,
         )
         self.engine = LLMEngine(config)
         self.config = config
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
@@ -22,7 +22,9 @@
 DEFAULT_REQUEST_TIMEOUT = 1000.0
 
 
-def chunk_json(id_, content=None, role=None, finish_reason=None):
+def chunk_json(
+    id_, content=None, role=None, finish_reason=None, model: str = "unknown"
+):
     """Generate JSON chunk for streaming response."""
     delta = {}
     if content:
@@ -65,6 +67,7 @@ def __init__(
         top_k: int = 1,
         host: str = "0.0.0.0",
         port: int = 8000,
+        enable_graph: bool = False,
     ):
         """Initialize inference server.
 
@@ -82,6 +85,7 @@ def __init__(
             top_k: Default top-k sampling parameter.
             host: Server host address.
             port: Server port number.
+            enable_graph: Whether to enable graph compiling.
         """
         self.model_path = model_path
         self.device = device
@@ -96,6 +100,7 @@ def __init__(
         self.top_k = top_k
         self.host = host
         self.port = port
+        self.enable_graph = enable_graph
 
         self.engine: AsyncLLMEngine = None
 
@@ -123,9 +128,11 @@ async def lifespan(app: FastAPI):
                 temperature=self.temperature,
                 top_p=self.top_p,
                 top_k=self.top_k,
+                enable_graph=self.enable_graph,
             )
             self.engine.start()
             logger.info(f"Engine initialized with model at {self.model_path}")
+            logger.info(f"  enable_graph: {self.enable_graph}")
             yield
             self.engine.stop()
 
@@ -407,6 +414,11 @@ def parse_args():
     parser.add_argument("--moore", action="store_true", help="Use Moore device")
     parser.add_argument("--iluvatar", action="store_true", help="Use Iluvatar device")
     parser.add_argument("--cambricon", action="store_true", help="Use Cambricon device")
+    parser.add_argument(
+        "--enable-graph",
+        action="store_true",
+        help="Enable graph compiling",
+    )
     parser.add_argument(
         "--log_level",
         type=str,
@@ -442,6 +454,8 @@ def main():
             "\n"
             "Example: python infinilm.server.inference_server --nvidia --model_path=/data/shared/models/9G7B_MHA/ "
             "--max_tokens=100 --max_batch_size=32 --tp=1 --temperature=1.0 --top_p=0.8 --top_k=1"
+            "\n"
+            "Optional: --enable-paged-attn --enable-graph"
         )
         sys.exit(1)
 
@@ -459,6 +473,7 @@ def main():
         top_k=args.top_k,
         host=args.host,
         port=args.port,
+        enable_graph=args.enable_graph,
     )
     server.start()