Merge pull request #260 from InfiniTensor/issue/259

wooway777 · web-flow · commit 3b8e1cb756fe · 2026-03-09T14:15:59.000+08:00
issue/259 - add attn backend option to inference server
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
@@ -55,6 +55,7 @@ class EngineConfig:
         top_p: Default top-p sampling parameter.
         top_k: Default top-k sampling parameter.
         enable_graph: Whether to enable graph compiling.
+        attn_backend: Attention backend to use ('default', 'flash-attn').
     """
 
     model_path: str
@@ -71,6 +72,7 @@ class EngineConfig:
     top_p: float = 0.8
     top_k: int = 1
     enable_graph: bool = False
+    attn_backend: str = "default"
 
 
 class LLMEngine:
@@ -88,6 +90,7 @@ def __init__(self, config: EngineConfig):
             device=self.device,
             distributed_config=DistConfig(config.tensor_parallel_size),
             enable_graph_compiling=config.enable_graph,
+            attention_backend=config.attn_backend,
         )
 
         # Load model weights
@@ -383,6 +386,7 @@ def __init__(
         top_p: float = 0.8,
         top_k: int = 1,
         enable_graph: bool = False,
+        attn_backend: str = "default",
     ):
         """Initialize LLM.
 
@@ -401,6 +405,7 @@ def __init__(
             top_p: Default top-p sampling parameter.
             top_k: Default top-k sampling parameter.
             enable_graph: Whether to enable graph compiling.
+            attn_backend: Attention backend to use ('default', 'flash-attn').
         """
         config = EngineConfig(
             model_path=model_path,
@@ -417,6 +422,7 @@ def __init__(
             top_p=top_p,
             top_k=top_k,
             enable_graph=enable_graph,
+            attn_backend=attn_backend,
         )
         self.engine = LLMEngine(config)
         self.config = config
@@ -536,6 +542,7 @@ def __init__(
         top_p: float = 0.8,
         top_k: int = 1,
         enable_graph: bool = False,
+        attn_backend: str = "default",
     ):
         """Initialize AsyncLLMEngine.
 
@@ -554,6 +561,7 @@ def __init__(
             top_p: Default top-p sampling parameter.
             top_k: Default top-k sampling parameter.
             enable_graph: Whether to enable graph compiling.
+            attn_backend: Attention backend to use ('default', 'flash-attn').
         """
         config = EngineConfig(
             model_path=model_path,
@@ -570,6 +578,7 @@ def __init__(
             top_p=top_p,
             top_k=top_k,
             enable_graph=enable_graph,
+            attn_backend=attn_backend,
         )
         self.engine = LLMEngine(config)
         self.config = config
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
@@ -108,6 +108,7 @@ def __init__(
         host: str = "0.0.0.0",
         port: int = 8000,
         enable_graph: bool = False,
+        attn_backend: str = "default",
     ):
         """Initialize inference server.
 
@@ -128,6 +129,7 @@ def __init__(
             host: Server host address.
             port: Server port number.
             enable_graph: Whether to enable graph compiling.
+            attn_backend: Attention backend to use ('default', 'flash-attn').
         """
         self.model_path = model_path
         # vLLM-like served model id: directory name of model_path
@@ -147,6 +149,7 @@ def __init__(
         self.host = host
         self.port = port
         self.enable_graph = enable_graph
+        self.attn_backend = attn_backend
 
         self.engine: AsyncLLMEngine = None
 
@@ -177,6 +180,7 @@ async def lifespan(app: FastAPI):
                 top_p=self.top_p,
                 top_k=self.top_k,
                 enable_graph=self.enable_graph,
+                attn_backend=self.attn_backend,
             )
             self.engine.start()
             logger.info(f"Engine initialized with model at {self.model_path}")
@@ -613,6 +617,13 @@ def parse_args():
         action="store_true",
         help="Enable graph compiling",
     )
+    parser.add_argument(
+        "--attn",
+        type=str,
+        default="default",
+        choices=["default", "flash-attn"],
+        help="Attention backend to use: 'default' or 'flash-attn'",
+    )
     parser.add_argument(
         "--log_level",
         type=str,
@@ -655,7 +666,7 @@ def main():
             "Example: python infinilm.server.inference_server --nvidia --model_path=/data/shared/models/9G7B_MHA/ "
             "--max_tokens=100 --max_batch_size=32 --tp=1 --temperature=1.0 --top_p=0.8 --top_k=1"
             "\n"
-            "Optional: --enable-paged-attn --enable-graph"
+            "Optional: --enable-paged-attn --enable-graph --attn=default"
         )
         sys.exit(1)
 
@@ -676,6 +687,7 @@ def main():
         host=args.host,
         port=args.port,
         enable_graph=args.enable_graph,
+        attn_backend=args.attn,
     )
     server.start()