add chunkprefill and prefill cuda graph

Simon12345777 · Simon12345777 · commit f2c8bab8531d · 2026-05-12T17:45:29.000+08:00
diff --git a/csrc/engine/compiler/general_compiler.cpp b/csrc/engine/compiler/general_compiler.cpp
@@ -1,13 +1,18 @@
 #include "general_compiler.hpp"
 
 namespace infinilm::engine {
-GeneralCompiler::GeneralCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier) : GraphCompiler(model, barrier) {
+GeneralCompiler::GeneralCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier, bool enable_chunk_prefill_graph)
+    : GraphCompiler(model, barrier), enable_chunk_prefill_graph_(enable_chunk_prefill_graph) {
     static_batching_compiler_ = std::make_unique<StaticBatchingCompiler>(model_, barrier);
+    chunk_prefill_compiler_ = std::make_unique<ChunkPrefillCompiler>(model_, barrier);
     paged_compiler_ = std::make_unique<PagedCompiler>(model_, barrier);
 }
 
 void GeneralCompiler::compile() {
     static_batching_compiler_->compile();
+    if (enable_chunk_prefill_graph_) {
+        chunk_prefill_compiler_->compile();
+    }
     paged_compiler_->compile();
 }
 
@@ -19,6 +24,11 @@ GeneralCompiler::Compiled GeneralCompiler::get_compiled(const InfinilmModel::Inp
     if (std::get<0>(result) != nullptr && std::get<1>(result) != nullptr) {
         return result;
     }
+    // chunk-prefill must be checked before decode (decode would also match if chunk_size==1)
+    result = chunk_prefill_compiler_.get()->get_compiled(input);
+    if (std::get<0>(result) != nullptr && std::get<1>(result) != nullptr) {
+        return result;
+    }
     result = paged_compiler_.get()->get_compiled(input);
     return result;
 }
diff --git a/csrc/engine/compiler/general_compiler.hpp b/csrc/engine/compiler/general_compiler.hpp
@@ -1,12 +1,13 @@
 #pragma once
 
+#include "chunk_prefill_compiler.hpp"
 #include "paged_compiler.hpp"
 #include "static_batching_compiler.hpp"
 
 namespace infinilm::engine {
 class GeneralCompiler : public GraphCompiler {
 public:
-    GeneralCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier);
+    GeneralCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier, bool enable_chunk_prefill_graph = false);
 
     void compile() override;
 
@@ -15,5 +16,7 @@ class GeneralCompiler : public GraphCompiler {
 private:
     std::unique_ptr<StaticBatchingCompiler> static_batching_compiler_;
     std::unique_ptr<PagedCompiler> paged_compiler_;
+    std::unique_ptr<ChunkPrefillCompiler> chunk_prefill_compiler_;
+    bool enable_chunk_prefill_graph_;
 };
 } // namespace infinilm::engine
diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp
@@ -25,6 +25,7 @@ InferEngine::InferEngine(
     infinicore::Device::Type device_type,
     const cache::CacheConfig *cache_config,
     bool enable_graph_compiling,
+    bool enable_chunk_prefill_graph,
     backends::AttentionBackend attention_backend) // Changed parameter
     : communication_group_(distributed_config, device_type),
       legacy_model_config_(config),
@@ -43,6 +44,7 @@ InferEngine::InferEngine(
             cache_config_ != nullptr ? cache_config_.get() : nullptr,
             barrier_.get(),
             enable_graph_compiling,
+            enable_chunk_prefill_graph,
             attention_backend_));
     }
 
@@ -56,6 +58,7 @@ InferEngine::InferEngine(
     infinicore::Device::Type device_type,
     const cache::CacheConfig *cache_config,
     bool enable_graph_compiling,
+    bool enable_chunk_prefill_graph,
     backends::AttentionBackend attention_backend,
     std::optional<infinicore::DataType> kv_cache_dtype) // Changed parameter
     : communication_group_(distributed_config, device_type), attention_backend_(attention_backend) {
@@ -82,6 +85,7 @@ InferEngine::InferEngine(
             cache_config_ != nullptr ? cache_config_.get() : nullptr,
             barrier_.get(),
             enable_graph_compiling,
+            enable_chunk_prefill_graph,
             attention_backend_));
     }
     // Compile the model on all workers
diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp
@@ -39,6 +39,7 @@ class InferEngine {
         infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
         const cache::CacheConfig *cache_config = nullptr,
         bool enable_graph_compiling = false,
+        bool enable_chunk_prefill_graph = false,
         backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
 
     InferEngine(
@@ -47,6 +48,7 @@ class InferEngine {
         infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
         const cache::CacheConfig *cache_config = nullptr,
         bool enable_graph_compiling = false,
+        bool enable_chunk_prefill_graph = false,
         backends::AttentionBackend attention_backend = backends::AttentionBackend::Default,
         std::optional<infinicore::DataType> kv_cache_dtype = std::nullopt);
 
diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp
@@ -27,11 +27,13 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config,
                        const cache::CacheConfig *cache_config,
                        RankBarrier *barrier,
                        bool enable_graph_compiling,
+                       bool enable_chunk_prefill_graph,
                        backends::AttentionBackend attention_backend)
     : legacy_model_config_(model_config),
       rank_info_(rank_info),
       attention_backend_(attention_backend),
       enable_graph_compiling_(enable_graph_compiling),
+      enable_chunk_prefill_graph_(enable_chunk_prefill_graph),
       job_cmd_(Command::INIT),
       has_job_(false),
       job_done_(false),
@@ -56,12 +58,14 @@ RankWorker::RankWorker(
     const cache::CacheConfig *cache_config,
     RankBarrier *barrier,
     bool enable_graph_compiling,
+    bool enable_chunk_prefill_graph,
     backends::AttentionBackend attention_backend)
     : infinilm_config_(infinilm_config),
       model_config_(infinilm_config->model_config),
       rank_info_(rank_info),
       attention_backend_(attention_backend),
       enable_graph_compiling_(enable_graph_compiling),
+      enable_chunk_prefill_graph_(enable_chunk_prefill_graph),
       job_cmd_(Command::INIT),
       has_job_(false),
       job_done_(false),
@@ -303,7 +307,7 @@ void RankWorker::thread_loop() {
                 throw std::runtime_error("Failed to create model");
             }
             if (enable_graph_compiling_) {
-                compiler_ = std::make_unique<GeneralCompiler>(model_, barrier_);
+                compiler_ = std::make_unique<GeneralCompiler>(model_, barrier_, enable_chunk_prefill_graph_);
             }
 
             init_done_ = true;
diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp
@@ -75,13 +75,15 @@ class RankWorker {
                const cache::CacheConfig *cache_config,
                RankBarrier *barrier,
                bool enable_graph_compiling,
+               bool enable_chunk_prefill_graph,
                backends::AttentionBackend attention_backend);
 
     RankWorker(std::shared_ptr<infinilm::global_state::InfinilmConfig> infinilm_config,
                const distributed::RankInfo &rank_info,
                const cache::CacheConfig *cache_config,
                RankBarrier *barrier,
                bool enable_graph_compiling,
+               bool enable_chunk_prefill_graph,
                backends::AttentionBackend attention_backend);
 
     // Submit a parameter load job and wait until the load completes on the worker thread.
@@ -131,6 +133,7 @@ class RankWorker {
 
     // Graph Compiling
     bool enable_graph_compiling_;
+    bool enable_chunk_prefill_graph_;
     std::unique_ptr<GraphCompiler> compiler_;
 
     // Command for the pending job (protected by mutex_)
diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp
@@ -37,20 +37,23 @@ inline void bind_infer_engine(py::module &m) {
                           infinicore::Device::Type dev,
                           std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
                           bool enable_graph_compiling,
+                          bool enable_chunk_prefill_graph,
                           const std::string &attention_backend) {
                  return std::make_shared<InferEngine>(
                      cfg,
                      dist,
                      dev,
                      cache_cfg ? cache_cfg.get() : nullptr,
                      enable_graph_compiling,
+                     enable_chunk_prefill_graph,
                      infinilm::backends::parse_attention_backend(attention_backend));
              }),
              py::arg("config"),
              py::arg("distributed_config") = distributed::DistConfig(),
              py::arg("device_type") = infinicore::context::getDevice().getType(),
              py::arg("cache_config") = py::none(),
              py::arg("enable_graph_compiling") = false,
+             py::arg("enable_chunk_prefill_graph") = false,
              py::arg("attention_backend") = "default")
         .def("load_param", &InferEngine::load_param,
              py::arg("name"), py::arg("param"),
@@ -81,6 +84,7 @@ inline void bind_infer_engine(py::module &m) {
                           infinicore::Device::Type dev,
                           std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
                           bool enable_graph_compiling,
+                          bool enable_chunk_prefill_graph,
                           const std::string &attention_backend,
                           std::optional<infinicore::DataType> kv_cache_dtype) {
                  return std::make_shared<InferEngine>(
@@ -89,6 +93,7 @@ inline void bind_infer_engine(py::module &m) {
                      dev,
                      cache_cfg ? cache_cfg.get() : nullptr,
                      enable_graph_compiling,
+                     enable_chunk_prefill_graph,
                      infinilm::backends::parse_attention_backend(attention_backend),
                      kv_cache_dtype);
              }),
@@ -97,6 +102,7 @@ inline void bind_infer_engine(py::module &m) {
              py::arg("device_type") = infinicore::context::getDevice().getType(),
              py::arg("cache_config") = py::none(),
              py::arg("enable_graph_compiling") = false,
+             py::arg("enable_chunk_prefill_graph") = false,
              py::arg("attention_backend") = "default",
              py::arg("kv_cache_dtype") = py::none())
         .def("load_param", &InferEngine::load_param,
diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py
@@ -61,6 +61,8 @@ def __init__(self):
 
         self.attn = self.args.attn
         self.enable_graph = self.args.enable_graph
+        self.enable_chunk_prefill_graph = self.args.enable_chunk_prefill_graph
+        self.chunk_size = self.args.chunk_size
         self.enable_paged_attn = self.args.enable_paged_attn
         self.num_blocks = self.args.num_blocks
         self.block_size = self.args.block_size
@@ -122,6 +124,8 @@ def _add_common_args(self):
             choices=["default", "paged-attn", "flash-attn"],
         )
         self.parser.add_argument("--enable-graph", action="store_true")
+        self.parser.add_argument("--enable-chunk-prefill-graph", action="store_true", help="enable chunk-prefill graph compiling")
+        self.parser.add_argument("--chunk-size", type=int, default=512, help="tokens per chunked-prefill slice (0 to disable)")
         self.parser.add_argument(
             "--enable-paged-attn",
             action="store_true",
diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py
@@ -45,6 +45,7 @@ def __init__(
         distributed_config=DistConfig(1),
         cache_config=None,
         enable_graph_compiling=False,
+        enable_chunk_prefill_graph=False,
         attention_backend="default",
         kv_cache_dtype=None,
     ):
@@ -60,6 +61,7 @@ def __init__(
             device._underlying.type,
             cache_config,
             enable_graph_compiling,
+            enable_chunk_prefill_graph,
             attention_backend,
             (
                 parse_dtype(kv_cache_dtype)._underlying
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
@@ -72,6 +72,8 @@ class EngineConfig:
     top_p: float = 0.8
     top_k: int = 1
     enable_graph: bool = False
+    enable_chunk_prefill_graph: bool = False
+    chunk_size: int = 0
     attn_backend: str = "default"
     skip_load: bool = False
 
@@ -91,6 +93,7 @@ def __init__(self, config: EngineConfig):
             device=self.device,
             distributed_config=DistConfig(config.tensor_parallel_size),
             enable_graph_compiling=config.enable_graph,
+            enable_chunk_prefill_graph=config.enable_chunk_prefill_graph,
             attention_backend=config.attn_backend,
         )
 
@@ -167,6 +170,8 @@ def _init_device(self):
 
     def add_request(self, request: InferenceRequest):
         """Add a request to the scheduler."""
+        if self.cache_type == "paged" and self.config.chunk_size > 0:
+            request.chunk_size = self.config.chunk_size
         self.scheduler.add_request(request)
 
     def step(self) -> tuple[list[InferenceRequest], list[tuple]]:
@@ -210,14 +215,39 @@ def _update_requests(
         sampled_tokens: List[int],
     ) -> List[tuple]:
         """Update request status after inference step."""
-        if is_prefill:
+        # Detect a chunked-prefill mid-step: single request, prefill phase,
+        # and this chunk does not yet cover the whole prompt. In that case
+        # we must NOT consume a sampled token, NOT commit prefill blocks,
+        # and re-enqueue the request to keep chunking.
+        chunk_mid_step = (
+            is_prefill
+            and len(requests) == 1
+            and requests[0].is_chunking()
+            and not requests[0].chunk_is_last()
+        )
+
+        if is_prefill and not chunk_mid_step:
             match self.cache_type:
                 case "paged":
                     self.scheduler.cache_manager.reset_req_blocks()
                 case "static":
                     self.scheduler.update_cache()
                 case _:
                     raise ValueError(f"Unsupported cache_type: {self.cache_type}")
+
+        if chunk_mid_step:
+            req = requests[0]
+            req.chunk_prefill_offset += req.chunk_size
+            # If this request was aborted while chunking, drop it.
+            if req.is_aborted():
+                logger.info(
+                    f"Request {req.request_id} aborted by client during chunked-prefill"
+                )
+                return []
+            # Re-enqueue to keep producing chunks; no token sampled yet.
+            self.scheduler.requeue_chunking(req)
+            return []
+
         pending = []
         for req, token_id in zip(requests, sampled_tokens):
             if req.is_aborted():
@@ -227,6 +257,10 @@ def _update_requests(
                 continue
 
             if req.is_prefill:
+                # Clean up chunked-prefill state on the final chunk so the
+                # next forward pass on this request takes the decode path.
+                req.chunk_prefill_offset = 0
+                req.chunk_size = 0
                 req.is_prefill = False
 
             req.generated_token_ids.append(token_id)
@@ -361,6 +395,8 @@ def __init__(
         top_p: float = 0.8,
         top_k: int = 1,
         enable_graph: bool = False,
+        enable_chunk_prefill_graph: bool = False,
+        chunk_size: int = 0,
         attn_backend: str = "default",
         skip_load: bool = False,
     ):
@@ -398,6 +434,8 @@ def __init__(
             top_p=top_p,
             top_k=top_k,
             enable_graph=enable_graph,
+            enable_chunk_prefill_graph=enable_chunk_prefill_graph,
+            chunk_size=chunk_size,
             attn_backend=attn_backend,
             skip_load=skip_load,
         )
@@ -539,6 +577,8 @@ def __init__(
         top_p: float = 0.8,
         top_k: int = 1,
         enable_graph: bool = False,
+        enable_chunk_prefill_graph: bool = False,
+        chunk_size: int = 0,
         attn_backend: str = "default",
     ):
         """Initialize AsyncLLMEngine.
@@ -575,6 +615,8 @@ def __init__(
             top_p=top_p,
             top_k=top_k,
             enable_graph=enable_graph,
+            enable_chunk_prefill_graph=enable_chunk_prefill_graph,
+            chunk_size=chunk_size,
             attn_backend=attn_backend,
         )
         self.engine = LLMEngine(config)
diff --git a/python/infinilm/llm/request.py b/python/infinilm/llm/request.py
@@ -144,6 +144,11 @@ def __init__(
         self.num_cached_tokens: int = 0
         self.num_blocks: int = 0
 
+        # Chunked-prefill state (0 = disabled, otherwise tokens per chunk)
+        self.chunk_size: int = 0
+        # Number of prompt tokens already fed through forward as chunked-prefill
+        self.chunk_prefill_offset: int = 0
+
         # For server use
         self.request_data: Optional[dict] = request_data
         self.http_request: Optional[Any] = http_request
@@ -186,6 +191,18 @@ def get_num_blocks_required(self, block_size: int) -> int:
     def get_max_tokens(self) -> Optional[int]:
         return self.sampling_params.max_tokens
 
+    def is_chunking(self) -> bool:
+        """Return True if this request is in the middle of chunked-prefill."""
+        return (
+            self.chunk_size > 0
+            and self.is_prefill
+            and self.prompt_length > self.chunk_size
+        )
+
+    def chunk_is_last(self) -> bool:
+        """Return True if the next chunk would finish the prompt."""
+        return self.chunk_prefill_offset + self.chunk_size >= self.prompt_length
+
     def is_finished(self) -> bool:
         return self.status in [
             RequestStatus.FINISHED,
diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py
diff --git a/python/infinilm/processors/basic_llm_processor.py b/python/infinilm/processors/basic_llm_processor.py
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
diff --git a/scripts/infer_task.py b/scripts/infer_task.py
diff --git a/scripts/launch_server.py b/scripts/launch_server.py