InfiniTensor
diff --git a/‎csrc/engine/rank_worker.cpp‎
Lines changed: 8 additions & 0 deletions b/‎csrc/engine/rank_worker.cpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎csrc/global_state/forward_context.hpp‎
Lines changed: 4 additions & 4 deletions b/‎csrc/global_state/forward_context.hpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎csrc/global_state/infinilm_config.hpp‎
Lines changed: 7 additions & 2 deletions b/‎csrc/global_state/infinilm_config.hpp‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎csrc/global_state/workspace_manager.hpp‎
Lines changed: 161 additions & 0 deletions b/‎csrc/global_state/workspace_manager.hpp‎
Lines changed: 161 additions & 0 deletions
diff --git a/‎csrc/layers/attention/attention.cpp‎
Lines changed: 52 additions & 26 deletions b/‎csrc/layers/attention/attention.cpp‎
Lines changed: 52 additions & 26 deletions
diff --git a/‎csrc/layers/attention/attention.hpp‎
Lines changed: 5 additions & 8 deletions b/‎csrc/layers/attention/attention.hpp‎
Lines changed: 5 additions & 8 deletions
@@ -278,6 +278,14 @@ void RankWorker::thread_loop() {
             if (!model_) {
                 throw std::runtime_error("Failed to create model");
             }
+
+            infinicore::context::syncStream();
+
+            if (infinilm_config_->enable_workspace_manager) {
+                forward_context_.workspace_manager.finalize_and_bind(rank_info_.device);
+            }
+            infinicore::context::syncStream();
+
             if (enable_graph_compiling_) {
                 compiler_ = std::make_unique<GeneralCompiler>(model_, barrier_);
             }
 
@@ -1,7 +1,9 @@
 #pragma once
 
 #include "../models/infinilm_model.hpp"
-#include <unordered_map>
+#include "../utils.hpp"
+#include "workspace_manager.hpp"
+#include <vector>
 
 namespace infinilm::global_state {
 
@@ -49,9 +51,7 @@ struct ForwardContext {
     AttentionMetadata attn_metadata;
     MultiModalMetadata mm_metadata;
     std::vector<infinicore::Tensor> kv_cache_vec;
-
-    // preallocated workspace for some modules
-    std::unordered_map<std::string, infinicore::Tensor> preallocated_workspace;
+    WorkspaceManager workspace_manager;
 };
 
 void initialize_forward_context(ForwardContext &forward_context);
 
@@ -19,14 +19,19 @@ struct InfinilmConfig {
         : attention_backend(backend),
           model_config(model_config),
           max_num_batched_tokens(max_num_batched_tokens) {
-        const size_t max_position_embeddings = model_config->get<size_t>("max_position_embeddings");
-        ASSERT(max_num_batched_tokens >= 512 && max_num_batched_tokens <= max_position_embeddings);
+
+        if (max_num_batched_tokens > 0) {
+            const size_t max_position_embeddings = model_config->get<size_t>("max_position_embeddings");
+            ASSERT(max_num_batched_tokens >= 512 && max_num_batched_tokens <= max_position_embeddings);
+            enable_workspace_manager = true;
+        }
     }
 
 public:
     infinilm::backends::AttentionBackend attention_backend;
     std::shared_ptr<infinilm::config::ModelConfig> model_config;
     size_t max_num_batched_tokens = 0;
+    bool enable_workspace_manager{false};
 };
 
 /**
 
@@ -0,0 +1,161 @@
+#pragma once
+
+#include "../models/infinilm_model.hpp"
+#include "../utils.hpp"
+#include <algorithm>
+#include <cstdio>
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace infinilm::global_state {
+
+// /**
+//  * @brief Unified GPU inference workspace manager.
+//  *
+//  * Phase 1: modules register buffer layouts via ``register_buffer``.
+//  * Phase 2/3: ``finalize_and_bind`` allocates ``scratch_buffer_`` and binds views.
+//  */
+// class WorkspaceManager {
+// public:
+//     using BindFn = std::function<void(const infinicore::Tensor &)>;
+
+//     WorkspaceManager() = default;
+//     ~WorkspaceManager() = default;
+
+//     /**
+//      * @brief Register a buffer appended at the current scratch_buffer tail.
+//      *
+//      * @param name Unique cache key; duplicate keys share one slot.
+//      * @param shape Tensor shape for the bound view.
+//      * @param dtype Element type of the bound view.
+//      * @param device Device on which scratch_buffer is allocated.
+//      * @param bind_fn Callback invoked in ``finalize_and_bind`` with the bound view.
+//      */
+//     void register_buffer(const std::string &name,
+//                          const infinicore::Shape &shape,
+//                          const infinicore::DataType &dtype,
+//                          const infinicore::Device &device,
+//                          BindFn bind_fn) {
+//         register_buffer_impl(name, total_bytes_, shape, dtype, device, std::move(bind_fn), true);
+//     }
+
+//     /**
+//      * @brief Register a buffer pinned at a fixed byte offset.
+//      *
+//      * @param name Unique cache key; duplicate keys share one slot.
+//      * @param offset Byte offset in scratch_buffer (currently only 0 is supported).
+//      * @param shape Tensor shape for the bound view.
+//      * @param dtype Element type of the bound view.
+//      * @param device Device on which scratch_buffer is allocated.
+//      * @param bind_fn Callback invoked in ``finalize_and_bind`` with the bound view.
+//      */
+//     void register_buffer(const std::string &name,
+//                          size_t offset,
+//                          const infinicore::Shape &shape,
+//                          const infinicore::DataType &dtype,
+//                          const infinicore::Device &device,
+//                          BindFn bind_fn) {
+//         ASSERT(0 == offset);
+//         register_buffer_impl(name, offset, shape, dtype, device, std::move(bind_fn), false);
+//     }
+
+//     /**
+//      * @brief Allocate scratch_buffer and run all registered bind callbacks.
+//      *
+//      * @param device Device on which scratch_buffer is allocated.
+//      */
+//     void finalize_and_bind(const infinicore::Device &device) {
+//         ASSERT(!finalized_);
+//         if (total_bytes_ == 0) {
+//             finalized_ = true;
+//             return;
+//         }
+
+//         ASSERT(device.getType() != infinicore::Device::Type::CPU);
+
+//         scratch_buffer_ = infinicore::Tensor::empty({total_bytes_}, infinicore::DataType::U8, device);
+
+//         spdlog::info("WorkspaceManager: finalize_and_bind {:.3f} MB", total_bytes_ / 1024.0 / 1024.0);
+
+//         for (auto &[name, reg] : registrations_) {
+//             auto *base_ptr = scratch_buffer_->data() + reg.offset;
+//             auto view = infinicore::Tensor::from_blob(static_cast<void *>(base_ptr), reg.shape, reg.dtype, device);
+//             inference_buffers_[name] = view;
+//             for (auto &bind_fn : reg.bind_callbacks) {
+//                 bind_fn(view);
+//             }
+//         }
+
+//         finalized_ = true;
+//     }
+
+// private:
+//     /** @brief Metadata for one registered region in scratch_buffer. */
+//     struct BufferRegistration {
+//         size_t offset{0};
+//         size_t aligned_bytes{0};
+//         infinicore::Shape shape;
+//         infinicore::DataType dtype;
+//         infinicore::Device device;
+//         std::vector<BindFn> bind_callbacks;
+//     };
+
+//     void register_buffer_impl(const std::string &name,
+//                               size_t offset,
+//                               const infinicore::Shape &shape,
+//                               const infinicore::DataType &dtype,
+//                               const infinicore::Device &device,
+//                               BindFn bind_fn,
+//                               bool bump_tail) {
+//         ASSERT(!finalized_);
+//         ASSERT(device.getType() != infinicore::Device::Type::CPU);
+
+//         auto compute_numel = [](const infinicore::Shape &shape) {
+//             size_t numel = 1;
+//             for (const auto dim : shape) {
+//                 numel *= dim;
+//             }
+//             return numel;
+//         };
+
+//         auto align_up = [](size_t n, size_t alignment = 512) {
+//             return (n + alignment - 1) & ~(alignment - 1);
+//         };
+
+//         const size_t actual_bytes = compute_numel(shape) * infinicore::dsize(dtype);
+//         const size_t aligned_bytes = align_up(actual_bytes);
+
+//         if (registrations_.find(name) == registrations_.end()) {
+//             BufferRegistration reg;
+//             reg.offset = offset;
+//             reg.aligned_bytes = aligned_bytes;
+//             reg.shape = shape;
+//             reg.dtype = dtype;
+//             reg.device = device;
+
+//             if (bump_tail) {
+//                 total_bytes_ += aligned_bytes;
+//             } else {
+//                 total_bytes_ = std::max(total_bytes_, offset + aligned_bytes);
+//             }
+//             registrations_.emplace(name, std::move(reg));
+//         }
+
+//         auto &reg = registrations_.at(name);
+//         ASSERT(reg.aligned_bytes == aligned_bytes);
+//         ASSERT(reg.shape == shape);
+//         ASSERT(reg.dtype == dtype);
+//         ASSERT(reg.device == device);
+//         reg.bind_callbacks.push_back(std::move(bind_fn));
+//     }
+
+//     size_t total_bytes_{0};
+//     bool finalized_{false};
+//     infinicore::Tensor scratch_buffer_;
+//     std::unordered_map<std::string, BufferRegistration> registrations_;
+//     std::unordered_map<std::string, infinicore::Tensor> inference_buffers_;
+// };
+
+}; // namespace infinilm::global_state
@@ -3,6 +3,7 @@
 #include "../../utils.hpp"
 #include "../rotary_embedding/rotary_embedding.hpp"
 #include <string>
+#include <tuple>
 
 namespace infinilm::layers::attention {
 
@@ -48,7 +49,10 @@ Attention::Attention(std::shared_ptr<infinilm::config::ModelConfig> model_config
     init_kv_cache_quant_params(register_fn, device_, kv_cache_k_scale_, kv_cache_v_scale_);
 
     rank_qkv_output_size_ = qkv_proj_->out_features() / static_cast<size_t>(tp_size);
-    this->_initialize_preallocated_workspace();
+    enable_workspace_manager_ = infinilm::global_state::get_infinilm_config().enable_workspace_manager;
+    if (enable_workspace_manager_) {
+        this->_register_inference_buffer();
+    }
 }
 
 infinicore::Tensor Attention::forward(const infinicore::Tensor &positions,
@@ -68,8 +72,13 @@ infinicore::Tensor Attention::forward_static_(const infinicore::Tensor &position
     size_t seq_len = shape[1];
 
     // 1. Project Q, K, V
-    auto qkv_output = max_qkv_output_->narrow({{0, 0, batch_size * seq_len}})->view({batch_size, seq_len, rank_qkv_output_size_});
-    auto [q, k, v] = qkv_proj_->forward_split_(qkv_output, hidden_states_mutable);
+    infinicore::Tensor q, k, v;
+    if (enable_workspace_manager_) {
+        auto qkv_output = max_qkv_output_->narrow({{0, 0, batch_size * seq_len}})->view({batch_size, seq_len, rank_qkv_output_size_});
+        std::tie(q, k, v) = qkv_proj_->forward_split_(qkv_output, hidden_states_mutable);
+    } else {
+        std::tie(q, k, v) = qkv_proj_->forward_split(hidden_states_mutable);
+    }
 
     // 2. Reshape for multi-head attention
     auto q_reshaped = q->view({batch_size, seq_len, num_attention_heads_, head_dim_});
@@ -96,10 +105,13 @@ infinicore::Tensor Attention::forward_static_(const infinicore::Tensor &position
     // 5. Attn Backend calculate
     auto attn_output = attn_->forward(q_rope, k_reshaped, v_reshaped);
 
-    // 7. Project output
-    auto o_output = max_o_output_->narrow({{0, 0, batch_size * seq_len}})->view({batch_size, seq_len, hidden_size_});
-    o_proj_->forward_(o_output, attn_output);
-    return o_output;
+    // 6. Project output
+    if (enable_workspace_manager_) {
+        auto o_output = max_o_output_->narrow({{0, 0, batch_size * seq_len}})->view({batch_size, seq_len, hidden_size_});
+        o_proj_->forward_(o_output, attn_output);
+        return o_output;
+    }
+    return o_proj_->forward(attn_output);
 }
 
 infinicore::Tensor Attention::forward_paged_(const infinicore::Tensor &position_ids,
@@ -114,8 +126,13 @@ infinicore::Tensor Attention::forward_paged_(const infinicore::Tensor &position_
     ASSERT_EQ(batch_size, 1);
 
     // 1. Project Q, K, V
-    auto qkv_output = max_qkv_output_->narrow({{0, 0, seq_len}})->view({1, seq_len, rank_qkv_output_size_});
-    auto [q, k, v] = qkv_proj_->forward_split_(qkv_output, hidden_states_mutable);
+    infinicore::Tensor q, k, v;
+    if (enable_workspace_manager_) {
+        auto qkv_output = max_qkv_output_->narrow({{0, 0, seq_len}})->view({1, seq_len, rank_qkv_output_size_});
+        std::tie(q, k, v) = qkv_proj_->forward_split_(qkv_output, hidden_states_mutable);
+    } else {
+        std::tie(q, k, v) = qkv_proj_->forward_split(hidden_states_mutable);
+    }
 
     // 2. Reshape for multi-head attention
     auto q_reshaped = q->view({seq_len, num_attention_heads_, head_dim_});
@@ -142,35 +159,44 @@ infinicore::Tensor Attention::forward_paged_(const infinicore::Tensor &position_
     auto attn_output = attn_->forward(q_reshaped, k_reshaped, v_reshaped);
 
     // 6. Project output
-    auto o_output = max_o_output_->narrow({{0, 0, seq_len}})->view({1, seq_len, hidden_size_});
-    o_proj_->forward_(o_output, attn_output);
-    return o_output;
+    if (enable_workspace_manager_) {
+        auto o_output = max_o_output_->narrow({{0, 0, seq_len}})->view({1, seq_len, hidden_size_});
+        o_proj_->forward_(o_output, attn_output);
+        return o_output;
+    }
+    return o_proj_->forward(attn_output);
 }
 
-void Attention::_initialize_preallocated_workspace() {
+void Attention::_register_inference_buffer() {
     const auto &infinilm_config = infinilm::global_state::get_infinilm_config();
-    auto &preallocated_workspace = infinilm::global_state::get_forward_context().preallocated_workspace;
+    auto &workspace_manager = infinilm::global_state::get_forward_context().workspace_manager;
     const size_t max_num_batched_tokens = infinilm_config.max_num_batched_tokens;
 
+    ASSERT(rank_qkv_output_size_ > 0 && hidden_size_ > 0);
+
     const std::string attention_cache_key = std::string("Attention_max_num_batched_tokens_")
                                           + std::to_string(max_num_batched_tokens) + "_rank_qkv_output_size_"
                                           + std::to_string(rank_qkv_output_size_) + "_hidden_size_"
                                           + std::to_string(hidden_size_) + "_dtype_"
                                           + infinicore::toString(dtype_) + "_device_"
                                           + device_.toString();
 
-    size_t max_output_size = std::max(rank_qkv_output_size_, hidden_size_);
-    if (preallocated_workspace.find(attention_cache_key) == preallocated_workspace.end()) {
-        auto attention_buffer = infinicore::Tensor::empty({max_num_batched_tokens * max_output_size}, dtype_, device_);
-        preallocated_workspace[attention_cache_key] = attention_buffer;
-    }
-
-    auto attention_buffer = preallocated_workspace.at(attention_cache_key);
-    const auto attention_buffer_shape = attention_buffer->shape();
-    ASSERT(attention_buffer_shape[0] == max_num_batched_tokens * max_output_size);
-
-    max_qkv_output_ = attention_buffer->narrow({{0, 0, max_num_batched_tokens * rank_qkv_output_size_}})->view({max_num_batched_tokens, rank_qkv_output_size_});
-    max_o_output_ = attention_buffer->narrow({{0, 0, max_num_batched_tokens * hidden_size_}})->view({max_num_batched_tokens, hidden_size_});
+    const size_t max_output_size = std::max(rank_qkv_output_size_, hidden_size_);
+    const infinicore::Shape attention_buffer_shape = {max_num_batched_tokens * max_output_size};
+    workspace_manager.register_buffer(
+        attention_cache_key,
+        attention_buffer_shape,
+        dtype_,
+        device_,
+        [this, max_num_batched_tokens, max_output_size](const infinicore::Tensor &attention_buffer) {
+            const auto attention_buffer_shape = attention_buffer->shape();
+            ASSERT(attention_buffer_shape[0] == max_num_batched_tokens * max_output_size);
+
+            max_qkv_output_ = attention_buffer->narrow({{0, 0, max_num_batched_tokens * rank_qkv_output_size_}})
+                                  ->view({max_num_batched_tokens, rank_qkv_output_size_});
+            max_o_output_ = attention_buffer->narrow({{0, 0, max_num_batched_tokens * hidden_size_}})
+                                ->view({max_num_batched_tokens, hidden_size_});
+        });
 }
 
 void init_kv_cache_quant_params(std::function<void(const std::string &, infinicore::nn::Parameter)> register_fn,
 
@@ -5,8 +5,6 @@
 #include "../../global_state/global_state.hpp"
 #include "../linear/linear.hpp"
 #include "backends/attention_layer.hpp"
-#include "infinicore/device.hpp"
-#include "infinicore/dtype.hpp"
 #include "infinicore/nn/module.hpp"
 #include "infinicore/nn/rope.hpp"
 #include "infinicore/tensor.hpp"
@@ -39,7 +37,7 @@ class Attention : public infinicore::nn::Module {
     infinicore::Tensor forward_paged_(const infinicore::Tensor &positions,
                                       const infinicore::Tensor &hidden_states) const;
 
-    void _initialize_preallocated_workspace();
+    void _register_inference_buffer();
 
 protected:
     std::shared_ptr<infinilm::layers::linear::QKVParallelLinear> qkv_proj_;
@@ -61,11 +59,10 @@ class Attention : public infinicore::nn::Module {
     INFINICORE_NN_PARAMETER(kv_cache_v_scale);
 
 private:
-    size_t rank_qkv_output_size_;
-
-    // preallocated workspace for Attention
-    infinicore::Tensor max_qkv_output_;
-    infinicore::Tensor max_o_output_;
+    bool enable_workspace_manager_{false};
+    size_t rank_qkv_output_size_{0};
+    infinicore::Tensor max_qkv_output_; // inference buffer for Attention
+    infinicore::Tensor max_o_output_;   // inference buffer for Attention
 };
 void init_kv_cache_quant_params(std::function<void(const std::string &, infinicore::nn::Parameter)> register_fn,
                                 const infinicore::Device &device,