issue/407 - Refine the code

wangpengcheng · wangpengcheng · commit b4d262b7bcc8 · 2026-06-10T01:26:56.000Z
diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp
@@ -282,7 +282,8 @@ void RankWorker::thread_loop() {
             infinicore::context::syncStream();
 
             if (infinilm_config_->enable_workspace_manager) {
-                forward_context_.workspace_manager.finalize_and_bind(rank_info_.device);
+                forward_context_.workspace_manager.finalize_and_bind();
+                // forward_context_.workspace_manager.log_registrations();
             }
             infinicore::context::syncStream();
 
diff --git a/csrc/global_state/workspace_manager.hpp b/csrc/global_state/workspace_manager.hpp
@@ -2,20 +2,24 @@
 
 #include "../models/infinilm_model.hpp"
 #include "../utils.hpp"
+#include "parallel_state.hpp"
 #include <algorithm>
-#include <cstdio>
 #include <functional>
+#include <iomanip>
+#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 namespace infinilm::global_state {
 
 /**
- * @brief Unified GPU inference workspace manager.
+ * @brief Unified GPU inference scratch buffer.
  *
- * Phase 1: modules register buffer layouts via ``register_buffer``.
- * Phase 2/3: ``finalize_and_bind`` allocates ``scratch_buffer_`` and binds views.
+ * Flow: register_buffer -> finalize_and_bind -> log_registrations (optional).
+ * Layout: bump (tail append) or pinned@0 (offset fixed at 0).
+ * Slots may overlap; scratch_bytes is max span, not sum of slots. Safe use requires
+ * temporal reuse across forward phases.
  */
 class WorkspaceManager {
 public:
@@ -24,93 +28,168 @@ class WorkspaceManager {
     WorkspaceManager() = default;
     ~WorkspaceManager() = default;
 
-    /**
-     * @brief Register a buffer appended at the current scratch_buffer tail.
-     *
-     * @param name Unique cache key; duplicate keys share one slot.
-     * @param shape Tensor shape for the bound view.
-     * @param dtype Element type of the bound view.
-     * @param device Device on which scratch_buffer is allocated.
-     * @param bind_fn Callback invoked in ``finalize_and_bind`` with the bound view.
-     */
+    /** @brief Register a bump slot at current total_bytes_. Same name reuses one slot. */
     void register_buffer(const std::string &name,
                          const infinicore::Shape &shape,
                          const infinicore::DataType &dtype,
                          const infinicore::Device &device,
                          BindFn bind_fn) {
-        register_buffer_impl(name, total_bytes_, shape, dtype, device, std::move(bind_fn), true);
+        _register_buffer_impl(name, total_bytes_, shape, dtype, device, std::move(bind_fn), true);
     }
 
-    /**
-     * @brief Register a buffer pinned at a fixed byte offset.
-     *
-     * @param name Unique cache key; duplicate keys share one slot.
-     * @param offset Byte offset in scratch_buffer (currently only 0 is supported).
-     * @param shape Tensor shape for the bound view.
-     * @param dtype Element type of the bound view.
-     * @param device Device on which scratch_buffer is allocated.
-     * @param bind_fn Callback invoked in ``finalize_and_bind`` with the bound view.
-     */
+    /** @brief Register a pinned@0 slot (only offset==0). May overlap bump slots. */
     void register_buffer(const std::string &name,
                          size_t offset,
                          const infinicore::Shape &shape,
                          const infinicore::DataType &dtype,
                          const infinicore::Device &device,
                          BindFn bind_fn) {
         ASSERT(0 == offset);
-        register_buffer_impl(name, offset, shape, dtype, device, std::move(bind_fn), false);
+        _register_buffer_impl(name, offset, shape, dtype, device, std::move(bind_fn), false);
     }
 
-    /**
-     * @brief Allocate scratch_buffer and run all registered bind callbacks.
-     *
-     * @param device Device on which scratch_buffer is allocated.
-     */
-    void finalize_and_bind(const infinicore::Device &device) {
+    /** @brief Allocate scratch_buffer_ and run bind callbacks. */
+    void finalize_and_bind() {
         ASSERT(!finalized_);
         if (total_bytes_ == 0) {
             finalized_ = true;
             return;
         }
 
-        ASSERT(device.getType() != infinicore::Device::Type::CPU);
+        auto &rank_device = get_tensor_model_parallel_rank_info().device;
 
-        scratch_buffer_ = infinicore::Tensor::empty({total_bytes_}, infinicore::DataType::U8, device);
+        scratch_buffer_ = infinicore::Tensor::empty({total_bytes_}, infinicore::DataType::U8, rank_device);
 
         spdlog::info("WorkspaceManager: finalize_and_bind {:.3f} MB", total_bytes_ / 1024.0 / 1024.0);
 
-        for (auto &[name, reg] : registrations_) {
+        for (auto &entry : registrations_) {
+            auto &reg = entry.second;
             auto *base_ptr = scratch_buffer_->data() + reg.offset;
-            auto view = infinicore::Tensor::from_blob(static_cast<void *>(base_ptr), reg.shape, reg.dtype, device);
-            inference_buffers_[name] = view;
+            ASSERT(rank_device == reg.device);
+            reg.bound_view = infinicore::Tensor::from_blob(static_cast<void *>(base_ptr), reg.shape, reg.dtype, rank_device);
             for (auto &bind_fn : reg.bind_callbacks) {
-                bind_fn(view);
+                bind_fn(reg.bound_view);
             }
         }
 
         finalized_ = true;
     }
 
+    /** @brief Log slot layout with memory ranges and overlap info. */
+    void log_registrations() const {
+        size_t total_callbacks = 0;
+        for (const auto &entry : registrations_) {
+            total_callbacks += entry.second.bind_callbacks.size();
+        }
+
+        std::vector<std::string> names;
+        names.reserve(registrations_.size());
+        for (const auto &entry : registrations_) {
+            names.push_back(entry.first);
+        }
+        std::sort(names.begin(), names.end(), [this](const std::string &a, const std::string &b) {
+            return registrations_.at(a).offset < registrations_.at(b).offset;
+        });
+
+        std::ostringstream oss;
+        oss << std::fixed << std::setprecision(3);
+        oss << "\n========== WorkspaceManager registrations ==========\n";
+        oss << "  " << std::setw(16) << std::left << "finalized:" << finalized_ << "\n";
+        oss << "  " << std::setw(16) << std::left << "slots:" << registrations_.size() << "\n";
+        oss << "  " << std::setw(16) << std::left << "bind_callbacks:" << total_callbacks << "\n";
+        oss << "  " << std::setw(16) << std::left << "scratch_bytes:"
+            << total_bytes_ << " (" << (total_bytes_ / 1024.0 / 1024.0) << " MB)\n";
+        oss << "  note: scratch_bytes=max span; slots may overlap (temporal reuse).\n";
+        oss << "----------------------------------------------------\n";
+
+        auto memory_end = [](const BufferRegistration &reg) {
+            return reg.offset + reg.aligned_bytes;
+        };
+        auto ranges_overlap = [](size_t a_start, size_t a_end, size_t b_start, size_t b_end) {
+            return a_start < b_end && b_start < a_end;
+        };
+
+        for (size_t slot_idx = 0; slot_idx < names.size(); ++slot_idx) {
+            const auto &name = names[slot_idx];
+            const auto &reg = registrations_.at(name);
+            const size_t mem_start = reg.offset;
+            const size_t mem_end = memory_end(reg);
+
+            std::string shape_str = "[";
+            for (size_t i = 0; i < reg.shape.size(); ++i) {
+                if (i > 0) {
+                    shape_str += ", ";
+                }
+                shape_str += std::to_string(reg.shape[i]);
+            }
+            shape_str += "]";
+
+            std::string overlap_str = "none";
+            {
+                std::ostringstream overlap_oss;
+                bool first = true;
+                for (size_t other_idx = 0; other_idx < names.size(); ++other_idx) {
+                    if (other_idx == slot_idx) {
+                        continue;
+                    }
+                    const auto &other = registrations_.at(names[other_idx]);
+                    if (ranges_overlap(mem_start, mem_end, other.offset, memory_end(other))) {
+                        if (!first) {
+                            overlap_oss << ", ";
+                        }
+                        overlap_oss << "slot " << other_idx;
+                        first = false;
+                    }
+                }
+                if (!first) {
+                    overlap_str = overlap_oss.str();
+                }
+            }
+
+            oss << "  [slot " << slot_idx << "]\n";
+            oss << "    " << std::setw(16) << std::left << "layout:"
+                << (reg.is_bump_tail ? "bump" : "pinned@0") << "\n";
+            oss << "    " << std::setw(16) << std::left << "memory:"
+                << "[" << mem_start << ", " << mem_end << ") "
+                << "(" << (reg.aligned_bytes / 1024.0 / 1024.0) << " MB)\n";
+            oss << "    " << std::setw(16) << std::left << "overlaps:" << overlap_str << "\n";
+            oss << "    " << std::setw(16) << std::left << "name:" << name << "\n";
+            oss << "    " << std::setw(16) << std::left << "shape:" << shape_str << "\n";
+            oss << "    " << std::setw(16) << std::left << "dtype:" << infinicore::toString(reg.dtype) << "\n";
+            oss << "    " << std::setw(16) << std::left << "device:" << reg.device.toString() << "\n";
+            oss << "    " << std::setw(16) << std::left << "bind_callbacks:" << reg.bind_callbacks.size() << "\n";
+            oss << "    " << std::setw(16) << std::left << "bound:" << finalized_ << "\n";
+            if (slot_idx + 1 < names.size()) {
+                oss << "\n";
+            }
+        }
+        oss << "====================================================\n";
+
+        spdlog::info("{}", oss.str());
+    }
+
 private:
-    /** @brief Metadata for one registered region in scratch_buffer. */
+    /** @brief Metadata for one registered view into scratch_buffer_. */
     struct BufferRegistration {
-        size_t offset{0};
-        size_t aligned_bytes{0};
-        infinicore::Shape shape;
-        infinicore::DataType dtype;
-        infinicore::Device device;
-        std::vector<BindFn> bind_callbacks;
+        size_t offset{0};                   // view start in scratch_buffer_ (not a unique partition id)
+        size_t aligned_bytes{0};            // view span after alignment; used for scratch size accounting
+        bool is_bump_tail{true};            // true=bump tail slot; false=pinned@0 slot
+        infinicore::Shape shape;            // shape of the bound inference view
+        infinicore::DataType dtype;         // element type of the bound inference view
+        infinicore::Device device;          // device passed at registration (must match rank device)
+        infinicore::Tensor bound_view;      // view into scratch_buffer_; valid after finalize_and_bind
+        std::vector<BindFn> bind_callbacks; // callbacks that bind module tensors to bound_view
     };
 
-    void register_buffer_impl(const std::string &name,
-                              size_t offset,
-                              const infinicore::Shape &shape,
-                              const infinicore::DataType &dtype,
-                              const infinicore::Device &device,
-                              BindFn bind_fn,
-                              bool bump_tail) {
+    void _register_buffer_impl(const std::string &name,
+                               size_t offset,
+                               const infinicore::Shape &shape,
+                               const infinicore::DataType &dtype,
+                               const infinicore::Device &device,
+                               BindFn bind_fn,
+                               bool bump_tail) {
         ASSERT(!finalized_);
-        ASSERT(device.getType() != infinicore::Device::Type::CPU);
+        ASSERT(device == get_tensor_model_parallel_rank_info().device);
 
         auto compute_numel = [](const infinicore::Shape &shape) {
             size_t numel = 1;
@@ -131,6 +210,7 @@ class WorkspaceManager {
             BufferRegistration reg;
             reg.offset = offset;
             reg.aligned_bytes = aligned_bytes;
+            reg.is_bump_tail = bump_tail;
             reg.shape = shape;
             reg.dtype = dtype;
             reg.device = device;
@@ -144,6 +224,7 @@ class WorkspaceManager {
         }
 
         auto &reg = registrations_.at(name);
+        ASSERT(reg.is_bump_tail == bump_tail);
         ASSERT(reg.aligned_bytes == aligned_bytes);
         ASSERT(reg.shape == shape);
         ASSERT(reg.dtype == dtype);
@@ -155,7 +236,6 @@ class WorkspaceManager {
     bool finalized_{false};
     infinicore::Tensor scratch_buffer_;
     std::unordered_map<std::string, BufferRegistration> registrations_;
-    std::unordered_map<std::string, infinicore::Tensor> inference_buffers_;
 };
 
 }; // namespace infinilm::global_state
diff --git a/csrc/layers/mlp/mlp.cpp b/csrc/layers/mlp/mlp.cpp
@@ -87,7 +87,7 @@ void MLP::_register_inference_buffer() {
                                     + infinicore::toString(dtype_) + "_device_"
                                     + device_.toString();
 
-    auto align_up = [](size_t n, size_t alignment = 256) {
+    auto align_up = [](size_t n, size_t alignment = 512) {
         return (n + alignment - 1) & ~(alignment - 1);
     };
 

Original file line number	Diff line number	Diff line change
`@@ -282,7 +282,8 @@ void RankWorker::thread_loop() {`
`282`	`282`	`infinicore::context::syncStream();`
`283`	`283`
`284`	`284`	`if (infinilm_config_->enable_workspace_manager) {`
`285`		`- forward_context_.workspace_manager.finalize_and_bind(rank_info_.device);`
	`285`	`+ forward_context_.workspace_manager.finalize_and_bind();`
	`286`	`+ // forward_context_.workspace_manager.log_registrations();`
`286`	`287`	`}`
`287`	`288`	`infinicore::context::syncStream();`
`288`	`289`