Skip to content

Commit 1c6f631

Browse files
author
wangpengcheng
committed
issue/407 - adjust to enable_workspace_manager
1 parent 4e78ae1 commit 1c6f631

16 files changed

Lines changed: 452 additions & 156 deletions

csrc/engine/rank_worker.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,14 @@ void RankWorker::thread_loop() {
278278
if (!model_) {
279279
throw std::runtime_error("Failed to create model");
280280
}
281+
282+
infinicore::context::syncStream();
283+
284+
if (infinilm_config_->enable_workspace_manager) {
285+
forward_context_.workspace_manager.finalize_and_bind(rank_info_.device);
286+
}
287+
infinicore::context::syncStream();
288+
281289
if (enable_graph_compiling_) {
282290
compiler_ = std::make_unique<GeneralCompiler>(model_, barrier_);
283291
}

csrc/global_state/forward_context.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
#pragma once
22

33
#include "../models/infinilm_model.hpp"
4+
#include "../utils.hpp"
5+
#include "workspace_manager.hpp"
46
#include <unordered_map>
7+
#include <vector>
58

69
namespace infinilm::global_state {
710

@@ -49,9 +52,7 @@ struct ForwardContext {
4952
AttentionMetadata attn_metadata;
5053
MultiModalMetadata mm_metadata;
5154
std::vector<infinicore::Tensor> kv_cache_vec;
52-
53-
// preallocated workspace for some modules
54-
std::unordered_map<std::string, infinicore::Tensor> preallocated_workspace;
55+
WorkspaceManager workspace_manager;
5556
};
5657

5758
void initialize_forward_context(ForwardContext &forward_context);

csrc/global_state/infinilm_config.hpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,19 @@ struct InfinilmConfig {
1919
: attention_backend(backend),
2020
model_config(model_config),
2121
max_num_batched_tokens(max_num_batched_tokens) {
22-
const size_t max_position_embeddings = model_config->get<size_t>("max_position_embeddings");
23-
ASSERT(max_num_batched_tokens >= 512 && max_num_batched_tokens <= max_position_embeddings);
22+
23+
if (max_num_batched_tokens > 0) {
24+
const size_t max_position_embeddings = model_config->get<size_t>("max_position_embeddings");
25+
ASSERT(max_num_batched_tokens >= 512 && max_num_batched_tokens <= max_position_embeddings);
26+
enable_workspace_manager = true;
27+
}
2428
}
2529

2630
public:
2731
infinilm::backends::AttentionBackend attention_backend;
2832
std::shared_ptr<infinilm::config::ModelConfig> model_config;
2933
size_t max_num_batched_tokens = 0;
34+
bool enable_workspace_manager{false};
3035
};
3136

3237
/**
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#pragma once
2+
3+
#include "../models/infinilm_model.hpp"
4+
#include "../utils.hpp"
5+
#include <algorithm>
6+
#include <cstdio>
7+
#include <functional>
8+
#include <string>
9+
#include <unordered_map>
10+
#include <vector>
11+
12+
namespace infinilm::global_state {
13+
14+
// class WorkspaceManager {
15+
// public:
16+
// using BindFn = std::function<void(const infinicore::Tensor &)>;
17+
18+
// WorkspaceManager() = default;
19+
// ~WorkspaceManager() = default;
20+
21+
// /// Phase 1: append at the current scratch tail (bump allocator).
22+
// void register_buffer(const std::string &name,
23+
// const infinicore::Shape &shape,
24+
// const infinicore::DataType &dtype,
25+
// const infinicore::Device &device,
26+
// BindFn bind_fn) {
27+
// register_buffer_impl(name, total_bytes_, shape, dtype, device, std::move(bind_fn), true);
28+
// }
29+
30+
// /// Phase 1: register at an explicit byte offset (e.g. logits pinned at 0).
31+
// /// ``total_bytes_`` grows to ``max(total_bytes_, offset + aligned_size)`` without bumping other regions.
32+
// void register_buffer(const std::string &name,
33+
// size_t offset,
34+
// const infinicore::Shape &shape,
35+
// const infinicore::DataType &dtype,
36+
// const infinicore::Device &device,
37+
// BindFn bind_fn) {
38+
// ASSERT(0 == offset);
39+
// register_buffer_impl(name, offset, shape, dtype, device, std::move(bind_fn), false);
40+
// }
41+
42+
// /// Phase 2 + 3: allocate one unified arena and invoke all registered bind callbacks.
43+
// void finalize_and_bind(const infinicore::Device &device) {
44+
// ASSERT(!finalized_);
45+
// if (total_bytes_ == 0) {
46+
// finalized_ = true;
47+
// return;
48+
// }
49+
50+
// ASSERT(device.getType() != infinicore::Device::Type::CPU);
51+
52+
// scratch_buffer_ = infinicore::Tensor::empty({total_bytes_}, infinicore::DataType::U8, device);
53+
54+
// spdlog::info("WorkspaceManager: finalize_and_bind {:.3f} MB", total_bytes_ / 1024.0 / 1024.0);
55+
56+
// for (auto &[name, reg] : registrations_) {
57+
// auto *base_ptr = scratch_buffer_->data() + reg.offset;
58+
// auto view = infinicore::Tensor::from_blob(static_cast<void *>(base_ptr), reg.shape, reg.dtype, device);
59+
// inference_buffers_[name] = view;
60+
// for (auto &bind_fn : reg.bind_callbacks) {
61+
// bind_fn(view);
62+
// }
63+
// }
64+
65+
// finalized_ = true;
66+
// }
67+
68+
// private:
69+
// struct BufferRegistration {
70+
// size_t offset{0};
71+
// size_t aligned_bytes{0};
72+
// infinicore::Shape shape;
73+
// infinicore::DataType dtype;
74+
// infinicore::Device device;
75+
// std::vector<BindFn> bind_callbacks;
76+
// };
77+
78+
// void register_buffer_impl(const std::string &name,
79+
// size_t offset,
80+
// const infinicore::Shape &shape,
81+
// const infinicore::DataType &dtype,
82+
// const infinicore::Device &device,
83+
// BindFn bind_fn,
84+
// bool bump_tail) {
85+
// ASSERT(!finalized_);
86+
// ASSERT(device.getType() != infinicore::Device::Type::CPU);
87+
88+
// auto compute_numel = [](const infinicore::Shape &shape) {
89+
// size_t numel = 1;
90+
// for (const auto dim : shape) {
91+
// numel *= dim;
92+
// }
93+
// return numel;
94+
// };
95+
96+
// auto align_up = [](size_t n, size_t alignment = 512) {
97+
// return (n + alignment - 1) & ~(alignment - 1);
98+
// };
99+
100+
// const size_t actual_bytes = compute_numel(shape) * infinicore::dsize(dtype);
101+
// const size_t aligned_bytes = align_up(actual_bytes);
102+
103+
// if (registrations_.find(name) == registrations_.end()) {
104+
// BufferRegistration reg;
105+
// reg.offset = offset;
106+
// reg.aligned_bytes = aligned_bytes;
107+
// reg.shape = shape;
108+
// reg.dtype = dtype;
109+
// reg.device = device;
110+
111+
// if (bump_tail) {
112+
// total_bytes_ += aligned_bytes;
113+
// } else {
114+
// total_bytes_ = std::max(total_bytes_, offset + aligned_bytes);
115+
// }
116+
// registrations_.emplace(name, std::move(reg));
117+
// }
118+
119+
// auto &reg = registrations_.at(name);
120+
// ASSERT(reg.aligned_bytes == aligned_bytes);
121+
// ASSERT(reg.shape == shape);
122+
// ASSERT(reg.dtype == dtype);
123+
// ASSERT(reg.device == device);
124+
// reg.bind_callbacks.push_back(std::move(bind_fn));
125+
// }
126+
127+
// size_t total_bytes_{0};
128+
// bool finalized_{false};
129+
// infinicore::Tensor scratch_buffer_;
130+
// std::unordered_map<std::string, BufferRegistration> registrations_;
131+
// std::unordered_map<std::string, infinicore::Tensor> inference_buffers_;
132+
// };
133+
134+
}; // namespace infinilm::global_state

csrc/layers/attention/attention.cpp

Lines changed: 56 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "../../utils.hpp"
44
#include "../rotary_embedding/rotary_embedding.hpp"
55
#include <string>
6+
#include <tuple>
67

78
namespace infinilm::layers::attention {
89

@@ -48,7 +49,10 @@ Attention::Attention(std::shared_ptr<infinilm::config::ModelConfig> model_config
4849
init_kv_cache_quant_params(register_fn, device_, kv_cache_k_scale_, kv_cache_v_scale_);
4950

5051
rank_qkv_output_size_ = qkv_proj_->out_features() / static_cast<size_t>(tp_size);
51-
this->_initialize_preallocated_workspace();
52+
enable_workspace_manager_ = infinilm::global_state::get_infinilm_config().enable_workspace_manager;
53+
if (enable_workspace_manager_) {
54+
this->_register_inference_buffer();
55+
}
5256
}
5357

5458
infinicore::Tensor Attention::forward(const infinicore::Tensor &positions,
@@ -68,8 +72,15 @@ infinicore::Tensor Attention::forward_static_(const infinicore::Tensor &position
6872
size_t seq_len = shape[1];
6973

7074
// 1. Project Q, K, V
71-
auto qkv_output = max_qkv_output_->narrow({{0, 0, batch_size * seq_len}})->view({batch_size, seq_len, rank_qkv_output_size_});
72-
auto [q, k, v] = qkv_proj_->forward_split_(qkv_output, hidden_states_mutable);
75+
infinicore::Tensor q;
76+
infinicore::Tensor k;
77+
infinicore::Tensor v;
78+
if (enable_workspace_manager_) {
79+
auto qkv_output = max_qkv_output_->narrow({{0, 0, batch_size * seq_len}})->view({batch_size, seq_len, rank_qkv_output_size_});
80+
std::tie(q, k, v) = qkv_proj_->forward_split_(qkv_output, hidden_states_mutable);
81+
} else {
82+
std::tie(q, k, v) = qkv_proj_->forward_split(hidden_states_mutable);
83+
}
7384

7485
// 2. Reshape for multi-head attention
7586
auto q_reshaped = q->view({batch_size, seq_len, num_attention_heads_, head_dim_});
@@ -96,10 +107,13 @@ infinicore::Tensor Attention::forward_static_(const infinicore::Tensor &position
96107
// 5. Attn Backend calculate
97108
auto attn_output = attn_->forward(q_rope, k_reshaped, v_reshaped);
98109

99-
// 7. Project output
100-
auto o_output = max_o_output_->narrow({{0, 0, batch_size * seq_len}})->view({batch_size, seq_len, hidden_size_});
101-
o_proj_->forward_(o_output, attn_output);
102-
return o_output;
110+
// 6. Project output
111+
if (enable_workspace_manager_) {
112+
auto o_output = max_o_output_->narrow({{0, 0, batch_size * seq_len}})->view({batch_size, seq_len, hidden_size_});
113+
o_proj_->forward_(o_output, attn_output);
114+
return o_output;
115+
}
116+
return o_proj_->forward(attn_output);
103117
}
104118

105119
infinicore::Tensor Attention::forward_paged_(const infinicore::Tensor &position_ids,
@@ -114,8 +128,15 @@ infinicore::Tensor Attention::forward_paged_(const infinicore::Tensor &position_
114128
ASSERT_EQ(batch_size, 1);
115129

116130
// 1. Project Q, K, V
117-
auto qkv_output = max_qkv_output_->narrow({{0, 0, seq_len}})->view({1, seq_len, rank_qkv_output_size_});
118-
auto [q, k, v] = qkv_proj_->forward_split_(qkv_output, hidden_states_mutable);
131+
infinicore::Tensor q;
132+
infinicore::Tensor k;
133+
infinicore::Tensor v;
134+
if (enable_workspace_manager_) {
135+
auto qkv_output = max_qkv_output_->narrow({{0, 0, seq_len}})->view({1, seq_len, rank_qkv_output_size_});
136+
std::tie(q, k, v) = qkv_proj_->forward_split_(qkv_output, hidden_states_mutable);
137+
} else {
138+
std::tie(q, k, v) = qkv_proj_->forward_split(hidden_states_mutable);
139+
}
119140

120141
// 2. Reshape for multi-head attention
121142
auto q_reshaped = q->view({seq_len, num_attention_heads_, head_dim_});
@@ -142,35 +163,44 @@ infinicore::Tensor Attention::forward_paged_(const infinicore::Tensor &position_
142163
auto attn_output = attn_->forward(q_reshaped, k_reshaped, v_reshaped);
143164

144165
// 6. Project output
145-
auto o_output = max_o_output_->narrow({{0, 0, seq_len}})->view({1, seq_len, hidden_size_});
146-
o_proj_->forward_(o_output, attn_output);
147-
return o_output;
166+
if (enable_workspace_manager_) {
167+
auto o_output = max_o_output_->narrow({{0, 0, seq_len}})->view({1, seq_len, hidden_size_});
168+
o_proj_->forward_(o_output, attn_output);
169+
return o_output;
170+
}
171+
return o_proj_->forward(attn_output);
148172
}
149173

150-
void Attention::_initialize_preallocated_workspace() {
174+
void Attention::_register_inference_buffer() {
151175
const auto &infinilm_config = infinilm::global_state::get_infinilm_config();
152-
auto &preallocated_workspace = infinilm::global_state::get_forward_context().preallocated_workspace;
176+
auto &workspace_manager = infinilm::global_state::get_forward_context().workspace_manager;
153177
const size_t max_num_batched_tokens = infinilm_config.max_num_batched_tokens;
154178

179+
ASSERT(rank_qkv_output_size_ > 0 && hidden_size_ > 0);
180+
155181
const std::string attention_cache_key = std::string("Attention_max_num_batched_tokens_")
156182
+ std::to_string(max_num_batched_tokens) + "_rank_qkv_output_size_"
157183
+ std::to_string(rank_qkv_output_size_) + "_hidden_size_"
158184
+ std::to_string(hidden_size_) + "_dtype_"
159185
+ infinicore::toString(dtype_) + "_device_"
160186
+ device_.toString();
161187

162-
size_t max_output_size = std::max(rank_qkv_output_size_, hidden_size_);
163-
if (preallocated_workspace.find(attention_cache_key) == preallocated_workspace.end()) {
164-
auto attention_buffer = infinicore::Tensor::empty({max_num_batched_tokens * max_output_size}, dtype_, device_);
165-
preallocated_workspace[attention_cache_key] = attention_buffer;
166-
}
167-
168-
auto attention_buffer = preallocated_workspace.at(attention_cache_key);
169-
const auto attention_buffer_shape = attention_buffer->shape();
170-
ASSERT(attention_buffer_shape[0] == max_num_batched_tokens * max_output_size);
171-
172-
max_qkv_output_ = attention_buffer->narrow({{0, 0, max_num_batched_tokens * rank_qkv_output_size_}})->view({max_num_batched_tokens, rank_qkv_output_size_});
173-
max_o_output_ = attention_buffer->narrow({{0, 0, max_num_batched_tokens * hidden_size_}})->view({max_num_batched_tokens, hidden_size_});
188+
const size_t max_output_size = std::max(rank_qkv_output_size_, hidden_size_);
189+
const infinicore::Shape attention_buffer_shape = {max_num_batched_tokens * max_output_size};
190+
workspace_manager.register_buffer(
191+
attention_cache_key,
192+
attention_buffer_shape,
193+
dtype_,
194+
device_,
195+
[this, max_num_batched_tokens, max_output_size](const infinicore::Tensor &attention_buffer) {
196+
const auto attention_buffer_shape = attention_buffer->shape();
197+
ASSERT(attention_buffer_shape[0] == max_num_batched_tokens * max_output_size);
198+
199+
max_qkv_output_ = attention_buffer->narrow({{0, 0, max_num_batched_tokens * rank_qkv_output_size_}})
200+
->view({max_num_batched_tokens, rank_qkv_output_size_});
201+
max_o_output_ = attention_buffer->narrow({{0, 0, max_num_batched_tokens * hidden_size_}})
202+
->view({max_num_batched_tokens, hidden_size_});
203+
});
174204
}
175205

176206
void init_kv_cache_quant_params(std::function<void(const std::string &, infinicore::nn::Parameter)> register_fn,

csrc/layers/attention/attention.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class Attention : public infinicore::nn::Module {
3939
infinicore::Tensor forward_paged_(const infinicore::Tensor &positions,
4040
const infinicore::Tensor &hidden_states) const;
4141

42-
void _initialize_preallocated_workspace();
42+
void _register_inference_buffer();
4343

4444
protected:
4545
std::shared_ptr<infinilm::layers::linear::QKVParallelLinear> qkv_proj_;
@@ -61,7 +61,9 @@ class Attention : public infinicore::nn::Module {
6161
INFINICORE_NN_PARAMETER(kv_cache_v_scale);
6262

6363
private:
64-
size_t rank_qkv_output_size_;
64+
bool enable_workspace_manager_{false};
65+
66+
size_t rank_qkv_output_size_{0};
6567

6668
// preallocated workspace for Attention
6769
infinicore::Tensor max_qkv_output_;

0 commit comments

Comments
 (0)