Skip to content

Commit 429471e

Browse files
issue/343 optimize minicpmv resampler
1 parent 1b05f5f commit 429471e

5 files changed

Lines changed: 63 additions & 72 deletions

File tree

csrc/models/minicpmv/minicpmv_model.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ MiniCPMVModel::MiniCPMVModel(std::shared_ptr<infinilm::config::ModelConfig> mode
3333
embed_dim,
3434
num_heads,
3535
vision_cfg.value("hidden_size", 768),
36+
vision_cfg.value("image_size", 224),
37+
vision_cfg.value("patch_size", 16),
3638
dtype,
3739
device);
3840
}

csrc/models/minicpmv/resampler.cpp

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -121,15 +121,20 @@ Resampler::Resampler(size_t num_queries,
121121
size_t embed_dim,
122122
size_t num_heads,
123123
size_t kv_dim,
124+
size_t image_size,
125+
size_t patch_size,
124126
const infinicore::DataType &dtype,
125127
const infinicore::Device &device)
126128
: num_queries_(num_queries),
127129
embed_dim_(embed_dim),
128130
num_heads_(num_heads),
129131
kv_dim_(kv_dim),
132+
image_size_(image_size),
133+
patch_size_(patch_size),
130134
use_kv_proj_(kv_dim != embed_dim) {
131135
INFINICORE_NN_PARAMETER_INIT(query, ({num_queries_, embed_dim_}, dtype, device));
132136
INFINICORE_NN_PARAMETER_INIT(proj, ({embed_dim_, embed_dim_}, dtype, device));
137+
133138
INFINICORE_NN_MODULE_INIT(attn, embed_dim_, num_heads_, dtype, device);
134139
INFINICORE_NN_MODULE_INIT(ln_q, embed_dim_, 1e-6, dtype, device);
135140
INFINICORE_NN_MODULE_INIT(ln_kv, embed_dim_, 1e-6, dtype, device);
@@ -138,6 +143,15 @@ Resampler::Resampler(size_t num_queries,
138143
if (use_kv_proj_) {
139144
INFINICORE_NN_MODULE_INIT(kv_proj, kv_dim_, embed_dim_, false, dtype, device);
140145
}
146+
147+
// Initialize full 2d embeddings with max size, calculate on cpu and copy to gpu
148+
size_t num_patches = image_size_ / patch_size_;
149+
INFINICORE_NN_BUFFER_INIT(embedding_table, ({num_patches, num_patches, embed_dim_}, dtype, device_));
150+
std::vector<float> buf(num_patches * num_patches * embed_dim_);
151+
compute_2d_sincos_pos_embed(buf.data(), embed_dim_, num_patches, num_patches);
152+
auto embedding_table_cpu = infinicore::Tensor::zeros({num_patches, num_patches, embed_dim_}, dtype, infinicore::Device::cpu());
153+
write_pos_embed(embedding_table_cpu->data(), embedding_table_cpu->dtype(), buf.data(), num_patches * num_patches * embed_dim_);
154+
embedding_table_->copy_from(embedding_table_cpu);
141155
}
142156

143157
infinicore::Tensor Resampler::forward(const infinicore::Tensor &x,
@@ -152,32 +166,22 @@ infinicore::Tensor Resampler::forward(const infinicore::Tensor &x,
152166
kv = ln_kv_->forward(kv);
153167

154168
// Build positional embeddings on CPU
155-
std::vector<int64_t> tgt_sizes_host;
156-
157169
auto tgt_cpu = tgt_sizes->to(infinicore::Device::cpu());
158-
auto n = tgt_cpu->numel();
159-
tgt_sizes_host.resize(n);
160-
std::memcpy(tgt_sizes_host.data(), tgt_cpu->data(), n * sizeof(int64_t));
170+
int64_t *tgt_sizes_ptr = (int64_t *)(tgt_cpu->data());
161171

162-
auto pos_cpu = infinicore::Tensor::zeros({batch_size, seq_len, embed_dim_}, kv->dtype(), infinicore::Device::cpu());
163-
auto *pos_ptr = reinterpret_cast<std::byte *>(pos_cpu->data());
164-
const size_t elem_size = pos_cpu->element_size();
172+
auto pos_embeddings = infinicore::Tensor::zeros(kv->shape(), kv->dtype(), kv->device());
165173

166174
for (size_t b = 0; b < batch_size; ++b) {
167-
size_t tgt_h = 1;
168-
size_t tgt_w = seq_len;
169-
if (!tgt_sizes_host.empty()) {
170-
tgt_h = static_cast<size_t>(tgt_sizes_host[b * 2]);
171-
tgt_w = static_cast<size_t>(tgt_sizes_host[b * 2 + 1]);
172-
}
173-
const size_t patch_len = tgt_h * tgt_w;
174-
std::vector<float> buf(patch_len * embed_dim_);
175-
compute_2d_sincos_pos_embed(buf.data(), embed_dim_, tgt_h, tgt_w);
176-
write_pos_embed(pos_ptr + b * seq_len * embed_dim_ * elem_size, pos_cpu->dtype(), buf.data(), patch_len * embed_dim_);
175+
176+
auto tgt_h = static_cast<size_t>(tgt_sizes_ptr[b * 2]);
177+
auto tgt_w = static_cast<size_t>(tgt_sizes_ptr[b * 2 + 1]);
178+
179+
auto src_embeddings = embedding_table_->narrow({{0, 0, tgt_h}, {1, 0, tgt_w}});
180+
auto tgt_embeddings = pos_embeddings->narrow({{0, b, 1}, {1, 0, tgt_h * tgt_w}})->view({tgt_h, tgt_w, embed_dim_});
181+
tgt_embeddings->copy_from(src_embeddings);
177182
}
178183

179-
auto pos = pos_cpu->to(kv->device());
180-
auto kv_with_pos = infinicore::op::add(kv, pos);
184+
auto kv_with_pos = infinicore::op::add(kv, pos_embeddings);
181185

182186
auto q = ln_q_->forward(query_);
183187
if (q->shape().size() == 2) {

csrc/models/minicpmv/resampler.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ class Resampler : public infinicore::nn::Module {
3939
size_t embed_dim,
4040
size_t num_heads,
4141
size_t kv_dim,
42+
size_t image_size,
43+
size_t patch_size,
4244
const infinicore::DataType &dtype,
4345
const infinicore::Device &device);
4446

@@ -50,10 +52,13 @@ class Resampler : public infinicore::nn::Module {
5052
size_t embed_dim_;
5153
size_t num_heads_;
5254
size_t kv_dim_;
55+
size_t image_size_;
56+
size_t patch_size_;
5357
bool use_kv_proj_;
5458

5559
INFINICORE_NN_PARAMETER(query);
5660
INFINICORE_NN_PARAMETER(proj);
61+
INFINICORE_NN_BUFFER(embedding_table);
5762
INFINICORE_NN_MODULE(infinicore::nn::Linear, kv_proj);
5863
INFINICORE_NN_MODULE(ResamplerAttention, attn);
5964
INFINICORE_NN_MODULE(infinicore::nn::LayerNorm, ln_q);

csrc/pybind11/engine/engine.hpp

Lines changed: 4 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -30,51 +30,6 @@ namespace infinilm::engine {
3030

3131
inline void bind_infer_engine(py::module &m) {
3232
py::class_<InferEngine, std::shared_ptr<InferEngine>> infer_engine(m, "InferEngine");
33-
infer_engine
34-
.def(py::init([](
35-
const InfinilmModel::Config &cfg,
36-
const distributed::DistConfig &dist,
37-
infinicore::Device::Type dev,
38-
std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
39-
bool enable_graph_compiling,
40-
const std::string &attention_backend) {
41-
return std::make_shared<InferEngine>(
42-
cfg,
43-
dist,
44-
dev,
45-
cache_cfg ? cache_cfg.get() : nullptr,
46-
enable_graph_compiling,
47-
infinilm::backends::parse_attention_backend(attention_backend));
48-
}),
49-
py::arg("config"),
50-
py::arg("distributed_config") = distributed::DistConfig(),
51-
py::arg("device_type") = infinicore::context::getDevice().getType(),
52-
py::arg("cache_config") = py::none(),
53-
py::arg("enable_graph_compiling") = false,
54-
py::arg("attention_backend") = "default")
55-
.def("load_param", &InferEngine::load_param,
56-
py::arg("name"), py::arg("param"),
57-
"Load a parameter tensor into all workers (each worker picks its shard)")
58-
.def("state_dict", [](InferEngine &self) {
59-
py::list state_dict_tp_all;
60-
for (const auto &state_dict_tp : self.state_dict()) {
61-
py::dict result;
62-
for (const auto &[name, param] : state_dict_tp) {
63-
result[py::cast(name)] = infinicore::Tensor(param);
64-
}
65-
state_dict_tp_all.append(result);
66-
}
67-
return state_dict_tp_all;
68-
})
69-
.def("process_weights_after_loading", &InferEngine::process_weights_after_loading, "Process the weights after loading on all workers (e.g., for quantization)")
70-
.def(
71-
"forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
72-
.def(
73-
"reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
74-
.def("get_cache_config", [](const InferEngine &self) -> std::shared_ptr<cache::CacheConfig> {
75-
auto cfg = self.get_cache_config();
76-
return cfg ? std::shared_ptr<cache::CacheConfig>(cfg->unique_copy()) : nullptr; })
77-
.def("__repr__", [](const InferEngine &self) { return "<InferEngine: " + std::string(self.get_dist_config()) + ">"; });
7833

7934
infer_engine
8035
.def(py::init([](
@@ -116,8 +71,10 @@ inline void bind_infer_engine(py::module &m) {
11671
return state_dict_tp_all;
11772
})
11873
.def("process_weights_after_loading", &InferEngine::process_weights_after_loading, "Process the weights after loading on all workers (e.g., for quantization)")
119-
.def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
120-
.def("reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
74+
.def(
75+
"forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
76+
.def(
77+
"reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
12178
.def("get_cache_config", [](const InferEngine &self) -> std::shared_ptr<cache::CacheConfig> {
12279
auto cfg = self.get_cache_config();
12380
return cfg ? std::shared_ptr<cache::CacheConfig>(cfg->unique_copy()) : nullptr; })

test/service/request.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,24 @@ def get_args():
2525
"--port", type=int, default=8000, help="Infer server port, default 8000"
2626
)
2727
parser.add_argument(
28-
"--host", default="127.0.0.1", help="Infer server url, default 127.0.0.1"
28+
"--host",
29+
type=str,
30+
default="127.0.0.1",
31+
help="Infer server url, default 127.0.0.1",
32+
)
33+
34+
parser.add_argument(
35+
"--api-url",
36+
type=str,
37+
default=None,
38+
help="Full service url, if given host and port will be ignored",
39+
)
40+
41+
parser.add_argument(
42+
"--model-name",
43+
type=str,
44+
default="default",
45+
help="Name of the model being served, needed by vllm",
2946
)
3047

3148
return parser.parse_args()
@@ -56,12 +73,12 @@ def build_messages(content_args, system_prompt):
5673
return messages
5774

5875

59-
async def benchmark_user(client, messages):
76+
async def benchmark_user(client, messages, model_name):
6077
try:
6178
print(f" ❓ 提问: {messages}")
6279
start_time = time.time()
6380
stream = await client.chat.completions.create(
64-
model="default",
81+
model=model_name,
6582
messages=messages,
6683
stream=True,
6784
)
@@ -110,8 +127,14 @@ def main():
110127
if not args.content:
111128
args.content = ["text:山东最高的山是?"]
112129
messages = build_messages(args.content, args.system)
113-
client = AsyncOpenAI(base_url=f"http://{args.host}:{args.port}", api_key="default")
114-
asyncio.run(benchmark_user(client, messages))
130+
api_url = (
131+
f"http://{args.api_url}"
132+
if args.api_url is not None
133+
else f"http://{args.host}:{args.port}"
134+
)
135+
136+
client = AsyncOpenAI(base_url=api_url, api_key="default")
137+
asyncio.run(benchmark_user(client, messages, args.model_name))
115138

116139

117140
if __name__ == "__main__":

0 commit comments

Comments
 (0)