Skip to content

Commit fb2f5a4

Browse files
Merge pull request #396 from InfiniTensor/issue/343
feat: MiniCPM-V-2.6 serving
2 parents 4c3e266 + bcbf633 commit fb2f5a4

29 files changed

Lines changed: 1049 additions & 234 deletions

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@
8989
python scripts/test_perf.py --verbose
9090
```
9191

92+
- 单请求推理服务测试
93+
```bash
94+
python test/service/request.py --content="text:Image 1:" --content="image_url:xxx.jpg" --content="text:Image 2:" --content="image_url:xxxx.jpg" --content="text:Compare the 2 images."
95+
```
96+
9297
- 运行推理基准测试(C-Eval/MMLU)
9398

9499
```bash

csrc/engine/infer_engine.cpp

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,19 +89,31 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
8989
-> std::optional<infinicore::Tensor> {
9090
return t.has_value() ? t.value()->to(device) : t;
9191
};
92+
auto to_device_vec = [&](const std::optional<std::vector<infinicore::Tensor>> &vec)
93+
-> std::optional<std::vector<infinicore::Tensor>> {
94+
if (!vec.has_value()) {
95+
return vec;
96+
}
97+
std::vector<infinicore::Tensor> result;
98+
result.reserve(vec->size());
99+
for (const auto &t : vec.value()) {
100+
result.push_back(t->to(device));
101+
}
102+
return result;
103+
};
92104

93105
infinilm::InfinilmModel::Input input = {
94106
to_device(input_ids), // @todo: on device in the future
95-
to_device(pixel_values),
96107
to_device(position_ids),
97108
to_device(past_sequence_lengths), // @todo: on device in the future
98109
to_device(total_sequence_lengths),
99110
to_device(input_offsets),
100111
to_device(cu_seqlens),
101112
to_device(block_tables),
102113
to_device(slot_mapping),
103-
to_device(image_bound),
104-
to_device(tgt_sizes),
114+
to_device_vec(pixel_values),
115+
to_device_vec(image_bound),
116+
to_device_vec(tgt_sizes),
105117
};
106118

107119
infinilm::global_state::get_forward_context().attn_metadata = {
@@ -110,8 +122,11 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
110122
input.input_offsets,
111123
input.cu_seqlens,
112124
input.block_tables,
113-
input.slot_mapping,
114-
};
125+
input.slot_mapping};
126+
127+
global_state::get_forward_context().mm_metadata = {
128+
image_req_ids};
129+
115130
return input;
116131
}
117132

csrc/engine/rank_worker.hpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,6 @@ class RankWorker {
3636
struct Input {
3737
/// Token IDs tensor of shape `[batch, seq_len]`.
3838
std::optional<infinicore::Tensor> input_ids;
39-
/// Image pixel values for multi-modal models.
40-
std::optional<infinicore::Tensor> pixel_values;
4139
/// Position IDs tensor of shape `[batch, seq_len]` or `[seq_len]`.
4240
std::optional<infinicore::Tensor> position_ids;
4341
/// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
@@ -52,10 +50,14 @@ class RankWorker {
5250
std::optional<infinicore::Tensor> block_tables;
5351
/// Slot ids for each token `[seq]`. Used for paged cache.
5452
std::optional<infinicore::Tensor> slot_mapping;
53+
/// Image pixel values for multi-modal models.
54+
std::optional<std::vector<infinicore::Tensor>> pixel_values;
5555
/// Image placeholder bounds for MiniCPM-V style replacement.
56-
std::optional<infinicore::Tensor> image_bound;
56+
std::optional<std::vector<infinicore::Tensor>> image_bound;
5757
/// Target patch sizes for each image (MiniCPM-V).
58-
std::optional<infinicore::Tensor> tgt_sizes;
58+
std::optional<std::vector<infinicore::Tensor>> tgt_sizes;
59+
/// req_id for each pixel_values among a batch
60+
std::optional<std::vector<size_t>> image_req_ids;
5961

6062
float temperature{1};
6163

csrc/global_state/forward_context.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,13 @@ struct AttentionMetadata {
4040
input.slot_mapping) {}
4141
};
4242

43+
struct MultiModalMetadata {
44+
std::optional<std::vector<size_t>> image_req_ids;
45+
};
46+
4347
struct ForwardContext {
4448
AttentionMetadata attn_metadata;
49+
MultiModalMetadata mm_metadata;
4550
std::vector<infinicore::Tensor> kv_cache_vec;
4651
};
4752

csrc/layers/linear/fused_linear.cpp

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,14 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
3131
const infinicore::Device &device,
3232
engine::distributed::RankInfo rank_info)
3333
: infinilm::nn::ColumnParallelLinear(
34-
hidden_size,
35-
calculate_out_feature_size(num_q_head, q_dim, num_k_head, k_dim, num_v_head, v_dim, rank_info),
36-
quantization,
37-
(q_bias || k_bias || v_bias),
38-
dtype,
39-
device,
40-
rank_info.tp_rank,
41-
rank_info.tp_size),
34+
hidden_size,
35+
calculate_out_feature_size(num_q_head, q_dim, num_k_head, k_dim, num_v_head, v_dim, rank_info),
36+
quantization == nullptr ? std::make_shared<infinilm::quantization::NoneQuantization>() : quantization,
37+
(q_bias || k_bias || v_bias),
38+
dtype,
39+
device,
40+
rank_info.tp_rank,
41+
rank_info.tp_size),
4242
q_dim_(q_dim),
4343
k_dim_(k_dim),
4444
v_dim_(v_dim),
@@ -120,7 +120,17 @@ GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermedia
120120
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
121121
const infinicore::DataType &dtype, const infinicore::Device &device,
122122
engine::distributed::RankInfo rank_info)
123-
: infinilm::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, quantization, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) {
123+
: infinilm::nn::ColumnParallelLinear(
124+
hidden_size,
125+
intermediate_size * 2,
126+
quantization == nullptr ? std::make_shared<infinilm::quantization::NoneQuantization>() : quantization,
127+
gate_bias || up_bias,
128+
dtype,
129+
device,
130+
rank_info.tp_rank,
131+
rank_info.tp_size),
132+
gate_bias_(gate_bias),
133+
up_bias_(up_bias) {
124134
if (gate_bias_ != up_bias_) {
125135
throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time");
126136
}

csrc/layers/linear/fused_linear.hpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#pragma once
22
#include "../../engine/distributed/communication_group.hpp"
3-
#include "linear.hpp"
43
#include "../quantization/quantization.hpp"
4+
#include "linear.hpp"
55
#include <functional>
66

77
namespace infinilm::layers::linear {
@@ -13,15 +13,15 @@ class QKVParallelLinear : public infinilm::nn::ColumnParallelLinear {
1313
size_t q_dim, size_t k_dim, size_t v_dim,
1414
size_t num_q_head, size_t num_k_head, size_t num_v_head,
1515
bool q_bias, bool k_bias, bool v_bias,
16-
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
16+
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
1717
const infinicore::DataType &dtype = infinicore::DataType::F32,
1818
const infinicore::Device &device = infinicore::Device(),
1919
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
2020

2121
explicit QKVParallelLinear(size_t hidden_size,
2222
size_t head_dim,
2323
size_t num_q_head, size_t num_kv_head,
24-
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
24+
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
2525
bool bias = false,
2626
const infinicore::DataType &dtype = infinicore::DataType::F32,
2727
const infinicore::Device &device = infinicore::Device(),
@@ -32,7 +32,7 @@ class QKVParallelLinear : public infinilm::nn::ColumnParallelLinear {
3232
size_t num_q_head, size_t num_kv_head,
3333
const std::string &q_name, const std::string &k_name, const std::string &v_name,
3434
RegisterParamFn register_fn,
35-
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
35+
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
3636
bool bias = false,
3737
const infinicore::DataType &dtype = infinicore::DataType::F32,
3838
const infinicore::Device &device = infinicore::Device(),
@@ -84,21 +84,22 @@ class QKVParallelLinear : public infinilm::nn::ColumnParallelLinear {
8484

8585
class GateUpParallelLinear : public infinilm::nn::ColumnParallelLinear {
8686
public:
87-
GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
87+
GateUpParallelLinear(size_t hidden_size, size_t intermediate_size,
88+
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
8889
bool bias = false,
8990
const infinicore::DataType &dtype = infinicore::DataType::F32,
9091
const infinicore::Device &device = infinicore::Device(),
9192
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
9293

9394
GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
94-
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
95+
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
9596
const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(),
9697
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
9798

9899
GateUpParallelLinear(size_t hidden_size, size_t intermediate_size,
99100
const std::string &gate_name, const std::string &up_name,
100101
RegisterParamFn register_fn,
101-
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
102+
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
102103
bool bias = false,
103104
const infinicore::DataType &dtype = infinicore::DataType::F32,
104105
const infinicore::Device &device = infinicore::Device(),

csrc/layers/quantization/none_quantization.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
namespace infinilm::quantization {
66

7+
NoneQuantization::NoneQuantization() : NoneQuantization(nlohmann::json()) {}
8+
79
std::vector<ParamDescriptor> NoneQuantization::get_param_layout(
810
size_t in_features, size_t out_features,
911
int split_dim, int tp_rank, int tp_size,
@@ -14,8 +16,7 @@ std::vector<ParamDescriptor> NoneQuantization::get_param_layout(
1416
std::vector<ParamDescriptor> descs;
1517
descs.push_back({"weight", {out_features, in_features}, dtype, split_dim, tp_rank, tp_size});
1618
if (bias) {
17-
descs.push_back({"bias", {out_features}, dtype, split_dim >= 0 ? 0 : -1,
18-
split_dim >= 0 ? tp_rank : 0, split_dim >= 0 ? tp_size : 1});
19+
descs.push_back({"bias", {out_features}, dtype, split_dim >= 0 ? 0 : -1, split_dim >= 0 ? tp_rank : 0, split_dim >= 0 ? tp_size : 1});
1920
}
2021
return descs;
2122
}

csrc/layers/quantization/none_quantization.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ namespace infinilm::quantization {
66
class NoneQuantization : public BaseQuantization {
77
public:
88
explicit NoneQuantization(const nlohmann::json &quant_config)
9-
: BaseQuantization(quant_config) {};
9+
: BaseQuantization(quant_config){};
10+
11+
NoneQuantization();
1012

1113
QuantScheme get_quant_scheme() const override {
1214
return QuantScheme::NONE;

csrc/models/infinilm_model.hpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,6 @@ class InfinilmModel : public infinicore::nn::Module {
2121
struct Input {
2222
/// Token IDs tensor of shape `[batch, seq_len]`.
2323
std::optional<infinicore::Tensor> input_ids;
24-
/// Image pixel values for multi-modal models.
25-
/// Shape is model-specific (e.g. LLaVA: [batch, 3, H, W], MiniCPM-V: [batch, 3, patch, seq_len * patch]).
26-
std::optional<infinicore::Tensor> pixel_values;
2724
/// Position IDs tensor of shape `[batch, seq_len]` or `[seq_len]`.
2825
std::optional<infinicore::Tensor> position_ids;
2926
/// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
@@ -38,12 +35,15 @@ class InfinilmModel : public infinicore::nn::Module {
3835
std::optional<infinicore::Tensor> block_tables;
3936
/// Slot ids for each token `[seq]`. Used for paged cache.
4037
std::optional<infinicore::Tensor> slot_mapping;
38+
/// Image pixel values for multi-modal models.
39+
/// Vector of tensors. Shape is model-specific (e.g. LLaVA: [batch, 3, H, W], MiniCPM-V: [n_patch, 3, filter_H, H * W / filter_H]).
40+
std::optional<std::vector<infinicore::Tensor>> pixel_values;
4141
/// Image placeholder bounds for MiniCPM-V style replacement.
42-
/// Tensor shape: [batch, max_ranges, 2] (start, end).
43-
std::optional<infinicore::Tensor> image_bound;
42+
/// Vector of tensors shape: [n_patch, 2].
43+
std::optional<std::vector<infinicore::Tensor>> image_bound;
4444
/// Target patch sizes for each image (MiniCPM-V).
45-
/// Tensor shape: [batch, 2] or [batch, max_slices, 2] if pre-flattened.
46-
std::optional<infinicore::Tensor> tgt_sizes;
45+
/// Vector of tensors shape: [n_path, 2] if pre-flattened.
46+
std::optional<std::vector<infinicore::Tensor>> tgt_sizes;
4747
};
4848

4949
struct Output {

csrc/models/minicpmv/minicpmv_model.cpp

Lines changed: 24 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -33,22 +33,21 @@ MiniCPMVModel::MiniCPMVModel(std::shared_ptr<infinilm::config::ModelConfig> mode
3333
embed_dim,
3434
num_heads,
3535
vision_cfg.value("hidden_size", 768),
36+
vision_cfg.value("image_size", 224),
37+
vision_cfg.value("patch_size", 16),
3638
dtype,
3739
device);
3840
}
3941

40-
infinicore::Tensor MiniCPMVModel::replace_embeddings(const infinicore::Tensor &inputs_embeds,
41-
const infinicore::Tensor &vision_hidden,
42-
const infinicore::Tensor &image_bound) const {
43-
auto out = infinicore::Tensor::empty(inputs_embeds->shape(), inputs_embeds->dtype(), inputs_embeds->device());
44-
out->copy_from(inputs_embeds);
45-
42+
void MiniCPMVModel::replace_embeddings(infinicore::Tensor inputs_embeds,
43+
const infinicore::Tensor &vision_hidden,
44+
const infinicore::Tensor &image_bound) const {
4645
auto bounds_cpu = image_bound->to(infinicore::Device::cpu());
4746
auto batch_size = inputs_embeds->size(0);
4847

4948
ASSERT_EQ(batch_size, 1);
5049
ASSERT_EQ(bounds_cpu->size(0), 1);
51-
auto out_slice = out->squeeze(0);
50+
auto out_slice = inputs_embeds->squeeze(0);
5251
auto bound_slice = bounds_cpu->squeeze(0);
5352
auto vision_len = vision_hidden->size(0);
5453
for (size_t patch = 0; patch < vision_len; ++patch) {
@@ -60,8 +59,6 @@ infinicore::Tensor MiniCPMVModel::replace_embeddings(const infinicore::Tensor &i
6059

6160
out_slice->narrow({{0, size_t(start), size_t(end - start)}})->copy_from(patch_embed);
6261
}
63-
64-
return out;
6562
}
6663

6764
InfinilmModel::Output MiniCPMVModel::forward(const InfinilmModel::Input &input) const {
@@ -70,36 +67,30 @@ InfinilmModel::Output MiniCPMVModel::forward(const InfinilmModel::Input &input)
7067
}
7168
auto input_ids = input.input_ids.value();
7269

73-
if (input.pixel_values.has_value() && input_ids->size(1) > 1) {
74-
if (!input.image_bound.has_value()) {
75-
throw std::runtime_error("MiniCPMVModel: image_bound required for multimodal input");
70+
if (input.pixel_values.has_value() && input.pixel_values.value().size() > 0) {
71+
if (!input.image_bound.has_value() or !input.tgt_sizes.has_value()) {
72+
throw std::runtime_error("MiniCPMVModel: image_bound and tgt_sizes must be provided with pixel_values");
73+
}
74+
if (input.pixel_values->size() != input.image_bound->size() || input.pixel_values->size() != input.tgt_sizes->size()) {
75+
throw std::runtime_error("MiniCPMVModel: pixel_values, image_bound and tgt_sizes must have the same number of elements");
7676
}
77-
auto pixel_values = input.pixel_values.value();
78-
auto vision_embedding = vpm_->forward(pixel_values, input.tgt_sizes);
79-
auto vision_hidden = resampler_->forward(vision_embedding, input.tgt_sizes);
8077

8178
auto inputs_embeds = llm_->model().embed_tokens(input_ids);
82-
auto merged_embeds = replace_embeddings(inputs_embeds, vision_hidden, input.image_bound.value());
83-
84-
infinicore::Tensor position_ids;
85-
if (input.position_ids.has_value()) {
86-
position_ids = input.position_ids.value();
87-
} else {
88-
auto batch = merged_embeds->size(0);
89-
auto seq_len = merged_embeds->size(1);
90-
auto pos_cpu = infinicore::Tensor::zeros({batch, seq_len}, infinicore::DataType::I64, infinicore::Device::cpu());
91-
auto *pos_ptr = reinterpret_cast<int64_t *>(pos_cpu->data());
92-
for (size_t b = 0; b < batch; ++b) {
93-
for (size_t i = 0; i < seq_len; ++i) {
94-
pos_ptr[b * seq_len + i] = static_cast<int64_t>(i);
95-
}
96-
}
97-
position_ids = pos_cpu->to(merged_embeds->device());
79+
80+
// inputs_embeds concat tokens from all requests, while images are processed per request
81+
// slice inputs_embeds using request offsets to get the embedding of each request
82+
infinicore::Tensor input_offsets_cpu = input.input_offsets.value()->to(infinicore::Device::cpu());
83+
int32_t *offsets = (int32_t *)(input_offsets_cpu->data());
84+
for (size_t i : global_state::get_forward_context().mm_metadata.image_req_ids.value()) {
85+
auto pixel_values = input.pixel_values.value().at(i);
86+
auto vision_embedding = vpm_->forward(pixel_values, input.tgt_sizes.value().at(i));
87+
auto vision_hidden = resampler_->forward(vision_embedding, input.tgt_sizes.value().at(i));
88+
replace_embeddings(inputs_embeds->narrow({{1, size_t(offsets[i]), size_t(offsets[i + 1] - offsets[i])}}), vision_hidden, input.image_bound.value().at(i));
9889
}
9990

10091
auto hidden_states = llm_->model().forward_embeds(
101-
merged_embeds,
102-
position_ids);
92+
inputs_embeds,
93+
input.position_ids.value());
10394

10495
auto logits = llm_->logits_from_hidden(hidden_states);
10596
return {logits};

0 commit comments

Comments
 (0)