Skip to content

Commit d493491

Browse files
issue/343 fix batching
1 parent 429471e commit d493491

5 files changed

Lines changed: 15 additions & 20 deletions

File tree

csrc/models/minicpmv/minicpmv_model.cpp

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,18 +39,15 @@ MiniCPMVModel::MiniCPMVModel(std::shared_ptr<infinilm::config::ModelConfig> mode
3939
device);
4040
}
4141

42-
infinicore::Tensor MiniCPMVModel::replace_embeddings(const infinicore::Tensor &inputs_embeds,
43-
const infinicore::Tensor &vision_hidden,
44-
const infinicore::Tensor &image_bound) const {
45-
auto out = infinicore::Tensor::empty(inputs_embeds->shape(), inputs_embeds->dtype(), inputs_embeds->device());
46-
out->copy_from(inputs_embeds);
47-
42+
void MiniCPMVModel::replace_embeddings(infinicore::Tensor inputs_embeds,
43+
const infinicore::Tensor &vision_hidden,
44+
const infinicore::Tensor &image_bound) const {
4845
auto bounds_cpu = image_bound->to(infinicore::Device::cpu());
4946
auto batch_size = inputs_embeds->size(0);
5047

5148
ASSERT_EQ(batch_size, 1);
5249
ASSERT_EQ(bounds_cpu->size(0), 1);
53-
auto out_slice = out->squeeze(0);
50+
auto out_slice = inputs_embeds->squeeze(0);
5451
auto bound_slice = bounds_cpu->squeeze(0);
5552
auto vision_len = vision_hidden->size(0);
5653
for (size_t patch = 0; patch < vision_len; ++patch) {
@@ -62,8 +59,6 @@ infinicore::Tensor MiniCPMVModel::replace_embeddings(const infinicore::Tensor &i
6259

6360
out_slice->narrow({{0, size_t(start), size_t(end - start)}})->copy_from(patch_embed);
6461
}
65-
66-
return out;
6762
}
6863

6964
InfinilmModel::Output MiniCPMVModel::forward(const InfinilmModel::Input &input) const {
@@ -90,7 +85,7 @@ InfinilmModel::Output MiniCPMVModel::forward(const InfinilmModel::Input &input)
9085
auto pixel_values = input.pixel_values.value().at(i);
9186
auto vision_embedding = vpm_->forward(pixel_values, input.tgt_sizes.value().at(i));
9287
auto vision_hidden = resampler_->forward(vision_embedding, input.tgt_sizes.value().at(i));
93-
inputs_embeds = replace_embeddings(inputs_embeds->narrow({{1, size_t(offsets[i]), size_t(offsets[i + 1] - offsets[i])}}), vision_hidden, input.image_bound.value().at(i));
88+
replace_embeddings(inputs_embeds->narrow({{1, size_t(offsets[i]), size_t(offsets[i + 1] - offsets[i])}}), vision_hidden, input.image_bound.value().at(i));
9489
}
9590

9691
auto hidden_states = llm_->model().forward_embeds(

csrc/models/minicpmv/minicpmv_model.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ class MiniCPMVModel : public InfinilmModel {
2323
void reset_cache(const cache::CacheConfig *cache_config) override;
2424

2525
private:
26-
infinicore::Tensor replace_embeddings(const infinicore::Tensor &inputs_embeds,
27-
const infinicore::Tensor &vision_hidden,
28-
const infinicore::Tensor &image_bound) const;
26+
void replace_embeddings(infinicore::Tensor inputs_embeds,
27+
const infinicore::Tensor &vision_hidden,
28+
const infinicore::Tensor &image_bound) const;
2929

3030
std::shared_ptr<infinilm::config::ModelConfig> config_;
3131

csrc/pybind11/engine/engine.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,11 @@ inline void bind_infer_engine(py::module &m) {
7272
})
7373
.def("process_weights_after_loading", &InferEngine::process_weights_after_loading, "Process the weights after loading on all workers (e.g., for quantization)")
7474
.def(
75-
"forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
75+
"forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output {
76+
py::gil_scoped_release release;
77+
return self.forward(input);
78+
},
79+
"Run inference on all ranks with arbitrary arguments")
7680
.def(
7781
"reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
7882
.def("get_cache_config", [](const InferEngine &self) -> std::shared_ptr<cache::CacheConfig> {

python/infinilm/processors/minicpmv_processor.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,6 @@ def build_model_inputs(
155155
):
156156
import torch
157157

158-
assert len(scheduler_output.scheduled_requests) == 1, (
159-
"Batching is not supported for image inputs yet"
160-
)
161-
162158
num_cached_patch = (
163159
(req.processed_inputs["image_bound"][0][:, 1] <= num_cached)
164160
.sum()

test/service/request.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@ def get_args():
3939
)
4040

4141
parser.add_argument(
42-
"--model-name",
42+
"--model",
4343
type=str,
4444
default="default",
45-
help="Name of the model being served, needed by vllm",
45+
help="Name or path of the model being served, needed by vllm",
4646
)
4747

4848
return parser.parse_args()

0 commit comments

Comments
 (0)