@@ -33,22 +33,21 @@ MiniCPMVModel::MiniCPMVModel(std::shared_ptr<infinilm::config::ModelConfig> mode
3333 embed_dim,
3434 num_heads,
3535 vision_cfg.value (" hidden_size" , 768 ),
36+ vision_cfg.value (" image_size" , 224 ),
37+ vision_cfg.value (" patch_size" , 16 ),
3638 dtype,
3739 device);
3840}
3941
40- infinicore::Tensor MiniCPMVModel::replace_embeddings (const infinicore::Tensor &inputs_embeds,
41- const infinicore::Tensor &vision_hidden,
42- const infinicore::Tensor &image_bound) const {
43- auto out = infinicore::Tensor::empty (inputs_embeds->shape (), inputs_embeds->dtype (), inputs_embeds->device ());
44- out->copy_from (inputs_embeds);
45-
42+ void MiniCPMVModel::replace_embeddings (infinicore::Tensor inputs_embeds,
43+ const infinicore::Tensor &vision_hidden,
44+ const infinicore::Tensor &image_bound) const {
4645 auto bounds_cpu = image_bound->to (infinicore::Device::cpu ());
4746 auto batch_size = inputs_embeds->size (0 );
4847
4948 ASSERT_EQ (batch_size, 1 );
5049 ASSERT_EQ (bounds_cpu->size (0 ), 1 );
51- auto out_slice = out ->squeeze (0 );
50+ auto out_slice = inputs_embeds ->squeeze (0 );
5251 auto bound_slice = bounds_cpu->squeeze (0 );
5352 auto vision_len = vision_hidden->size (0 );
5453 for (size_t patch = 0 ; patch < vision_len; ++patch) {
@@ -60,8 +59,6 @@ infinicore::Tensor MiniCPMVModel::replace_embeddings(const infinicore::Tensor &i
6059
6160 out_slice->narrow ({{0 , size_t (start), size_t (end - start)}})->copy_from (patch_embed);
6261 }
63-
64- return out;
6562}
6663
6764InfinilmModel::Output MiniCPMVModel::forward (const InfinilmModel::Input &input) const {
@@ -70,36 +67,30 @@ InfinilmModel::Output MiniCPMVModel::forward(const InfinilmModel::Input &input)
7067 }
7168 auto input_ids = input.input_ids .value ();
7269
73- if (input.pixel_values .has_value () && input_ids->size (1 ) > 1 ) {
74- if (!input.image_bound .has_value ()) {
75- throw std::runtime_error (" MiniCPMVModel: image_bound required for multimodal input" );
70+ if (input.pixel_values .has_value () && input.pixel_values .value ().size () > 0 ) {
71+ if (!input.image_bound .has_value () or !input.tgt_sizes .has_value ()) {
72+ throw std::runtime_error (" MiniCPMVModel: image_bound and tgt_sizes must be provided with pixel_values" );
73+ }
74+ if (input.pixel_values ->size () != input.image_bound ->size () || input.pixel_values ->size () != input.tgt_sizes ->size ()) {
75+ throw std::runtime_error (" MiniCPMVModel: pixel_values, image_bound and tgt_sizes must have the same number of elements" );
7676 }
77- auto pixel_values = input.pixel_values .value ();
78- auto vision_embedding = vpm_->forward (pixel_values, input.tgt_sizes );
79- auto vision_hidden = resampler_->forward (vision_embedding, input.tgt_sizes );
8077
8178 auto inputs_embeds = llm_->model ().embed_tokens (input_ids);
82- auto merged_embeds = replace_embeddings (inputs_embeds, vision_hidden, input.image_bound .value ());
83-
84- infinicore::Tensor position_ids;
85- if (input.position_ids .has_value ()) {
86- position_ids = input.position_ids .value ();
87- } else {
88- auto batch = merged_embeds->size (0 );
89- auto seq_len = merged_embeds->size (1 );
90- auto pos_cpu = infinicore::Tensor::zeros ({batch, seq_len}, infinicore::DataType::I64 , infinicore::Device::cpu ());
91- auto *pos_ptr = reinterpret_cast <int64_t *>(pos_cpu->data ());
92- for (size_t b = 0 ; b < batch; ++b) {
93- for (size_t i = 0 ; i < seq_len; ++i) {
94- pos_ptr[b * seq_len + i] = static_cast <int64_t >(i);
95- }
96- }
97- position_ids = pos_cpu->to (merged_embeds->device ());
79+
80+ // inputs_embeds concat tokens from all requests, while images are processed per request
81+ // slice inputs_embeds using request offsets to get the embedding of each request
82+ infinicore::Tensor input_offsets_cpu = input.input_offsets .value ()->to (infinicore::Device::cpu ());
83+ int32_t *offsets = (int32_t *)(input_offsets_cpu->data ());
84+ for (size_t i : global_state::get_forward_context ().mm_metadata .image_req_ids .value ()) {
85+ auto pixel_values = input.pixel_values .value ().at (i);
86+ auto vision_embedding = vpm_->forward (pixel_values, input.tgt_sizes .value ().at (i));
87+ auto vision_hidden = resampler_->forward (vision_embedding, input.tgt_sizes .value ().at (i));
88+ replace_embeddings (inputs_embeds->narrow ({{1 , size_t (offsets[i]), size_t (offsets[i + 1 ] - offsets[i])}}), vision_hidden, input.image_bound .value ().at (i));
9889 }
9990
10091 auto hidden_states = llm_->model ().forward_embeds (
101- merged_embeds ,
102- position_ids);
92+ inputs_embeds ,
93+ input. position_ids . value () );
10394
10495 auto logits = llm_->logits_from_hidden (hidden_states);
10596 return {logits};
0 commit comments