@@ -114,10 +114,10 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
114114 return n_pos;
115115}
116116
117- void mtmd_helper_image_get_decoder_pos (const mtmd_image_tokens * chunks, mtmd_decoder_pos * out_pos) {
117+ void mtmd_helper_image_get_decoder_pos (const mtmd_image_tokens * chunks, llama_pos pos_0, mtmd_decoder_pos * out_pos) {
118118 size_t n_tokens = mtmd_image_tokens_get_n_tokens (chunks);
119119 for (size_t i = 0 ; i < n_tokens; i++) {
120- out_pos[i] = mtmd_image_tokens_get_decoder_pos (chunks, i);
120+ out_pos[i] = mtmd_image_tokens_get_decoder_pos (chunks, pos_0, i);
121121 }
122122}
123123
@@ -163,15 +163,15 @@ struct decode_embd_batch {
163163 }
164164
165165 // M-RoPE for image
166- void set_position_mrope_2d (llama_pos pos_0, const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
166+ void set_position_mrope_2d (const std::vector<mtmd_decoder_pos> & rel_pos, llama_seq_id seq_id) {
167167 GGML_ASSERT (n_pos_per_embd == 4 );
168168 GGML_ASSERT (!rel_pos.empty () && (int32_t )rel_pos.size () == batch.n_tokens );
169169 seq_id_0[0 ] = seq_id;
170170 for (int32_t i = 0 ; i < batch.n_tokens ; i++) {
171- pos[i ] = pos_0 + rel_pos[i].t ;
172- pos[i + batch.n_tokens ] = pos_0 + rel_pos[i].y ;
173- pos[i + batch.n_tokens * 2 ] = pos_0 + rel_pos[i].x ;
174- pos[i + batch.n_tokens * 3 ] = 0 ; // last pos dim is unused
171+ pos[i ] = rel_pos[i].t ;
172+ pos[i + batch.n_tokens ] = rel_pos[i].y ;
173+ pos[i + batch.n_tokens * 2 ] = rel_pos[i].x ;
174+ pos[i + batch.n_tokens * 3 ] = rel_pos[i]. z ;
175175 }
176176 for (int i = 0 ; i < batch.n_tokens ; i++) {
177177 batch.n_seq_id [i] = 1 ;
@@ -188,7 +188,7 @@ struct decode_embd_batch {
188188 pos[i ] = pos_0 + i;
189189 pos[i + batch.n_tokens ] = pos_0 + i;
190190 pos[i + batch.n_tokens * 2 ] = pos_0 + i;
191- pos[i + batch.n_tokens * 3 ] = 0 ; // last pos dim is unused
191+ pos[i + batch.n_tokens * 3 ] = pos_0 + i;
192192 }
193193 for (int i = 0 ; i < batch.n_tokens ; i++) {
194194 batch.n_seq_id [i] = 1 ;
@@ -268,8 +268,8 @@ int32_t mtmd_helper_decode_image_chunk(
268268 }
269269 const auto n_tokens = mtmd_image_tokens_get_n_tokens (image_tokens);
270270 std::vector<mtmd_decoder_pos> rel_pos (n_tokens);
271- mtmd_helper_image_get_decoder_pos (image_tokens, rel_pos.data ());
272- batch_embd.set_position_mrope_2d (n_past, rel_pos, seq_id);
271+ mtmd_helper_image_get_decoder_pos (image_tokens, n_past, rel_pos.data ());
272+ batch_embd.set_position_mrope_2d (rel_pos, seq_id);
273273 } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO ) {
274274 batch_embd.set_position_mrope_1d (n_past, seq_id);
275275 } else {
0 commit comments