@@ -201,8 +201,6 @@ def preprocess_vlm_conversations(
201201 - pixel_values: List of pixel values for images in the examples.
202202 - image_grid_thw: List of image grid tensors.
203203 """
204- system_prompt = chat_template .system_prompt
205-
206204 # prepare result
207205 results = {
208206 "input_ids" : [],
@@ -212,16 +210,15 @@ def preprocess_vlm_conversations(
212210 "image_grid_thw" : [],
213211 }
214212
215- # Note: currently, we assume that each example has only one image
216- for i , image in enumerate (examples ["image" ]):
213+ for i , images in enumerate (examples ["images" ]):
217214 source = examples ["conversations" ][i ]
218215 messages = []
219216 # messages = [{"role": "system", "content": system_prompt}]
220217 if not source :
221218 # if the source is None, skip it
222219 continue
223220
224- if not image :
221+ if not images :
225222 text_messages = []
226223 convroles = ["user" , "assistant" ]
227224 for j , sentence in enumerate (source ):
@@ -266,26 +263,17 @@ def preprocess_vlm_conversations(
266263 source = source [1 :]
267264
268265 convroles = ["user" , "assistant" ]
269- has_added_image = False
266+ has_added_images = False
270267 for j , sentence in enumerate (source ):
271268 role = sentence ["role" ]
272269 assert role == convroles [j % 2 ], f"unexpected role { role } "
273270 if role == "user" :
274- # if the message is from user and has image, process the image
275- if not has_added_image :
276- messages .append (
277- {
278- "role" : role ,
279- "content" : [
280- {
281- "type" : "image" ,
282- "image" : image ,
283- },
284- {"type" : "text" , "text" : sentence ["content" ]},
285- ],
286- }
287- )
288- has_added_image = True
271+ # Insert all images into the first user message
272+ if not has_added_images :
273+ content = [{"type" : "image" , "image" : img } for img in images ]
274+ content .append ({"type" : "text" , "text" : sentence ["content" ]})
275+ messages .append ({"role" : role , "content" : content })
276+ has_added_images = True
289277 else :
290278 messages .append ({"role" : role , "content" : sentence ["content" ]})
291279 else :
@@ -318,7 +306,7 @@ def preprocess_vlm_conversations(
318306 input_ids = encoding .input_ids [0 ]
319307 offsets = encoding .offset_mapping [0 ]
320308 pixel_values = encoding .pixel_values
321- image_grid_thw = encoding .image_grid_thw [ 0 ]
309+ image_grid_thw = encoding .image_grid_thw # shape: (num_images, 3)
322310
323311 # get conversation with image info for loss mask generation
324312 decoded_conversation = processor .tokenizer .decode (
@@ -334,7 +322,7 @@ def preprocess_vlm_conversations(
334322 results ["loss_mask" ].append (loss_mask [None , :])
335323 results ["attention_mask" ].append (torch .ones_like (loss_mask )[None , :])
336324 results ["pixel_values" ].append (pixel_values )
337- results ["image_grid_thw" ].append (image_grid_thw [ None , :] )
325+ results ["image_grid_thw" ].append (image_grid_thw )
338326 return results
339327
340328
0 commit comments