@@ -202,8 +202,6 @@ def preprocess_vlm_conversations(
202202 - pixel_values: List of pixel values for images in the examples.
203203 - image_grid_thw: List of image grid tensors.
204204 """
205- system_prompt = chat_template .system_prompt
206-
207205 # prepare result
208206 results = {
209207 "input_ids" : [],
@@ -213,16 +211,15 @@ def preprocess_vlm_conversations(
213211 "image_grid_thw" : [],
214212 }
215213
216- # Note: currently, we assume that each example has only one image
217- for i , image in enumerate (examples ["image" ]):
214+ for i , images in enumerate (examples ["images" ]):
218215 source = examples ["conversations" ][i ]
219216 messages = []
220217 # messages = [{"role": "system", "content": system_prompt}]
221218 if not source :
222219 # if the source is None, skip it
223220 continue
224221
225- if not image :
222+ if not images :
226223 text_messages = []
227224 convroles = ["user" , "assistant" ]
228225 for j , sentence in enumerate (source ):
@@ -267,26 +264,17 @@ def preprocess_vlm_conversations(
267264 source = source [1 :]
268265
269266 convroles = ["user" , "assistant" ]
270- has_added_image = False
267+ has_added_images = False
271268 for j , sentence in enumerate (source ):
272269 role = sentence ["role" ]
273270 assert role == convroles [j % 2 ], f"unexpected role { role } "
274271 if role == "user" :
275- # if the message is from user and has image, process the image
276- if not has_added_image :
277- messages .append (
278- {
279- "role" : role ,
280- "content" : [
281- {
282- "type" : "image" ,
283- "image" : image ,
284- },
285- {"type" : "text" , "text" : sentence ["content" ]},
286- ],
287- }
288- )
289- has_added_image = True
272+ # Insert all images into the first user message
273+ if not has_added_images :
274+ content = [{"type" : "image" , "image" : img } for img in images ]
275+ content .append ({"type" : "text" , "text" : sentence ["content" ]})
276+ messages .append ({"role" : role , "content" : content })
277+ has_added_images = True
290278 else :
291279 messages .append ({"role" : role , "content" : sentence ["content" ]})
292280 else :
@@ -319,7 +307,7 @@ def preprocess_vlm_conversations(
319307 input_ids = encoding .input_ids [0 ]
320308 offsets = encoding .offset_mapping [0 ]
321309 pixel_values = encoding .pixel_values
322- image_grid_thw = encoding .image_grid_thw [ 0 ]
310+ image_grid_thw = encoding .image_grid_thw # shape: (num_images, 3)
323311
324312 # get conversation with image info for loss mask generation
325313 decoded_conversation = processor .tokenizer .decode (
@@ -335,7 +323,7 @@ def preprocess_vlm_conversations(
335323 results ["loss_mask" ].append (loss_mask [None , :])
336324 results ["attention_mask" ].append (torch .ones_like (loss_mask )[None , :])
337325 results ["pixel_values" ].append (pixel_values )
338- results ["image_grid_thw" ].append (image_grid_thw [ None , :] )
326+ results ["image_grid_thw" ].append (image_grid_thw )
339327 return results
340328
341329
0 commit comments