Skip to content

Commit cfc260b

Browse files
authored
feat: add process_results_use_image flag and video metadata dict support (#1275)
Two task API enhancements: 1. New `process_results_use_image` config flag: when true, preserves image/video data in dataset_no_image for tasks whose process_results needs visual context (e.g., bounding box verification). 2. Video metadata dict handling in ConfigurableMessagesTask: dict visuals with video_start/video_end metadata are passed through to models, enabling per-sample temporal range support.
1 parent f54dd28 commit cfc260b

1 file changed

Lines changed: 29 additions & 16 deletions

File tree

lmms_eval/api/task.py

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,21 +1157,24 @@ def concat_tar_parts(tar_parts, output_tar):
11571157
if split in [self.config.training_split, self.config.validation_split, self.config.test_split, self.config.fewshot_split]:
11581158
self.dataset[split] = self.config.process_docs(self.dataset[split])
11591159

1160-
# copy dataset, remove image features
1161-
self.dataset_no_image = self.dataset.copy()
1162-
for doc_name in self.dataset_no_image:
1163-
remove_cols = []
1164-
features = self.dataset_no_image[doc_name].features
1165-
# If it is an Image instance or a Sequence of Image instance. Remove it
1166-
for feature in features:
1167-
if isinstance(features[feature], Image):
1168-
remove_cols.append(feature)
1169-
elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image):
1170-
remove_cols.append(feature)
1171-
elif isinstance(features[feature], Audio):
1172-
remove_cols.append(feature)
1173-
for remove_col in remove_cols:
1174-
self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(remove_col)
1160+
# copy dataset, remove image features (unless process_results needs them)
1161+
if getattr(self.config, "process_results_use_image", False):
1162+
self.dataset_no_image = self.dataset
1163+
else:
1164+
self.dataset_no_image = self.dataset.copy()
1165+
for doc_name in self.dataset_no_image:
1166+
remove_cols = []
1167+
features = self.dataset_no_image[doc_name].features
1168+
# If it is an Image instance or a Sequence of Image instance. Remove it
1169+
for feature in features:
1170+
if isinstance(features[feature], Image):
1171+
remove_cols.append(feature)
1172+
elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image):
1173+
remove_cols.append(feature)
1174+
elif isinstance(features[feature], Audio):
1175+
remove_cols.append(feature)
1176+
for remove_col in remove_cols:
1177+
self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(remove_col)
11751178

11761179
def has_training_docs(self) -> bool:
11771180
if self.config.training_split is not None:
@@ -1766,7 +1769,17 @@ def auto_doc_to_messages(doc):
17661769
if isinstance(visual, PIL_Image.Image):
17671770
content.append({"type": "image", "url": visual})
17681771
elif isinstance(visual, dict):
1769-
content.append({"type": "audio", "url": visual})
1772+
# Dict visuals carry explicit type (default: video).
1773+
# Metadata keys (video_start, video_end, etc.) are
1774+
# preserved in the url field so they flow through
1775+
# ChatVideoContent → extract_media → fetch_video.
1776+
media_type = visual.get("type", "video")
1777+
has_metadata = any(k in visual for k in ("video_start", "video_end"))
1778+
if has_metadata:
1779+
media_url = visual # pass full dict as url
1780+
else:
1781+
media_url = visual.get("url") or visual.get("path") or visual
1782+
content.append({"type": media_type, "url": media_url})
17701783
elif isinstance(visual, str):
17711784
ext = os.path.splitext(visual)[1].lower()
17721785
if ext in _IMAGE_EXTS:

0 commit comments

Comments
 (0)