Skip to content

Commit a1ba778

Browse files
fix(jumpscore): align message format and video lookup (#1330)
* feat: add jump rope evaluation task * fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import The judge server was initialized at module import time, causing OpenAI API errors in CI environments where OPENAI_API_KEY is not set. Now the server is created on first use via _get_judge_server() instead. * Revert "fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import" This reverts commit 18dd0c3. * fix(jump_rope): lazy-load HF dataset snapshot to avoid import-time download snapshot_download was called at module level, causing CI to fail when loading task configs without HF credentials. Moved to _get_cache_dir() which is called on first actual use, following the same pattern as other tasks (e.g. vbvr/utils.py). * fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import The judge server was initialized at module level, causing an OpenAIError in CI environments where OPENAI_API_KEY is not set. Replaced the top-level initialization with _get_judge_server(), which creates the server on first actual use, consistent with how jump_rope/utils.py handles its HF download. * ci(task-input-ab): gracefully skip comparison when BASE snapshot fails The BASE worktree may contain pre-existing import-time errors (e.g. module-level OpenAI client init requiring OPENAI_API_KEY, or network calls at import time). These cause the BASE capture step to fail, blocking all PRs even when the PR itself introduces no regression. Changes: - Add continue-on-error: true to 'Capture BASE snapshot' step - Update 'Compare snapshots' to skip diff when base.json is absent, printing a clear warning instead of failing the workflow * refactor(jump_rope): rename task directory from jump_rope to jumpscore * Revert "fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import" This reverts commit 917a3ed. * Revert "ci(task-input-ab): gracefully skip comparison when BASE snapshot fails" This reverts commit 86f7f9a. * fix(jumpscore): configure video cache in yaml * fix(jumpscore): expose map metric * fix(jumpscore): align message format and video lookup * fix(jumpscore): remove snapshot cache fallback * fix(jumpscore): support zipped video cache * style: auto-fix lint (black + isort) --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 4510f3e commit a1ba778

1 file changed

Lines changed: 44 additions & 7 deletions

File tree

lmms_eval/tasks/jumpscore/utils.py

Lines changed: 44 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,51 @@
11
import json
22
import os
33
import re
4+
import zipfile
45
from collections import defaultdict
6+
from pathlib import Path
57
from typing import Any, Dict, List, Optional, Tuple
68

9+
import yaml
10+
from huggingface_hub import snapshot_download
711
from loguru import logger as eval_logger
812

13+
_JUMPSCORE_VIDEOS_READY = False
14+
15+
16+
def _load_dataset_path() -> str:
17+
"""Load the JumpScore dataset repo from the adjacent task YAML."""
18+
with open(Path(__file__).parent / "jumpscore.yaml", "r") as f:
19+
safe_lines = [line for line in f if "!function" not in line]
20+
return str(yaml.safe_load("".join(safe_lines))["dataset_path"])
21+
22+
23+
def _ensure_jumpscore_videos(cache_dir: str) -> None:
24+
"""Download and extract JumpScore videos when the local cache is not prepared."""
25+
global _JUMPSCORE_VIDEOS_READY
26+
if _JUMPSCORE_VIDEOS_READY:
27+
return
28+
29+
videos_dir = os.path.join(cache_dir, "videos")
30+
if os.path.isdir(videos_dir) and os.listdir(videos_dir):
31+
_JUMPSCORE_VIDEOS_READY = True
32+
return
33+
34+
repo_dir = snapshot_download(repo_id=_load_dataset_path(), repo_type="dataset", allow_patterns=["*.zip"])
35+
zip_paths = [os.path.join(repo_dir, name) for name in os.listdir(repo_dir) if name.endswith(".zip")]
36+
if not zip_paths:
37+
eval_logger.warning(f"JumpScore video zip not found in {repo_dir}; expected videos under {videos_dir}.")
38+
_JUMPSCORE_VIDEOS_READY = True
39+
return
40+
41+
os.makedirs(cache_dir, exist_ok=True)
42+
for zip_path in sorted(zip_paths):
43+
eval_logger.info(f"Extracting JumpScore videos from {zip_path} to {cache_dir}")
44+
with zipfile.ZipFile(zip_path) as zf:
45+
zf.extractall(cache_dir)
46+
47+
_JUMPSCORE_VIDEOS_READY = True
48+
949

1050
def jumpscore_doc_to_visual(doc: Dict[str, Any], lmms_eval_specific_kwargs: Optional[Dict[str, Any]] = None) -> List[str]:
1151
"""Return the local video path for a JumpScore sample."""
@@ -24,6 +64,8 @@ def jumpscore_doc_to_visual(doc: Dict[str, Any], lmms_eval_specific_kwargs: Opti
2464
os.path.join(cache_dir, video_ref),
2565
os.path.join(cache_dir, "videos", video_ref),
2666
]
67+
if not any(os.path.exists(path) for path in candidates):
68+
_ensure_jumpscore_videos(cache_dir)
2769
video_path = next((path for path in candidates if os.path.exists(path)), candidates[0])
2870

2971
if not os.path.exists(video_path):
@@ -48,26 +90,21 @@ def jumpscore_doc_to_target(doc: Dict[str, Any]) -> str:
4890

4991

5092
def jumpscore_doc_to_messages(doc: Dict[str, Any], lmms_eval_specific_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
51-
"""Build the multi-turn JumpScore conversation used during evaluation."""
93+
"""Build the single-turn JumpScore conversation used during evaluation."""
5294
if lmms_eval_specific_kwargs is None:
5395
lmms_eval_specific_kwargs = {}
5496

5597
video_path = jumpscore_doc_to_visual(doc, lmms_eval_specific_kwargs)[0]
56-
count_question = str(doc.get("count_question", "")).replace("<image>", "").strip()
57-
count_question = re.sub(r"\n+", "\n", count_question).strip()
58-
count_answer = str(doc.get("count_answer", "")).strip()
5998
timestamps_question = jumpscore_doc_to_text(doc, lmms_eval_specific_kwargs)
6099

61100
return [
62101
{
63102
"role": "user",
64103
"content": [
65104
{"type": "video", "url": video_path},
66-
{"type": "text", "text": count_question},
105+
{"type": "text", "text": timestamps_question},
67106
],
68107
},
69-
{"role": "assistant", "content": [{"type": "text", "text": count_answer}]},
70-
{"role": "user", "content": [{"type": "text", "text": timestamps_question}]},
71108
]
72109

73110

0 commit comments

Comments
 (0)