fix(jumpscore): align message format and video lookup (#1330)

mathCrazyy · github-actions[bot] · web-flow · commit a1ba7780c6bc · 2026-05-15T13:31:06.000+08:00
* feat: add jump rope evaluation task * fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import The judge server was initialized at module import time, causing OpenAI API errors in CI environments where OPENAI_API_KEY is not set. Now the server is created on first use via _get_judge_server() instead. * Revert "fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import" This reverts commit 18dd0c3. * fix(jump_rope): lazy-load HF dataset snapshot to avoid import-time download snapshot_download was called at module level, causing CI to fail when loading task configs without HF credentials. Moved to _get_cache_dir() which is called on first actual use, following the same pattern as other tasks (e.g. vbvr/utils.py). * fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import The judge server was initialized at module level, causing an OpenAIError in CI environments where OPENAI_API_KEY is not set. Replaced the top-level initialization with _get_judge_server(), which creates the server on first actual use, consistent with how jump_rope/utils.py handles its HF download. * ci(task-input-ab): gracefully skip comparison when BASE snapshot fails The BASE worktree may contain pre-existing import-time errors (e.g. module-level OpenAI client init requiring OPENAI_API_KEY, or network calls at import time). These cause the BASE capture step to fail, blocking all PRs even when the PR itself introduces no regression. Changes: - Add continue-on-error: true to 'Capture BASE snapshot' step - Update 'Compare snapshots' to skip diff when base.json is absent, printing a clear warning instead of failing the workflow * refactor(jump_rope): rename task directory from jump_rope to jumpscore * Revert "fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import" This reverts commit 917a3ed. * Revert "ci(task-input-ab): gracefully skip comparison when BASE snapshot fails" This reverts commit 86f7f9a. * fix(jumpscore): configure video cache in yaml * fix(jumpscore): expose map metric * fix(jumpscore): align message format and video lookup * fix(jumpscore): remove snapshot cache fallback * fix(jumpscore): support zipped video cache * style: auto-fix lint (black + isort) --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
diff --git a/lmms_eval/tasks/jumpscore/utils.py b/lmms_eval/tasks/jumpscore/utils.py
@@ -1,11 +1,51 @@
 import json
 import os
 import re
+import zipfile
 from collections import defaultdict
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
+import yaml
+from huggingface_hub import snapshot_download
 from loguru import logger as eval_logger
 
+_JUMPSCORE_VIDEOS_READY = False
+
+
+def _load_dataset_path() -> str:
+    """Load the JumpScore dataset repo from the adjacent task YAML."""
+    with open(Path(__file__).parent / "jumpscore.yaml", "r") as f:
+        safe_lines = [line for line in f if "!function" not in line]
+    return str(yaml.safe_load("".join(safe_lines))["dataset_path"])
+
+
+def _ensure_jumpscore_videos(cache_dir: str) -> None:
+    """Download and extract JumpScore videos when the local cache is not prepared."""
+    global _JUMPSCORE_VIDEOS_READY
+    if _JUMPSCORE_VIDEOS_READY:
+        return
+
+    videos_dir = os.path.join(cache_dir, "videos")
+    if os.path.isdir(videos_dir) and os.listdir(videos_dir):
+        _JUMPSCORE_VIDEOS_READY = True
+        return
+
+    repo_dir = snapshot_download(repo_id=_load_dataset_path(), repo_type="dataset", allow_patterns=["*.zip"])
+    zip_paths = [os.path.join(repo_dir, name) for name in os.listdir(repo_dir) if name.endswith(".zip")]
+    if not zip_paths:
+        eval_logger.warning(f"JumpScore video zip not found in {repo_dir}; expected videos under {videos_dir}.")
+        _JUMPSCORE_VIDEOS_READY = True
+        return
+
+    os.makedirs(cache_dir, exist_ok=True)
+    for zip_path in sorted(zip_paths):
+        eval_logger.info(f"Extracting JumpScore videos from {zip_path} to {cache_dir}")
+        with zipfile.ZipFile(zip_path) as zf:
+            zf.extractall(cache_dir)
+
+    _JUMPSCORE_VIDEOS_READY = True
+
 
 def jumpscore_doc_to_visual(doc: Dict[str, Any], lmms_eval_specific_kwargs: Optional[Dict[str, Any]] = None) -> List[str]:
     """Return the local video path for a JumpScore sample."""
@@ -24,6 +64,8 @@ def jumpscore_doc_to_visual(doc: Dict[str, Any], lmms_eval_specific_kwargs: Opti
             os.path.join(cache_dir, video_ref),
             os.path.join(cache_dir, "videos", video_ref),
         ]
+        if not any(os.path.exists(path) for path in candidates):
+            _ensure_jumpscore_videos(cache_dir)
         video_path = next((path for path in candidates if os.path.exists(path)), candidates[0])
 
     if not os.path.exists(video_path):
@@ -48,26 +90,21 @@ def jumpscore_doc_to_target(doc: Dict[str, Any]) -> str:
 
 
 def jumpscore_doc_to_messages(doc: Dict[str, Any], lmms_eval_specific_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
-    """Build the multi-turn JumpScore conversation used during evaluation."""
+    """Build the single-turn JumpScore conversation used during evaluation."""
     if lmms_eval_specific_kwargs is None:
         lmms_eval_specific_kwargs = {}
 
     video_path = jumpscore_doc_to_visual(doc, lmms_eval_specific_kwargs)[0]
-    count_question = str(doc.get("count_question", "")).replace("<image>", "").strip()
-    count_question = re.sub(r"\n+", "\n", count_question).strip()
-    count_answer = str(doc.get("count_answer", "")).strip()
     timestamps_question = jumpscore_doc_to_text(doc, lmms_eval_specific_kwargs)
 
     return [
         {
             "role": "user",
             "content": [
                 {"type": "video", "url": video_path},
-                {"type": "text", "text": count_question},
+                {"type": "text", "text": timestamps_question},
             ],
         },
-        {"role": "assistant", "content": [{"type": "text", "text": count_answer}]},
-        {"role": "user", "content": [{"type": "text", "text": timestamps_question}]},
     ]