From d221454058a71632800c912b9550588564c0dd5a Mon Sep 17 00:00:00 2001 From: johan bjorck Date: Wed, 20 May 2026 17:42:08 -0700 Subject: [PATCH 1/2] feat: add CrossPoint-Bench task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CrossPoint-Bench is a 1,000-sample cross-view point correspondence benchmark covering four sub-tasks at two granularity levels (object / part): - Fine-grained Grounding (161, coordinate output, in-mask hit) - Visibility Reasoning (220, binary MCQ) - Correspondence-Judgement (156, MCQ) - Correspondence-Pointing (463, coordinate output, in-mask hit) Dataset: WangYipu2002/CrossPoint-Bench on HuggingFace. The JSONL is loaded via load_dataset; image files live alongside under image/ and are fetched once via snapshot_download (cached on subsequent calls). Metric: crosspoint_accuracy — task-type-aware scoring (MCQ letter match for the two MCQ subtypes, point-in-mask hit for the coordinate subtypes). Per-type and per-level breakdowns are printed at aggregation time. The CROSSPOINT_COORD_FORMAT env var (absolute|relative_1|relative_1000) controls how coordinate outputs are interpreted before being checked against the ground-truth mask. Default: absolute. Reference: https://arxiv.org/abs/2512.04686 --- .../crosspoint_bench/crosspoint_bench.yaml | 27 ++ lmms_eval/tasks/crosspoint_bench/utils.py | 272 ++++++++++++++++++ 2 files changed, 299 insertions(+) create mode 100644 lmms_eval/tasks/crosspoint_bench/crosspoint_bench.yaml create mode 100644 lmms_eval/tasks/crosspoint_bench/utils.py diff --git a/lmms_eval/tasks/crosspoint_bench/crosspoint_bench.yaml b/lmms_eval/tasks/crosspoint_bench/crosspoint_bench.yaml new file mode 100644 index 000000000..5ee47ef5d --- /dev/null +++ b/lmms_eval/tasks/crosspoint_bench/crosspoint_bench.yaml @@ -0,0 +1,27 @@ +dataset_path: WangYipu2002/CrossPoint-Bench +task: crosspoint_bench +test_split: train +output_type: generate_until +doc_to_visual: !function utils.crosspoint_doc_to_visual +doc_to_text: !function utils.crosspoint_doc_to_text +doc_to_target: !function utils.crosspoint_doc_to_target + +generation_kwargs: + max_new_tokens: 256 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false + +process_results: !function utils.crosspoint_process_results +metric_list: + - metric: crosspoint_accuracy + aggregation: !function utils.crosspoint_aggregate_results + higher_is_better: true + +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "" +metadata: + version: 0.1 diff --git a/lmms_eval/tasks/crosspoint_bench/utils.py b/lmms_eval/tasks/crosspoint_bench/utils.py new file mode 100644 index 000000000..e4f9f5a67 --- /dev/null +++ b/lmms_eval/tasks/crosspoint_bench/utils.py @@ -0,0 +1,272 @@ +"""CrossPoint-Bench task for lmms-eval. + +Cross-view point correspondence benchmark covering four sub-tasks: + +- Fine-grained Grounding (coordinate output, in-mask hit) +- Visibility Reasoning (binary MCQ) +- Correspondence-Judgement (MCQ) +- Correspondence-Pointing (coordinate output, in-mask hit) + +The HF dataset stores only the JSONL; image files live alongside under +``image/`` and are fetched once with ``snapshot_download`` and cached on +disk for subsequent ``doc_to_visual`` lookups. + +Reference: https://arxiv.org/abs/2512.04686 +""" + +from __future__ import annotations + +import base64 +import io +import os +import os.path as osp +import re +from functools import lru_cache +from typing import Any, Dict, List + +import numpy as np +from huggingface_hub import snapshot_download +from PIL import Image + +REPO_ID = "WangYipu2002/CrossPoint-Bench" + +COORDINATE_TASK_TYPES = {"Fine-grained Grounding", "Correspondence-Pointing"} +MCQ_TASK_TYPES = {"Visibility Reasoning", "Correspondence-Judgement"} + +COORDINATE_PROMPT_SUFFIX = " Output the point coordinates in JSON format." + + +# --------------------------------------------------------------------------- +# Image resolution +# --------------------------------------------------------------------------- + +@lru_cache(maxsize=1) +def _image_root() -> str: + """Download (once) and return the local path to the ``image/`` tree.""" + local_dir = snapshot_download( + repo_id=REPO_ID, + repo_type="dataset", + allow_patterns=["image/**", "CrossPoint-Bench.jsonl"], + ) + return osp.join(local_dir, "image") + + +def crosspoint_doc_to_visual(doc: Dict[str, Any]) -> List[Image.Image]: + root = _image_root() + out = [] + for rel in doc["images"]: + path = osp.join(root, rel) + with Image.open(path) as im: + out.append(im.convert("RGB")) + return out + + +# --------------------------------------------------------------------------- +# Prompting +# --------------------------------------------------------------------------- + +def crosspoint_doc_to_text( + doc: Dict[str, Any], + lmms_eval_specific_kwargs: Dict[str, Any] | None = None, +) -> str: + kwargs = lmms_eval_specific_kwargs or {} + pre = kwargs.get("pre_prompt", "") + post = kwargs.get("post_prompt", "") + question = str(doc["question"]) + if doc.get("type") in COORDINATE_TASK_TYPES: + question = question + COORDINATE_PROMPT_SUFFIX + return f"{pre}{question}{post}" + + +def crosspoint_doc_to_target(doc: Dict[str, Any]) -> str: + return str(doc.get("answer", "")) + + +# --------------------------------------------------------------------------- +# Answer extraction +# --------------------------------------------------------------------------- + +def _extract_coordinates_from_json(text: str): + patterns = [ + r'\{[^{}]*"(?:point_2d|point|coordinates?)"\s*:\s*\[([0-9.]+)\s*,\s*([0-9.]+)\]', + r'\{[^{}]*"x"\s*:\s*([0-9.]+)[^{}]*"y"\s*:\s*([0-9.]+)', + r'\{[^{}]*"y"\s*:\s*([0-9.]+)[^{}]*"x"\s*:\s*([0-9.]+)', + ] + for i, pat in enumerate(patterns): + m = re.search(pat, text, re.DOTALL) + if m: + if i == 2: + return float(m.group(2)), float(m.group(1)) + return float(m.group(1)), float(m.group(2)) + return None + + +def _extract_coordinates_from_text(text: str): + patterns = [ + (r']*>', False), + (r'(?:coordinates?|position|location).*?(?:are|is)\s+([0-9.]+),\s*([0-9.]+)', False), + (r'[xX]\s*:\s*([0-9.]+).*?[yY]\s*:\s*([0-9.]+)', False), + (r'[yY]\s*:\s*([0-9.]+).*?[xX]\s*:\s*([0-9.]+)', True), + (r'["\']?x["\']?\s*:\s*([0-9.]+).*?["\']?y["\']?\s*:\s*([0-9.]+)', False), + (r'\[([0-9.]+),\s*([0-9.]+)\]', False), + (r'\(([0-9.]+),\s*([0-9.]+)\)', False), + (r'([0-9.]+),\s*([0-9.]+)', False), + ] + for pat, swapped in patterns: + m = re.search(pat, text, re.DOTALL) + if m: + x, y = float(m.group(1)), float(m.group(2)) + if swapped: + x, y = y, x + return x, y + return None + + +def _extract_coordinates(text: str): + return _extract_coordinates_from_json(text) or _extract_coordinates_from_text(text) + + +def _extract_mcq_letter(text: str): + s = text.strip() + if "" in s: + s = s.split("", 1)[1].strip() + + patterns = [ + r'\\boxed\{(?:\\text\{)?([ABCD])(?:\..*?)?\}', + r'\((?:Choice\s+)?([ABCD])\)', + r'\*\*Answer:\s*([ABCD])\*\*', + r'\*\*([ABCD])\.\s*(?:Yes|No)', + r'(?:answer is|correct answer is|choose)\s*[:\s]*\*?\*?\(?([ABCD])\)?', + r'\*\*([ABCD])\*\*', + r'(?:^|\s|Answer:\s*)\(?([ABCD])\)?\s*[\.:,]', + r'^\s*\(?([ABCD])\)?\s*$', + ] + for pat in patterns: + m = re.search(pat, s, re.IGNORECASE | re.MULTILINE) + if m: + return m.group(1).upper() + + final_patterns = [ + r'(?:Final Answer|Therefore|So|Hence|Thus|The correct answer is|The answer is)' + r'.*?(?:\\boxed\{([ABCD])\}|\*\*([ABCD])\*\*|\(?([ABCD])\)?)', + r'Answer:\s*\(?([ABCD])\)?', + ] + for pat in final_patterns: + m = re.search(pat, s, re.IGNORECASE) + if m: + for g in m.groups(): + if g: + return g.upper() + + letters = re.findall(r'\b([ABCD])\b', s) + if len(letters) == 1: + return letters[0].upper() + return None + + +# --------------------------------------------------------------------------- +# Scoring +# --------------------------------------------------------------------------- + +def _decode_base64_mask(b64: str): + try: + return np.array(Image.open(io.BytesIO(base64.b64decode(b64))).convert("L")) + except Exception: + return None + + +def _point_in_mask(x: int, y: int, mask: np.ndarray, threshold: int = 128) -> bool: + h, w = mask.shape[:2] + if not (0 <= x < w and 0 <= y < h): + return False + return bool(mask[y, x] > threshold) + + +def _image_size(path: str): + try: + with Image.open(path) as im: + return im.size # (w, h) + except Exception: + return None + + +def crosspoint_process_results(doc: Dict[str, Any], results: List[str]) -> Dict[str, Any]: + pred = results[0] if results else "" + task_type = str(doc.get("type", "")) + level = str(doc.get("level", "")) + answer = str(doc.get("answer", "")) + hit = 0 + + if task_type in COORDINATE_TASK_TYPES: + coords = _extract_coordinates(pred) + if coords is not None: + x_raw, y_raw = coords + # Resolve coordinate format. Heuristic + env override (matches the + # vlmevalkit version): default to absolute pixel coordinates. + coord_format = os.environ.get("CROSSPOINT_COORD_FORMAT", "absolute") + img_paths = list(doc.get("images") or []) + dims = None + if img_paths: + root = _image_root() + dims = _image_size(osp.join(root, img_paths[0])) + if dims is not None: + w, h = dims + if coord_format == "relative_1": + x_abs = int(x_raw * w) + y_abs = int(y_raw * h) + elif coord_format == "relative_1000": + x_abs = int((x_raw / 1000.0) * w) + y_abs = int((y_raw / 1000.0) * h) + else: + x_abs, y_abs = int(x_raw), int(y_raw) + else: + x_abs, y_abs = int(x_raw), int(y_raw) + mask = _decode_base64_mask(answer) + if mask is not None: + hit = int(_point_in_mask(x_abs, y_abs, mask)) + elif task_type in MCQ_TASK_TYPES: + letter = _extract_mcq_letter(pred) + if letter is not None: + hit = int(letter == answer.strip().upper()) + + return { + "crosspoint_accuracy": { + "hit": hit, + "type": task_type, + "level": level, + } + } + + +def crosspoint_aggregate_results(results: List[Dict[str, Any]]) -> float: + if not results: + return 0.0 + + # Overall + overall = sum(r["hit"] for r in results) / len(results) * 100 + + # By type + from collections import defaultdict + by_type = defaultdict(list) + by_level = defaultdict(list) + by_type_level = defaultdict(list) + for r in results: + by_type[r["type"]].append(r["hit"]) + by_level[r["level"]].append(r["hit"]) + by_type_level[(r["type"], r["level"])].append(r["hit"]) + + lines = ["", "=" * 70, "CrossPoint-Bench Evaluation Results", "=" * 70, + f" {'Overall':<45} {overall:5.1f}% ({sum(r['hit'] for r in results)}/{len(results)})"] + for k in sorted(by_type): + hits = by_type[k] + lines.append(f" type/{k:<40} {sum(hits)/len(hits)*100:5.1f}% ({sum(hits)}/{len(hits)})") + for k in sorted(by_level): + hits = by_level[k] + lines.append(f" level/{k:<39} {sum(hits)/len(hits)*100:5.1f}% ({sum(hits)}/{len(hits)})") + for (t, lv) in sorted(by_type_level): + hits = by_type_level[(t, lv)] + lines.append(f" {t}/{lv:<45} {sum(hits)/len(hits)*100:5.1f}% ({sum(hits)}/{len(hits)})") + lines.append("=" * 70) + print("\n".join(lines)) + + return overall / 100.0 From 86be087b114de8821fb8194520bca7cf2c120fd9 Mon Sep 17 00:00:00 2001 From: johan bjorck Date: Fri, 22 May 2026 10:44:11 -0700 Subject: [PATCH 2/2] crosspoint_bench: expose per-type/per-level metrics in metric_list Address reviewer feedback on #1349: previously the aggregator just print()ed the per-type and per-level breakdowns to stdout. Now each breakdown is its own entry in metric_list, so results.json carries: - crosspoint_accuracy (overall) - fine_grained_grounding (type) - visibility_reasoning (type) - correspondence_judgement (type) - correspondence_pointing (type) - level_object (level) - level_part (level) process_results returns a single {hit, type, level} item, replicated under every metric key, so each metric's aggregator filters its slice (by type or level) and computes its own mean. --- .../crosspoint_bench/crosspoint_bench.yaml | 20 ++- lmms_eval/tasks/crosspoint_bench/utils.py | 157 ++++++++++-------- 2 files changed, 107 insertions(+), 70 deletions(-) diff --git a/lmms_eval/tasks/crosspoint_bench/crosspoint_bench.yaml b/lmms_eval/tasks/crosspoint_bench/crosspoint_bench.yaml index 5ee47ef5d..08d332dc6 100644 --- a/lmms_eval/tasks/crosspoint_bench/crosspoint_bench.yaml +++ b/lmms_eval/tasks/crosspoint_bench/crosspoint_bench.yaml @@ -16,7 +16,25 @@ generation_kwargs: process_results: !function utils.crosspoint_process_results metric_list: - metric: crosspoint_accuracy - aggregation: !function utils.crosspoint_aggregate_results + aggregation: !function utils.crosspoint_aggregate_overall + higher_is_better: true + - metric: fine_grained_grounding + aggregation: !function utils.crosspoint_aggregate_fine_grained_grounding + higher_is_better: true + - metric: visibility_reasoning + aggregation: !function utils.crosspoint_aggregate_visibility_reasoning + higher_is_better: true + - metric: correspondence_judgement + aggregation: !function utils.crosspoint_aggregate_correspondence_judgement + higher_is_better: true + - metric: correspondence_pointing + aggregation: !function utils.crosspoint_aggregate_correspondence_pointing + higher_is_better: true + - metric: level_object + aggregation: !function utils.crosspoint_aggregate_level_object + higher_is_better: true + - metric: level_part + aggregation: !function utils.crosspoint_aggregate_level_part higher_is_better: true lmms_eval_specific_kwargs: diff --git a/lmms_eval/tasks/crosspoint_bench/utils.py b/lmms_eval/tasks/crosspoint_bench/utils.py index e4f9f5a67..c869b6c2a 100644 --- a/lmms_eval/tasks/crosspoint_bench/utils.py +++ b/lmms_eval/tasks/crosspoint_bench/utils.py @@ -190,83 +190,102 @@ def _image_size(path: str): return None -def crosspoint_process_results(doc: Dict[str, Any], results: List[str]) -> Dict[str, Any]: - pred = results[0] if results else "" +def _score(pred: str, doc: Dict[str, Any]) -> int: task_type = str(doc.get("type", "")) - level = str(doc.get("level", "")) answer = str(doc.get("answer", "")) - hit = 0 if task_type in COORDINATE_TASK_TYPES: coords = _extract_coordinates(pred) - if coords is not None: - x_raw, y_raw = coords - # Resolve coordinate format. Heuristic + env override (matches the - # vlmevalkit version): default to absolute pixel coordinates. - coord_format = os.environ.get("CROSSPOINT_COORD_FORMAT", "absolute") - img_paths = list(doc.get("images") or []) - dims = None - if img_paths: - root = _image_root() - dims = _image_size(osp.join(root, img_paths[0])) - if dims is not None: - w, h = dims - if coord_format == "relative_1": - x_abs = int(x_raw * w) - y_abs = int(y_raw * h) - elif coord_format == "relative_1000": - x_abs = int((x_raw / 1000.0) * w) - y_abs = int((y_raw / 1000.0) * h) - else: - x_abs, y_abs = int(x_raw), int(y_raw) + if coords is None: + return 0 + x_raw, y_raw = coords + coord_format = os.environ.get("CROSSPOINT_COORD_FORMAT", "absolute") + img_paths = list(doc.get("images") or []) + dims = None + if img_paths: + dims = _image_size(osp.join(_image_root(), img_paths[0])) + if dims is not None: + w, h = dims + if coord_format == "relative_1": + x_abs, y_abs = int(x_raw * w), int(y_raw * h) + elif coord_format == "relative_1000": + x_abs, y_abs = int((x_raw / 1000.0) * w), int((y_raw / 1000.0) * h) else: x_abs, y_abs = int(x_raw), int(y_raw) - mask = _decode_base64_mask(answer) - if mask is not None: - hit = int(_point_in_mask(x_abs, y_abs, mask)) - elif task_type in MCQ_TASK_TYPES: + else: + x_abs, y_abs = int(x_raw), int(y_raw) + mask = _decode_base64_mask(answer) + if mask is None: + return 0 + return int(_point_in_mask(x_abs, y_abs, mask)) + + if task_type in MCQ_TASK_TYPES: letter = _extract_mcq_letter(pred) - if letter is not None: - hit = int(letter == answer.strip().upper()) - - return { - "crosspoint_accuracy": { - "hit": hit, - "type": task_type, - "level": level, - } + if letter is None: + return 0 + return int(letter == answer.strip().upper()) + + return 0 + + +# All per-doc metric values share the same {hit, type, level} structure so any +# aggregator can filter on type or level. +_METRIC_KEYS = ( + "crosspoint_accuracy", + "fine_grained_grounding", + "visibility_reasoning", + "correspondence_judgement", + "correspondence_pointing", + "level_object", + "level_part", +) + + +def crosspoint_process_results(doc: Dict[str, Any], results: List[str]) -> Dict[str, Any]: + pred = results[0] if results else "" + item = { + "hit": _score(pred, doc), + "type": str(doc.get("type", "")), + "level": str(doc.get("level", "")), } + return {k: item for k in _METRIC_KEYS} + + +def _mean(xs): + return sum(xs) / len(xs) if xs else 0.0 + + +def crosspoint_aggregate_overall(results: List[Dict[str, Any]]) -> float: + return _mean([r["hit"] for r in results]) + + +def _aggregate_by_type(results, target_type): + return _mean([r["hit"] for r in results if r["type"] == target_type]) + + +def _aggregate_by_level(results, target_level): + return _mean([r["hit"] for r in results if r["level"] == target_level]) + + +def crosspoint_aggregate_fine_grained_grounding(results): + return _aggregate_by_type(results, "Fine-grained Grounding") + + +def crosspoint_aggregate_visibility_reasoning(results): + return _aggregate_by_type(results, "Visibility Reasoning") + + +def crosspoint_aggregate_correspondence_judgement(results): + return _aggregate_by_type(results, "Correspondence-Judgement") + + +def crosspoint_aggregate_correspondence_pointing(results): + return _aggregate_by_type(results, "Correspondence-Pointing") + + +def crosspoint_aggregate_level_object(results): + return _aggregate_by_level(results, "object") -def crosspoint_aggregate_results(results: List[Dict[str, Any]]) -> float: - if not results: - return 0.0 - - # Overall - overall = sum(r["hit"] for r in results) / len(results) * 100 - - # By type - from collections import defaultdict - by_type = defaultdict(list) - by_level = defaultdict(list) - by_type_level = defaultdict(list) - for r in results: - by_type[r["type"]].append(r["hit"]) - by_level[r["level"]].append(r["hit"]) - by_type_level[(r["type"], r["level"])].append(r["hit"]) - - lines = ["", "=" * 70, "CrossPoint-Bench Evaluation Results", "=" * 70, - f" {'Overall':<45} {overall:5.1f}% ({sum(r['hit'] for r in results)}/{len(results)})"] - for k in sorted(by_type): - hits = by_type[k] - lines.append(f" type/{k:<40} {sum(hits)/len(hits)*100:5.1f}% ({sum(hits)}/{len(hits)})") - for k in sorted(by_level): - hits = by_level[k] - lines.append(f" level/{k:<39} {sum(hits)/len(hits)*100:5.1f}% ({sum(hits)}/{len(hits)})") - for (t, lv) in sorted(by_type_level): - hits = by_type_level[(t, lv)] - lines.append(f" {t}/{lv:<45} {sum(hits)/len(hits)*100:5.1f}% ({sum(hits)}/{len(hits)})") - lines.append("=" * 70) - print("\n".join(lines)) - - return overall / 100.0 +def crosspoint_aggregate_level_part(results): + return _aggregate_by_level(results, "part")