Skip to content

Commit df477b1

Browse files
authored
add scibench(math) task (#834)
* add scibench math task * add medqa task * run pre commit
1 parent 036637e commit df477b1

5 files changed

Lines changed: 324 additions & 0 deletions

File tree

lmms_eval/tasks/medqa/medqa.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
dataset_path: lmms-lab/MEDQA
2+
dataset_kwargs:
3+
token: True
4+
5+
task: "medqa"
6+
test_split: test
7+
doc_to_target: !function utils.medqa_doc_to_target
8+
doc_to_visual: null
9+
doc_to_text: !function utils.medqa_doc_to_text
10+
doc_to_choice: !function utils.medqa_doc_to_choice
11+
12+
lmms_eval_specific_kwargs:
13+
default:
14+
pre_prompt: ""
15+
post_prompt: "\nAnswer with the option's letter from the given choices directly: "
16+
metric_list:
17+
- metric: accuracy
18+
aggregation: mean
19+
higher_is_better: true
20+
21+
process_results: !function utils.medqa_process_results
22+
23+
metadata:
24+
version: 0.0

lmms_eval/tasks/medqa/utils.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import random
2+
from typing import Any, Dict, List
3+
4+
import numpy as np
5+
6+
7+
def medqa_doc_to_text(doc: Dict[str, Any], lmms_eval_specific_kwargs: Dict[str, Any]):
8+
"""
9+
Build the MCQ prompt from MEDQA sample.
10+
11+
Expected doc fields (from `lmms-lab/MEDQA` parquet):
12+
- "question": str
13+
- "options": dict mapping letters to option strings (e.g., {"A": "...", "B": "..."})
14+
- Some samples may also expose choices as list-like; we normalize to a lettered block.
15+
- We do not use visuals for MEDQA.
16+
"""
17+
question = doc.get("question", "").strip()
18+
19+
# Normalize options into A..E style lines
20+
options = doc.get("options")
21+
if isinstance(options, dict):
22+
# Keep only A-E in sorted letter order if present
23+
ordered_keys = [k for k in ["A", "B", "C", "D", "E"] if k in options]
24+
options_block = "\n".join([f"{k}. {str(options[k]).strip()}" for k in ordered_keys])
25+
elif isinstance(options, list):
26+
letters = ["A", "B", "C", "D", "E"]
27+
options_block = "\n".join([f"{letters[i]}. {str(opt).strip()}" for i, opt in enumerate(options)])
28+
else:
29+
# Fallback: try to format if already string-like
30+
options_block = str(options) if options is not None else ""
31+
32+
pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
33+
post_prompt = lmms_eval_specific_kwargs["post_prompt"]
34+
prompt = f"{question}\n{options_block}"
35+
return f"{pre_prompt}{prompt}{post_prompt}"
36+
37+
38+
def medqa_doc_to_target(doc: Dict[str, Any]):
39+
"""
40+
Return the ground-truth answer letter.
41+
42+
MEDQA on HF commonly provides either:
43+
- "answer_idx": a letter like "A"/"B"/... OR
44+
- "answer": a full string like "C" or the option text. We prioritize letter if available.
45+
"""
46+
# Prefer explicit answer letter field when present
47+
if "answer_idx" in doc and isinstance(doc["answer_idx"], str) and len(doc["answer_idx"]) == 1:
48+
return doc["answer_idx"].strip()
49+
50+
# Some variants store the letter in "answer" directly
51+
ans = doc.get("answer")
52+
if isinstance(ans, str) and len(ans.strip()) == 1 and ans.strip().upper() in ["A", "B", "C", "D", "E"]:
53+
return ans.strip().upper()
54+
55+
# If answer is provided as text, try to map back to a letter via options
56+
options = doc.get("options")
57+
if isinstance(options, dict) and isinstance(ans, str):
58+
for k, v in options.items():
59+
if isinstance(v, str) and v.strip() == ans.strip():
60+
return k
61+
62+
# Fallback: unknown -> choose a dummy; evaluation will mark as incorrect
63+
return "A"
64+
65+
66+
def medqa_doc_to_choice(doc: Dict[str, Any]) -> List[str]:
67+
# Detect how many choices are present and return corresponding letters
68+
if isinstance(doc.get("options"), dict):
69+
present = [k for k in ["A", "B", "C", "D", "E"] if k in doc["options"]]
70+
if present:
71+
return present
72+
if isinstance(doc.get("options"), list):
73+
n = min(len(doc["options"]), 5)
74+
return ["A", "B", "C", "D", "E"][:n]
75+
# Default to 5-way if uncertain
76+
return ["A", "B", "C", "D", "E"]
77+
78+
79+
def medqa_process_results(doc: Dict[str, Any], result: List[str]):
80+
"""
81+
Parse model output and compute accuracy against the gold letter.
82+
We robustly extract a single letter from the response.
83+
"""
84+
response = result[0].strip()
85+
all_choices = medqa_doc_to_choice(doc)
86+
pred = _parse_multi_choice_response(response, all_choices)
87+
gt_ans = medqa_doc_to_target(doc)
88+
score = 1.0 if pred == gt_ans else 0.0
89+
return {"accuracy": score}
90+
91+
92+
def _parse_multi_choice_response(response: str, all_choices: List[str]) -> str:
93+
# Clean punctuation around the response
94+
for ch in [",", ".", "!", "?", ";", ":", "'"]:
95+
response = response.strip(ch)
96+
response = " " + response + " "
97+
98+
candidates = []
99+
# (A) style
100+
for c in all_choices:
101+
if f"({c})" in response:
102+
candidates.append(c)
103+
104+
# plain letter surrounded by spaces
105+
if len(candidates) == 0:
106+
for c in all_choices:
107+
if f" {c} " in response:
108+
candidates.append(c)
109+
110+
# A., B., etc.
111+
if len(candidates) == 0:
112+
for c in all_choices:
113+
if f"{c}." in response:
114+
candidates.append(c)
115+
116+
if len(candidates) == 0:
117+
return random.choice(all_choices)
118+
if len(candidates) > 1:
119+
# choose the last occurrence to mitigate explanations mentioning multiple letters
120+
start_indexes = [response.rfind(f" {can} ") for can in candidates]
121+
return candidates[int(np.argmax(start_indexes))]
122+
return candidates[0]
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
dataset_path: lmms-lab/SciBench
2+
dataset_kwargs:
3+
token: True
4+
test_split: train
5+
task: "scibench"
6+
7+
doc_to_text: !function utils.scibench_doc_to_text
8+
doc_to_target: "answer_number"
9+
10+
lmms_eval_specific_kwargs:
11+
default:
12+
pre_prompt: |
13+
Please provide a clear and step-by-step solution for a scientific problem in the categories of Chemistry, Physics, or Mathematics. The problem will specify the unit of measurement, which should not be included in the answer. Express the final answer as a decimal number with three digits after the decimal point. Conclude the answer by stating "The answer is therefore \boxed{[ANSWER]}."
14+
post_prompt: "\nLet's think step by step."
15+
16+
metric_list:
17+
- metric: accuracy
18+
aggregation: mean
19+
higher_is_better: true
20+
21+
process_results: !function utils.scibench_process_results
22+
23+
metadata:
24+
version: 0.0
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
dataset_path: lmms-lab/SuperGPQA
2+
dataset_kwargs:
3+
token: True
4+
test_split: test
5+
task: "scibench_multishot"
6+
7+
doc_to_text: !function utils.scibench_multishot_doc_to_text
8+
doc_to_target: "answer_number"
9+
10+
lmms_eval_specific_kwargs:
11+
default:
12+
pre_prompt: ""
13+
post_prompt: ""
14+
15+
metric_list:
16+
- metric: accuracy
17+
aggregation: mean
18+
higher_is_better: true
19+
20+
process_results: !function utils.scibench_process_results
21+
22+
metadata:
23+
version: 0.0

lmms_eval/tasks/scibench/utils.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import re
2+
from math import isclose
3+
from typing import Dict, List, Tuple
4+
5+
FEWSHOT_PROMPT = """Problem:
6+
Suppose that $10.0 \mathrm{~mol} \mathrm{C}_2 \mathrm{H}_6(\mathrm{~g})$ is confined to $4.860 \mathrm{dm}^3$ at $27^{\circ} \mathrm{C}$. Predict the pressure exerted by the ethane from the perfect gas.
7+
8+
Solution:
9+
To predict ethane's pressure using the ideal gas law (PV = nRT), I'll convert temperature from 27°C to Kelvin (300.15 K, rounded to 300 K for significant figures), then substitute the given values: 10.0 mol of ethane, volume of 4.860 L, gas constant R = 0.0821 L·atm/(mol·K), and T = 300 K. Rearranging the ideal gas equation to P = nRT/V and calculating: P = (10.0 mol × 0.0821 L·atm/(mol·K) × 300 K) ÷ 4.860 L = 246.3 L·atm ÷ 4.860 L ≈ 50.7 atm. Since temperature has two significant figures, the final pressure is \(\boxed{50.7}\) atm.
10+
Final Answer: The final answer is \(\boxed{50.7}\). I hope it is correct.
11+
12+
Problem:
13+
Assume that all gases are perfect and that data refer to 298.15 K unless otherwise stated. Calculate the change in chemical potential of a perfect gas when its pressure is increased isothermally from $1.8 \mathrm{~atm}$ to $29.5 \mathrm{~atm}$ at $40^{\circ} \mathrm{C}$.
14+
15+
Solution:
16+
To determine the change in chemical potential (Δμ) of a perfect gas during pressure change, I begin with the fundamental relation μ = μ° + RT ln(P/P°), which yields Δμ = RT ln(Pf/Pi) for changes between two states. Converting the given temperature of 40°C to 313.15K and using R = 8.314 J/(mol·K), I calculate Δμ = (8.314 J/(mol·K))(313.15K)ln(29.5/1.8). The pressure ratio 29.5/1.8 ≈ 16.39 gives ln(16.39) ≈ 2.797, so Δμ = 8.314 × 313.15 × 2.797 ≈ 7274.5 J/mol, which rounds to 7.3 kJ/mol.
17+
Final Answer: The final answer is \(\boxed{7.3}\). I hope it is correct.
18+
19+
Problem:
20+
Show that the small angular deviation of $\epsilon$ of a plumb line from the true vertical (i.e., toward the center of Earth) at a point on Earth's surface at a latitude $\lambda$ is $\epsilon = \frac{R\omega^2sin\lambda cos\lambda}{g_0 - R\omega^2 cos^2\lambda}$ where R is the radius of Earth. What is the value (in seconds of arc) of the maximum deviation? Note that the entire denominator in the answer is actually the effective $g$, and $g_0$ denotes the pure gravitational component.
21+
22+
Solution:
23+
To determine the small angular deviation (ε) of a plumb line from true vertical due to Earth's rotation, we analyze the balance of forces at latitude λ: gravitational force (Fg = mg0) toward Earth's center and centrifugal force (Fc = mRω²cosλ) perpendicular to the rotation axis. The centrifugal force resolves into a vertical component (Fc,v = mRω²cos²λ) that reduces effective gravity to g = g0 - Rω²cos²λ, and a horizontal component (Fc,h = mRω²sinλcosλ) pulling toward the equator. The angular deviation equals the ratio of horizontal force to effective gravity: ε = Rω²sinλcosλ/(g0 - Rω²cos²λ). To find maximum deviation, we differentiate with respect to λ and find it occurs at λ = 45°. Using Earth values (R = 6.371×10⁶ m, ω = 7.292×10⁻⁵ rad/s, g0 ≈ 9.81 m/s²), we calculate the numerator at 45° as 1.697×10⁻² m/s² and denominator as 9.793 m/s², yielding εmax = 1.733×10⁻³ rad or approximately 357 arcseconds (6 arcminutes).
24+
Final Answer: The final answer is \(\boxed{6}\). I hope it is correct."""
25+
26+
27+
def scibench_doc_to_text(doc: Dict, lmms_eval_specific_kwargs: Dict) -> str:
28+
pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
29+
post_prompt = lmms_eval_specific_kwargs["post_prompt"]
30+
question = doc["problem_text"]
31+
if doc["unit"].strip():
32+
question = question + " The unit of the answer is " + doc["unit"] + "."
33+
return f"{pre_prompt}{question}{post_prompt}"
34+
35+
36+
def extract_boxed_answers(text):
37+
# Find all boxed contents
38+
matches = re.findall(r"boxed{([^}]*)}", text)
39+
for m in matches:
40+
# Strip spaces
41+
candidate = m.strip()
42+
# Keep only the numeric ones (int or decimal, with optional sign)
43+
if re.fullmatch(r"[-+]?\d*\.?\d+", candidate):
44+
return candidate
45+
return None
46+
47+
48+
def remove_not(x):
49+
match_number = re.compile("[\$]?\ *10\^[{]?\ *-?[0-9]+\ *[}]?\ *[\$]?")
50+
result = re.findall(match_number, x)
51+
if len(result) != 0:
52+
return re.split(match_number, x)[-1]
53+
return None
54+
55+
56+
def cal_not(inputs):
57+
try:
58+
x, ab = list(inputs)
59+
match_number = re.compile("10\^[{]?\ *-?[0-9]+\ *[}]?")
60+
ab = re.findall(match_number, ab)[0]
61+
ab = ab[ab.find("^") + 1 :]
62+
if "{" in ab:
63+
ab = ab[ab.find("{") + 1 :]
64+
if "}" in ab:
65+
ab = ab[: ab.find("}")]
66+
x = x.strip()
67+
out = float(x) * 10 ** float(ab)
68+
# print(float(x)*10**float(ab))
69+
return str(out)
70+
except:
71+
print("error")
72+
return inputs
73+
74+
75+
def parse_not(inputs):
76+
try:
77+
if not inputs:
78+
return "", ""
79+
if "\\times" in inputs:
80+
x, ab = inputs.split("\\times")
81+
elif "\times" in inputs:
82+
x, ab = inputs.split("\times")
83+
elif "*" in inputs:
84+
x, ab = inputs.split("*")
85+
else:
86+
return inputs
87+
return x, ab
88+
except:
89+
return "", ""
90+
91+
92+
def equiv_with_unit(model_output, answer, unit):
93+
model_output = model_output.replace(",", "")
94+
print("Model_output: ", model_output)
95+
try:
96+
ans = float(answer.strip())
97+
first = isclose(float(model_output.strip()), ans, rel_tol=0.05)
98+
except:
99+
first = False
100+
try:
101+
model = model_output.strip().split()[0]
102+
second = isclose(float(model.strip()), ans, rel_tol=0.05)
103+
except:
104+
second = False
105+
if first or second:
106+
return True
107+
return False
108+
109+
110+
def clean_number_string(s):
111+
return s.replace(",", "").replace("−", "-").strip()
112+
113+
114+
def scibench_process_results(doc: Dict, result: List[str]) -> Dict[str, float]:
115+
pred = result[0]
116+
pred = extract_boxed_answers(pred)
117+
if pred:
118+
res_equiv = isclose(float(clean_number_string(pred)), float(clean_number_string(doc["answer_number"])), rel_tol=0.05)
119+
score = 1 if res_equiv else 0
120+
else:
121+
score = 0
122+
return {"accuracy": score}
123+
124+
125+
def scibench_multishot_doc_to_text(doc: Dict, lmms_eval_specific_kwargs: Dict) -> str:
126+
pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
127+
post_prompt = lmms_eval_specific_kwargs["post_prompt"]
128+
question = doc["problem_text"]
129+
if doc["unit"].strip():
130+
question = question + " The unit of the answer is " + doc["unit"] + "."
131+
return FEWSHOT_PROMPT + "\n" + question + "\nAnswer: Let's think step by step."

0 commit comments

Comments
 (0)