[Evals] 1/N Add more task related tests (#110)

SumanthRH · web-flow · commit 2e5db2b26be6 · 2025-04-21T18:04:42.000-07:00
# What does this PR do? Adds more task specific tests. The tests are pretty lightweight and basic. The idea is to be able to catch any errors in templating and having basic E2E correctness checks for each task to avoid issues like #108 There is some repetition across tests which is intentional: It's best to decouple tests for different tasks. I'll add more tests for other tasks in future PRs. --------- Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
diff --git a/tests/evals/tasks/test_aime.py b/tests/evals/tasks/test_aime.py
@@ -0,0 +1,55 @@
+import pytest
+
+from skythought.evals.tasks.aime.aime_handler import AIMETaskHandler
+
+
+class MockTaskConfig:
+    templating_parameters = {
+        "template": "Problem: {prompt}\n\nProvide a numerical answer."
+    }
+    answer_key = "answer"
+    question_key = "question"
+
+
+@pytest.mark.parametrize(
+    "problem, response, expected",
+    [
+        (
+            {
+                "question": "Find the sum of the first 10 positive integers.",
+                "answer": "55",
+            },
+            "The sum is 55",
+            True,
+        ),
+        (
+            {
+                "question": "What is the value of (3^4 - 2^5)?",
+                "answer": "49",
+            },
+            "48",
+            False,
+        ),
+    ],
+)
+def test_check_correctness(problem, response, expected):
+    handler = AIMETaskHandler(task_config=MockTaskConfig)
+    assert handler.check_correctness(problem, generation=response) == expected
+
+
+@pytest.mark.parametrize(
+    "problem, expected",
+    [
+        (
+            {
+                "question": "Find the sum of the first 10 positive integers.",
+                "answer": "4",
+            },
+            "Problem: Find the sum of the first 10 positive integers.\n\nProvide a numerical answer.",
+        ),
+    ],
+)
+def test_generate_prompt(problem, expected):
+    print(problem)
+    handler = AIMETaskHandler(task_config=MockTaskConfig)
+    assert handler.generate_prompt(problem) == expected
diff --git a/tests/evals/tasks/test_amc.py b/tests/evals/tasks/test_amc.py
@@ -0,0 +1,47 @@
+import pytest
+
+from skythought.evals.tasks.amc23.amc23_handler import AMC23TaskHandler
+
+
+class MockTaskConfig:
+    templating_parameters = {
+        "template": "Return the answer to the following: {question}"
+    }
+    answer_key = "answer"
+    question_key = "question"
+    choices_key = "choices"
+
+
+@pytest.mark.parametrize(
+    "problem, response, expected",
+    [
+        (
+            {"question": "2+2", "answer": "4"},
+            "5",
+            False,
+        ),
+        (
+            {"question": "3* 25 percent", "answer": " 75%"},
+            "My reply is $0.75.",  # ignores dollar signs and normalizes percentages
+            True,
+        ),
+    ],
+)
+def test_check_correctness(problem, response, expected):
+    handler = AMC23TaskHandler(task_config=MockTaskConfig)
+    print(handler.check_correctness(problem, generation=response))
+    assert handler.check_correctness(problem, generation=response) == expected
+
+
+@pytest.mark.parametrize(
+    "problem, expected",
+    [
+        (
+            {"question": "What is the result of 2+2?", "answer": "4"},
+            "Return the answer to the following: What is the result of 2+2?",
+        ),
+    ],
+)
+def test_generate_prompt(problem, expected):
+    handler = AMC23TaskHandler(task_config=MockTaskConfig)
+    assert handler.generate_prompt(problem) == expected
diff --git a/tests/evals/tasks/test_math.py b/tests/evals/tasks/test_math.py
@@ -4,7 +4,9 @@
 
 
 class MockTaskConfig:
-    templating_parameters = {"template": "{question}"}
+    templating_parameters = {
+        "template": "Return the answer to the following: {question}"
+    }
     answer_key = "answer"
     question_key = "question"
 
@@ -42,3 +44,20 @@ def test_check_correctness(
 ):
     handler = MathTaskHandler(task_config=MockTaskConfig)
     assert handler.check_correctness(problem, generation=response) == expected
+
+
+@pytest.mark.parametrize(
+    "problem, expected",
+    [
+        (
+            {"question": "What is the result of 2+2?", "answer": "4"},
+            "Return the answer to the following: What is the result of 2+2?",
+        ),
+    ],
+)
+def test_generate_prompt(
+    problem,
+    expected,
+):
+    handler = MathTaskHandler(task_config=MockTaskConfig)
+    assert handler.generate_prompt(problem) == expected
diff --git a/tests/evals/tasks/test_mmlu.py b/tests/evals/tasks/test_mmlu.py
@@ -0,0 +1,56 @@
+import pytest
+
+from skythought.evals.tasks.mmlu.mmlu_handler import MMLUTaskHandler
+
+
+class MockTaskConfig:
+    templating_parameters = {"template": "{prompt}"}
+    answer_key = "answer"
+    question_key = "question"
+    choices_key = "choices"
+
+
+@pytest.mark.parametrize(
+    "problem, response, expected",
+    [
+        (
+            {
+                "question": "What is the capital of France?",
+                "choices": "A) London\nB) Paris\nC) Berlin\nD) Madrid",
+                "answer": 1,
+            },
+            "The answer is B) Paris",
+            True,
+        ),
+        (
+            {
+                "question": "Which element has the atomic number 1?",
+                "choices": "A) Helium\nB) Oxygen\nC) Hydrogen\nD) Carbon",
+                "answer": 2,
+            },
+            "A",
+            False,
+        ),
+    ],
+)
+def test_check_correctness(problem, response, expected):
+    handler = MMLUTaskHandler(task_config=MockTaskConfig)
+    assert handler.check_correctness(problem, generation=response) == expected
+
+
+@pytest.mark.parametrize(
+    "problem, expected",
+    [
+        (
+            {
+                "question": "What is the capital of France?",
+                "answer": "B",
+                "choices": ["London", "Paris", "Berlin", "Madrid"],
+            },
+            "What is the capital of France?\nAnswer Choices: (A) London (B) Paris (C) Berlin (D) Madrid",
+        ),
+    ],
+)
+def test_generate_prompt(problem, expected):
+    handler = MMLUTaskHandler(task_config=MockTaskConfig)
+    assert handler.generate_prompt(problem) == expected
diff --git a/tests/evals/tasks/test_mmlu_pro.py b/tests/evals/tasks/test_mmlu_pro.py
@@ -0,0 +1,69 @@
+import pytest
+
+from skythought.evals.tasks.mmlu.mmlu_handler import MMLUProTaskHandler
+
+
+class MockTaskConfig:
+    templating_parameters = {"template": "Question: {prompt}"}
+    answer_key = "answer"
+    question_key = "question"
+    choices_key = "choices"
+    context_key = "context"
+
+
+@pytest.mark.parametrize(
+    "problem, response, expected",
+    [
+        (
+            {
+                "question": "What is the main function of the left ventricle?",
+                "choices": "A) Pumps blood to the lungs\nB) Pumps blood to the body\nC) Collects blood from the body\nD) Stores blood",
+                "answer": "B",
+                "answer_index": 1,
+            },
+            "B) Pumps blood to the body",
+            True,
+        ),
+        (
+            {
+                "question": "What does GDP stand for?",
+                "choices": "A) Gross Domestic Product\nB) General Development Plan\nC) Global Distribution Process\nD) Geographic Data Point",
+                "answer": "A",
+                "answer_index": 0,
+            },
+            "I think it's B",
+            False,
+        ),
+    ],
+)
+def test_check_correctness(problem, response, expected):
+    handler = MMLUProTaskHandler(task_config=MockTaskConfig)
+    assert handler.check_correctness(problem, generation=response) == expected
+
+
+@pytest.mark.parametrize(
+    "problem, expected",
+    [
+        (
+            {
+                "question": "What is the main function of the left ventricle?",
+                "options": [
+                    "Pumps blood to the lungs",
+                    "Pumps blood to the body",
+                    "Collects blood from the body",
+                    "Stores blood",
+                ],
+                "answer": "B",
+                "answer_index": 1,
+            },
+            "Question: What is the main function of the left ventricle?\n"
+            "Answer Choices: (A) Pumps blood to the lungs (B) Pumps blood to the body (C) Collects blood from the body (D) Stores blood",
+        ),
+    ],
+)
+def test_generate_prompt(
+    problem,
+    expected,
+):
+    handler = MMLUProTaskHandler(task_config=MockTaskConfig)
+    assert handler.generate_prompt(problem) == expected