Skip to content

Commit 2422f24

Browse files
committed
Deepcoder example
1 parent 5ed139a commit 2422f24

3 files changed

Lines changed: 103 additions & 1 deletion

File tree

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "5", "expected_output": "6"}
2+
{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "-2", "expected_output": "-1"}
3+
{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "0", "expected_output": "1"}
4+
{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "3", "expected_output": "6"}
5+
{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "-4", "expected_output": "-8"}
6+
{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "0", "expected_output": "0"}
7+
{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "10", "expected_output": "20"}
8+
{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "[1, 2, 3]", "expected_output": "3"}
9+
{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "[]", "expected_output": "0"}
10+
{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "['a', 'b', 'c', 'd']", "expected_output": "4"}

tests/pytest/test_deepcoder.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
Pytest test for deepcoder code evaluation using the evaluation_test decorator.
3+
4+
This test demonstrates how to evaluate code correctness by executing Python code locally
5+
and comparing the output against expected results in a pointwise manner.
6+
"""
7+
8+
from typing import Any, Dict, List
9+
10+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
11+
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
12+
from eval_protocol.rewards.code_execution import extract_code_blocks, execute_python_code
13+
14+
15+
def deepcoder_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
16+
"""
17+
Convert entries from deepcoder dataset to EvaluationRow objects.
18+
"""
19+
return [
20+
EvaluationRow(
21+
messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")],
22+
ground_truth=row["expected_output"]
23+
)
24+
for row in data
25+
]
26+
27+
28+
@evaluation_test(
29+
input_dataset=["tests/pytest/data/deepcoder_dataset.jsonl"],
30+
dataset_adapter=deepcoder_dataset_to_evaluation_row,
31+
model=["accounts/fireworks/models/kimi-k2-instruct"],
32+
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
33+
threshold_of_success=0.5,
34+
rollout_processor=default_single_turn_rollout_processor,
35+
num_runs=1,
36+
mode="pointwise",
37+
)
38+
def test_deepcoder_code_evaluation(row: EvaluationRow) -> EvaluationRow:
39+
"""
40+
Evaluation function that tests code correctness by executing it locally.
41+
42+
This function:
43+
1. Extracts Python code from the assistant's response
44+
2. Executes the code locally with timeout=10
45+
3. Compares the output to ground_truth
46+
4. Returns a score of 1.0 if output matches, 0.0 otherwise
47+
48+
Args:
49+
row: EvaluationRow containing the conversation messages and expected_output in ground_truth
50+
51+
Returns:
52+
EvaluationRow with the evaluation result
53+
"""
54+
# Check if we have an assistant response
55+
if len(row.messages) < 2 or row.messages[-1].role != "assistant":
56+
row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found")
57+
return row
58+
59+
assistant_content = row.messages[-1].content or ""
60+
expected_output = (row.ground_truth or "").strip()
61+
62+
# Extract Python code blocks
63+
code_blocks = extract_code_blocks(assistant_content, language="python")
64+
if not code_blocks:
65+
row.evaluation_result = EvaluateResult(score=0.0, reason="No Python code block found")
66+
return row
67+
68+
code = code_blocks[0]["code"]
69+
70+
# Execute the code locally
71+
execution_result = execute_python_code(code, timeout=10)
72+
73+
if not execution_result.get("success", False):
74+
error_msg = execution_result.get("error", "Code execution failed")
75+
row.evaluation_result = EvaluateResult(score=0.0, reason=f"Execution error: {error_msg}")
76+
return row
77+
78+
# Compare output with expected
79+
actual_output = (execution_result.get("output", "") or "").strip()
80+
81+
if actual_output == expected_output:
82+
row.evaluation_result = EvaluateResult(
83+
score=1.0,
84+
reason=f"✅ Output matches: '{actual_output}'"
85+
)
86+
else:
87+
row.evaluation_result = EvaluateResult(
88+
score=0.0,
89+
reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
90+
)
91+
92+
return row

tests/pytest/test_markdown_highlighting.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
2424
@evaluation_test(
2525
input_dataset=["tests/pytest/data/markdown_dataset.jsonl"],
2626
dataset_adapter=markdown_dataset_to_evaluation_row,
27-
model=["accounts/fireworks/models/llama-v3p1-8b-instruct"],
27+
model=["accounts/fireworks/models/kimi-k2-instruct"],
2828
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
2929
threshold_of_success=0.5,
3030
rollout_processor=default_single_turn_rollout_processor,

0 commit comments

Comments
 (0)