Deepcoder example

xzrderek · xzrderek · commit 2422f24dce90 · 2025-08-04T11:31:30.000-07:00
diff --git a/tests/pytest/data/deepcoder_dataset.jsonl b/tests/pytest/data/deepcoder_dataset.jsonl
@@ -0,0 +1,10 @@
+{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "5", "expected_output": "6"}
+{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "-2", "expected_output": "-1"}
+{"prompt": "Write a Python function `add_one` that takes an integer and returns the integer incremented by 1.", "input": "0", "expected_output": "1"}
+{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "3", "expected_output": "6"}
+{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "-4", "expected_output": "-8"}
+{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "0", "expected_output": "0"}
+{"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "10", "expected_output": "20"}
+{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "[1, 2, 3]", "expected_output": "3"}
+{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "[]", "expected_output": "0"}
+{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "['a', 'b', 'c', 'd']", "expected_output": "4"}
diff --git a/tests/pytest/test_deepcoder.py b/tests/pytest/test_deepcoder.py
@@ -0,0 +1,92 @@
+"""
+Pytest test for deepcoder code evaluation using the evaluation_test decorator.
+
+This test demonstrates how to evaluate code correctness by executing Python code locally
+and comparing the output against expected results in a pointwise manner.
+"""
+
+from typing import Any, Dict, List
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message
+from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
+from eval_protocol.rewards.code_execution import extract_code_blocks, execute_python_code
+
+
+def deepcoder_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """
+    Convert entries from deepcoder dataset to EvaluationRow objects.
+    """
+    return [
+        EvaluationRow(
+            messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")], 
+            ground_truth=row["expected_output"]
+        )
+        for row in data
+    ]
+
+
+@evaluation_test(
+    input_dataset=["tests/pytest/data/deepcoder_dataset.jsonl"],
+    dataset_adapter=deepcoder_dataset_to_evaluation_row,
+    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
+    threshold_of_success=0.5,
+    rollout_processor=default_single_turn_rollout_processor,
+    num_runs=1,
+    mode="pointwise",
+)
+def test_deepcoder_code_evaluation(row: EvaluationRow) -> EvaluationRow:
+    """
+    Evaluation function that tests code correctness by executing it locally.
+    
+    This function:
+    1. Extracts Python code from the assistant's response
+    2. Executes the code locally with timeout=10
+    3. Compares the output to ground_truth
+    4. Returns a score of 1.0 if output matches, 0.0 otherwise
+    
+    Args:
+        row: EvaluationRow containing the conversation messages and expected_output in ground_truth
+        
+    Returns:
+        EvaluationRow with the evaluation result
+    """
+    # Check if we have an assistant response
+    if len(row.messages) < 2 or row.messages[-1].role != "assistant":
+        row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found")
+        return row
+    
+    assistant_content = row.messages[-1].content or ""
+    expected_output = (row.ground_truth or "").strip()
+    
+    # Extract Python code blocks
+    code_blocks = extract_code_blocks(assistant_content, language="python")
+    if not code_blocks:
+        row.evaluation_result = EvaluateResult(score=0.0, reason="No Python code block found")
+        return row
+    
+    code = code_blocks[0]["code"]
+    
+    # Execute the code locally
+    execution_result = execute_python_code(code, timeout=10)
+    
+    if not execution_result.get("success", False):
+        error_msg = execution_result.get("error", "Code execution failed")
+        row.evaluation_result = EvaluateResult(score=0.0, reason=f"Execution error: {error_msg}")
+        return row
+    
+    # Compare output with expected
+    actual_output = (execution_result.get("output", "") or "").strip()
+    
+    if actual_output == expected_output:
+        row.evaluation_result = EvaluateResult(
+            score=1.0, 
+            reason=f"✅ Output matches: '{actual_output}'"
+        )
+    else:
+        row.evaluation_result = EvaluateResult(
+            score=0.0, 
+            reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
+        )
+    
+    return row
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
@@ -24,7 +24,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
 @evaluation_test(
     input_dataset=["tests/pytest/data/markdown_dataset.jsonl"],
     dataset_adapter=markdown_dataset_to_evaluation_row,
-    model=["accounts/fireworks/models/llama-v3p1-8b-instruct"],
+    model=["accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     threshold_of_success=0.5,
     rollout_processor=default_single_turn_rollout_processor,