Skip to content

Commit 7d194fe

Browse files
committed
Adding APPS example
1 parent 0964562 commit 7d194fe

5 files changed

Lines changed: 11 additions & 21 deletions

File tree

tests/pytest/data/apps_dataset.jsonl

Lines changed: 0 additions & 3 deletions
This file was deleted.

tests/pytest/data/apps_sample_dataset.jsonl

Lines changed: 3 additions & 0 deletions
Large diffs are not rendered by default.
File renamed without changes.

tests/pytest/test_apps_coding.py

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,47 +19,37 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
1919
"""
2020
return [
2121
EvaluationRow(
22-
messages=[Message(role="user", content=row["prompt"])],
23-
ground_truth=json.dumps({
24-
"inputs": [row["input"] + "\n"], # Add newline for stdin format
25-
"outputs": [row["expected_output"] + "\n"] # Add newline for stdout format
26-
})
22+
messages=[Message(role="user", content=row["question"])],
23+
ground_truth=row["input_output"]
2724
)
2825
for row in data
2926
]
3027

3128

3229
@evaluation_test(
33-
input_dataset=["tests/pytest/data/apps_dataset.jsonl"],
30+
input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"],
3431
dataset_adapter=apps_dataset_to_evaluation_row,
3532
model=["accounts/fireworks/models/kimi-k2-instruct"],
3633
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
37-
threshold_of_success=0.5,
34+
threshold_of_success=0.33,
3835
rollout_processor=default_single_turn_rollout_processor,
3936
num_runs=1,
4037
mode="pointwise",
41-
max_dataset_rows=3, # Limit for testing
4238
)
4339
def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
4440
"""
4541
Evaluation function that tests APPS coding problems using evaluate_apps_solution.
46-
47-
This function:
48-
1. Uses the actual evaluate_apps_solution from apps_coding_reward.py
49-
2. Expects ground_truth as JSON string with "inputs" and "outputs" arrays
50-
3. Returns the evaluation result directly from evaluate_apps_solution
51-
42+
5243
Args:
5344
row: EvaluationRow containing the conversation messages and ground_truth as JSON string
54-
45+
5546
Returns:
5647
EvaluationRow with the evaluation result
5748
"""
5849
# Use evaluate_apps_solution directly
5950
result = evaluate_apps_solution(
6051
messages=row.messages,
6152
ground_truth=row.ground_truth,
62-
execution_timeout=10
6353
)
6454

6555
# Set the evaluation result on the row

tests/pytest/test_basic_coding.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
2626

2727

2828
@evaluation_test(
29-
input_dataset=["tests/pytest/data/coding_dataset.jsonl"],
29+
input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"],
3030
dataset_adapter=coding_dataset_to_evaluation_row,
3131
model=["accounts/fireworks/models/kimi-k2-instruct"],
3232
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
33-
threshold_of_success=0.5,
33+
threshold_of_success=0.8,
3434
rollout_processor=default_single_turn_rollout_processor,
3535
num_runs=1,
3636
mode="pointwise",

0 commit comments

Comments
 (0)