@@ -19,47 +19,37 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
1919 """
2020 return [
2121 EvaluationRow (
22- messages = [Message (role = "user" , content = row ["prompt" ])],
23- ground_truth = json .dumps ({
24- "inputs" : [row ["input" ] + "\n " ], # Add newline for stdin format
25- "outputs" : [row ["expected_output" ] + "\n " ] # Add newline for stdout format
26- })
22+ messages = [Message (role = "user" , content = row ["question" ])],
23+ ground_truth = row ["input_output" ]
2724 )
2825 for row in data
2926 ]
3027
3128
3229@evaluation_test (
33- input_dataset = ["tests/pytest/data/apps_dataset .jsonl" ],
30+ input_dataset = ["tests/pytest/data/apps_sample_dataset .jsonl" ],
3431 dataset_adapter = apps_dataset_to_evaluation_row ,
3532 model = ["accounts/fireworks/models/kimi-k2-instruct" ],
3633 rollout_input_params = [{"temperature" : 0.0 , "max_tokens" : 4096 }],
37- threshold_of_success = 0.5 ,
34+ threshold_of_success = 0.33 ,
3835 rollout_processor = default_single_turn_rollout_processor ,
3936 num_runs = 1 ,
4037 mode = "pointwise" ,
41- max_dataset_rows = 3 , # Limit for testing
4238)
4339def test_apps_code_evaluation (row : EvaluationRow ) -> EvaluationRow :
4440 """
4541 Evaluation function that tests APPS coding problems using evaluate_apps_solution.
46-
47- This function:
48- 1. Uses the actual evaluate_apps_solution from apps_coding_reward.py
49- 2. Expects ground_truth as JSON string with "inputs" and "outputs" arrays
50- 3. Returns the evaluation result directly from evaluate_apps_solution
51-
42+
5243 Args:
5344 row: EvaluationRow containing the conversation messages and ground_truth as JSON string
54-
45+
5546 Returns:
5647 EvaluationRow with the evaluation result
5748 """
5849 # Use evaluate_apps_solution directly
5950 result = evaluate_apps_solution (
6051 messages = row .messages ,
6152 ground_truth = row .ground_truth ,
62- execution_timeout = 10
6353 )
6454
6555 # Set the evaluation result on the row
0 commit comments