eval-protocol
diff --git a/‎tests/pytest/data/apps_dataset.jsonl‎
Lines changed: 0 additions & 3 deletions b/‎tests/pytest/data/apps_dataset.jsonl‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎tests/pytest/data/apps_sample_dataset.jsonl‎
Lines changed: 3 additions & 0 deletions b/‎tests/pytest/data/apps_sample_dataset.jsonl‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/pytest/data/coding_dataset.jsonl‎ ‎…s/pytest/data/basic_coding_dataset.jsonl‎tests/pytest/data/coding_dataset.jsonl renamed to tests/pytest/data/basic_coding_dataset.jsonl b/‎tests/pytest/data/coding_dataset.jsonl‎ ‎…s/pytest/data/basic_coding_dataset.jsonl‎tests/pytest/data/coding_dataset.jsonl renamed to tests/pytest/data/basic_coding_dataset.jsonl
diff --git a/‎tests/pytest/test_apps_coding.py‎
Lines changed: 6 additions & 16 deletions b/‎tests/pytest/test_apps_coding.py‎
Lines changed: 6 additions & 16 deletions
diff --git a/‎tests/pytest/test_basic_coding.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/pytest/test_basic_coding.py‎
Lines changed: 2 additions & 2 deletions
@@ -19,47 +19,37 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     """
     return [
         EvaluationRow(
-            messages=[Message(role="user", content=row["prompt"])], 
-            ground_truth=json.dumps({
-                "inputs": [row["input"] + "\n"],  # Add newline for stdin format
-                "outputs": [row["expected_output"] + "\n"]  # Add newline for stdout format
-            })
+            messages=[Message(role="user", content=row["question"])],
+            ground_truth=row["input_output"]
         )
         for row in data
     ]
 
 
 @evaluation_test(
-    input_dataset=["tests/pytest/data/apps_dataset.jsonl"],
+    input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"],
     dataset_adapter=apps_dataset_to_evaluation_row,
     model=["accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold_of_success=0.5,
+    threshold_of_success=0.33,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
-    max_dataset_rows=3,  # Limit for testing
 )
 def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
     Evaluation function that tests APPS coding problems using evaluate_apps_solution.
-    
-    This function:
-    1. Uses the actual evaluate_apps_solution from apps_coding_reward.py
-    2. Expects ground_truth as JSON string with "inputs" and "outputs" arrays
-    3. Returns the evaluation result directly from evaluate_apps_solution
-    
+
     Args:
         row: EvaluationRow containing the conversation messages and ground_truth as JSON string
-        
+    
     Returns:
         EvaluationRow with the evaluation result
     """
     # Use evaluate_apps_solution directly
     result = evaluate_apps_solution(
         messages=row.messages,
         ground_truth=row.ground_truth,
-        execution_timeout=10
     )
 
     # Set the evaluation result on the row
 
@@ -26,11 +26,11 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
 
 
 @evaluation_test(
-    input_dataset=["tests/pytest/data/coding_dataset.jsonl"],
+    input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"],
     dataset_adapter=coding_dataset_to_evaluation_row,
     model=["accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold_of_success=0.5,
+    threshold_of_success=0.8,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",