python-sdk/eval_protocol/benchmarks/test_frozen_lake.py at 4a4baa10196bf6042e3479e8823d429b017e20ae · eval-protocol/python-sdk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
Pytest test for frozen lake evaluation using the evaluation_test decorator.

This test demonstrates how to use frozen lake environments within the pytest framework,
similar to the test_frozen_lake_e2e test but integrated with the pytest evaluation system.
"""

from typing import Any, Dict, List

from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor


def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
    """
    Convert entries from frozen lake dataset to EvaluationRow objects.
    """
    rows = []

    for row in data:
        eval_row = EvaluationRow(
            messages=[Message(role="system", content=row["system_prompt"])],
            input_metadata=InputMetadata(
                row_id=row["id"],
                dataset_info={
                    "environment_context": row["environment_context"],
                    "user_prompt_template": row["user_prompt_template"],
                },
            ),
        )

        rows.append(eval_row)

    return rows


@evaluation_test(
    input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
    dataset_adapter=frozen_lake_to_evaluation_row,
    completion_params=[
        {
            "temperature": 0.0,
            "max_tokens": 4096,
            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
        }
    ],
    rollout_processor=MCPGymRolloutProcessor(),
    passed_threshold=0.66,
    num_runs=1,
    max_concurrent_rollouts=3,
    mode="pointwise",
    server_script_path="eval_protocol/mcp_servers/frozen_lake/server.py",
)
def test_frozen_lake_evaluation(row: EvaluationRow) -> EvaluationRow:
    """
    Test frozen lake evaluation using the pytest framework.

    This test evaluates how well the model can navigate the FrozenLake environment
    by checking if it successfully reaches the goal while avoiding holes.

    Args:
        row: EvaluationRow object from frozen lake dataset

    Returns:
        EvaluationRow object with evaluation results
    """
    score = row.get_total_reward()

    if score == 1.0:
        reason = "Agent reached the goal"
    else:
        reason = "Agent did not reach the goal"

    row.evaluation_result = EvaluateResult(
        score=score,
        reason=reason,
    )

    return row