python-sdk/tests/test_eval_protocol_import.py at b6c1af91b5fd9c3195723c63f91843a9412d11b5 · eval-protocol/python-sdk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
"""Test that eval_protocol imports work correctly and provide the same functionality as eval_protocol."""

import importlib
import sys
from unittest.mock import patch

import pytest


class TestRewardProtocolImports:
    """Test that eval_protocol provides the same functionality as eval_protocol."""

    def test_basic_imports(self):
        """Test that both packages can be imported successfully."""
        import eval_protocol

        # Both should be importable
        assert eval_protocol is not None
        assert eval_protocol is not None

    def test_version_consistency(self):
        """Test that both packages have the same version."""
        import eval_protocol

        assert hasattr(eval_protocol, "__version__")
        assert hasattr(eval_protocol, "__version__")
        assert eval_protocol.__version__ == eval_protocol.__version__

    def test_all_exports_consistency(self):
        """Test that both packages export the same __all__ list."""
        import eval_protocol

        assert hasattr(eval_protocol, "__all__")
        assert hasattr(eval_protocol, "__all__")
        assert eval_protocol.__all__ == eval_protocol.__all__

    def test_core_classes_available(self):
        """Test that core classes are available through both imports."""
        from eval_protocol import (
            EvaluateResult,
            EvaluateResult as RPEvaluateResult,
            Message,
            Message as RPMessage,
            MetricResult,
            MetricResult as RPMetricResult,
            RewardFunction,
            RewardFunction as RPRewardFunction,
        )

        # Classes should be the same
        assert RewardFunction is RPRewardFunction
        assert Message is RPMessage
        assert MetricResult is RPMetricResult
        assert EvaluateResult is RPEvaluateResult

    def test_functions_available(self):
        """Test that core functions are available through both imports."""
        from eval_protocol import (
            load_jsonl,
            load_jsonl as rp_load_jsonl,
            make,
            make as rp_make,
            reward_function,
            reward_function as rp_reward_function,
            rollout,
            rollout as rp_rollout,
            test_mcp,
            test_mcp as rp_test_mcp,
        )

        # Functions should be the same
        assert reward_function is rp_reward_function
        assert load_jsonl is rp_load_jsonl
        assert make is rp_make
        assert rollout is rp_rollout
        assert test_mcp is rp_test_mcp

    def test_submodules_available(self):
        """Test that submodules are available through both imports."""
        import eval_protocol

        # Test a few key submodules
        submodules_to_test = ["models", "auth", "config", "rewards", "mcp"]

        for submodule in submodules_to_test:
            assert hasattr(eval_protocol, submodule)
            assert hasattr(eval_protocol, submodule)
            # The submodules should be the same object
            assert getattr(eval_protocol, submodule) is getattr(eval_protocol, submodule)

    def test_star_import_works(self):
        """Test that star imports work for both packages."""
        # This needs to be done in separate namespaces to avoid conflicts

        # Test eval_protocol star import
        rk_globals = {}
        exec("from eval_protocol import *", rk_globals)

        # Test eval_protocol star import
        rp_globals = {}
        exec("from eval_protocol import *", rp_globals)

        # Both should have the same set of imported names (minus built-ins)
        rk_names = {k for k in rk_globals.keys() if not k.startswith("__")}
        rp_names = {k for k in rp_globals.keys() if not k.startswith("__")}

        assert rk_names == rp_names

        # Test that key items are available
        expected_items = ["RewardFunction", "Message", "reward_function", "load_jsonl"]
        for item in expected_items:
            assert item in rk_names
            assert item in rp_names

    def test_reward_function_decorator_works(self):
        """Test that the @reward_function decorator works through both imports."""
        from eval_protocol import (
            EvaluateResult,
            reward_function as rk_reward_function,
            reward_function as rp_reward_function,
        )

        # Create a simple reward function using eval_protocol
        @rk_reward_function
        def test_reward_rk(response: str, **kwargs) -> EvaluateResult:
            score = len(response) / 10.0
            return EvaluateResult(
                score=score,
                reason=f"Score based on response length: {len(response)} characters",
                is_score_valid=True,
            )

        # Create the same reward function using eval_protocol
        @rp_reward_function
        def test_reward_rp(response: str, **kwargs) -> EvaluateResult:
            score = len(response) / 10.0
            return EvaluateResult(
                score=score,
                reason=f"Score based on response length: {len(response)} characters",
                is_score_valid=True,
            )

        # Both should work the same way
        test_input = "Hello, world!"
        result_rk = test_reward_rk(test_input)
        result_rp = test_reward_rp(test_input)

        # Both should return EvaluateResult objects with the same score
        assert isinstance(result_rk, EvaluateResult)
        assert isinstance(result_rp, EvaluateResult)
        assert result_rk.score == result_rp.score
        assert result_rk.score == len(test_input) / 10.0

    def test_message_class_works(self):
        """Test that Message class works through both imports."""
        from eval_protocol import Message as RKMessage, Message as RPMessage

        # They should be the same class
        assert RKMessage is RPMessage

        # Test creating instances
        msg_data = {"role": "user", "content": "Hello"}
        rk_msg = RKMessage(**msg_data)
        rp_msg = RPMessage(**msg_data)

        assert rk_msg.role == rp_msg.role
        assert rk_msg.content == rp_msg.content

    def test_console_scripts_in_setup(self):
        """Test that console scripts are defined in setup.py."""
        import os

        # Read setup.py content directly to avoid running it
        setup_path = os.path.join(os.path.dirname(__file__), "..", "pyproject.toml")
        with open(setup_path, "r") as f:
            setup_content = f.read()

        # Check for console scripts in the file content
        expected_scripts = [
            'fireworks-reward = "eval_protocol.cli:main"',
            'eval-protocol = "eval_protocol.cli:main"',
        ]

        for script in expected_scripts:
            assert script in setup_content, f"Console script '{script}' not found in pyproject.toml"

    def test_package_structure_in_setup(self):
        """Test that both packages are included in setup.py."""
        from setuptools import find_packages

        packages = find_packages(include=["eval_protocol*", "eval_protocol*"])

        # Should include both main packages
        assert "eval_protocol" in packages
        assert "eval_protocol" in packages

        # Should include subpackages
        assert any(pkg.startswith("eval_protocol.") for pkg in packages)

    def test_deep_import_consistency(self):
        """Test that deep imports work consistently."""
        try:
            # Test importing from submodules
            from eval_protocol.models import Message as RKMessage, Message as RPMessage

            # Should be the same class
            assert RKMessage is RPMessage
        except ImportError:
            # If submodule imports don't work, that's expected in some install scenarios
            # Just verify the star import works
            from eval_protocol import Message as RKMessage, Message as RPMessage

            assert RKMessage is RPMessage

        try:
            # Test another submodule - use a function that actually exists
            from eval_protocol.auth import (
                get_fireworks_account_id,
                get_fireworks_account_id as rp_get_fireworks_account_id,
            )

            assert get_fireworks_account_id is rp_get_fireworks_account_id
        except ImportError:
            # If submodule imports don't work, verify through star import
            from eval_protocol import auth as rk_auth, auth as rp_auth

            assert rk_auth is rp_auth


class TestRewardProtocolFunctionality:
    """Test that eval_protocol functionality works correctly."""

    def test_reward_function_creation(self):
        """Test creating reward functions with eval_protocol."""
        from eval_protocol import EvaluateResult, reward_function

        @reward_function
        def simple_reward(response: str, **kwargs) -> EvaluateResult:
            """Simple reward based on response length."""
            score = float(len(response))
            return EvaluateResult(
                score=score,
                reason=f"Score based on response length: {len(response)} characters",
                is_score_valid=True,
            )

        # Test the reward function
        result = simple_reward("Hello")
        assert isinstance(result, EvaluateResult)
        assert result.score == 5.0
        assert result.is_score_valid is True
        assert "5 characters" in result.reason

        # Test that the function is callable (the decorator returns a callable)
        assert callable(simple_reward)

    def test_message_creation(self):
        """Test creating Message objects with eval_protocol."""
        from eval_protocol import Message

        msg = Message(role="user", content="Test message")
        assert msg.role == "user"
        assert msg.content == "Test message"

    def test_message_preserves_token_ids(self):
        """Test token IDs round-trip on messages."""
        from eval_protocol import Message

        msg = Message(role="assistant", content="Hi", token_ids=[1, 2], logprobs=[-0.1, -0.2])
        assert msg.model_dump()["token_ids"] == [1, 2]

    def test_message_rejects_misaligned_float_logprobs(self):
        """Test token IDs and flat float logprobs must align."""
        import pytest
        from pydantic import ValidationError

        from eval_protocol import Message

        with pytest.raises(ValidationError):
            Message(role="assistant", content="Hi", token_ids=[1, 2], logprobs=[-0.1])

    def test_utility_functions(self):
        """Test that utility functions work through eval_protocol."""
        from eval_protocol import create_llm_resource, load_jsonl

        # These should be callable
        assert callable(load_jsonl)
        assert callable(create_llm_resource)


if __name__ == "__main__":
    pytest.main([__file__, "-v"])