Skip to content

Commit 90ef351

Browse files
ASRagabclaude
andcommitted
fix(ci): add provider integration tests, rename GOOGLE_API_KEY to GEMINI_API_KEY
The CI integration matrix used -k filters (integration_openai, etc.) but no matching tests existed, causing pytest exit code 5 (no tests collected). Added live integration tests for OpenAI, Anthropic, and Gemini providers with skipif guards when API keys are absent. 218 tests passing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Entire-Checkpoint: 83174226da47
1 parent 289d04f commit 90ef351

2 files changed

Lines changed: 53 additions & 2 deletions

File tree

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ jobs:
5454
secret_env: ANTHROPIC_API_KEY
5555
pytest_k: integration_anthropic
5656
- provider: google
57-
secret_env: GOOGLE_API_KEY
57+
secret_env: GEMINI_API_KEY
5858
pytest_k: integration_google
5959
steps:
6060
- uses: actions/checkout@v4
@@ -81,6 +81,6 @@ jobs:
8181
env:
8282
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
8383
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
84-
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
84+
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
8585
# pytest skips tests internally when env vars are absent (via @pytest.mark.skipif)
8686
run: uv run pytest tests/test_llm_judge.py -k "${{ matrix.pytest_k }}" -v

tests/test_llm_judge.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,57 @@ def test_invalid_model_raises(self):
234234
llm_judge_evaluator("maximize quality", model="")
235235

236236

237+
class TestLlmJudgeIntegration:
238+
"""Live integration tests — call real provider APIs.
239+
240+
Each test is named to match the CI matrix's -k filter
241+
(e.g., -k "integration_openai") and skipped when the
242+
corresponding API key is absent.
243+
"""
244+
245+
@pytest.mark.integration
246+
@pytest.mark.skipif(
247+
not __import__("os").environ.get("OPENAI_API_KEY"),
248+
reason="OPENAI_API_KEY required",
249+
)
250+
def test_integration_openai_judge(self):
251+
evaluator = llm_judge_evaluator(
252+
"Score this text on clarity and conciseness. Be strict.",
253+
model="openai/gpt-4o-mini",
254+
)
255+
score, side_info = evaluator("The quick brown fox jumps over the lazy dog.")
256+
assert 0.0 <= score <= 1.0
257+
assert "reasoning" in side_info or "error" not in side_info
258+
259+
@pytest.mark.integration
260+
@pytest.mark.skipif(
261+
not __import__("os").environ.get("ANTHROPIC_API_KEY"),
262+
reason="ANTHROPIC_API_KEY required",
263+
)
264+
def test_integration_anthropic_judge(self):
265+
evaluator = llm_judge_evaluator(
266+
"Score this text on clarity and conciseness. Be strict.",
267+
model="anthropic/claude-haiku-4-5-20251001",
268+
)
269+
score, side_info = evaluator("The quick brown fox jumps over the lazy dog.")
270+
assert 0.0 <= score <= 1.0
271+
assert "reasoning" in side_info or "error" not in side_info
272+
273+
@pytest.mark.integration
274+
@pytest.mark.skipif(
275+
not __import__("os").environ.get("GEMINI_API_KEY"),
276+
reason="GEMINI_API_KEY required",
277+
)
278+
def test_integration_google_judge(self):
279+
evaluator = llm_judge_evaluator(
280+
"Score this text on clarity and conciseness. Be strict.",
281+
model="gemini/gemini-2.0-flash",
282+
)
283+
score, side_info = evaluator("The quick brown fox jumps over the lazy dog.")
284+
assert 0.0 <= score <= 1.0
285+
assert "reasoning" in side_info or "error" not in side_info
286+
287+
237288
class TestComputeWeightedScore:
238289
def test_basic_weighted_average(self):
239290
dims = [{"name": "a", "weight": 0.7}, {"name": "b", "weight": 0.3}]

0 commit comments

Comments
 (0)