fix(ci): add provider integration tests, rename GOOGLE_API_KEY to GEMINI_API_KEY

ASRagab · claude · ASRagab · commit 90ef35156120 · 2026-02-24T02:47:59.000-05:00
The CI integration matrix used -k filters (integration_openai, etc.)
but no matching tests existed, causing pytest exit code 5 (no tests
collected). Added live integration tests for OpenAI, Anthropic, and
Gemini providers with skipif guards when API keys are absent.

218 tests passing.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
Entire-Checkpoint: 83174226da47
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -54,7 +54,7 @@ jobs:
             secret_env: ANTHROPIC_API_KEY
             pytest_k: integration_anthropic
           - provider: google
-            secret_env: GOOGLE_API_KEY
+            secret_env: GEMINI_API_KEY
             pytest_k: integration_google
     steps:
       - uses: actions/checkout@v4
@@ -81,6 +81,6 @@ jobs:
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
         # pytest skips tests internally when env vars are absent (via @pytest.mark.skipif)
         run: uv run pytest tests/test_llm_judge.py -k "${{ matrix.pytest_k }}" -v
diff --git a/tests/test_llm_judge.py b/tests/test_llm_judge.py
@@ -234,6 +234,57 @@ def test_invalid_model_raises(self):
             llm_judge_evaluator("maximize quality", model="")
 
 
+class TestLlmJudgeIntegration:
+    """Live integration tests — call real provider APIs.
+
+    Each test is named to match the CI matrix's -k filter
+    (e.g., -k "integration_openai") and skipped when the
+    corresponding API key is absent.
+    """
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(
+        not __import__("os").environ.get("OPENAI_API_KEY"),
+        reason="OPENAI_API_KEY required",
+    )
+    def test_integration_openai_judge(self):
+        evaluator = llm_judge_evaluator(
+            "Score this text on clarity and conciseness. Be strict.",
+            model="openai/gpt-4o-mini",
+        )
+        score, side_info = evaluator("The quick brown fox jumps over the lazy dog.")
+        assert 0.0 <= score <= 1.0
+        assert "reasoning" in side_info or "error" not in side_info
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(
+        not __import__("os").environ.get("ANTHROPIC_API_KEY"),
+        reason="ANTHROPIC_API_KEY required",
+    )
+    def test_integration_anthropic_judge(self):
+        evaluator = llm_judge_evaluator(
+            "Score this text on clarity and conciseness. Be strict.",
+            model="anthropic/claude-haiku-4-5-20251001",
+        )
+        score, side_info = evaluator("The quick brown fox jumps over the lazy dog.")
+        assert 0.0 <= score <= 1.0
+        assert "reasoning" in side_info or "error" not in side_info
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(
+        not __import__("os").environ.get("GEMINI_API_KEY"),
+        reason="GEMINI_API_KEY required",
+    )
+    def test_integration_google_judge(self):
+        evaluator = llm_judge_evaluator(
+            "Score this text on clarity and conciseness. Be strict.",
+            model="gemini/gemini-2.0-flash",
+        )
+        score, side_info = evaluator("The quick brown fox jumps over the lazy dog.")
+        assert 0.0 <= score <= 1.0
+        assert "reasoning" in side_info or "error" not in side_info
+
+
 class TestComputeWeightedScore:
     def test_basic_weighted_average(self):
         dims = [{"name": "a", "weight": 0.7}, {"name": "b", "weight": 0.3}]