Skip to content

Commit d07c7cd

Browse files
committed
chore: add LLM minimum acceptance test workflow with CI-managed model matrix
Move LLM provider/model selection from Python-level pytest.mark.parametrize to a GitHub Actions matrix. Each provider/model combo runs as a separate CI job for clear per-model failure visibility. - Rewrite test_llm_provider.py to read LLM_TEST_PROVIDER/LLM_TEST_MODEL from env vars instead of hardcoded MODEL_MATRIX - Mark with pytest.mark.llm, excluded from test-api via -m "not llm" - Add test-llm-acceptance.yml workflow (daily cron, manual, or 'llm-tests' label) with matrix of 14 provider/model combinations
1 parent c1eaf71 commit d07c7cd

4 files changed

Lines changed: 288 additions & 315 deletions

File tree

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
name: LLM Acceptance Tests
2+
3+
on:
4+
schedule:
5+
- cron: "30 6 * * *"
6+
workflow_dispatch:
7+
pull_request:
8+
types: [labeled]
9+
10+
jobs:
11+
test-llm-acceptance:
12+
# Run on schedule, manual dispatch, or when the 'llm-tests' label is added
13+
if: >-
14+
github.event_name == 'schedule' ||
15+
github.event_name == 'workflow_dispatch' ||
16+
(github.event_name == 'pull_request' && github.event.label.name == 'llm-tests')
17+
runs-on: ubuntu-latest
18+
strategy:
19+
fail-fast: false
20+
matrix:
21+
include:
22+
# VertexAI (Gemini) models
23+
- provider: vertexai
24+
model: google/gemini-2.5-flash
25+
api_key_env: ""
26+
- provider: vertexai
27+
model: google/gemini-2.5-flash-lite
28+
api_key_env: ""
29+
# OpenAI models
30+
- provider: openai
31+
model: gpt-4o-mini
32+
api_key_env: OPENAI_API_KEY
33+
- provider: openai
34+
model: gpt-4.1-mini
35+
api_key_env: OPENAI_API_KEY
36+
- provider: openai
37+
model: gpt-4.1-nano
38+
api_key_env: OPENAI_API_KEY
39+
# Anthropic models
40+
- provider: anthropic
41+
model: claude-sonnet-4-20250514
42+
api_key_env: ANTHROPIC_API_KEY
43+
- provider: anthropic
44+
model: claude-haiku-4-20250514
45+
api_key_env: ANTHROPIC_API_KEY
46+
# Groq models
47+
- provider: groq
48+
model: openai/gpt-oss-20b
49+
api_key_env: GROQ_API_KEY
50+
# DeepSeek models
51+
- provider: deepseek
52+
model: deepseek-chat
53+
api_key_env: DEEPSEEK_API_KEY
54+
# Gemini (direct API) models
55+
- provider: gemini
56+
model: gemini-2.5-flash
57+
api_key_env: GEMINI_API_KEY
58+
- provider: gemini
59+
model: gemini-2.5-flash-lite
60+
api_key_env: GEMINI_API_KEY
61+
# Bedrock models
62+
- provider: bedrock
63+
model: us.amazon.nova-2-lite-v1:0
64+
api_key_env: ""
65+
env:
66+
# Test matrix env vars
67+
LLM_TEST_PROVIDER: ${{ matrix.provider }}
68+
LLM_TEST_MODEL: ${{ matrix.model }}
69+
# Default LLM config (needed for MemoryEngine fixtures)
70+
HINDSIGHT_API_LLM_PROVIDER: vertexai
71+
HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY: /tmp/gcp-credentials.json
72+
HINDSIGHT_API_LLM_MODEL: google/gemini-2.5-flash-lite
73+
# API keys - each job only needs one, but we set all so the fixture works
74+
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
75+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
76+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
77+
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
78+
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
79+
COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }}
80+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
81+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
82+
AWS_REGION_NAME: ${{ secrets.AWS_REGION_NAME }}
83+
84+
name: ${{ matrix.provider }}/${{ matrix.model }}
85+
86+
steps:
87+
- uses: actions/checkout@v6
88+
with:
89+
ref: ${{ github.event.pull_request.head.sha || '' }}
90+
91+
- name: Skip if API key is missing
92+
if: matrix.api_key_env != '' && !secrets[matrix.api_key_env]
93+
run: |
94+
echo "::warning::Skipping ${{ matrix.provider }}/${{ matrix.model }} — ${{ matrix.api_key_env }} secret not set"
95+
exit 0
96+
97+
- name: Setup GCP credentials
98+
run: |
99+
printf '%s' '${{ secrets.GCP_VERTEXAI_CREDENTIALS }}' > /tmp/gcp-credentials.json
100+
PROJECT_ID=$(jq -r '.project_id' /tmp/gcp-credentials.json)
101+
echo "HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID=$PROJECT_ID" >> $GITHUB_ENV
102+
103+
- name: Install uv
104+
uses: astral-sh/setup-uv@v7
105+
with:
106+
enable-cache: true
107+
prune-cache: false
108+
109+
- name: Set up Python
110+
uses: actions/setup-python@v6
111+
with:
112+
python-version-file: ".python-version"
113+
114+
- name: Install dependencies
115+
working-directory: ./hindsight-api-slim
116+
run: uv sync --frozen --all-extras --index-strategy unsafe-best-match
117+
118+
- name: Cache HuggingFace models
119+
uses: actions/cache@v5
120+
with:
121+
path: ~/.cache/huggingface
122+
key: ${{ runner.os }}-huggingface-${{ hashFiles('hindsight-api-slim/pyproject.toml') }}
123+
restore-keys: |
124+
${{ runner.os }}-huggingface-
125+
126+
- name: Pre-download models
127+
working-directory: ./hindsight-api-slim
128+
run: |
129+
uv run python -c "
130+
from sentence_transformers import SentenceTransformer, CrossEncoder
131+
SentenceTransformer('BAAI/bge-small-en-v1.5')
132+
CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
133+
"
134+
135+
- name: Run LLM acceptance tests
136+
working-directory: ./hindsight-api-slim
137+
run: uv run pytest tests/test_llm_provider.py -v --timeout 600

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1167,7 +1167,7 @@ jobs:
11671167
11681168
- name: Run tests
11691169
working-directory: ./hindsight-api-slim
1170-
run: uv run pytest tests -v
1170+
run: uv run pytest tests -v -m "not llm"
11711171

11721172
test-api-oracle:
11731173
needs: [detect-changes]

hindsight-api-slim/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ log_cli_date_format = "%Y-%m-%d %H:%M:%S"
141141
addopts = "--timeout 300 -n 8 --dist loadgroup --durations=10 -v"
142142
markers = [
143143
"oracle: Oracle 23ai integration tests (require ORACLE_TEST_DSN env var)",
144+
"llm: LLM acceptance tests (require LLM_TEST_PROVIDER and LLM_TEST_MODEL env vars)",
144145
]
145146
asyncio_mode = "auto"
146147
asyncio_default_fixture_loop_scope = "function"

0 commit comments

Comments
 (0)