chore: add LLM minimum acceptance test workflow with CI-managed model matrix

nicoloboschi · nicoloboschi · commit d07c7cd645cc · 2026-05-05T10:16:21.000+02:00
Move LLM provider/model selection from Python-level pytest.mark.parametrize
to a GitHub Actions matrix. Each provider/model combo runs as a separate CI
job for clear per-model failure visibility.

- Rewrite test_llm_provider.py to read LLM_TEST_PROVIDER/LLM_TEST_MODEL
  from env vars instead of hardcoded MODEL_MATRIX
- Mark with pytest.mark.llm, excluded from test-api via -m "not llm"
- Add test-llm-acceptance.yml workflow (daily cron, manual, or 'llm-tests' label)
  with matrix of 14 provider/model combinations
diff --git a/.github/workflows/test-llm-acceptance.yml b/.github/workflows/test-llm-acceptance.yml
@@ -0,0 +1,137 @@
+name: LLM Acceptance Tests
+
+on:
+  schedule:
+    - cron: "30 6 * * *"
+  workflow_dispatch:
+  pull_request:
+    types: [labeled]
+
+jobs:
+  test-llm-acceptance:
+    # Run on schedule, manual dispatch, or when the 'llm-tests' label is added
+    if: >-
+      github.event_name == 'schedule' ||
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && github.event.label.name == 'llm-tests')
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          # VertexAI (Gemini) models
+          - provider: vertexai
+            model: google/gemini-2.5-flash
+            api_key_env: ""
+          - provider: vertexai
+            model: google/gemini-2.5-flash-lite
+            api_key_env: ""
+          # OpenAI models
+          - provider: openai
+            model: gpt-4o-mini
+            api_key_env: OPENAI_API_KEY
+          - provider: openai
+            model: gpt-4.1-mini
+            api_key_env: OPENAI_API_KEY
+          - provider: openai
+            model: gpt-4.1-nano
+            api_key_env: OPENAI_API_KEY
+          # Anthropic models
+          - provider: anthropic
+            model: claude-sonnet-4-20250514
+            api_key_env: ANTHROPIC_API_KEY
+          - provider: anthropic
+            model: claude-haiku-4-20250514
+            api_key_env: ANTHROPIC_API_KEY
+          # Groq models
+          - provider: groq
+            model: openai/gpt-oss-20b
+            api_key_env: GROQ_API_KEY
+          # DeepSeek models
+          - provider: deepseek
+            model: deepseek-chat
+            api_key_env: DEEPSEEK_API_KEY
+          # Gemini (direct API) models
+          - provider: gemini
+            model: gemini-2.5-flash
+            api_key_env: GEMINI_API_KEY
+          - provider: gemini
+            model: gemini-2.5-flash-lite
+            api_key_env: GEMINI_API_KEY
+          # Bedrock models
+          - provider: bedrock
+            model: us.amazon.nova-2-lite-v1:0
+            api_key_env: ""
+    env:
+      # Test matrix env vars
+      LLM_TEST_PROVIDER: ${{ matrix.provider }}
+      LLM_TEST_MODEL: ${{ matrix.model }}
+      # Default LLM config (needed for MemoryEngine fixtures)
+      HINDSIGHT_API_LLM_PROVIDER: vertexai
+      HINDSIGHT_API_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY: /tmp/gcp-credentials.json
+      HINDSIGHT_API_LLM_MODEL: google/gemini-2.5-flash-lite
+      # API keys - each job only needs one, but we set all so the fixture works
+      GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
+      DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
+      COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AWS_REGION_NAME: ${{ secrets.AWS_REGION_NAME }}
+
+    name: ${{ matrix.provider }}/${{ matrix.model }}
+
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        ref: ${{ github.event.pull_request.head.sha || '' }}
+
+    - name: Skip if API key is missing
+      if: matrix.api_key_env != '' && !secrets[matrix.api_key_env]
+      run: |
+        echo "::warning::Skipping ${{ matrix.provider }}/${{ matrix.model }} — ${{ matrix.api_key_env }} secret not set"
+        exit 0
+
+    - name: Setup GCP credentials
+      run: |
+        printf '%s' '${{ secrets.GCP_VERTEXAI_CREDENTIALS }}' > /tmp/gcp-credentials.json
+        PROJECT_ID=$(jq -r '.project_id' /tmp/gcp-credentials.json)
+        echo "HINDSIGHT_API_LLM_VERTEXAI_PROJECT_ID=$PROJECT_ID" >> $GITHUB_ENV
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v7
+      with:
+        enable-cache: true
+        prune-cache: false
+
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version-file: ".python-version"
+
+    - name: Install dependencies
+      working-directory: ./hindsight-api-slim
+      run: uv sync --frozen --all-extras --index-strategy unsafe-best-match
+
+    - name: Cache HuggingFace models
+      uses: actions/cache@v5
+      with:
+        path: ~/.cache/huggingface
+        key: ${{ runner.os }}-huggingface-${{ hashFiles('hindsight-api-slim/pyproject.toml') }}
+        restore-keys: |
+          ${{ runner.os }}-huggingface-
+
+    - name: Pre-download models
+      working-directory: ./hindsight-api-slim
+      run: |
+        uv run python -c "
+        from sentence_transformers import SentenceTransformer, CrossEncoder
+        SentenceTransformer('BAAI/bge-small-en-v1.5')
+        CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+        "
+
+    - name: Run LLM acceptance tests
+      working-directory: ./hindsight-api-slim
+      run: uv run pytest tests/test_llm_provider.py -v --timeout 600
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1167,7 +1167,7 @@ jobs:
 
     - name: Run tests
       working-directory: ./hindsight-api-slim
-      run: uv run pytest tests -v
+      run: uv run pytest tests -v -m "not llm"
 
   test-api-oracle:
     needs: [detect-changes]
diff --git a/hindsight-api-slim/pyproject.toml b/hindsight-api-slim/pyproject.toml
@@ -141,6 +141,7 @@ log_cli_date_format = "%Y-%m-%d %H:%M:%S"
 addopts = "--timeout 300 -n 8 --dist loadgroup --durations=10 -v"
 markers = [
     "oracle: Oracle 23ai integration tests (require ORACLE_TEST_DSN env var)",
+    "llm: LLM acceptance tests (require LLM_TEST_PROVIDER and LLM_TEST_MODEL env vars)",
 ]
 asyncio_mode = "auto"
 asyncio_default_fixture_loop_scope = "function"
diff --git a/hindsight-api-slim/tests/test_llm_provider.py b/hindsight-api-slim/tests/test_llm_provider.py

Original file line number	Diff line number	Diff line change
`@@ -141,6 +141,7 @@ log_cli_date_format = "%Y-%m-%d %H:%M:%S"`
`141`	`141`	`addopts = "--timeout 300 -n 8 --dist loadgroup --durations=10 -v"`
`142`	`142`	`markers = [`
`143`	`143`	`"oracle: Oracle 23ai integration tests (require ORACLE_TEST_DSN env var)",`
	`144`	`+ "llm: LLM acceptance tests (require LLM_TEST_PROVIDER and LLM_TEST_MODEL env vars)",`
`144`	`145`	`]`
`145`	`146`	`asyncio_mode = "auto"`
`146`	`147`	`asyncio_default_fixture_loop_scope = "function"`