Skip to content

Commit 8fb5ef3

Browse files
authored
Merge branch 'main' into anx/mcp-header
2 parents 9dd70d6 + 68bc75d commit 8fb5ef3

37 files changed

Lines changed: 2440 additions & 267 deletions

.github/workflows/e2e_tests.yaml

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# E2E integration tests with Lightspeed Core
2+
name: E2E Lightspeed Evaluation Integration Tests
3+
4+
on: [push, pull_request]
5+
6+
jobs:
7+
##########
8+
e2e_tests:
9+
runs-on: ubuntu-latest
10+
11+
#name: "Lightspeed-stack setup"
12+
13+
strategy:
14+
# For local testing use matrix with just one variant, "act" doesn't separate runs
15+
matrix:
16+
mode: ["query", "streaming"]
17+
eval-data: ["tests/integration/test_evaluation_data.yaml"]
18+
lsc_image_path: ["quay.io/lightspeed-core/lightspeed-stack:latest"]
19+
20+
name: "E2E Lightspeed Evaluation Test, mode: ${{ matrix.mode }}"
21+
22+
env:
23+
LSC_IMAGE_NAME: "lightspeed-stack-test-mode-${{ matrix.mode }}"
24+
25+
steps:
26+
# Stolen from lightspeed-stack
27+
- uses: actions/checkout@v4
28+
with:
29+
# On PR_TARGET → the fork (or same repo) that opened the PR.
30+
# On push → falls back to the current repository.
31+
repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
32+
33+
# On PR_TARGET → the PR head *commit* (reproducible).
34+
# On push → the pushed commit that triggered the workflow.
35+
ref: ${{ github.event.pull_request.head.ref || github.sha }}
36+
37+
# Don’t keep credentials when running untrusted PR code under PR_TARGET.
38+
persist-credentials: ${{ github.event_name != 'pull_request_target' }}
39+
40+
- name: Verify actual git checkout result
41+
run: |
42+
echo "=== Git Status After Checkout ==="
43+
echo "Remote URLs:"
44+
git remote -v
45+
echo ""
46+
echo "Current branch: $(git branch --show-current 2>/dev/null || echo 'detached HEAD')"
47+
echo "Current commit: $(git rev-parse HEAD)"
48+
echo "Current commit message: $(git log -1 --oneline)"
49+
echo ""
50+
echo "=== Recent commits ==="
51+
git log --oneline -5
52+
53+
# Run LSC
54+
# Can't be in onetime separate job -- networking is not shared between jobs
55+
- name: Run Lightspeed Stack (LSC)
56+
env:
57+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
58+
run: |
59+
echo "==========Running Lightspeed Core======="
60+
docker run \
61+
--name $LSC_IMAGE_NAME \
62+
-p 8080:8080 \
63+
-v $(pwd)/tests/integration/lightspeed-stack.yaml:/app-root/lightspeed-stack.yaml:Z \
64+
-v $(pwd)/tests/integration/run.yaml:/app-root/run.yaml:Z \
65+
-e OPENAI_API_KEY="${OPENAI_API_KEY}" \
66+
--detach \
67+
${{ matrix.lsc_image_path }}
68+
echo "==========Running Lightspeed Core Done======="
69+
70+
- name: Show logs from the LSC
71+
run: |
72+
sleep 2
73+
docker container ls -a
74+
docker logs $LSC_IMAGE_NAME
75+
76+
# Wait for LSC
77+
- name: Wait for the LSC
78+
run: |
79+
echo "Waiting for service on port 8080..."
80+
for i in {1..30}; do
81+
if curl --output /dev/null --fail http://localhost:8080/v1/models ; then
82+
echo "Service is up!"
83+
exit 0
84+
fi
85+
docker logs -n 10 $LSC_IMAGE_NAME
86+
echo "Still waiting..."
87+
sleep 2
88+
done
89+
90+
echo "Service did not start in time"
91+
exit 1
92+
93+
# Query mode
94+
- name: Set query mode
95+
if: matrix.mode == 'query'
96+
run: |
97+
echo "CONFIG=./tests/integration/system-config-query.yaml" >> $GITHUB_ENV
98+
99+
- name: Set streaming mode
100+
if: matrix.mode == 'streaming'
101+
run: |
102+
echo "CONFIG=./tests/integration/system-config-streaming.yaml" >> $GITHUB_ENV
103+
104+
# Dependencies
105+
- name: Install dependencies for Lightspeed Evaluation
106+
env:
107+
TERM: xterm-256color
108+
FORCE_COLOR: 1
109+
run: |
110+
echo "Installing e2e tests dependencies"
111+
pip install --break-system-packages uv
112+
uv sync
113+
114+
# Run the tests
115+
- name: Run the tests
116+
env:
117+
TERM: xterm-256color
118+
FORCE_COLOR: 1
119+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
120+
run: |
121+
echo "============================="
122+
echo "Running..."
123+
echo " config: ${CONFIG}"
124+
echo " LSC image: ${{ matrix.lsc_image_path }}"
125+
echo "============================="
126+
uv run lightspeed-eval --system-config "${CONFIG}" --eval-data "${{ matrix.eval-data }}"
127+
128+
# Check the result
129+
- name: Check test result
130+
run: |
131+
OUT_FILES=( eval_output/evaluation_*_summary.json )
132+
if [ ${#OUT_FILES[@]} != 1 ] ; then
133+
echo "Multiple output files: " eval_output/evaluation_*_summary.json
134+
exit 1
135+
fi
136+
OUT_FILE=${OUT_FILES[0]}
137+
PASS=$( jq .summary_stats.overall.PASS $OUT_FILE )
138+
EXPECTED="1"
139+
if [ ${PASS} != ${EXPECTED} ] ; then
140+
echo "============"
141+
echo "Wrong PASS number in ${OUT_FILE}: got ${PASS}, expected ${EXPECTED}"
142+
echo "============"
143+
exit 1
144+
fi
145+
146+
147+
# Cleanup
148+
- name: Stop the LSC if in local devel
149+
if: ${{ always() && env.ACT }}
150+
run: |
151+
echo "Stopping LSC container $LSC_IMAGE_NAME"
152+
echo "++++++++++++++++++++++"
153+
docker stop $LSC_IMAGE_NAME || true
154+
docker rm $LSC_IMAGE_NAME || true

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,3 +189,6 @@ eval_output*/
189189
wip*/
190190

191191
.history/
192+
193+
# Used in e2e tests local testing
194+
.secrets

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,10 @@ check-types: ## Checks type hints in sources
4242
uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ tests
4343

4444
black-check:
45-
uv run black . --check
45+
uv run black src tests script lsc_agent_eval --check
4646

4747
black-format:
48-
uv run black .
48+
uv run black src tests script lsc_agent_eval
4949

5050
requirements.txt: pyproject.toml uv.lock ## Generate requirements.txt file containing hashes for all non-devel packages
5151
uv export --no-dev --format requirements-txt --output-file requirements.txt
@@ -82,7 +82,7 @@ docstyle:
8282
uv run pydocstyle -v src tests script lsc_agent_eval
8383

8484
ruff:
85-
uv run ruff check .
85+
uv run ruff check src tests script lsc_agent_eval
8686

8787
bandit: ## Security scanning with Bandit
8888
uv run bandit -r src/lightspeed_evaluation -ll

README.md

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -142,26 +142,20 @@ lightspeed-eval --system-config config/system_api_disabled.yaml --eval-data conf
142142

143143
### Custom Metrics with GEval (from DeepEval)
144144

145-
GEval allows us to define custom evaluation metrics using natural language criteria. Define metrics in `system.yaml` under `metrics_metadata`:
145+
Define custom evaluation metrics in `system.yaml` under `metrics_metadata`. **Criteria** is required; **evaluation_steps** and **rubrics** are optional. Score is 0–1.
146146

147147
```yaml
148148
metrics_metadata:
149149
turn_level:
150150
"geval:custom_metric_name":
151151
criteria: |
152-
Specific criteria for the evaluation.
153-
evaluation_params:
154-
- query
155-
- response
156-
- expected_response # optional
157-
evaluation_steps:
158-
- "Step 1: Check if..."
159-
- "Step 2: Verify that..."
152+
What to evaluate (required).
153+
evaluation_params: [query, response, expected_response]
160154
threshold: 0.7
161155
description: "Metric description"
162156
```
163157
164-
See sample [`system config`](config/system.yaml) for complete examples of `geval:technical_accuracy` and `geval:conversation_coherence`.
158+
See [Configuration → Metrics](docs/configuration.md#metrics) for GEval options (evaluation_steps, rubrics) and [config/system.yaml](config/system.yaml) for full examples.
165159
166160
## ⚙️ Configuration
167161

config/system.yaml

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ api:
5858
# Legacy authentication (fallback when mcp_headers.enabled is false)
5959
# Authentication via API_KEY environment variable only for MCP server
6060

61+
# Retry configuration for 429 Too Many Requests API errors
62+
num_retries: 3 # Number of retry attempts (default 3)
63+
6164
# Default metrics metadata
6265
metrics_metadata:
6366
# Turn-level metrics metadata
@@ -130,9 +133,9 @@ metrics_metadata:
130133
distance_measure: "levenshtein" # Options: levenshtein, hamming, jaro, jaro_winkler
131134
default: false # Use custom:answer_correctness for semantic comparison instead
132135

133-
# GEval turn-level metrics
136+
# GEval turn-level metrics (criteria = required; evaluation_steps, rubrics = optional)
134137
"geval:technical_accuracy":
135-
criteria: |
138+
criteria: | # required
136139
Assess whether the response provides technically accurate information,
137140
commands, code, syntax, and follows relevant industry or
138141
domain-specific best practices. The response should
@@ -141,12 +144,19 @@ metrics_metadata:
141144
- query
142145
- response
143146
- expected_response
144-
evaluation_steps:
147+
evaluation_steps: # optional: how to evaluate; if omitted, GEval generates from criteria
145148
- "Verify that the provided syntax (e.g., code, commands, configuration) is valid and follows the language/tool's formatting rules."
146149
- "Check if the response uses appropriate modules, functions, libraries, or parameters for the given task."
147150
- "Assess whether the solution aligns with relevant official documentation or established best practices for the specific domain."
148151
- "Verify the response directly and accurately addresses the user's specific query or task."
149152
- "Check for potential security issues, significant inefficiencies, or anti-patterns."
153+
# rubrics: # optional: score ranges 0-10, non-overlapping, but final score is 0-1; same style as evaluation_steps
154+
# - score_range: [0, 3]
155+
# expected_outcome: "Incorrect or invalid."
156+
# - score_range: [4, 7]
157+
# expected_outcome: "Partially correct or has issues."
158+
# - score_range: [8, 10]
159+
# expected_outcome: "Technically correct and follows best practices."
150160
threshold: 0.7
151161
description: "General technical accuracy of provided commands, code, or technical information"
152162

@@ -166,16 +176,16 @@ metrics_metadata:
166176
threshold: 0.7
167177
description: "How well the model retains information from previous turns"
168178

169-
# GEval conversation-level metrics
179+
# GEval conversation-level metrics (criteria = required; evaluation_steps, rubrics = optional)
170180
"geval:conversation_coherence":
171-
criteria: |
181+
criteria: | # required
172182
Evaluate whether the conversation maintains context and provides coherent
173183
responses across multiple turns. The assistant should reference previous
174184
exchanges and build upon earlier context.
175185
evaluation_params:
176186
- query
177187
- response
178-
evaluation_steps:
188+
evaluation_steps: # optional
179189
- "Check if the assistant remembers information from previous turns"
180190
- "Verify responses build logically on previous context"
181191
- "Assess whether the conversation flows naturally"

0 commit comments

Comments
 (0)