Anxhela21
diff --git a/‎.github/workflows/e2e_tests.yaml‎
Lines changed: 154 additions & 0 deletions b/‎.github/workflows/e2e_tests.yaml‎
Lines changed: 154 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 3 additions & 3 deletions b/‎Makefile‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 10 deletions b/‎README.md‎
Lines changed: 4 additions & 10 deletions
diff --git a/‎config/system.yaml‎
Lines changed: 16 additions & 6 deletions b/‎config/system.yaml‎
Lines changed: 16 additions & 6 deletions
@@ -0,0 +1,154 @@
+# E2E integration tests with Lightspeed Core
+name: E2E Lightspeed Evaluation Integration Tests
+
+on: [push, pull_request]
+
+jobs:
+  ##########
+  e2e_tests:
+    runs-on: ubuntu-latest
+
+    #name: "Lightspeed-stack setup"
+
+    strategy:
+      # For local testing use matrix with just one variant, "act" doesn't separate runs
+      matrix:
+        mode: ["query", "streaming"]
+        eval-data: ["tests/integration/test_evaluation_data.yaml"]
+        lsc_image_path: ["quay.io/lightspeed-core/lightspeed-stack:latest"]
+    
+    name: "E2E Lightspeed Evaluation Test, mode: ${{ matrix.mode }}"
+
+    env:
+      LSC_IMAGE_NAME: "lightspeed-stack-test-mode-${{ matrix.mode }}"
+
+    steps:
+      # Stolen from lightspeed-stack
+      - uses: actions/checkout@v4
+        with:
+          # On PR_TARGET → the fork (or same repo) that opened the PR.
+          # On push      → falls back to the current repository.
+          repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
+
+          # On PR_TARGET → the PR head *commit* (reproducible).
+          # On push      → the pushed commit that triggered the workflow.
+          ref: ${{ github.event.pull_request.head.ref || github.sha }}
+
+          # Don’t keep credentials when running untrusted PR code under PR_TARGET.
+          persist-credentials: ${{ github.event_name != 'pull_request_target' }}
+
+      - name: Verify actual git checkout result
+        run: |
+          echo "=== Git Status After Checkout ==="
+          echo "Remote URLs:"
+          git remote -v
+          echo ""
+          echo "Current branch: $(git branch --show-current 2>/dev/null || echo 'detached HEAD')"
+          echo "Current commit: $(git rev-parse HEAD)"
+          echo "Current commit message: $(git log -1 --oneline)"
+          echo ""
+          echo "=== Recent commits ==="
+          git log --oneline -5
+
+      # Run LSC
+      # Can't be in onetime separate job -- networking is not shared between jobs
+      - name: Run Lightspeed Stack (LSC)
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: |
+          echo "==========Running Lightspeed Core======="
+          docker run \
+            --name $LSC_IMAGE_NAME \
+            -p 8080:8080 \
+            -v $(pwd)/tests/integration/lightspeed-stack.yaml:/app-root/lightspeed-stack.yaml:Z \
+            -v $(pwd)/tests/integration/run.yaml:/app-root/run.yaml:Z \
+            -e OPENAI_API_KEY="${OPENAI_API_KEY}" \
+            --detach \
+            ${{ matrix.lsc_image_path }}
+          echo "==========Running Lightspeed Core Done======="
+
+      - name: Show logs from the LSC
+        run: |
+          sleep 2
+          docker container ls -a
+          docker logs $LSC_IMAGE_NAME
+
+      # Wait for LSC
+      - name: Wait for the LSC
+        run: |
+          echo "Waiting for service on port 8080..."
+          for i in {1..30}; do
+            if curl --output /dev/null --fail http://localhost:8080/v1/models ; then
+              echo "Service is up!"
+              exit 0
+            fi
+            docker logs -n 10 $LSC_IMAGE_NAME
+            echo "Still waiting..."
+            sleep 2
+          done
+
+          echo "Service did not start in time"
+          exit 1
+
+      # Query mode
+      - name: Set query mode
+        if: matrix.mode == 'query'
+        run: |
+          echo "CONFIG=./tests/integration/system-config-query.yaml" >> $GITHUB_ENV
+
+      - name: Set streaming mode
+        if: matrix.mode == 'streaming'
+        run: |
+          echo "CONFIG=./tests/integration/system-config-streaming.yaml" >> $GITHUB_ENV
+
+      # Dependencies
+      - name: Install dependencies for Lightspeed Evaluation
+        env:
+          TERM: xterm-256color
+          FORCE_COLOR: 1
+        run: |
+          echo "Installing e2e tests dependencies"
+          pip install --break-system-packages uv
+          uv sync
+      
+      # Run the tests
+      - name: Run the tests
+        env:
+          TERM: xterm-256color
+          FORCE_COLOR: 1
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: |
+          echo "============================="
+          echo "Running..."
+          echo "  config: ${CONFIG}" 
+          echo "  LSC image: ${{ matrix.lsc_image_path }}"
+          echo "============================="
+          uv run lightspeed-eval --system-config "${CONFIG}" --eval-data "${{ matrix.eval-data }}"
+      
+      # Check the result
+      - name: Check test result
+        run: |
+          OUT_FILES=( eval_output/evaluation_*_summary.json )
+          if [ ${#OUT_FILES[@]} != 1 ] ; then
+            echo "Multiple output files: " eval_output/evaluation_*_summary.json
+            exit 1
+          fi
+          OUT_FILE=${OUT_FILES[0]}
+          PASS=$( jq .summary_stats.overall.PASS $OUT_FILE )
+          EXPECTED="1"
+          if [ ${PASS} != ${EXPECTED} ] ; then
+            echo "============"
+            echo "Wrong PASS number in ${OUT_FILE}: got ${PASS}, expected ${EXPECTED}"
+            echo "============"
+            exit 1
+          fi
+
+
+      # Cleanup
+      - name: Stop the LSC if in local devel
+        if: ${{ always() && env.ACT }}
+        run: |
+          echo "Stopping LSC container $LSC_IMAGE_NAME"
+          echo "++++++++++++++++++++++"
+          docker stop $LSC_IMAGE_NAME || true
+          docker rm $LSC_IMAGE_NAME || true
@@ -189,3 +189,6 @@ eval_output*/
 wip*/
 
 .history/
+
+# Used in e2e tests local testing
+.secrets
@@ -42,10 +42,10 @@ check-types: ## Checks type hints in sources
 	uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ tests
 
 black-check:
-	uv run black . --check
+	uv run black src tests script lsc_agent_eval --check
 
 black-format:
-	uv run black .
+	uv run black src tests script lsc_agent_eval
 
 requirements.txt:	pyproject.toml uv.lock ## Generate requirements.txt file containing hashes for all non-devel packages
 	uv export --no-dev --format requirements-txt --output-file requirements.txt
@@ -82,7 +82,7 @@ docstyle:
 	uv run pydocstyle -v src tests script lsc_agent_eval
 
 ruff:
-	uv run ruff check .
+	uv run ruff check src tests script lsc_agent_eval
 
 bandit: ## Security scanning with Bandit
 	uv run bandit -r src/lightspeed_evaluation -ll
@@ -142,26 +142,20 @@ lightspeed-eval --system-config config/system_api_disabled.yaml --eval-data conf
 
 ### Custom Metrics with GEval (from DeepEval)
 
-GEval allows us to define custom evaluation metrics using natural language criteria. Define metrics in `system.yaml` under `metrics_metadata`:
+Define custom evaluation metrics in `system.yaml` under `metrics_metadata`. **Criteria** is required; **evaluation_steps** and **rubrics** are optional. Score is 0–1.
 
 ```yaml
 metrics_metadata:
   turn_level:
     "geval:custom_metric_name":
       criteria: |
-        Specific criteria for the evaluation.
-      evaluation_params:
-        - query
-        - response
-        - expected_response  # optional
-      evaluation_steps:
-        - "Step 1: Check if..."
-        - "Step 2: Verify that..."
+        What to evaluate (required).
+      evaluation_params: [query, response, expected_response]
       threshold: 0.7
       description: "Metric description"
 ```
 
-See sample [`system config`](config/system.yaml) for complete examples of `geval:technical_accuracy` and `geval:conversation_coherence`.
+See [Configuration → Metrics](docs/configuration.md#metrics) for GEval options (evaluation_steps, rubrics) and [config/system.yaml](config/system.yaml) for full examples.
 
 ## ⚙️ Configuration
 
 
@@ -58,6 +58,9 @@ api:
   # Legacy authentication (fallback when mcp_headers.enabled is false)
   # Authentication via API_KEY environment variable only for MCP server
 
+  # Retry configuration for 429 Too Many Requests API errors
+  num_retries: 3  # Number of retry attempts (default 3)
+
 # Default metrics metadata
 metrics_metadata:
   # Turn-level metrics metadata
@@ -130,9 +133,9 @@ metrics_metadata:
       distance_measure: "levenshtein"  # Options: levenshtein, hamming, jaro, jaro_winkler
       default: false  # Use custom:answer_correctness for semantic comparison instead
 
-    # GEval turn-level metrics 
+    # GEval turn-level metrics (criteria = required; evaluation_steps, rubrics = optional)
     "geval:technical_accuracy":
-      criteria: |
+      criteria: |  # required
         Assess whether the response provides technically accurate information,
         commands, code, syntax, and follows relevant industry or
         domain-specific best practices. The response should
@@ -141,12 +144,19 @@ metrics_metadata:
         - query
         - response
         - expected_response
-      evaluation_steps:
+      evaluation_steps:  # optional: how to evaluate; if omitted, GEval generates from criteria
         - "Verify that the provided syntax (e.g., code, commands, configuration) is valid and follows the language/tool's formatting rules."
         - "Check if the response uses appropriate modules, functions, libraries, or parameters for the given task."
         - "Assess whether the solution aligns with relevant official documentation or established best practices for the specific domain."
         - "Verify the response directly and accurately addresses the user's specific query or task."
         - "Check for potential security issues, significant inefficiencies, or anti-patterns."
+      # rubrics:  # optional: score ranges 0-10, non-overlapping, but final score is 0-1; same style as evaluation_steps
+      #   - score_range: [0, 3]
+      #     expected_outcome: "Incorrect or invalid."
+      #   - score_range: [4, 7]
+      #     expected_outcome: "Partially correct or has issues."
+      #   - score_range: [8, 10]
+      #     expected_outcome: "Technically correct and follows best practices."
       threshold: 0.7
       description: "General technical accuracy of provided commands, code, or technical information"
 
@@ -166,16 +176,16 @@ metrics_metadata:
       threshold: 0.7
       description: "How well the model retains information from previous turns"
 
-    # GEval conversation-level metrics
+    # GEval conversation-level metrics (criteria = required; evaluation_steps, rubrics = optional)
     "geval:conversation_coherence":
-      criteria: |
+      criteria: |  # required
         Evaluate whether the conversation maintains context and provides coherent
         responses across multiple turns. The assistant should reference previous
         exchanges and build upon earlier context.
       evaluation_params:
         - query
         - response
-      evaluation_steps:
+      evaluation_steps:  # optional
         - "Check if the assistant remembers information from previous turns"
         - "Verify responses build logically on previous context"
         - "Assess whether the conversation flows naturally"