E2e Smoke Test #2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: E2E Smoke Test | |
| # Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC | |
| on: | |
| push: | |
| branches: [main] | |
| paths-ignore: | |
| - "docs/**" | |
| - "*.md" | |
| pull_request: | |
| branches: [main] | |
| paths-ignore: | |
| - "docs/**" | |
| - "*.md" | |
| schedule: | |
| - cron: '0 */6 * * *' | |
| workflow_dispatch: # Allow manual triggering | |
| inputs: | |
| debug_mode: | |
| description: 'Enable debug output' | |
| required: false | |
| default: 'false' | |
| type: boolean | |
| jobs: | |
| e2e-smoke-test: | |
| name: E2E Smoke Test | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: true | |
| - name: Install the project | |
| run: uv sync --locked --all-extras --dev | |
| - name: Install tau2 for testing | |
| run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main | |
| - name: Run E2E Smoke Test | |
| id: run_test | |
| env: | |
| FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} | |
| FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" | |
| run: | | |
| echo "Running e2e smoke test..." | |
| # Run the test and capture both stdout and exit code | |
| set +e # Don't exit on failure | |
| uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \ | |
| -v --tb=short --durations=10 \ | |
| --ep-print-summary \ | |
| --ep-summary-json=ep_summary.json | |
| TEST_EXIT_CODE=$? | |
| echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT | |
| # List generated files for debugging | |
| echo "π Generated files:" | |
| ls -la *.json 2>/dev/null || echo "No JSON files found" | |
| ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found" | |
| # Parse evaluation protocol summary if it exists | |
| # EP might generate files with different names, check for common patterns | |
| EP_SUMMARY_FILE="" | |
| for file in ep_summary*.json; do | |
| if [ -f "$file" ]; then | |
| EP_SUMMARY_FILE="$file" | |
| break | |
| fi | |
| done | |
| if [ -n "$EP_SUMMARY_FILE" ] && [ -f "$EP_SUMMARY_FILE" ]; then | |
| echo "EP Summary found: $EP_SUMMARY_FILE, parsing..." | |
| # Log the full summary for debugging | |
| echo "EP Summary contents:" | |
| cat "$EP_SUMMARY_FILE" | jq . 2>/dev/null || cat "$EP_SUMMARY_FILE" | |
| # Extract success rate from EP summary (this contains the actual accuracy/success rate) | |
| # The EP summary uses 'agg_score' for the aggregated success rate | |
| SUCCESS_RATE=$(jq -r '.agg_score // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0") | |
| echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT | |
| # Check if success rate meets thresholds (40% - 90% acceptable range) | |
| LOWER_BOUND=0.4 # 40% | |
| UPPER_BOUND=0.9 # 90% | |
| LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l) | |
| UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l) | |
| THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l) | |
| echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT | |
| echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT | |
| echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT | |
| # Extract additional info for display | |
| NUM_ROWS=$(jq -r '.rows // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0") | |
| NUM_RUNS=$(jq -r '.num_runs // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0") | |
| echo "π Evaluation Summary:" | |
| echo " - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%" | |
| echo " - Dataset rows evaluated: $NUM_ROWS" | |
| echo " - Number of runs: $NUM_RUNS" | |
| echo " - Lower bound (β₯40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "β YES" || echo "β NO")" | |
| echo " - Upper bound (β€90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "β YES" || echo "β NO")" | |
| echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "β YES" || echo "β NO")" | |
| else | |
| echo "β No EP summary file found" | |
| echo "threshold_met=0" >> $GITHUB_OUTPUT | |
| echo "success_rate=0" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Upload test results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: e2e-smoke-test-results-${{ github.run_number }} | |
| path: | | |
| ep_summary*.json | |
| *.log | |
| retention-days: 7 | |
| - name: Validate test results | |
| if: always() | |
| run: | | |
| echo "Validating test results against thresholds..." | |
| TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}" | |
| THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}" | |
| LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}" | |
| UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}" | |
| SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}" | |
| echo "Test exit code: $TEST_EXIT_CODE" | |
| echo "Threshold met (40%-90%): $THRESHOLD_MET" | |
| echo "Lower bound met (β₯40%): $LOWER_BOUND_MET" | |
| echo "Upper bound met (β€90%): $UPPER_BOUND_MET" | |
| echo "Success rate: $SUCCESS_RATE" | |
| # Fail the job if tests didn't run successfully or thresholds weren't met | |
| if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then | |
| echo "β E2E smoke test FAILED" | |
| echo " - Test execution failed (exit code: $TEST_EXIT_CODE)" | |
| echo " - Success rate outside acceptable range (required: 40%-90%, actual: ${SUCCESS_RATE:-unknown})" | |
| exit 1 | |
| elif [ "$TEST_EXIT_CODE" != "0" ]; then | |
| echo "β οΈ E2E smoke test had test execution issues but may have met thresholds" | |
| echo " - Test exit code: $TEST_EXIT_CODE" | |
| echo " - Thresholds met: $THRESHOLD_MET" | |
| # Don't exit with error if thresholds were actually met despite test issues | |
| if [ "$THRESHOLD_MET" = "1" ]; then | |
| echo "β Thresholds met despite execution issues - considering this a pass" | |
| else | |
| exit 1 | |
| fi | |
| elif [ "$THRESHOLD_MET" != "1" ]; then | |
| # Determine which bound was violated | |
| if [ "$LOWER_BOUND_MET" != "1" ]; then | |
| echo "β E2E smoke test FAILED - success rate too low" | |
| echo " - Success rate: ${SUCCESS_RATE:-unknown}" | |
| echo " - Required: β₯40%" | |
| elif [ "$UPPER_BOUND_MET" != "1" ]; then | |
| echo "β E2E smoke test FAILED - success rate suspiciously high" | |
| echo " - Success rate: ${SUCCESS_RATE:-unknown}" | |
| echo " - Maximum expected: β€90%" | |
| echo " - This may indicate test issues or unrealistic performance" | |
| else | |
| echo "β E2E smoke test FAILED - success rate outside acceptable range" | |
| echo " - Success rate: ${SUCCESS_RATE:-unknown}" | |
| echo " - Required range: 40%-90%" | |
| fi | |
| exit 1 | |
| else | |
| echo "β E2E smoke test PASSED" | |
| echo " - Success rate: ${SUCCESS_RATE:-unknown}" | |
| echo " - Within acceptable range: 40%-90%" | |
| fi |