Scale Test Infrastructure #122
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Scale Test Infrastructure | |
| on: | |
| # Nightly scheduled runs - all scenarios | |
| schedule: | |
| - cron: '0 2 * * *' # 2 AM UTC daily | |
| workflow_dispatch: | |
| inputs: | |
| action: | |
| description: 'Action to perform' | |
| required: true | |
| type: choice | |
| options: | |
| - plan | |
| - apply | |
| - destroy | |
| scale_multiplier: | |
| description: 'Resource multiplier (1=175, 10=1740, 50=8700 resources)' | |
| required: true | |
| type: choice | |
| default: '1' | |
| options: | |
| - '1' | |
| - '5' | |
| - '10' | |
| - '25' | |
| - '50' | |
| scenario: | |
| description: 'Test scenario (triggers specific risks in Overmind)' | |
| required: false | |
| type: choice | |
| default: 'none' | |
| options: | |
| - 'none' | |
| - 'run_all_nightly' # Runs all 7 AWS scenarios at 25x (like scheduled nightly) | |
| # AWS scenarios | |
| - 'lambda_timeout' | |
| - 'shared_sg_open' | |
| - 'vpc_peering_change' | |
| - 'central_sns_change' | |
| - 'combined_network' | |
| - 'combined_all' | |
| - 'combined_max' | |
| # AWS KMS orphan detection | |
| - 'kms_orphan_simulation' | |
| # GCP scenarios (requires cloud_provider=gcp or both) | |
| - 'shared_firewall_open' | |
| - 'central_pubsub_change' | |
| - 'gce_downgrade' | |
| - 'function_timeout' | |
| - 'combined_gcp_all' | |
| cloud_provider: | |
| description: 'Cloud provider to deploy' | |
| required: true | |
| type: choice | |
| default: 'aws' | |
| options: | |
| - 'aws' | |
| - 'gcp' | |
| - 'both' | |
| confirmation: | |
| description: 'For destroy: type DESTROY-SCALE-TEST to confirm' | |
| required: false | |
| type: string | |
| env: | |
| WORKING_DIR: scale-test | |
| jobs: | |
| # ========================================================================= | |
| # Nightly Scheduled Runs - All AWS Scenarios | |
| # ========================================================================= | |
| nightly-scenarios: | |
| name: Nightly - ${{ matrix.scenario }} @ 25x | |
| if: github.event_name == 'schedule' || inputs.scenario == 'run_all_nightly' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| id-token: write | |
| concurrency: | |
| group: scale-test-tfstate-${{ matrix.scenario }} | |
| cancel-in-progress: false | |
| strategy: | |
| fail-fast: false # Run all scenarios even if one fails | |
| max-parallel: 1 # Run sequentially - shared Terraform state + unique change per scenario | |
| matrix: | |
| scenario: | |
| - shared_sg_open | |
| - lambda_timeout | |
| - vpc_peering_change | |
| - central_sns_change | |
| - combined_network | |
| - combined_all | |
| - kms_orphan_simulation | |
| env: | |
| TF_VAR_scale_multiplier: 25 | |
| TF_VAR_scenario: ${{ matrix.scenario }} | |
| TF_VAR_cloud_provider: aws | |
| defaults: | |
| run: | |
| working-directory: scale-test | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Setup Terraform | |
| uses: hashicorp/setup-terraform@v3 | |
| with: | |
| terraform_wrapper: false | |
| - name: Configure AWS Credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| audience: sts.amazonaws.com | |
| aws-region: us-east-1 | |
| role-to-assume: ${{ vars.TERRAFORM_DEPLOY_ROLE }} | |
| - name: Configure GCP Credentials | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.OVERMIND_SCALE_TEST }} | |
| - name: Setup GCP SDK | |
| uses: google-github-actions/setup-gcloud@v2 | |
| with: | |
| project_id: overmind-scale-test | |
| - name: Terraform Init | |
| run: terraform init -input=false | |
| - name: Terraform Plan | |
| run: | | |
| echo "Planning scenario: ${{ matrix.scenario }}" | |
| terraform plan \ | |
| -compact-warnings \ | |
| -no-color \ | |
| -input=false \ | |
| -lock-timeout=5m \ | |
| -parallelism=100 \ | |
| -out=tfplan | |
| terraform show -json tfplan > tfplan.json | |
| - name: Install Overmind CLI | |
| uses: overmindtech/actions/install-cli@main | |
| continue-on-error: true | |
| with: | |
| version: latest | |
| github-token: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Add Overmind CLI to PATH | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| if command -v overmind &> /dev/null; then | |
| overmind --version | |
| elif [ -f "$GITHUB_WORKSPACE/overmindtech/overmind" ]; then | |
| chmod +x "$GITHUB_WORKSPACE/overmindtech/overmind" | |
| echo "$GITHUB_WORKSPACE/overmindtech" >> $GITHUB_PATH | |
| "$GITHUB_WORKSPACE/overmindtech/overmind" --version | |
| else | |
| echo "::error::Overmind CLI not found" | |
| exit 1 | |
| fi | |
| - name: Record Overmind start time | |
| id: overmind-start | |
| run: echo "start_time=$(date +%s%3N)" >> $GITHUB_OUTPUT | |
| - name: Submit Plan to Overmind | |
| id: submit-plan | |
| continue-on-error: true | |
| env: | |
| OVM_API_KEY: ${{ secrets.OVM_API_KEY }} | |
| run: | | |
| # Use unique ticket link per scenario to create separate changes | |
| TICKET_LINK="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}?scenario=${{ matrix.scenario }}" | |
| echo "Submitting plan with unique ticket link: $TICKET_LINK" | |
| # Capture output and extract the URL (should be the last line or contain https://) | |
| overmind changes submit-plan \ | |
| --title "Scale Test - ${{ matrix.scenario }} @ 25x" \ | |
| --description "Nightly scale test for scenario: ${{ matrix.scenario }}" \ | |
| --ticket-link "$TICKET_LINK" \ | |
| --tags "model=risks_v6,scenario=${{ matrix.scenario }}" \ | |
| tfplan.json 2>&1 | tee /tmp/submit-output.txt | |
| # Extract the change URL (line containing https://app.overmind.tech/changes/) | |
| CHANGE_URL=$(grep -oE 'https://app\.overmind\.tech/changes/[a-f0-9-]+' /tmp/submit-output.txt | head -1) | |
| if [ -z "$CHANGE_URL" ]; then | |
| echo "::error::Could not extract change URL from submit-plan output" | |
| cat /tmp/submit-output.txt | |
| exit 1 | |
| fi | |
| echo "change-url=$CHANGE_URL" >> $GITHUB_OUTPUT | |
| echo "Submitted change: $CHANGE_URL" | |
| # Wait for change analysis to complete (fetch the change to trigger wait) | |
| echo "Waiting for change analysis to complete..." | |
| overmind changes get-change \ | |
| --change "$CHANGE_URL" \ | |
| --format markdown \ | |
| > /tmp/change-summary.md | |
| echo "Change analysis complete" | |
| - name: Get change results as JSON | |
| id: get-results | |
| if: steps.submit-plan.outputs.change-url != '' | |
| continue-on-error: true | |
| env: | |
| OVM_API_KEY: ${{ secrets.OVM_API_KEY }} | |
| run: | | |
| echo "Fetching change results from: ${{ steps.submit-plan.outputs.change-url }}" | |
| overmind --version | |
| overmind changes get-change \ | |
| --change "${{ steps.submit-plan.outputs.change-url }}" \ | |
| --format json \ | |
| > change-results.json | |
| # Calculate duration | |
| END_TIME=$(date +%s%3N) | |
| START_TIME=${{ steps.overmind-start.outputs.start_time }} | |
| DURATION_MS=$((END_TIME - START_TIME)) | |
| echo "overmind_duration_ms=$DURATION_MS" >> $GITHUB_OUTPUT | |
| RISK_COUNT=$(jq '.risks | length // 0' change-results.json) | |
| HIGH_RISK_COUNT=$(jq '[.risks[]? | select(.severity == "high" or .severity == "critical")] | length' change-results.json) | |
| MEDIUM_RISK_COUNT=$(jq '[.risks[]? | select(.severity == "medium")] | length' change-results.json) | |
| # Extract blast radius and discovery metrics | |
| BLAST_RADIUS_NODES=$(jq '.change.metadata.numAffectedItems // 0' change-results.json) | |
| BLAST_RADIUS_EDGES=$(jq '.change.metadata.numAffectedEdges // 0' change-results.json) | |
| OBSERVATIONS=$(jq '.change.metadata.total_observations // 0' change-results.json) | |
| HYPOTHESES=$(jq '.hypotheses | length // 0' change-results.json) | |
| echo "risk_count=$RISK_COUNT" >> $GITHUB_OUTPUT | |
| echo "high_risk_count=$HIGH_RISK_COUNT" >> $GITHUB_OUTPUT | |
| echo "medium_risk_count=$MEDIUM_RISK_COUNT" >> $GITHUB_OUTPUT | |
| echo "blast_radius_nodes=$BLAST_RADIUS_NODES" >> $GITHUB_OUTPUT | |
| echo "blast_radius_edges=$BLAST_RADIUS_EDGES" >> $GITHUB_OUTPUT | |
| echo "observations=$OBSERVATIONS" >> $GITHUB_OUTPUT | |
| echo "hypotheses=$HYPOTHESES" >> $GITHUB_OUTPUT | |
| echo "## Nightly Scenario: ${{ matrix.scenario }}" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Total Risks:** $RISK_COUNT" >> $GITHUB_STEP_SUMMARY | |
| echo "- **High/Critical:** $HIGH_RISK_COUNT" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Duration:** $((DURATION_MS / 1000))s" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Blast Radius:** $BLAST_RADIUS_NODES nodes, $BLAST_RADIUS_EDGES edges" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Observations:** $OBSERVATIONS" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Hypotheses:** $HYPOTHESES" >> $GITHUB_STEP_SUMMARY | |
| # ========================================================================= | |
| # PromptFoo Quality Evals (Nightly) | |
| # ========================================================================= | |
| - name: Setup Node.js for PromptFoo | |
| uses: actions/setup-node@v4 | |
| if: ${{ always() && steps.get-results.outcome == 'success' }} | |
| with: | |
| node-version: '20' | |
| - name: Install PromptFoo dependencies | |
| if: ${{ always() && steps.get-results.outcome == 'success' }} | |
| working-directory: scale-test/evals | |
| run: npm install | |
| - name: Run PromptFoo quality evals | |
| id: promptfoo-eval | |
| if: ${{ always() && steps.get-results.outcome == 'success' }} | |
| continue-on-error: true | |
| working-directory: scale-test/evals | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_KEY }} | |
| CHANGE_RESULTS_PATH: ${{ github.workspace }}/scale-test/change-results.json | |
| run: | | |
| echo "Running PromptFoo evals for scenario: ${{ matrix.scenario }}" | |
| # Extract data for eval | |
| RISK_COUNT=$(jq '.risks | length // 0' $CHANGE_RESULTS_PATH) | |
| HIGH_RISK_COUNT=$(jq '[.risks[]? | select(.severity == "high" or .severity == "critical")] | length' $CHANGE_RESULTS_PATH) | |
| RISKS_JSON=$(jq -c '.risks // []' $CHANGE_RESULTS_PATH) | |
| # Run evals | |
| npx promptfoo eval \ | |
| --var "scenario=${{ matrix.scenario }}" \ | |
| --var "risk_count=$RISK_COUNT" \ | |
| --var "high_risk_count=$HIGH_RISK_COUNT" \ | |
| --var "risks_json=$RISKS_JSON" \ | |
| --filter-pattern "${{ matrix.scenario }}" \ | |
| --output eval-results.json | |
| # Parse results for summary | |
| if [ -f "eval-results.json" ]; then | |
| PASS_COUNT=$(jq '[.results[]?.success] | map(select(. == true)) | length' eval-results.json) | |
| TOTAL_COUNT=$(jq '[.results[]?.success] | length' eval-results.json) | |
| echo "eval_pass_count=$PASS_COUNT" >> $GITHUB_OUTPUT | |
| echo "eval_fail_count=$((TOTAL_COUNT - PASS_COUNT))" >> $GITHUB_OUTPUT | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### PromptFoo Quality Eval" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Passed:** $PASS_COUNT / $TOTAL_COUNT assertions" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| - name: Send results to dashboard | |
| if: always() && steps.get-results.outcome == 'success' | |
| continue-on-error: true | |
| env: | |
| DASHBOARD_URL: ${{ secrets.SCALE_DASHBOARD_URL }} | |
| DASHBOARD_API_KEY: ${{ secrets.SCALE_DASHBOARD_API_KEY }} | |
| run: | | |
| if [ -z "$DASHBOARD_URL" ] || [ -z "$DASHBOARD_API_KEY" ]; then | |
| echo "Dashboard not configured" | |
| exit 0 | |
| fi | |
| # Get eval results if available | |
| EVAL_PASS_COUNT=${{ steps.promptfoo-eval.outputs.eval_pass_count || 0 }} | |
| EVAL_FAIL_COUNT=${{ steps.promptfoo-eval.outputs.eval_fail_count || 0 }} | |
| TOTAL=$((EVAL_PASS_COUNT + EVAL_FAIL_COUNT)) | |
| EVAL_SCORE=0 | |
| if [ "$TOTAL" -gt 0 ]; then | |
| EVAL_SCORE=$(echo "scale=2; $EVAL_PASS_COUNT / $TOTAL" | bc) | |
| fi | |
| cat <<EOF > /tmp/payload.json | |
| { | |
| "runId": "${{ github.run_id }}-${{ matrix.scenario }}", | |
| "scenario": "${{ matrix.scenario }}", | |
| "cloudProvider": "aws", | |
| "scaleMultiplier": 25, | |
| "overmindDurationMs": ${{ steps.get-results.outputs.overmind_duration_ms || 0 }}, | |
| "riskCount": ${{ steps.get-results.outputs.risk_count || 0 }}, | |
| "highRiskCount": ${{ steps.get-results.outputs.high_risk_count || 0 }}, | |
| "mediumRiskCount": ${{ steps.get-results.outputs.medium_risk_count || 0 }}, | |
| "blastRadiusNodes": ${{ steps.get-results.outputs.blast_radius_nodes || 0 }}, | |
| "blastRadiusEdges": ${{ steps.get-results.outputs.blast_radius_edges || 0 }}, | |
| "observations": ${{ steps.get-results.outputs.observations || 0 }}, | |
| "hypotheses": ${{ steps.get-results.outputs.hypotheses || 0 }}, | |
| "validationPassed": true, | |
| "evalPassCount": $EVAL_PASS_COUNT, | |
| "evalFailCount": $EVAL_FAIL_COUNT, | |
| "evalScore": $EVAL_SCORE, | |
| "workflowRunUrl": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| } | |
| EOF | |
| curl -s -X POST "$DASHBOARD_URL/api/results" \ | |
| -H "Authorization: Bearer $DASHBOARD_API_KEY" \ | |
| -H "Content-Type: application/json" \ | |
| -d @/tmp/payload.json | |
| # ========================================================================= | |
| # Manual Runs - Validate Inputs | |
| # ========================================================================= | |
| validate: | |
| name: Validate Inputs | |
| if: github.event_name == 'workflow_dispatch' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Validate destroy confirmation | |
| if: inputs.action == 'destroy' | |
| run: | | |
| if [ "${{ inputs.confirmation }}" != "DESTROY-SCALE-TEST" ]; then | |
| echo "::error::Destroy requires confirmation. Type 'DESTROY-SCALE-TEST' in the confirmation field." | |
| exit 1 | |
| fi | |
| echo "Destroy confirmation validated" | |
| # ========================================================================= | |
| # Manual Runs - Terraform | |
| # ========================================================================= | |
| terraform: | |
| name: Terraform ${{ inputs.action }} (×${{ inputs.scale_multiplier }}, scenario=${{ inputs.scenario }}) | |
| if: github.event_name == 'workflow_dispatch' | |
| needs: validate | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| id-token: write | |
| pull-requests: write | |
| concurrency: | |
| group: scale-test-tfstate | |
| cancel-in-progress: false | |
| env: | |
| TF_VAR_scale_multiplier: ${{ inputs.scale_multiplier }} | |
| TF_VAR_scenario: ${{ inputs.scenario }} | |
| TF_VAR_cloud_provider: ${{ inputs.cloud_provider }} | |
| defaults: | |
| run: | |
| working-directory: ${{ env.WORKING_DIR }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Setup Terraform | |
| uses: hashicorp/setup-terraform@v3 | |
| with: | |
| terraform_wrapper: false | |
| - name: Configure AWS Credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| audience: sts.amazonaws.com | |
| aws-region: us-east-1 | |
| role-to-assume: ${{ vars.TERRAFORM_DEPLOY_ROLE }} | |
| # Always authenticate with GCP - Terraform validates all providers during init | |
| # even when cloud_provider=aws (no GCP resources created) | |
| - name: Configure GCP Credentials | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.OVERMIND_SCALE_TEST }} | |
| - name: Setup GCP SDK | |
| uses: google-github-actions/setup-gcloud@v2 | |
| with: | |
| project_id: overmind-scale-test | |
| - name: Terraform Init | |
| id: init | |
| run: | | |
| terraform version | |
| terraform init -input=false | |
| - name: Terraform Plan | |
| id: plan | |
| if: ${{ inputs.action == 'plan' || inputs.action == 'apply' }} | |
| run: | | |
| set -o pipefail | |
| echo "Planning with scale_multiplier=${{ inputs.scale_multiplier }}, scenario=${{ inputs.scenario }}" | |
| terraform plan \ | |
| -compact-warnings \ | |
| -no-color \ | |
| -input=false \ | |
| -lock-timeout=5m \ | |
| -parallelism=100 \ | |
| -out=tfplan 2>&1 | tee terraform_plan.log | |
| # Generate JSON plan for Overmind | |
| terraform show -json tfplan > tfplan.json | |
| # Output summary | |
| echo "## Terraform Plan Summary" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Scale Multiplier:** ${{ inputs.scale_multiplier }}" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Scenario:** ${{ inputs.scenario }}" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Action:** ${{ inputs.action }}" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### Resource Changes" >> $GITHUB_STEP_SUMMARY | |
| terraform show -no-color tfplan | grep -E "^(Plan:|No changes)" >> $GITHUB_STEP_SUMMARY || true | |
| - name: Install Overmind CLI | |
| uses: overmindtech/actions/install-cli@main | |
| id: install-cli | |
| continue-on-error: true | |
| with: | |
| version: latest | |
| github-token: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Add Overmind CLI to PATH | |
| id: verify-cli | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| # The install-cli action downloads to $GITHUB_WORKSPACE/overmindtech/overmind | |
| # We need to add it to PATH for subsequent steps | |
| if command -v overmind &> /dev/null; then | |
| echo "Overmind CLI already in PATH" | |
| overmind --version | |
| elif [ -f "$GITHUB_WORKSPACE/overmindtech/overmind" ]; then | |
| echo "Found CLI at $GITHUB_WORKSPACE/overmindtech/overmind, adding to PATH" | |
| chmod +x "$GITHUB_WORKSPACE/overmindtech/overmind" | |
| echo "$GITHUB_WORKSPACE/overmindtech" >> $GITHUB_PATH | |
| "$GITHUB_WORKSPACE/overmindtech/overmind" --version | |
| else | |
| echo "::error::Overmind CLI not found. Contents of workspace:" | |
| ls -la "$GITHUB_WORKSPACE/" | |
| exit 1 | |
| fi | |
| - name: Record Overmind start time | |
| id: overmind-start | |
| if: ${{ inputs.action == 'plan' || inputs.action == 'apply' }} | |
| run: echo "start_time=$(date +%s%3N)" >> $GITHUB_OUTPUT | |
| - name: Submit Plan to Overmind | |
| uses: overmindtech/actions/submit-plan@main | |
| continue-on-error: true | |
| id: submit-plan | |
| if: ${{ inputs.action == 'plan' || inputs.action == 'apply' }} | |
| with: | |
| ovm-api-key: ${{ secrets.OVM_API_KEY }} | |
| plan-json: ${{ env.WORKING_DIR }}/tfplan.json | |
| tags: 'model=risks_v6' | |
| # ========================================================================= | |
| # Quality Evaluation Steps | |
| # Capture analysis results and validate expected risks for each scenario | |
| # ========================================================================= | |
| - name: Get change results as JSON | |
| id: get-results | |
| if: ${{ steps.submit-plan.outputs.change-url != '' && (inputs.action == 'plan' || inputs.action == 'apply') }} | |
| continue-on-error: true | |
| env: | |
| OVM_API_KEY: ${{ secrets.OVM_API_KEY }} | |
| run: | | |
| echo "Fetching change results from: ${{ steps.submit-plan.outputs.change-url }}" | |
| # Show CLI version for debugging | |
| echo "Using Overmind CLI:" | |
| overmind --version | |
| # Get the full change analysis as JSON | |
| overmind changes get-change \ | |
| --change "${{ steps.submit-plan.outputs.change-url }}" \ | |
| --format json \ | |
| > change-results.json | |
| # Calculate Overmind duration | |
| END_TIME=$(date +%s%3N) | |
| START_TIME=${{ steps.overmind-start.outputs.start_time }} | |
| DURATION_MS=$((END_TIME - START_TIME)) | |
| echo "overmind_duration_ms=$DURATION_MS" >> $GITHUB_OUTPUT | |
| echo "Overmind analysis took ${DURATION_MS}ms" | |
| # Extract key metrics for assertions | |
| RISK_COUNT=$(jq '.risks | length // 0' change-results.json) | |
| HIGH_RISK_COUNT=$(jq '[.risks[]? | select(.severity == "high" or .severity == "critical")] | length' change-results.json) | |
| MEDIUM_RISK_COUNT=$(jq '[.risks[]? | select(.severity == "medium")] | length' change-results.json) | |
| # Extract blast radius and discovery metrics | |
| BLAST_RADIUS_NODES=$(jq '.change.metadata.numAffectedItems // 0' change-results.json) | |
| BLAST_RADIUS_EDGES=$(jq '.change.metadata.numAffectedEdges // 0' change-results.json) | |
| OBSERVATIONS=$(jq '.change.metadata.total_observations // 0' change-results.json) | |
| HYPOTHESES=$(jq '.hypotheses | length // 0' change-results.json) | |
| echo "risk_count=$RISK_COUNT" >> $GITHUB_OUTPUT | |
| echo "high_risk_count=$HIGH_RISK_COUNT" >> $GITHUB_OUTPUT | |
| echo "medium_risk_count=$MEDIUM_RISK_COUNT" >> $GITHUB_OUTPUT | |
| echo "blast_radius_nodes=$BLAST_RADIUS_NODES" >> $GITHUB_OUTPUT | |
| echo "blast_radius_edges=$BLAST_RADIUS_EDGES" >> $GITHUB_OUTPUT | |
| echo "observations=$OBSERVATIONS" >> $GITHUB_OUTPUT | |
| echo "hypotheses=$HYPOTHESES" >> $GITHUB_OUTPUT | |
| echo "## Change Analysis Results" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Total Risks:** $RISK_COUNT" >> $GITHUB_STEP_SUMMARY | |
| echo "- **High/Critical Risks:** $HIGH_RISK_COUNT" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Analysis Duration:** $((DURATION_MS / 1000))s" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Medium Risks:** $MEDIUM_RISK_COUNT" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Blast Radius:** $BLAST_RADIUS_NODES nodes, $BLAST_RADIUS_EDGES edges" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Observations:** $OBSERVATIONS" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Hypotheses:** $HYPOTHESES" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| # List all risks in summary | |
| echo "### Detected Risks" >> $GITHUB_STEP_SUMMARY | |
| if [ "$RISK_COUNT" -gt 0 ]; then | |
| jq -r '.risks[]? | "- **[\(.severity)]** \(.title)"' change-results.json >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "_No risks detected_" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| # NOTE: Artifact upload disabled for public repo - change-results.json may contain | |
| # internal system prompts. Only tfplan.json is safe to upload if needed. | |
| # - name: Upload analysis results | |
| # uses: actions/upload-artifact@v4 | |
| # if: ${{ always() && (inputs.action == 'plan' || inputs.action == 'apply') }} | |
| # with: | |
| # name: change-analysis-${{ inputs.scenario }}-${{ github.run_id }} | |
| # path: ${{ env.WORKING_DIR }}/tfplan.json | |
| # if-no-files-found: ignore | |
| # ========================================================================= | |
| # PromptFoo Quality Evals | |
| # Run LLM-as-judge evaluation on the risk analysis quality | |
| # ========================================================================= | |
| - name: Setup Node.js for PromptFoo | |
| uses: actions/setup-node@v4 | |
| if: ${{ always() && steps.get-results.outcome == 'success' && inputs.scenario != 'none' }} | |
| with: | |
| node-version: '20' | |
| - name: Install PromptFoo dependencies | |
| if: ${{ always() && steps.get-results.outcome == 'success' && inputs.scenario != 'none' }} | |
| working-directory: ${{ env.WORKING_DIR }}/evals | |
| run: npm install | |
| - name: Run PromptFoo quality evals | |
| id: promptfoo-eval | |
| if: ${{ always() && steps.get-results.outcome == 'success' && inputs.scenario != 'none' }} | |
| continue-on-error: true | |
| working-directory: ${{ env.WORKING_DIR }}/evals | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_KEY }} | |
| CHANGE_RESULTS_PATH: ${{ github.workspace }}/${{ env.WORKING_DIR }}/change-results.json | |
| run: | | |
| echo "Running PromptFoo evals for scenario: ${{ inputs.scenario }}" | |
| echo "Results file: $CHANGE_RESULTS_PATH" | |
| # Verify results file exists | |
| if [ ! -f "$CHANGE_RESULTS_PATH" ]; then | |
| echo "::warning::Change results file not found at $CHANGE_RESULTS_PATH" | |
| exit 0 | |
| fi | |
| # Extract metrics from change-results.json | |
| RISK_COUNT=$(jq '.risks | length // 0' "$CHANGE_RESULTS_PATH") | |
| HIGH_RISK_COUNT=$(jq '[.risks[]? | select(.severity == "high" or .severity == "critical")] | length' "$CHANGE_RESULTS_PATH") | |
| RISKS_JSON=$(jq -c '.risks // []' "$CHANGE_RESULTS_PATH") | |
| echo "Risk count: $RISK_COUNT, High risk count: $HIGH_RISK_COUNT" | |
| # Run evals with pre-extracted data | |
| # --filter-pattern filters tests by description regex | |
| npx promptfoo eval \ | |
| --var "scenario=${{ inputs.scenario }}" \ | |
| --var "risk_count=$RISK_COUNT" \ | |
| --var "high_risk_count=$HIGH_RISK_COUNT" \ | |
| --var "risks_json=$RISKS_JSON" \ | |
| --filter-pattern "${{ inputs.scenario }}" \ | |
| --output eval-results.json \ | |
| --no-cache | |
| # Extract summary metrics | |
| if [ -f eval-results.json ]; then | |
| PASS_COUNT=$(jq '[.results[].success] | map(select(. == true)) | length' eval-results.json) | |
| TOTAL_COUNT=$(jq '[.results[].success] | length' eval-results.json) | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### PromptFoo Quality Eval" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Passed:** $PASS_COUNT / $TOTAL_COUNT assertions" >> $GITHUB_STEP_SUMMARY | |
| # Show any failures | |
| jq -r '.results[] | select(.success == false) | "- ❌ \(.description): \(.error // "assertion failed")"' eval-results.json >> $GITHUB_STEP_SUMMARY || true | |
| fi | |
| # NOTE: Artifact upload disabled for public repo - eval results may reference | |
| # internal system prompts from change-results.json | |
| # - name: Upload PromptFoo results | |
| # uses: actions/upload-artifact@v4 | |
| # if: ${{ always() && steps.promptfoo-eval.outcome != 'skipped' }} | |
| # with: | |
| # name: promptfoo-results-${{ inputs.scenario }}-${{ github.run_id }} | |
| # path: ${{ env.WORKING_DIR }}/evals/eval-results.json | |
| # if-no-files-found: ignore | |
| # ========================================================================= | |
| # Send Results to Dashboard | |
| # Posts metrics to private Vercel dashboard for trend analysis | |
| # ========================================================================= | |
| - name: Send results to dashboard | |
| id: send-dashboard | |
| if: ${{ always() && steps.get-results.outcome == 'success' && inputs.scenario != 'none' }} | |
| continue-on-error: true | |
| env: | |
| DASHBOARD_URL: ${{ secrets.SCALE_DASHBOARD_URL }} | |
| DASHBOARD_API_KEY: ${{ secrets.SCALE_DASHBOARD_API_KEY }} | |
| run: | | |
| # Skip if dashboard not configured | |
| if [ -z "$DASHBOARD_URL" ] || [ -z "$DASHBOARD_API_KEY" ]; then | |
| echo "Dashboard not configured, skipping..." | |
| exit 0 | |
| fi | |
| echo "Sending results to dashboard..." | |
| # Get eval results if available | |
| EVAL_PASS_COUNT=0 | |
| EVAL_FAIL_COUNT=0 | |
| EVAL_SCORE=0 | |
| if [ -f "evals/eval-results.json" ]; then | |
| EVAL_PASS_COUNT=$(jq '[.results[]?.success] | map(select(. == true)) | length' evals/eval-results.json 2>/dev/null || echo 0) | |
| EVAL_FAIL_COUNT=$(jq '[.results[]?.success] | map(select(. == false)) | length' evals/eval-results.json 2>/dev/null || echo 0) | |
| TOTAL=$(($EVAL_PASS_COUNT + $EVAL_FAIL_COUNT)) | |
| if [ "$TOTAL" -gt 0 ]; then | |
| EVAL_SCORE=$(echo "scale=2; $EVAL_PASS_COUNT / $TOTAL" | bc) | |
| fi | |
| fi | |
| # Build JSON payload | |
| cat <<EOF > /tmp/dashboard-payload.json | |
| { | |
| "runId": "${{ github.run_id }}", | |
| "scenario": "${{ inputs.scenario }}", | |
| "cloudProvider": "${{ inputs.cloud_provider }}", | |
| "scaleMultiplier": ${{ inputs.scale_multiplier }}, | |
| "overmindDurationMs": ${{ steps.get-results.outputs.overmind_duration_ms || 0 }}, | |
| "riskCount": ${{ steps.get-results.outputs.risk_count || 0 }}, | |
| "highRiskCount": ${{ steps.get-results.outputs.high_risk_count || 0 }}, | |
| "mediumRiskCount": ${{ steps.get-results.outputs.medium_risk_count || 0 }}, | |
| "blastRadiusNodes": ${{ steps.get-results.outputs.blast_radius_nodes || 0 }}, | |
| "blastRadiusEdges": ${{ steps.get-results.outputs.blast_radius_edges || 0 }}, | |
| "observations": ${{ steps.get-results.outputs.observations || 0 }}, | |
| "hypotheses": ${{ steps.get-results.outputs.hypotheses || 0 }}, | |
| "validationPassed": true, | |
| "evalPassCount": $EVAL_PASS_COUNT, | |
| "evalFailCount": $EVAL_FAIL_COUNT, | |
| "evalScore": $EVAL_SCORE, | |
| "workflowRunUrl": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| } | |
| EOF | |
| echo "Payload:" | |
| cat /tmp/dashboard-payload.json | |
| # Send to dashboard API | |
| RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$DASHBOARD_URL/api/results" \ | |
| -H "Authorization: Bearer $DASHBOARD_API_KEY" \ | |
| -H "Content-Type: application/json" \ | |
| -d @/tmp/dashboard-payload.json) | |
| HTTP_CODE=$(echo "$RESPONSE" | tail -n1) | |
| BODY=$(echo "$RESPONSE" | sed '$d') | |
| if [ "$HTTP_CODE" -ge 200 ] && [ "$HTTP_CODE" -lt 300 ]; then | |
| echo "✅ Results sent to dashboard successfully" | |
| echo "$BODY" | |
| else | |
| echo "⚠️ Failed to send results to dashboard (HTTP $HTTP_CODE)" | |
| echo "$BODY" | |
| # Don't fail the workflow for dashboard issues | |
| fi | |
| # Cost Analysis disabled for scale testing (plan too large) | |
| # - name: Cost Analysis | |
| # uses: overmindtech/cost-signals-action@v1 | |
| # continue-on-error: true | |
| # if: ${{ inputs.action == 'plan' || inputs.action == 'apply' }} | |
| # with: | |
| # overmind-api-key: ${{ secrets.OVM_API_KEY }} | |
| # infracost-api-key: ${{ secrets.INFRACOST_API_KEY }} | |
| # terraform-plan-json: ${{ env.WORKING_DIR }}/tfplan.json | |
| # ticket-link: ${{ steps.submit-plan.outputs.change-url }} | |
| - name: Start Overmind Change | |
| uses: overmindtech/actions/start-change@main | |
| continue-on-error: true | |
| if: ${{ inputs.action == 'apply' }} | |
| with: | |
| ovm-api-key: ${{ secrets.OVM_API_KEY }} | |
| # reauthenticate after potentially long running terraform plan or submit-plan steps as credentials may have expired | |
| - name: Configure AWS Credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| audience: sts.amazonaws.com | |
| aws-region: us-east-1 | |
| role-to-assume: ${{ vars.TERRAFORM_DEPLOY_ROLE }} | |
| - name: Configure GCP Credentials | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.OVERMIND_SCALE_TEST }} | |
| - name: Terraform Apply | |
| id: apply | |
| if: ${{ inputs.action == 'apply' }} | |
| run: | | |
| echo "Applying scale test infrastructure (×${{ inputs.scale_multiplier }})" | |
| terraform apply \ | |
| -auto-approve \ | |
| -no-color \ | |
| -input=false \ | |
| -lock-timeout=5m \ | |
| -parallelism=100 \ | |
| tfplan | |
| echo "## Apply Complete" >> $GITHUB_STEP_SUMMARY | |
| echo "Scale test infrastructure deployed with multiplier ×${{ inputs.scale_multiplier }}" >> $GITHUB_STEP_SUMMARY | |
| - name: End Overmind Change | |
| uses: overmindtech/actions/end-change@main | |
| continue-on-error: true | |
| if: ${{ (inputs.action == 'apply') && (success() || failure() || cancelled()) }} | |
| with: | |
| ovm-api-key: ${{ secrets.OVM_API_KEY }} | |
| # reauthenticate after potentially long running end-change step as credentials may have expired | |
| - name: Configure AWS Credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| audience: sts.amazonaws.com | |
| aws-region: us-east-1 | |
| role-to-assume: ${{ vars.TERRAFORM_DEPLOY_ROLE }} | |
| - name: Configure GCP Credentials | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.OVERMIND_SCALE_TEST }} | |
| - name: Terraform Destroy | |
| id: destroy | |
| if: ${{ inputs.action == 'destroy' }} | |
| run: | | |
| echo "::warning::Destroying scale test infrastructure (×${{ inputs.scale_multiplier }})" | |
| terraform destroy \ | |
| -auto-approve \ | |
| -no-color \ | |
| -input=false \ | |
| -parallelism=100 \ | |
| -lock-timeout=10m | |
| echo "## Destroy Complete" >> $GITHUB_STEP_SUMMARY | |
| echo "Scale test infrastructure destroyed" >> $GITHUB_STEP_SUMMARY | |
| - name: Output Terraform Summary | |
| if: always() | |
| run: | | |
| echo "" | |
| echo "=== Scale Test Summary ===" | |
| echo "Action: ${{ inputs.action }}" | |
| echo "Multiplier: ${{ inputs.scale_multiplier }}" | |
| echo "Scenario: ${{ inputs.scenario }}" | |
| echo "Status: ${{ job.status }}" |