Scale Test Infrastructure #122

Workflow file for this run

.github/workflows/scale-test.yml at 9e0f91a

	name: Scale Test Infrastructure

	on:
	# Nightly scheduled runs - all scenarios
	schedule:
	- cron: '0 2 * * *' # 2 AM UTC daily

	workflow_dispatch:
	inputs:
	action:
	description: 'Action to perform'
	required: true
	type: choice
	options:
	- plan
	- apply
	- destroy
	scale_multiplier:
	description: 'Resource multiplier (1=175, 10=1740, 50=8700 resources)'
	required: true
	type: choice
	default: '1'
	options:
	- '1'
	- '5'
	- '10'
	- '25'
	- '50'
	scenario:
	description: 'Test scenario (triggers specific risks in Overmind)'
	required: false
	type: choice
	default: 'none'
	options:
	- 'none'
	- 'run_all_nightly' # Runs all 7 AWS scenarios at 25x (like scheduled nightly)
	# AWS scenarios
	- 'lambda_timeout'
	- 'shared_sg_open'
	- 'vpc_peering_change'
	- 'central_sns_change'
	- 'combined_network'
	- 'combined_all'
	- 'combined_max'
	# AWS KMS orphan detection
	- 'kms_orphan_simulation'
	# GCP scenarios (requires cloud_provider=gcp or both)
	- 'shared_firewall_open'
	- 'central_pubsub_change'
	- 'gce_downgrade'
	- 'function_timeout'
	- 'combined_gcp_all'
	cloud_provider:
	description: 'Cloud provider to deploy'
	required: true
	type: choice
	default: 'aws'
	options:
	- 'aws'
	- 'gcp'
	- 'both'
	confirmation:
	description: 'For destroy: type DESTROY-SCALE-TEST to confirm'
	required: false
	type: string

	env:
	WORKING_DIR: scale-test

	jobs:
	# =========================================================================
	# Nightly Scheduled Runs - All AWS Scenarios
	# =========================================================================
	nightly-scenarios:
	name: Nightly - ${{ matrix.scenario }} @ 25x
	if: github.event_name == 'schedule' \|\| inputs.scenario == 'run_all_nightly'
	runs-on: ubuntu-latest
	permissions:
	contents: read
	id-token: write
	concurrency:
	group: scale-test-tfstate-${{ matrix.scenario }}
	cancel-in-progress: false

	strategy:
	fail-fast: false # Run all scenarios even if one fails
	max-parallel: 1 # Run sequentially - shared Terraform state + unique change per scenario
	matrix:
	scenario:
	- shared_sg_open
	- lambda_timeout
	- vpc_peering_change
	- central_sns_change
	- combined_network
	- combined_all
	- kms_orphan_simulation

	env:
	TF_VAR_scale_multiplier: 25
	TF_VAR_scenario: ${{ matrix.scenario }}
	TF_VAR_cloud_provider: aws

	defaults:
	run:
	working-directory: scale-test

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Setup Terraform
	uses: hashicorp/setup-terraform@v3
	with:
	terraform_wrapper: false

	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	audience: sts.amazonaws.com
	aws-region: us-east-1
	role-to-assume: ${{ vars.TERRAFORM_DEPLOY_ROLE }}

	- name: Configure GCP Credentials
	uses: google-github-actions/auth@v2
	with:
	credentials_json: ${{ secrets.OVERMIND_SCALE_TEST }}

	- name: Setup GCP SDK
	uses: google-github-actions/setup-gcloud@v2
	with:
	project_id: overmind-scale-test

	- name: Terraform Init
	run: terraform init -input=false

	- name: Terraform Plan
	run: \|
	echo "Planning scenario: ${{ matrix.scenario }}"
	terraform plan \
	-compact-warnings \
	-no-color \
	-input=false \
	-lock-timeout=5m \
	-parallelism=100 \
	-out=tfplan

	terraform show -json tfplan > tfplan.json

	- name: Install Overmind CLI
	uses: overmindtech/actions/install-cli@main
	continue-on-error: true
	with:
	version: latest
	github-token: ${{ secrets.GITHUB_TOKEN }}

	- name: Add Overmind CLI to PATH
	working-directory: ${{ github.workspace }}
	run: \|
	if command -v overmind &> /dev/null; then
	overmind --version
	elif [ -f "$GITHUB_WORKSPACE/overmindtech/overmind" ]; then
	chmod +x "$GITHUB_WORKSPACE/overmindtech/overmind"
	echo "$GITHUB_WORKSPACE/overmindtech" >> $GITHUB_PATH
	"$GITHUB_WORKSPACE/overmindtech/overmind" --version
	else
	echo "::error::Overmind CLI not found"
	exit 1
	fi

	- name: Record Overmind start time
	id: overmind-start
	run: echo "start_time=$(date +%s%3N)" >> $GITHUB_OUTPUT

	- name: Submit Plan to Overmind
	id: submit-plan
	continue-on-error: true
	env:
	OVM_API_KEY: ${{ secrets.OVM_API_KEY }}
	run: \|
	# Use unique ticket link per scenario to create separate changes
	TICKET_LINK="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}?scenario=${{ matrix.scenario }}"

	echo "Submitting plan with unique ticket link: $TICKET_LINK"

	# Capture output and extract the URL (should be the last line or contain https://)
	overmind changes submit-plan \
	--title "Scale Test - ${{ matrix.scenario }} @ 25x" \
	--description "Nightly scale test for scenario: ${{ matrix.scenario }}" \
	--ticket-link "$TICKET_LINK" \
	--tags "model=risks_v6,scenario=${{ matrix.scenario }}" \
	tfplan.json 2>&1 \| tee /tmp/submit-output.txt

	# Extract the change URL (line containing https://app.overmind.tech/changes/)
	CHANGE_URL=$(grep -oE 'https://app\.overmind\.tech/changes/[a-f0-9-]+' /tmp/submit-output.txt \| head -1)

	if [ -z "$CHANGE_URL" ]; then
	echo "::error::Could not extract change URL from submit-plan output"
	cat /tmp/submit-output.txt
	exit 1
	fi

	echo "change-url=$CHANGE_URL" >> $GITHUB_OUTPUT
	echo "Submitted change: $CHANGE_URL"

	# Wait for change analysis to complete (fetch the change to trigger wait)
	echo "Waiting for change analysis to complete..."
	overmind changes get-change \
	--change "$CHANGE_URL" \
	--format markdown \
	> /tmp/change-summary.md

	echo "Change analysis complete"

	- name: Get change results as JSON
	id: get-results
	if: steps.submit-plan.outputs.change-url != ''
	continue-on-error: true
	env:
	OVM_API_KEY: ${{ secrets.OVM_API_KEY }}
	run: \|
	echo "Fetching change results from: ${{ steps.submit-plan.outputs.change-url }}"
	overmind --version

	overmind changes get-change \
	--change "${{ steps.submit-plan.outputs.change-url }}" \
	--format json \
	> change-results.json

	# Calculate duration
	END_TIME=$(date +%s%3N)
	START_TIME=${{ steps.overmind-start.outputs.start_time }}
	DURATION_MS=$((END_TIME - START_TIME))
	echo "overmind_duration_ms=$DURATION_MS" >> $GITHUB_OUTPUT

	RISK_COUNT=$(jq '.risks \| length // 0' change-results.json)
	HIGH_RISK_COUNT=$(jq '[.risks[]? \| select(.severity == "high" or .severity == "critical")] \| length' change-results.json)
	MEDIUM_RISK_COUNT=$(jq '[.risks[]? \| select(.severity == "medium")] \| length' change-results.json)

	# Extract blast radius and discovery metrics
	BLAST_RADIUS_NODES=$(jq '.change.metadata.numAffectedItems // 0' change-results.json)
	BLAST_RADIUS_EDGES=$(jq '.change.metadata.numAffectedEdges // 0' change-results.json)
	OBSERVATIONS=$(jq '.change.metadata.total_observations // 0' change-results.json)
	HYPOTHESES=$(jq '.hypotheses \| length // 0' change-results.json)

	echo "risk_count=$RISK_COUNT" >> $GITHUB_OUTPUT
	echo "high_risk_count=$HIGH_RISK_COUNT" >> $GITHUB_OUTPUT
	echo "medium_risk_count=$MEDIUM_RISK_COUNT" >> $GITHUB_OUTPUT
	echo "blast_radius_nodes=$BLAST_RADIUS_NODES" >> $GITHUB_OUTPUT
	echo "blast_radius_edges=$BLAST_RADIUS_EDGES" >> $GITHUB_OUTPUT
	echo "observations=$OBSERVATIONS" >> $GITHUB_OUTPUT
	echo "hypotheses=$HYPOTHESES" >> $GITHUB_OUTPUT

	echo "## Nightly Scenario: ${{ matrix.scenario }}" >> $GITHUB_STEP_SUMMARY
	echo "- Total Risks: $RISK_COUNT" >> $GITHUB_STEP_SUMMARY
	echo "- High/Critical: $HIGH_RISK_COUNT" >> $GITHUB_STEP_SUMMARY
	echo "- Duration: $((DURATION_MS / 1000))s" >> $GITHUB_STEP_SUMMARY
	echo "- Blast Radius: $BLAST_RADIUS_NODES nodes, $BLAST_RADIUS_EDGES edges" >> $GITHUB_STEP_SUMMARY
	echo "- Observations: $OBSERVATIONS" >> $GITHUB_STEP_SUMMARY
	echo "- Hypotheses: $HYPOTHESES" >> $GITHUB_STEP_SUMMARY

	# =========================================================================
	# PromptFoo Quality Evals (Nightly)
	# =========================================================================

	- name: Setup Node.js for PromptFoo
	uses: actions/setup-node@v4
	if: ${{ always() && steps.get-results.outcome == 'success' }}
	with:
	node-version: '20'

	- name: Install PromptFoo dependencies
	if: ${{ always() && steps.get-results.outcome == 'success' }}
	working-directory: scale-test/evals
	run: npm install

	- name: Run PromptFoo quality evals
	id: promptfoo-eval
	if: ${{ always() && steps.get-results.outcome == 'success' }}
	continue-on-error: true
	working-directory: scale-test/evals
	env:
	OPENAI_API_KEY: ${{ secrets.OPENAI_KEY }}
	CHANGE_RESULTS_PATH: ${{ github.workspace }}/scale-test/change-results.json
	run: \|
	echo "Running PromptFoo evals for scenario: ${{ matrix.scenario }}"

	# Extract data for eval
	RISK_COUNT=$(jq '.risks \| length // 0' $CHANGE_RESULTS_PATH)
	HIGH_RISK_COUNT=$(jq '[.risks[]? \| select(.severity == "high" or .severity == "critical")] \| length' $CHANGE_RESULTS_PATH)
	RISKS_JSON=$(jq -c '.risks // []' $CHANGE_RESULTS_PATH)

	# Run evals
	npx promptfoo eval \
	--var "scenario=${{ matrix.scenario }}" \
	--var "risk_count=$RISK_COUNT" \
	--var "high_risk_count=$HIGH_RISK_COUNT" \
	--var "risks_json=$RISKS_JSON" \
	--filter-pattern "${{ matrix.scenario }}" \
	--output eval-results.json

	# Parse results for summary
	if [ -f "eval-results.json" ]; then
	PASS_COUNT=$(jq '[.results[]?.success] \| map(select(. == true)) \| length' eval-results.json)
	TOTAL_COUNT=$(jq '[.results[]?.success] \| length' eval-results.json)
	echo "eval_pass_count=$PASS_COUNT" >> $GITHUB_OUTPUT
	echo "eval_fail_count=$((TOTAL_COUNT - PASS_COUNT))" >> $GITHUB_OUTPUT

	echo "" >> $GITHUB_STEP_SUMMARY
	echo "### PromptFoo Quality Eval" >> $GITHUB_STEP_SUMMARY
	echo "- Passed: $PASS_COUNT / $TOTAL_COUNT assertions" >> $GITHUB_STEP_SUMMARY
	fi

	- name: Send results to dashboard
	if: always() && steps.get-results.outcome == 'success'
	continue-on-error: true
	env:
	DASHBOARD_URL: ${{ secrets.SCALE_DASHBOARD_URL }}
	DASHBOARD_API_KEY: ${{ secrets.SCALE_DASHBOARD_API_KEY }}
	run: \|
	if [ -z "$DASHBOARD_URL" ] \|\| [ -z "$DASHBOARD_API_KEY" ]; then
	echo "Dashboard not configured"
	exit 0
	fi

	# Get eval results if available
	EVAL_PASS_COUNT=${{ steps.promptfoo-eval.outputs.eval_pass_count \|\| 0 }}
	EVAL_FAIL_COUNT=${{ steps.promptfoo-eval.outputs.eval_fail_count \|\| 0 }}
	TOTAL=$((EVAL_PASS_COUNT + EVAL_FAIL_COUNT))
	EVAL_SCORE=0
	if [ "$TOTAL" -gt 0 ]; then
	EVAL_SCORE=$(echo "scale=2; $EVAL_PASS_COUNT / $TOTAL" \| bc)
	fi

	cat <<EOF > /tmp/payload.json
	{
	"runId": "${{ github.run_id }}-${{ matrix.scenario }}",
	"scenario": "${{ matrix.scenario }}",
	"cloudProvider": "aws",
	"scaleMultiplier": 25,
	"overmindDurationMs": ${{ steps.get-results.outputs.overmind_duration_ms \|\| 0 }},
	"riskCount": ${{ steps.get-results.outputs.risk_count \|\| 0 }},
	"highRiskCount": ${{ steps.get-results.outputs.high_risk_count \|\| 0 }},
	"mediumRiskCount": ${{ steps.get-results.outputs.medium_risk_count \|\| 0 }},
	"blastRadiusNodes": ${{ steps.get-results.outputs.blast_radius_nodes \|\| 0 }},
	"blastRadiusEdges": ${{ steps.get-results.outputs.blast_radius_edges \|\| 0 }},
	"observations": ${{ steps.get-results.outputs.observations \|\| 0 }},
	"hypotheses": ${{ steps.get-results.outputs.hypotheses \|\| 0 }},
	"validationPassed": true,
	"evalPassCount": $EVAL_PASS_COUNT,
	"evalFailCount": $EVAL_FAIL_COUNT,
	"evalScore": $EVAL_SCORE,
	"workflowRunUrl": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
	}
	EOF

	curl -s -X POST "$DASHBOARD_URL/api/results" \
	-H "Authorization: Bearer $DASHBOARD_API_KEY" \
	-H "Content-Type: application/json" \
	-d @/tmp/payload.json

	# =========================================================================
	# Manual Runs - Validate Inputs
	# =========================================================================
	validate:
	name: Validate Inputs
	if: github.event_name == 'workflow_dispatch'
	runs-on: ubuntu-latest
	steps:
	- name: Validate destroy confirmation
	if: inputs.action == 'destroy'
	run: \|
	if [ "${{ inputs.confirmation }}" != "DESTROY-SCALE-TEST" ]; then
	echo "::error::Destroy requires confirmation. Type 'DESTROY-SCALE-TEST' in the confirmation field."
	exit 1
	fi
	echo "Destroy confirmation validated"

	# =========================================================================
	# Manual Runs - Terraform
	# =========================================================================
	terraform:
	name: Terraform ${{ inputs.action }} (×${{ inputs.scale_multiplier }}, scenario=${{ inputs.scenario }})
	if: github.event_name == 'workflow_dispatch'
	needs: validate
	runs-on: ubuntu-latest
	permissions:
	contents: read
	id-token: write
	pull-requests: write
	concurrency:
	group: scale-test-tfstate
	cancel-in-progress: false

	env:
	TF_VAR_scale_multiplier: ${{ inputs.scale_multiplier }}
	TF_VAR_scenario: ${{ inputs.scenario }}
	TF_VAR_cloud_provider: ${{ inputs.cloud_provider }}

	defaults:
	run:
	working-directory: ${{ env.WORKING_DIR }}

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Setup Terraform
	uses: hashicorp/setup-terraform@v3
	with:
	terraform_wrapper: false

	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	audience: sts.amazonaws.com
	aws-region: us-east-1
	role-to-assume: ${{ vars.TERRAFORM_DEPLOY_ROLE }}

	# Always authenticate with GCP - Terraform validates all providers during init
	# even when cloud_provider=aws (no GCP resources created)
	- name: Configure GCP Credentials
	uses: google-github-actions/auth@v2
	with:
	credentials_json: ${{ secrets.OVERMIND_SCALE_TEST }}

	- name: Setup GCP SDK
	uses: google-github-actions/setup-gcloud@v2
	with:
	project_id: overmind-scale-test

	- name: Terraform Init
	id: init
	run: \|
	terraform version
	terraform init -input=false

	- name: Terraform Plan
	id: plan
	if: ${{ inputs.action == 'plan' \|\| inputs.action == 'apply' }}
	run: \|
	set -o pipefail
	echo "Planning with scale_multiplier=${{ inputs.scale_multiplier }}, scenario=${{ inputs.scenario }}"
	terraform plan \
	-compact-warnings \
	-no-color \
	-input=false \
	-lock-timeout=5m \
	-parallelism=100 \
	-out=tfplan 2>&1 \| tee terraform_plan.log

	# Generate JSON plan for Overmind
	terraform show -json tfplan > tfplan.json

	# Output summary
	echo "## Terraform Plan Summary" >> $GITHUB_STEP_SUMMARY
	echo "- Scale Multiplier: ${{ inputs.scale_multiplier }}" >> $GITHUB_STEP_SUMMARY
	echo "- Scenario: ${{ inputs.scenario }}" >> $GITHUB_STEP_SUMMARY
	echo "- Action: ${{ inputs.action }}" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "### Resource Changes" >> $GITHUB_STEP_SUMMARY
	terraform show -no-color tfplan \| grep -E "^(Plan:\|No changes)" >> $GITHUB_STEP_SUMMARY \|\| true

	- name: Install Overmind CLI
	uses: overmindtech/actions/install-cli@main
	id: install-cli
	continue-on-error: true
	with:
	version: latest
	github-token: ${{ secrets.GITHUB_TOKEN }}

	- name: Add Overmind CLI to PATH
	id: verify-cli
	working-directory: ${{ github.workspace }}
	run: \|
	# The install-cli action downloads to $GITHUB_WORKSPACE/overmindtech/overmind
	# We need to add it to PATH for subsequent steps

	if command -v overmind &> /dev/null; then
	echo "Overmind CLI already in PATH"
	overmind --version
	elif [ -f "$GITHUB_WORKSPACE/overmindtech/overmind" ]; then
	echo "Found CLI at $GITHUB_WORKSPACE/overmindtech/overmind, adding to PATH"
	chmod +x "$GITHUB_WORKSPACE/overmindtech/overmind"
	echo "$GITHUB_WORKSPACE/overmindtech" >> $GITHUB_PATH
	"$GITHUB_WORKSPACE/overmindtech/overmind" --version
	else
	echo "::error::Overmind CLI not found. Contents of workspace:"
	ls -la "$GITHUB_WORKSPACE/"
	exit 1
	fi

	- name: Record Overmind start time
	id: overmind-start
	if: ${{ inputs.action == 'plan' \|\| inputs.action == 'apply' }}
	run: echo "start_time=$(date +%s%3N)" >> $GITHUB_OUTPUT

	- name: Submit Plan to Overmind
	uses: overmindtech/actions/submit-plan@main
	continue-on-error: true
	id: submit-plan
	if: ${{ inputs.action == 'plan' \|\| inputs.action == 'apply' }}
	with:
	ovm-api-key: ${{ secrets.OVM_API_KEY }}
	plan-json: ${{ env.WORKING_DIR }}/tfplan.json
	tags: 'model=risks_v6'

	# =========================================================================
	# Quality Evaluation Steps
	# Capture analysis results and validate expected risks for each scenario
	# =========================================================================

	- name: Get change results as JSON
	id: get-results
	if: ${{ steps.submit-plan.outputs.change-url != '' && (inputs.action == 'plan' \|\| inputs.action == 'apply') }}
	continue-on-error: true
	env:
	OVM_API_KEY: ${{ secrets.OVM_API_KEY }}
	run: \|
	echo "Fetching change results from: ${{ steps.submit-plan.outputs.change-url }}"

	# Show CLI version for debugging
	echo "Using Overmind CLI:"
	overmind --version

	# Get the full change analysis as JSON
	overmind changes get-change \
	--change "${{ steps.submit-plan.outputs.change-url }}" \
	--format json \
	> change-results.json

	# Calculate Overmind duration
	END_TIME=$(date +%s%3N)
	START_TIME=${{ steps.overmind-start.outputs.start_time }}
	DURATION_MS=$((END_TIME - START_TIME))
	echo "overmind_duration_ms=$DURATION_MS" >> $GITHUB_OUTPUT
	echo "Overmind analysis took ${DURATION_MS}ms"

	# Extract key metrics for assertions
	RISK_COUNT=$(jq '.risks \| length // 0' change-results.json)
	HIGH_RISK_COUNT=$(jq '[.risks[]? \| select(.severity == "high" or .severity == "critical")] \| length' change-results.json)
	MEDIUM_RISK_COUNT=$(jq '[.risks[]? \| select(.severity == "medium")] \| length' change-results.json)

	# Extract blast radius and discovery metrics
	BLAST_RADIUS_NODES=$(jq '.change.metadata.numAffectedItems // 0' change-results.json)
	BLAST_RADIUS_EDGES=$(jq '.change.metadata.numAffectedEdges // 0' change-results.json)
	OBSERVATIONS=$(jq '.change.metadata.total_observations // 0' change-results.json)
	HYPOTHESES=$(jq '.hypotheses \| length // 0' change-results.json)

	echo "risk_count=$RISK_COUNT" >> $GITHUB_OUTPUT
	echo "high_risk_count=$HIGH_RISK_COUNT" >> $GITHUB_OUTPUT
	echo "medium_risk_count=$MEDIUM_RISK_COUNT" >> $GITHUB_OUTPUT
	echo "blast_radius_nodes=$BLAST_RADIUS_NODES" >> $GITHUB_OUTPUT
	echo "blast_radius_edges=$BLAST_RADIUS_EDGES" >> $GITHUB_OUTPUT
	echo "observations=$OBSERVATIONS" >> $GITHUB_OUTPUT
	echo "hypotheses=$HYPOTHESES" >> $GITHUB_OUTPUT

	echo "## Change Analysis Results" >> $GITHUB_STEP_SUMMARY
	echo "- Total Risks: $RISK_COUNT" >> $GITHUB_STEP_SUMMARY
	echo "- High/Critical Risks: $HIGH_RISK_COUNT" >> $GITHUB_STEP_SUMMARY
	echo "- Analysis Duration: $((DURATION_MS / 1000))s" >> $GITHUB_STEP_SUMMARY
	echo "- Medium Risks: $MEDIUM_RISK_COUNT" >> $GITHUB_STEP_SUMMARY
	echo "- Blast Radius: $BLAST_RADIUS_NODES nodes, $BLAST_RADIUS_EDGES edges" >> $GITHUB_STEP_SUMMARY
	echo "- Observations: $OBSERVATIONS" >> $GITHUB_STEP_SUMMARY
	echo "- Hypotheses: $HYPOTHESES" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY

	# List all risks in summary
	echo "### Detected Risks" >> $GITHUB_STEP_SUMMARY
	if [ "$RISK_COUNT" -gt 0 ]; then
	jq -r '.risks[]? \| "- [\(.severity)] \(.title)"' change-results.json >> $GITHUB_STEP_SUMMARY
	else
	echo "_No risks detected_" >> $GITHUB_STEP_SUMMARY
	fi

	# NOTE: Artifact upload disabled for public repo - change-results.json may contain
	# internal system prompts. Only tfplan.json is safe to upload if needed.
	# - name: Upload analysis results
	# uses: actions/upload-artifact@v4
	# if: ${{ always() && (inputs.action == 'plan' \|\| inputs.action == 'apply') }}
	# with:
	# name: change-analysis-${{ inputs.scenario }}-${{ github.run_id }}
	# path: ${{ env.WORKING_DIR }}/tfplan.json
	# if-no-files-found: ignore

	# =========================================================================
	# PromptFoo Quality Evals
	# Run LLM-as-judge evaluation on the risk analysis quality
	# =========================================================================

	- name: Setup Node.js for PromptFoo
	uses: actions/setup-node@v4
	if: ${{ always() && steps.get-results.outcome == 'success' && inputs.scenario != 'none' }}
	with:
	node-version: '20'

	- name: Install PromptFoo dependencies
	if: ${{ always() && steps.get-results.outcome == 'success' && inputs.scenario != 'none' }}
	working-directory: ${{ env.WORKING_DIR }}/evals
	run: npm install

	- name: Run PromptFoo quality evals
	id: promptfoo-eval
	if: ${{ always() && steps.get-results.outcome == 'success' && inputs.scenario != 'none' }}
	continue-on-error: true
	working-directory: ${{ env.WORKING_DIR }}/evals
	env:
	OPENAI_API_KEY: ${{ secrets.OPENAI_KEY }}
	CHANGE_RESULTS_PATH: ${{ github.workspace }}/${{ env.WORKING_DIR }}/change-results.json
	run: \|
	echo "Running PromptFoo evals for scenario: ${{ inputs.scenario }}"
	echo "Results file: $CHANGE_RESULTS_PATH"

	# Verify results file exists
	if [ ! -f "$CHANGE_RESULTS_PATH" ]; then
	echo "::warning::Change results file not found at $CHANGE_RESULTS_PATH"
	exit 0
	fi

	# Extract metrics from change-results.json
	RISK_COUNT=$(jq '.risks \| length // 0' "$CHANGE_RESULTS_PATH")
	HIGH_RISK_COUNT=$(jq '[.risks[]? \| select(.severity == "high" or .severity == "critical")] \| length' "$CHANGE_RESULTS_PATH")
	RISKS_JSON=$(jq -c '.risks // []' "$CHANGE_RESULTS_PATH")

	echo "Risk count: $RISK_COUNT, High risk count: $HIGH_RISK_COUNT"

	# Run evals with pre-extracted data
	# --filter-pattern filters tests by description regex
	npx promptfoo eval \
	--var "scenario=${{ inputs.scenario }}" \
	--var "risk_count=$RISK_COUNT" \
	--var "high_risk_count=$HIGH_RISK_COUNT" \
	--var "risks_json=$RISKS_JSON" \
	--filter-pattern "${{ inputs.scenario }}" \
	--output eval-results.json \
	--no-cache

	# Extract summary metrics
	if [ -f eval-results.json ]; then
	PASS_COUNT=$(jq '[.results[].success] \| map(select(. == true)) \| length' eval-results.json)
	TOTAL_COUNT=$(jq '[.results[].success] \| length' eval-results.json)

	echo "" >> $GITHUB_STEP_SUMMARY
	echo "### PromptFoo Quality Eval" >> $GITHUB_STEP_SUMMARY
	echo "- Passed: $PASS_COUNT / $TOTAL_COUNT assertions" >> $GITHUB_STEP_SUMMARY

	# Show any failures
	jq -r '.results[] \| select(.success == false) \| "- ❌ \(.description): \(.error // "assertion failed")"' eval-results.json >> $GITHUB_STEP_SUMMARY \|\| true
	fi

	# NOTE: Artifact upload disabled for public repo - eval results may reference
	# internal system prompts from change-results.json
	# - name: Upload PromptFoo results
	# uses: actions/upload-artifact@v4
	# if: ${{ always() && steps.promptfoo-eval.outcome != 'skipped' }}
	# with:
	# name: promptfoo-results-${{ inputs.scenario }}-${{ github.run_id }}
	# path: ${{ env.WORKING_DIR }}/evals/eval-results.json
	# if-no-files-found: ignore

	# =========================================================================
	# Send Results to Dashboard
	# Posts metrics to private Vercel dashboard for trend analysis
	# =========================================================================

	- name: Send results to dashboard
	id: send-dashboard
	if: ${{ always() && steps.get-results.outcome == 'success' && inputs.scenario != 'none' }}
	continue-on-error: true
	env:
	DASHBOARD_URL: ${{ secrets.SCALE_DASHBOARD_URL }}
	DASHBOARD_API_KEY: ${{ secrets.SCALE_DASHBOARD_API_KEY }}
	run: \|
	# Skip if dashboard not configured
	if [ -z "$DASHBOARD_URL" ] \|\| [ -z "$DASHBOARD_API_KEY" ]; then
	echo "Dashboard not configured, skipping..."
	exit 0
	fi

	echo "Sending results to dashboard..."

	# Get eval results if available
	EVAL_PASS_COUNT=0
	EVAL_FAIL_COUNT=0
	EVAL_SCORE=0
	if [ -f "evals/eval-results.json" ]; then
	EVAL_PASS_COUNT=$(jq '[.results[]?.success] \| map(select(. == true)) \| length' evals/eval-results.json 2>/dev/null \|\| echo 0)
	EVAL_FAIL_COUNT=$(jq '[.results[]?.success] \| map(select(. == false)) \| length' evals/eval-results.json 2>/dev/null \|\| echo 0)
	TOTAL=$(($EVAL_PASS_COUNT + $EVAL_FAIL_COUNT))
	if [ "$TOTAL" -gt 0 ]; then
	EVAL_SCORE=$(echo "scale=2; $EVAL_PASS_COUNT / $TOTAL" \| bc)
	fi
	fi

	# Build JSON payload
	cat <<EOF > /tmp/dashboard-payload.json
	{
	"runId": "${{ github.run_id }}",
	"scenario": "${{ inputs.scenario }}",
	"cloudProvider": "${{ inputs.cloud_provider }}",
	"scaleMultiplier": ${{ inputs.scale_multiplier }},
	"overmindDurationMs": ${{ steps.get-results.outputs.overmind_duration_ms \|\| 0 }},
	"riskCount": ${{ steps.get-results.outputs.risk_count \|\| 0 }},
	"highRiskCount": ${{ steps.get-results.outputs.high_risk_count \|\| 0 }},
	"mediumRiskCount": ${{ steps.get-results.outputs.medium_risk_count \|\| 0 }},
	"blastRadiusNodes": ${{ steps.get-results.outputs.blast_radius_nodes \|\| 0 }},
	"blastRadiusEdges": ${{ steps.get-results.outputs.blast_radius_edges \|\| 0 }},
	"observations": ${{ steps.get-results.outputs.observations \|\| 0 }},
	"hypotheses": ${{ steps.get-results.outputs.hypotheses \|\| 0 }},
	"validationPassed": true,
	"evalPassCount": $EVAL_PASS_COUNT,
	"evalFailCount": $EVAL_FAIL_COUNT,
	"evalScore": $EVAL_SCORE,
	"workflowRunUrl": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
	}
	EOF

	echo "Payload:"
	cat /tmp/dashboard-payload.json

	# Send to dashboard API
	RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$DASHBOARD_URL/api/results" \
	-H "Authorization: Bearer $DASHBOARD_API_KEY" \
	-H "Content-Type: application/json" \
	-d @/tmp/dashboard-payload.json)

	HTTP_CODE=$(echo "$RESPONSE" \| tail -n1)
	BODY=$(echo "$RESPONSE" \| sed '$d')

	if [ "$HTTP_CODE" -ge 200 ] && [ "$HTTP_CODE" -lt 300 ]; then
	echo "✅ Results sent to dashboard successfully"
	echo "$BODY"
	else
	echo "⚠️ Failed to send results to dashboard (HTTP $HTTP_CODE)"
	echo "$BODY"
	# Don't fail the workflow for dashboard issues
	fi

	# Cost Analysis disabled for scale testing (plan too large)
	# - name: Cost Analysis
	# uses: overmindtech/cost-signals-action@v1
	# continue-on-error: true
	# if: ${{ inputs.action == 'plan' \|\| inputs.action == 'apply' }}
	# with:
	# overmind-api-key: ${{ secrets.OVM_API_KEY }}
	# infracost-api-key: ${{ secrets.INFRACOST_API_KEY }}
	# terraform-plan-json: ${{ env.WORKING_DIR }}/tfplan.json
	# ticket-link: ${{ steps.submit-plan.outputs.change-url }}

	- name: Start Overmind Change
	uses: overmindtech/actions/start-change@main
	continue-on-error: true
	if: ${{ inputs.action == 'apply' }}
	with:
	ovm-api-key: ${{ secrets.OVM_API_KEY }}

	# reauthenticate after potentially long running terraform plan or submit-plan steps as credentials may have expired
	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	audience: sts.amazonaws.com
	aws-region: us-east-1
	role-to-assume: ${{ vars.TERRAFORM_DEPLOY_ROLE }}

	- name: Configure GCP Credentials
	uses: google-github-actions/auth@v2
	with:
	credentials_json: ${{ secrets.OVERMIND_SCALE_TEST }}

	- name: Terraform Apply
	id: apply
	if: ${{ inputs.action == 'apply' }}
	run: \|
	echo "Applying scale test infrastructure (×${{ inputs.scale_multiplier }})"
	terraform apply \
	-auto-approve \
	-no-color \
	-input=false \
	-lock-timeout=5m \
	-parallelism=100 \
	tfplan

	echo "## Apply Complete" >> $GITHUB_STEP_SUMMARY
	echo "Scale test infrastructure deployed with multiplier ×${{ inputs.scale_multiplier }}" >> $GITHUB_STEP_SUMMARY

	- name: End Overmind Change
	uses: overmindtech/actions/end-change@main
	continue-on-error: true
	if: ${{ (inputs.action == 'apply') && (success() \|\| failure() \|\| cancelled()) }}
	with:
	ovm-api-key: ${{ secrets.OVM_API_KEY }}

	# reauthenticate after potentially long running end-change step as credentials may have expired
	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	audience: sts.amazonaws.com
	aws-region: us-east-1
	role-to-assume: ${{ vars.TERRAFORM_DEPLOY_ROLE }}

	- name: Configure GCP Credentials
	uses: google-github-actions/auth@v2
	with:
	credentials_json: ${{ secrets.OVERMIND_SCALE_TEST }}

	- name: Terraform Destroy
	id: destroy
	if: ${{ inputs.action == 'destroy' }}
	run: \|
	echo "::warning::Destroying scale test infrastructure (×${{ inputs.scale_multiplier }})"
	terraform destroy \
	-auto-approve \
	-no-color \
	-input=false \
	-parallelism=100 \
	-lock-timeout=10m

	echo "## Destroy Complete" >> $GITHUB_STEP_SUMMARY
	echo "Scale test infrastructure destroyed" >> $GITHUB_STEP_SUMMARY

	- name: Output Terraform Summary
	if: always()
	run: \|
	echo ""
	echo "=== Scale Test Summary ==="
	echo "Action: ${{ inputs.action }}"
	echo "Multiplier: ${{ inputs.scale_multiplier }}"
	echo "Scenario: ${{ inputs.scenario }}"
	echo "Status: ${{ job.status }}"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Scale Test Infrastructure #122

Workflow file

Scale Test Infrastructure #122

Uh oh!

Workflow file for this run