Knowledge Test #119

Workflow file for this run

.github/workflows/knowledge-test.yml at 73832b9

	name: Knowledge Test

	on:
	# Nightly - runs after scale test completes
	schedule:
	- cron: '0 5 * * *' # 5 AM UTC daily (3 hours after scale test)

	workflow_dispatch:
	inputs:
	test_filter:
	description: 'Filter by category (all, create_risk, lower_risk, discover, instruct)'
	required: false
	type: choice
	default: 'all'
	options:
	- 'all'
	- 'create_risk'
	- 'lower_risk'
	- 'discover'
	- 'instruct'
	scale_multiplier:
	description: 'Resource multiplier (must match scale-test state, default 25)'
	required: false
	type: choice
	default: '25'
	options:
	- '1'
	- '5'
	- '10'
	- '25'
	- '50'

	env:
	WORKING_DIR: scale-test

	# =============================================================================
	# Knowledge Test Workflow
	#
	# All knowledge files live permanently in .overmind/knowledge/ (as a customer
	# would have them). The Overmind CLI picks them up automatically.
	#
	# What varies between tests is the SCENARIO (which Terraform plan change is
	# being analyzed) and the EXPECTED EFFECT of knowledge on the risk output.
	#
	# Each test case has a paired baseline (same scenario, no knowledge) that runs
	# on the regular nightly scale-test workflow. This workflow adds a "with
	# knowledge" run for comparison.
	#
	# IMPORTANT: This workflow shares Terraform state with the nightly scale-test
	# workflow (scale-test/backend.tf). The scale-test runs at 2 AM UTC and applies
	# baseline at scale_multiplier=25. This workflow runs at 5 AM UTC and plans
	# against that existing state -- it does NOT apply. The prepare-baseline job
	# only verifies the state is clean; it never modifies infrastructure.
	#
	# The evaluation question for each test is: given that all 5 knowledge files
	# are available, did Overmind activate the right ones and produce the expected
	# effect for this scenario?
	#
	# Categories:
	# create_risk - Knowledge should cause new or elevated risks
	# lower_risk - Knowledge should reduce or disprove risks
	# discover - Knowledge should surface resources not normally found
	# instruct - Knowledge should add operational context to risks
	#
	# Baselines (no-knowledge runs) are provided by the nightly scale-test
	# workflow, which runs the same scenarios without knowledge files. The scale
	# dashboard compares knowledge test results against those nightly runs.
	#
	# All knowledge files (all 5) are always present. The categories describe
	# what we EXPECT the dominant effect to be for each scenario based on which
	# knowledge files are most relevant.
	# =============================================================================

	jobs:
	# =========================================================================
	# Phase 1: Verify Baseline Infrastructure
	# The nightly scale-test (2 AM) applies baseline at 25x. This job verifies
	# the state is clean before running knowledge tests. No apply is needed.
	# =========================================================================
	prepare-baseline:
	name: Verify baseline
	runs-on: ubuntu-latest
	permissions:
	contents: read
	id-token: write
	concurrency:
	group: scale-test-tfstate-knowledge
	cancel-in-progress: false

	env:
	TF_VAR_scale_multiplier: ${{ inputs.scale_multiplier \|\| '25' }}
	TF_VAR_scenario: none
	TF_VAR_cloud_provider: aws

	defaults:
	run:
	working-directory: scale-test

	steps:
	- name: Checkout repository
	uses: actions/checkout@v6

	- name: Setup Terraform
	uses: hashicorp/setup-terraform@v4
	with:
	terraform_wrapper: false

	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v6
	with:
	audience: sts.amazonaws.com
	aws-region: us-east-1
	role-to-assume: ${{ vars.TERRAFORM_DEPLOY_ROLE }}

	- name: Configure GCP Credentials
	uses: google-github-actions/auth@v3
	with:
	credentials_json: ${{ secrets.OVERMIND_SCALE_TEST }}

	- name: Setup GCP SDK
	uses: google-github-actions/setup-gcloud@v3
	with:
	project_id: overmind-scale-test

	- name: Terraform Init
	run: terraform init -input=false

	- name: Verify Baseline Stable
	run: \|
	echo "Verifying scale-test baseline is stable (scale=${{ inputs.scale_multiplier \|\| '25' }})..."
	echo "The nightly scale-test should have already applied baseline at this multiplier."
	EXIT_CODE=0
	terraform plan -var="scenario=none" -detailed-exitcode -no-color \|\| EXIT_CODE=$?
	if [ "$EXIT_CODE" = "2" ]; then
	echo "::error::Baseline has pending changes - scale-test may not have run or used a different multiplier"
	exit 1
	elif [ "$EXIT_CODE" = "1" ]; then
	echo "::error::Terraform plan failed"
	exit 1
	fi
	echo "Baseline stable - no changes detected"

	# =========================================================================
	# Phase 2: Knowledge Tests
	#
	# All 5 knowledge files are always present in .overmind/knowledge/.
	# Each test submits a different scenario plan and checks whether Overmind
	# activated the right knowledge and produced the expected effect.
	# =========================================================================
	knowledge-test:
	name: "${{ matrix.test_id }}"
	needs: prepare-baseline
	runs-on: ubuntu-latest
	permissions:
	contents: read
	id-token: write
	concurrency:
	group: scale-test-tfstate-knowledge
	cancel-in-progress: false

	strategy:
	fail-fast: false
	max-parallel: 1

	matrix:
	include:
	# -----------------------------------------------------------------
	# Create risk
	# For these scenarios, specific knowledge files should cause Overmind
	# to identify risks it wouldn't find without knowledge, or to elevate
	# the severity of risks it already detects.
	# -----------------------------------------------------------------
	- test_id: create-risk-sg-public-exposure
	scenario: shared_sg_open
	category: create_risk
	expected_effect: "security-standards.md should cause Overmind to cite public subnet + public IP as compounding factors, elevating SSH exposure from bad to critical"
	relevant_knowledge: "security-standards.md"

	- test_id: create-risk-lambda-sqs-timeout
	scenario: lambda_timeout
	category: create_risk
	expected_effect: "platform-event-pipeline.md should cause Overmind to cite the 6x SQS visibility timeout rule (Lambda needs >= 180s, SQS uses 30s visibility)"
	relevant_knowledge: "platform-event-pipeline.md"

	- test_id: create-risk-kms-encryption-compliance
	scenario: kms_orphan_simulation
	category: create_risk
	expected_effect: "security-standards.md should cause Overmind to flag S3 buckets using AES256 instead of required KMS encryption as non-compliant"
	relevant_knowledge: "security-standards.md"

	# -----------------------------------------------------------------
	# Lower risk
	# For these scenarios, knowledge provides context that a change is
	# approved or expected, which should lower or disprove the risk.
	# -----------------------------------------------------------------
	- test_id: lower-risk-vpc-approved-dns
	scenario: vpc_peering_change
	category: lower_risk
	expected_effect: "multi-region-design.md states DNS resolution on peering is required for service discovery and is the approved architecture"
	relevant_knowledge: "multi-region-design.md"

	- test_id: lower-risk-sns-approved-hardening
	scenario: central_sns_change
	category: lower_risk
	expected_effect: "platform-event-pipeline.md states the Deny+StringNotEquals pattern is approved security hardening that does not break internal publishers"
	relevant_knowledge: "platform-event-pipeline.md"

	- test_id: lower-risk-lambda-dummy-functions
	scenario: lambda_timeout
	category: lower_risk
	expected_effect: "infrastructure-guide.md states scale-test Lambda functions are dummy handlers that dont process real messages, so timeout is irrelevant"
	relevant_knowledge: "infrastructure-guide.md"

	# -----------------------------------------------------------------
	# Discover
	# Knowledge should guide Overmind to find resources or relationships
	# that are not obvious from the Terraform dependency graph alone.
	# -----------------------------------------------------------------
	- test_id: discover-sns-ssm-and-publishers
	scenario: central_sns_change
	category: discover
	expected_effect: "multi-region-design.md should help discover SSM parameters with stale SNS ARN references. platform-event-pipeline.md should help discover Lambda publishers across all 4 regions (not just SQS subscribers)."
	relevant_knowledge: "multi-region-design.md, platform-event-pipeline.md"

	- test_id: discover-vpc-endpoints
	scenario: vpc_peering_change
	category: discover
	expected_effect: "multi-region-design.md should help discover S3 VPC Gateway Endpoints affected by routing changes (non-obvious dependency chain)"
	relevant_knowledge: "multi-region-design.md"

	# -----------------------------------------------------------------
	# Instruct
	# Knowledge should add operational context to risks: who to contact,
	# what process to follow, approval requirements, etc.
	# -----------------------------------------------------------------
	- test_id: instruct-kms-security-process
	scenario: kms_orphan_simulation
	category: instruct
	expected_effect: "Risk should mention: Sarah Chen (key custodian), SEC-REVIEW Jira ticket, Security Engineering approval, state-rm danger warning"
	relevant_knowledge: "security-standards.md, change-approvals.md"

	- test_id: instruct-sg-firewall-exception
	scenario: shared_sg_open
	category: instruct
	expected_effect: "Risk should mention: firewall exception form URL, Mike Rodriguez (network team), David Kim (VP approval), 48-hour review window for internet-facing changes"
	relevant_knowledge: "change-approvals.md, security-standards.md"

	- test_id: instruct-vpc-multi-team-signoff
	scenario: vpc_peering_change
	category: instruct
	expected_effect: "Risk should mention: regional team contacts (james.park, priya.sharma, thomas.mueller, wei.zhang), multi-team sign-off requirement, #cross-region-changes Slack channel"
	relevant_knowledge: "change-approvals.md"

	- test_id: instruct-sns-maintenance-window
	scenario: central_sns_change
	category: instruct
	expected_effect: "Risk should mention: Tuesday 2-4 AM UTC maintenance window, Platform-Primary PagerDuty schedule, #platform-ops Slack, runbook URL"
	relevant_knowledge: "platform-event-pipeline.md"

	env:
	TF_VAR_scale_multiplier: ${{ inputs.scale_multiplier \|\| '25' }}
	TF_VAR_scenario: ${{ matrix.scenario }}
	TF_VAR_cloud_provider: aws

	defaults:
	run:
	working-directory: scale-test

	steps:
	- name: Checkout repository
	uses: actions/checkout@v6

	- name: Check test filter
	id: filter
	run: \|
	FILTER="${{ inputs.test_filter \|\| 'all' }}"
	CATEGORY="${{ matrix.category }}"
	if [ "$FILTER" != "all" ] && [ "$FILTER" != "$CATEGORY" ]; then
	echo "skip=true" >> $GITHUB_OUTPUT
	echo "Skipping ${{ matrix.test_id }} (category=$CATEGORY, filter=$FILTER)"
	else
	echo "skip=false" >> $GITHUB_OUTPUT
	fi
	working-directory: ${{ github.workspace }}

	- name: Verify knowledge files present
	if: steps.filter.outputs.skip != 'true'
	working-directory: ${{ github.workspace }}
	run: \|
	echo "Knowledge files in .overmind/knowledge/:"
	ls -la .overmind/knowledge/*.md
	echo ""
	FILE_COUNT=$(ls .overmind/knowledge/*.md \| wc -l \| tr -d ' ')
	echo "Total: $FILE_COUNT knowledge files"
	if [ "$FILE_COUNT" -eq 0 ]; then
	echo "::error::No knowledge files found in .overmind/knowledge/"
	exit 1
	fi

	- name: Setup Terraform
	if: steps.filter.outputs.skip != 'true'
	uses: hashicorp/setup-terraform@v4
	with:
	terraform_wrapper: false

	- name: Configure AWS Credentials
	if: steps.filter.outputs.skip != 'true'
	uses: aws-actions/configure-aws-credentials@v6
	with:
	audience: sts.amazonaws.com
	aws-region: us-east-1
	role-to-assume: ${{ vars.TERRAFORM_DEPLOY_ROLE }}

	- name: Configure GCP Credentials
	if: steps.filter.outputs.skip != 'true'
	uses: google-github-actions/auth@v3
	with:
	credentials_json: ${{ secrets.OVERMIND_SCALE_TEST }}

	- name: Setup GCP SDK
	if: steps.filter.outputs.skip != 'true'
	uses: google-github-actions/setup-gcloud@v3
	with:
	project_id: overmind-scale-test

	- name: Terraform Init
	if: steps.filter.outputs.skip != 'true'
	run: terraform init -input=false

	# =====================================================================
	# Terraform Plan
	# =====================================================================
	- name: Terraform Plan
	id: plan
	if: steps.filter.outputs.skip != 'true'
	run: \|
	echo "Planning scenario: ${{ matrix.scenario }}"
	echo "Knowledge files are in .overmind/knowledge/ (always present)"
	echo "Expected: ${{ matrix.expected_effect }}"
	terraform plan \
	-compact-warnings \
	-no-color \
	-input=false \
	-lock-timeout=5m \
	-parallelism=100 \
	-out=tfplan

	terraform show -json tfplan > tfplan.json

	# =====================================================================
	# Submit to Overmind (CLI picks up .overmind/knowledge/ automatically)
	# =====================================================================
	- name: Install Overmind CLI
	if: steps.filter.outputs.skip != 'true'
	uses: overmindtech/actions/install-cli@main
	continue-on-error: true
	with:
	version: latest
	github-token: ${{ secrets.GITHUB_TOKEN }}

	- name: Add Overmind CLI to PATH
	if: steps.filter.outputs.skip != 'true'
	working-directory: ${{ github.workspace }}
	run: \|
	if command -v overmind &> /dev/null; then
	overmind --version
	elif [ -f "$GITHUB_WORKSPACE/overmindtech/overmind" ]; then
	chmod +x "$GITHUB_WORKSPACE/overmindtech/overmind"
	echo "$GITHUB_WORKSPACE/overmindtech" >> $GITHUB_PATH
	"$GITHUB_WORKSPACE/overmindtech/overmind" --version
	else
	echo "::error::Overmind CLI not found"
	exit 1
	fi

	- name: Record start time
	if: steps.filter.outputs.skip != 'true'
	id: start-time
	run: echo "start_time=$(date +%s%3N)" >> $GITHUB_OUTPUT

	- name: Submit Plan to Overmind
	id: submit-plan
	if: steps.filter.outputs.skip != 'true'
	continue-on-error: true
	env:
	OVM_API_KEY: ${{ secrets.OVM_API_KEY }}
	run: \|
	TICKET_LINK="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}?test=${{ matrix.test_id }}"

	overmind changes submit-plan \
	--title "Knowledge Test - ${{ matrix.test_id }}" \
	--description "Knowledge test: ${{ matrix.category }} \| scenario=${{ matrix.scenario }}" \
	--ticket-link "$TICKET_LINK" \
	--tags "model=risks_v6,test_type=knowledge,category=${{ matrix.category }},scenario=${{ matrix.scenario }}" \
	tfplan.json 2>&1 \| tee /tmp/submit-output.txt

	CHANGE_URL=$(grep -oE 'https://app\.overmind\.tech/changes/[a-f0-9-]+' /tmp/submit-output.txt \| head -1)

	if [ -z "$CHANGE_URL" ]; then
	echo "::error::Could not extract change URL"
	cat /tmp/submit-output.txt
	exit 1
	fi

	echo "change-url=$CHANGE_URL" >> $GITHUB_OUTPUT
	echo "Submitted: $CHANGE_URL"

	echo "Waiting for analysis to complete..."
	overmind changes get-change \
	--change "$CHANGE_URL" \
	--format markdown \
	> /tmp/change-summary.md

	# =====================================================================
	# Collect Results
	# =====================================================================
	- name: Get change results
	id: get-results
	if: steps.submit-plan.outputs.change-url != ''
	continue-on-error: true
	env:
	OVM_API_KEY: ${{ secrets.OVM_API_KEY }}
	run: \|
	overmind changes get-change \
	--change "${{ steps.submit-plan.outputs.change-url }}" \
	--format json \
	> change-results.json

	END_TIME=$(date +%s%3N)
	START_TIME=${{ steps.start-time.outputs.start_time }}
	DURATION_MS=$((END_TIME - START_TIME))
	echo "overmind_duration_ms=$DURATION_MS" >> $GITHUB_OUTPUT

	RISK_COUNT=$(jq '.risks \| length // 0' change-results.json)
	HIGH_RISK_COUNT=$(jq '[.risks[]? \| select(.severity == "high")] \| length' change-results.json)
	MEDIUM_RISK_COUNT=$(jq '[.risks[]? \| select(.severity == "medium")] \| length' change-results.json)
	LOW_RISK_COUNT=$(jq '[.risks[]? \| select(.severity == "low")] \| length' change-results.json)
	BLAST_RADIUS_NODES=$(jq '.change.metadata.numAffectedItems // 0' change-results.json)
	BLAST_RADIUS_EDGES=$(jq '.change.metadata.numAffectedEdges // 0' change-results.json)
	OBSERVATIONS=$(jq '.change.metadata.total_observations // 0' change-results.json)
	HYPOTHESES=$(jq '.hypotheses \| length // 0' change-results.json)

	RISKS_JSON=$(jq -c '[.risks[]? \| {title: .title, severity: .severity, description: .description}]' change-results.json)
	echo "$RISKS_JSON" > /tmp/risks.json

	RISKS_FULL=$(jq -c '.risks // []' change-results.json)
	echo "$RISKS_FULL" > /tmp/risks-full.json

	HYPOTHESES_FULL=$(jq -c '.hypotheses // []' change-results.json)
	echo "$HYPOTHESES_FULL" > /tmp/hypotheses-full.json

	echo "risk_count=$RISK_COUNT" >> $GITHUB_OUTPUT
	echo "high_risk_count=$HIGH_RISK_COUNT" >> $GITHUB_OUTPUT
	echo "medium_risk_count=$MEDIUM_RISK_COUNT" >> $GITHUB_OUTPUT
	echo "low_risk_count=$LOW_RISK_COUNT" >> $GITHUB_OUTPUT
	echo "blast_radius_nodes=$BLAST_RADIUS_NODES" >> $GITHUB_OUTPUT
	echo "blast_radius_edges=$BLAST_RADIUS_EDGES" >> $GITHUB_OUTPUT
	echo "observations=$OBSERVATIONS" >> $GITHUB_OUTPUT
	echo "hypotheses=$HYPOTHESES" >> $GITHUB_OUTPUT

	# Step summary
	echo "## ${{ matrix.test_id }}" >> $GITHUB_STEP_SUMMARY
	echo "Category: ${{ matrix.category }} \| Scenario: ${{ matrix.scenario }}" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "- Risks: $RISK_COUNT (high=$HIGH_RISK_COUNT, med=$MEDIUM_RISK_COUNT, low=$LOW_RISK_COUNT)" >> $GITHUB_STEP_SUMMARY
	echo "- Blast Radius: $BLAST_RADIUS_NODES nodes, $BLAST_RADIUS_EDGES edges" >> $GITHUB_STEP_SUMMARY
	echo "- Duration: $((DURATION_MS / 1000))s" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "Expected effect: ${{ matrix.expected_effect }}" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "Relevant knowledge: ${{ matrix.relevant_knowledge }}" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	if [ "$RISK_COUNT" -gt 0 ]; then
	echo "### Detected Risks" >> $GITHUB_STEP_SUMMARY
	jq -r '.risks[]? \| "- [\(.severity)] \(.title)"' change-results.json >> $GITHUB_STEP_SUMMARY
	else
	echo "_No risks detected_" >> $GITHUB_STEP_SUMMARY
	fi

	# =====================================================================
	# Send to Dashboard
	# Includes knowledge metadata for LLM-based evaluation on dashboard side
	# =====================================================================
	- name: Send results to dashboard
	if: always() && steps.get-results.outcome == 'success'
	continue-on-error: true
	env:
	DASHBOARD_URL: ${{ secrets.SCALE_DASHBOARD_URL }}
	DASHBOARD_API_KEY: ${{ secrets.SCALE_DASHBOARD_API_KEY }}
	run: \|
	if [ -z "$DASHBOARD_URL" ] \|\| [ -z "$DASHBOARD_API_KEY" ]; then
	echo "Dashboard not configured, skipping..."
	exit 0
	fi

	RISKS_JSON=$(cat /tmp/risks.json 2>/dev/null \|\| echo '[]')
	RISKS_FULL=$(cat /tmp/risks-full.json 2>/dev/null \|\| echo '[]')
	HYPOTHESES_FULL=$(cat /tmp/hypotheses-full.json 2>/dev/null \|\| echo '[]')

	jq -n \
	--arg runId "${{ github.run_id }}-${{ matrix.test_id }}" \
	--arg testId "${{ matrix.test_id }}" \
	--arg testType "knowledge" \
	--arg scenario "${{ matrix.scenario }}" \
	--arg category "${{ matrix.category }}" \
	--arg expectedEffect "${{ matrix.expected_effect }}" \
	--arg relevantKnowledge "${{ matrix.relevant_knowledge }}" \
	--arg cloudProvider "aws" \
	--argjson scaleMultiplier "${{ inputs.scale_multiplier \|\| '25' }}" \
	--argjson overmindDurationMs "${{ steps.get-results.outputs.overmind_duration_ms \|\| 0 }}" \
	--argjson riskCount "${{ steps.get-results.outputs.risk_count \|\| 0 }}" \
	--argjson highRiskCount "${{ steps.get-results.outputs.high_risk_count \|\| 0 }}" \
	--argjson mediumRiskCount "${{ steps.get-results.outputs.medium_risk_count \|\| 0 }}" \
	--argjson lowRiskCount "${{ steps.get-results.outputs.low_risk_count \|\| 0 }}" \
	--argjson blastRadiusNodes "${{ steps.get-results.outputs.blast_radius_nodes \|\| 0 }}" \
	--argjson blastRadiusEdges "${{ steps.get-results.outputs.blast_radius_edges \|\| 0 }}" \
	--argjson observations "${{ steps.get-results.outputs.observations \|\| 0 }}" \
	--argjson hypotheses "${{ steps.get-results.outputs.hypotheses \|\| 0 }}" \
	--argjson risks "$RISKS_JSON" \
	--argjson risksFull "$RISKS_FULL" \
	--argjson hypothesesFull "$HYPOTHESES_FULL" \
	--arg workflowRunUrl "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
	'{
	runId: $runId,
	testId: $testId,
	testType: $testType,
	scenario: $scenario,
	category: $category,
	expectedEffect: $expectedEffect,
	relevantKnowledge: $relevantKnowledge,
	cloudProvider: $cloudProvider,
	scaleMultiplier: $scaleMultiplier,
	overmindDurationMs: $overmindDurationMs,
	riskCount: $riskCount,
	highRiskCount: $highRiskCount,
	mediumRiskCount: $mediumRiskCount,
	lowRiskCount: $lowRiskCount,
	risks: $risks,
	risksFull: $risksFull,
	hypothesesFull: $hypothesesFull,
	blastRadiusNodes: $blastRadiusNodes,
	blastRadiusEdges: $blastRadiusEdges,
	observations: $observations,
	hypotheses: $hypotheses,
	workflowRunUrl: $workflowRunUrl
	}' > /tmp/payload.json

	curl -s -X POST "$DASHBOARD_URL/api/knowledge-results" \
	-H "Authorization: Bearer $DASHBOARD_API_KEY" \
	-H "Content-Type: application/json" \
	-d @/tmp/payload.json

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Knowledge Test #119

Workflow file

Knowledge Test #119

Uh oh!

Workflow file for this run