Skip to content

Commit 9552584

Browse files
authored
feat(perfcheck): harden v2 weekly performance harness (#487)
Signed-off-by: Roel de Cort <roel.decort@adfinis.com>
1 parent a695068 commit 9552584

45 files changed

Lines changed: 7666 additions & 1420 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/perf-baseline-capture.yml

Lines changed: 132 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,18 @@ name: Performance Baseline Capture
33
on:
44
workflow_dispatch:
55
inputs:
6+
scenarios:
7+
description: Comma-separated scenarios to capture, or "all".
8+
required: false
9+
default: lifecycle-convergence,tenant-churn
610
runs:
711
description: Number of runs per scenario (baseline capture).
812
required: false
913
default: "5"
14+
warmups:
15+
description: Number of warmup runs per scenario.
16+
required: false
17+
default: "1"
1018
scenario_timeout:
1119
description: Timeout per scenario (Go duration, e.g. 90m).
1220
required: false
@@ -25,10 +33,66 @@ concurrency:
2533
cancel-in-progress: true
2634

2735
jobs:
36+
plan:
37+
name: Plan baseline capture
38+
runs-on: ubuntu-24.04
39+
outputs:
40+
scenarios: ${{ steps.scenarios.outputs.scenarios }}
41+
steps:
42+
- name: Resolve selected scenarios
43+
id: scenarios
44+
env:
45+
INPUT_SCENARIOS: ${{ inputs.scenarios }}
46+
run: |
47+
python3 - <<'PY' >> "${GITHUB_OUTPUT}"
48+
import json
49+
import os
50+
import sys
51+
52+
available = [
53+
"lifecycle-convergence",
54+
"tenant-churn",
55+
"backup",
56+
"restore",
57+
"rolling-upgrade",
58+
]
59+
raw = os.environ.get("INPUT_SCENARIOS", "").strip()
60+
if not raw or raw == "all":
61+
selected = available
62+
else:
63+
selected = []
64+
seen = set()
65+
for part in raw.split(","):
66+
name = part.strip()
67+
if not name:
68+
continue
69+
if name == "all":
70+
selected = available
71+
break
72+
if name not in available:
73+
print(
74+
f"Unknown scenario {name!r}; available: {', '.join(available)}",
75+
file=sys.stderr,
76+
)
77+
sys.exit(1)
78+
if name not in seen:
79+
selected.append(name)
80+
seen.add(name)
81+
if not selected:
82+
print("No scenarios selected", file=sys.stderr)
83+
sys.exit(1)
84+
print(f"scenarios={json.dumps(selected)}")
85+
PY
86+
2887
perf-baseline:
29-
name: Capture Performance Baseline (kind)
88+
name: Capture ${{ matrix.scenario }} (kind)
89+
needs: plan
3090
runs-on: ubuntu-24.04
31-
timeout-minutes: 420
91+
timeout-minutes: 180
92+
strategy:
93+
fail-fast: false
94+
matrix:
95+
scenario: ${{ fromJson(needs.plan.outputs.scenarios) }}
3296
steps:
3397
- name: Checkout
3498
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -52,31 +116,86 @@ jobs:
52116
with:
53117
install-dir: bin/
54118

55-
- name: Capture baseline and regenerate thresholds
119+
- name: Capture v2 baseline samples
56120
env:
57121
GOFLAGS: -mod=vendor
58-
PERF_RUNS: ${{ inputs.runs }}
122+
PERF_SAMPLES: ${{ inputs.runs }}
123+
PERF_WARMUPS: ${{ inputs.warmups }}
124+
PERF_SCENARIOS: ${{ matrix.scenario }}
59125
PERF_NODE_IMAGE: ${{ inputs.node_image }}
60126
PERF_SCENARIO_TIMEOUT: ${{ inputs.scenario_timeout }}
61-
PERF_BASELINE_OUT: hack/perf/baseline/kind-v1.34.3-baseline.json
62-
PERF_THRESHOLDS_OUT: hack/perf/thresholds/kind-v1.34.3.yaml
127+
PERF_BASELINE_DIR: hack/perf/v2/baselines
128+
PERF_POLICY_FILE: hack/perf/v2/policies/weekly.yaml
129+
PERF_ARTIFACT_DIR: dist/perf
63130
run: make perf-baseline
64131

65-
- name: Verify captured thresholds
132+
- name: Render captured baseline report
133+
if: always()
66134
env:
67135
GOFLAGS: -mod=vendor
136+
PERF_SCENARIOS: ${{ matrix.scenario }}
68137
PERF_NODE_IMAGE: ${{ inputs.node_image }}
69138
PERF_SCENARIO_TIMEOUT: ${{ inputs.scenario_timeout }}
70-
PERF_THRESHOLDS_OUT: hack/perf/thresholds/kind-v1.34.3.yaml
71-
run: make verify-perf
139+
PERF_BASELINE_DIR: hack/perf/v2/baselines
140+
PERF_POLICY_FILE: hack/perf/v2/policies/weekly.yaml
141+
PERF_ARTIFACT_DIR: dist/perf
142+
run: make perf-v2-report
143+
144+
- name: Preserve scenario report
145+
if: always()
146+
run: |
147+
mkdir -p "dist/perf/reports/${{ matrix.scenario }}"
148+
if [[ -f dist/perf/summary.json ]]; then
149+
cp dist/perf/summary.json "dist/perf/reports/${{ matrix.scenario }}/summary.json"
150+
fi
151+
if [[ -f dist/perf/report.md ]]; then
152+
cp dist/perf/report.md "dist/perf/reports/${{ matrix.scenario }}/report.md"
153+
fi
72154
73-
- name: Upload baseline artifacts
155+
- name: Upload scenario baseline artifacts
74156
if: always()
75157
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
76158
with:
77-
name: perf-baseline-kind-v1.34.3-${{ github.run_id }}-${{ github.run_attempt }}
159+
name: perf-baseline-kind-v1.34.3-${{ matrix.scenario }}-${{ github.run_id }}-${{ github.run_attempt }}
78160
path: |
79-
hack/perf/baseline/kind-v1.34.3-baseline.json
80-
hack/perf/thresholds/kind-v1.34.3.yaml
161+
hack/perf/v2/baselines/${{ matrix.scenario }}/
162+
dist/perf/scenarios/${{ matrix.scenario }}/
163+
dist/perf/kubeconfigs/
164+
dist/perf/reports/${{ matrix.scenario }}/
165+
if-no-files-found: warn
166+
retention-days: 14
167+
168+
combine-artifacts:
169+
name: Combine baseline artifacts
170+
if: always()
171+
needs: perf-baseline
172+
runs-on: ubuntu-24.04
173+
steps:
174+
- name: Download scenario artifacts
175+
continue-on-error: true
176+
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
177+
with:
178+
pattern: perf-baseline-kind-v1.34.3-*-${{ github.run_id }}-${{ github.run_attempt }}
179+
path: combined
180+
merge-multiple: true
181+
182+
- name: Summarize captured baselines
183+
run: |
184+
{
185+
echo "## Captured baselines"
186+
echo
187+
if [[ -d combined/hack/perf/v2/baselines ]]; then
188+
find combined/hack/perf/v2/baselines -type f -name '*.json' | sort | sed 's#^combined/#- #'
189+
else
190+
echo "- No baseline JSON files were captured."
191+
fi
192+
} >> "${GITHUB_STEP_SUMMARY}"
193+
194+
- name: Upload combined baseline artifacts
195+
if: always()
196+
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
197+
with:
198+
name: perf-baseline-kind-v1.34.3-${{ github.run_id }}-${{ github.run_attempt }}
199+
path: combined/
81200
if-no-files-found: warn
82201
retention-days: 14

.github/workflows/perf-weekly.yml

Lines changed: 154 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ on:
66
workflow_dispatch:
77

88
permissions:
9+
actions: read
910
contents: read
1011
issues: write
1112

@@ -15,9 +16,18 @@ concurrency:
1516

1617
jobs:
1718
verify-perf:
18-
name: Verify Performance Thresholds (kind v1.34.3)
19+
name: Verify ${{ matrix.scenario }} (kind v1.34.3)
1920
runs-on: ubuntu-24.04
2021
timeout-minutes: 180
22+
strategy:
23+
fail-fast: false
24+
matrix:
25+
scenario:
26+
- lifecycle-convergence
27+
- tenant-churn
28+
- backup
29+
- restore
30+
- rolling-upgrade
2131
steps:
2232
- name: Checkout
2333
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -40,37 +50,172 @@ jobs:
4050
with:
4151
install-dir: bin/
4252

43-
- name: Verify full performance thresholds
53+
- name: Verify v2 performance baseline
4454
env:
4555
GOFLAGS: -mod=vendor
56+
PERF_SCENARIOS: ${{ matrix.scenario }}
4657
PERF_NODE_IMAGE: kindest/node:v1.34.3
4758
PERF_SCENARIO_TIMEOUT: 90m
48-
PERF_THRESHOLDS_OUT: hack/perf/thresholds/kind-v1.34.3.yaml
59+
PERF_BASELINE_DIR: hack/perf/v2/baselines
60+
PERF_POLICY_FILE: hack/perf/v2/policies/weekly.yaml
61+
PERF_ARTIFACT_DIR: dist/perf
62+
PERF_TENANT_CHURN_COUNT: 10
4963
run: make verify-perf
5064

65+
- name: Upload scenario performance artifacts
66+
if: always()
67+
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
68+
with:
69+
name: perf-weekly-kind-v1.34.3-${{ matrix.scenario }}-${{ github.run_id }}-${{ github.run_attempt }}
70+
path: dist/perf/
71+
if-no-files-found: warn
72+
retention-days: 14
73+
74+
summarize-perf:
75+
name: Summarize Weekly Performance
76+
runs-on: ubuntu-24.04
77+
needs: verify-perf
78+
if: ${{ always() }}
79+
steps:
80+
- name: Checkout
81+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
82+
83+
- name: Setup Go
84+
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
85+
with:
86+
go-version-file: go.mod
87+
check-latest: true
88+
cache: true
89+
90+
- name: Download scenario artifacts
91+
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
92+
with:
93+
pattern: perf-weekly-kind-v1.34.3-*-${{ github.run_id }}-${{ github.run_attempt }}
94+
path: dist/perf
95+
merge-multiple: true
96+
97+
- name: Download previous weekly summary
98+
id: previous-summary
99+
if: ${{ github.event_name == 'schedule' }}
100+
env:
101+
GH_TOKEN: ${{ github.token }}
102+
run: |
103+
set -euo pipefail
104+
mkdir -p dist/previous-perf
105+
found=""
106+
mapfile -t run_ids < <(
107+
gh run list \
108+
--workflow "perf-weekly.yml" \
109+
--branch "${GITHUB_REF_NAME}" \
110+
--event schedule \
111+
--status completed \
112+
--json databaseId \
113+
--jq '.[].databaseId' \
114+
--limit 20
115+
)
116+
for run_id in "${run_ids[@]}"; do
117+
if [[ "${run_id}" == "${GITHUB_RUN_ID}" ]]; then
118+
continue
119+
fi
120+
rm -rf dist/previous-perf/*
121+
if gh run download "${run_id}" \
122+
--pattern "perf-weekly-summary-kind-v1.34.3-${run_id}-*" \
123+
--dir dist/previous-perf >/dev/null 2>&1; then
124+
candidate="$(find dist/previous-perf -name summary.json -type f | sort | head -n 1 || true)"
125+
if [[ -n "${candidate}" ]]; then
126+
found="${candidate}"
127+
break
128+
fi
129+
fi
130+
done
131+
echo "summary=${found}" >> "${GITHUB_OUTPUT}"
132+
133+
- name: Render merged v2 report
134+
env:
135+
GOFLAGS: -mod=vendor
136+
PERF_SCENARIOS: all
137+
PERF_BASELINE_DIR: hack/perf/v2/baselines
138+
PERF_POLICY_FILE: hack/perf/v2/policies/weekly.yaml
139+
PERF_ARTIFACT_DIR: dist/perf
140+
PERF_PREVIOUS_SUMMARY: ${{ steps.previous-summary.outputs.summary }}
141+
PERF_REPORT_FAIL_ON_FAILURES: true
142+
run: make perf-v2-report
143+
144+
- name: Upload merged performance report
145+
if: always()
146+
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
147+
with:
148+
name: perf-weekly-summary-kind-v1.34.3-${{ github.run_id }}-${{ github.run_attempt }}
149+
path: dist/perf/
150+
if-no-files-found: warn
151+
retention-days: 14
152+
51153
open-regression-issue:
52154
name: Open Weekly Regression Issue
53155
runs-on: ubuntu-24.04
54-
needs: verify-perf
55-
# `always()` is required so this job still runs after the failed `needs` job.
56-
if: ${{ always() && github.event_name == 'schedule' && needs.verify-perf.result == 'failure' }}
156+
needs:
157+
- verify-perf
158+
- summarize-perf
159+
# `always()` is required so this job still runs after the failed matrix job.
160+
if: ${{ always() && github.event_name == 'schedule' && (needs.verify-perf.result == 'failure' || needs.summarize-perf.result == 'failure') }}
57161
steps:
162+
- name: Download merged performance report
163+
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
164+
with:
165+
name: perf-weekly-summary-kind-v1.34.3-${{ github.run_id }}-${{ github.run_attempt }}
166+
path: dist/perf
167+
58168
- name: Open or update issue
59169
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v8.0.1
60170
with:
61171
script: |
172+
const fs = require("fs");
62173
const owner = context.repo.owner;
63174
const repo = context.repo.repo;
64-
const title = "Weekly performance regression detected";
175+
const title = "Weekly performance finding detected";
65176
const runUrl = `${context.serverUrl}/${owner}/${repo}/actions/runs/${context.runId}`;
177+
const summaryPath = "dist/perf/summary.json";
178+
const artifactName = `perf-weekly-summary-kind-v1.34.3-${context.runId}-${process.env.GITHUB_RUN_ATTEMPT}`;
179+
180+
let statusLines = ["- Summary artifact unavailable; inspect workflow logs and scenario artifacts."];
181+
let findingLines = ["- Summary artifact unavailable; inspect workflow logs and scenario artifacts."];
182+
183+
if (fs.existsSync(summaryPath)) {
184+
const summary = JSON.parse(fs.readFileSync(summaryPath, "utf8"));
185+
statusLines = Object.entries(summary.scenarios || {})
186+
.sort(([left], [right]) => left.localeCompare(right))
187+
.map(([scenario, data]) => `- \`${scenario}\`: \`${data.status}\` (${data.samples} samples)`);
188+
189+
const failures = [];
190+
for (const [scenario, data] of Object.entries(summary.scenarios || {})) {
191+
for (const finding of data.findings || []) {
192+
if (finding.severity !== "fail") {
193+
continue;
194+
}
195+
const measurement = finding.measurement ? ` \`${finding.measurement}\`` : "";
196+
failures.push(`- \`${scenario}\`${measurement}: ${finding.message}`);
197+
}
198+
}
199+
findingLines = failures.length > 0
200+
? failures.sort()
201+
: ["- No fail-severity finding was present in the merged summary."];
202+
}
203+
66204
const body = [
67-
"The weekly `verify-perf` run detected a regression.",
205+
"The weekly v2 performance matrix produced a failing finding.",
68206
"",
69207
`- Workflow run: ${runUrl}`,
70208
`- Commit: ${context.sha}`,
71209
`- Trigger time (UTC): ${new Date().toISOString()}`,
210+
`- Merged artifact: \`${artifactName}\``,
211+
"",
212+
"## Failing Findings",
213+
...findingLines,
214+
"",
215+
"## Scenario Status",
216+
...statusLines,
72217
"",
73-
"Please inspect metrics and update baseline/thresholds only with justification.",
218+
"Inspect the uploaded perfcheck v2 artifacts before updating baselines.",
74219
].join("\n");
75220
76221
const openIssues = await github.paginate(github.rest.issues.listForRepo, {

0 commit comments

Comments
 (0)