Skip to content

Commit 6975e81

Browse files
authored
test(perfcheck): harden baseline capture artifacts (#491)
Signed-off-by: Roel de Cort <roel.decort@adfinis.com>
1 parent bc24feb commit 6975e81

9 files changed

Lines changed: 177 additions & 46 deletions

File tree

.github/workflows/perf-baseline-capture.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@ on:
2323
description: Kind node image used for capture and verification.
2424
required: false
2525
default: kindest/node:v1.34.3
26+
continue_on_sample_error:
27+
description: Continue collecting remaining samples after a failed setup or scenario sample.
28+
required: false
29+
default: "true"
30+
minimum_successful_samples:
31+
description: Minimum passing measured samples required before writing a scenario baseline.
32+
required: false
33+
default: "3"
2634

2735
permissions:
2836
contents: read
@@ -116,6 +124,9 @@ jobs:
116124
with:
117125
install-dir: bin/
118126

127+
- name: Remove checked-in scenario baseline
128+
run: rm -rf "hack/perf/v2/baselines/${{ matrix.scenario }}"
129+
119130
- name: Capture v2 baseline samples
120131
env:
121132
GOFLAGS: -mod=vendor
@@ -127,6 +138,8 @@ jobs:
127138
PERF_BASELINE_DIR: hack/perf/v2/baselines
128139
PERF_POLICY_FILE: hack/perf/v2/policies/weekly.yaml
129140
PERF_ARTIFACT_DIR: dist/perf
141+
PERF_CONTINUE_ON_SAMPLE_ERROR: ${{ inputs.continue_on_sample_error }}
142+
PERF_MIN_SUCCESSFUL_SAMPLES: ${{ inputs.minimum_successful_samples }}
130143
run: make perf-baseline
131144

132145
- name: Render captured baseline report

hack/perfcheck/analysis.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,7 @@ func readRunSummary(path string) (runSummaryDocument, error) {
604604
func writeScenarioBaseline(opts options, scenario scenarioSpec, samples []sampleDocument) error {
605605
values := make(map[string][]float64)
606606
var environment runEnvironment
607+
successfulSamples := 0
607608
allowedMeasurements := scenarioMeasurementSet(scenario)
608609
for _, sample := range samples {
609610
if environment == (runEnvironment{}) {
@@ -612,6 +613,7 @@ func writeScenarioBaseline(opts options, scenario scenarioSpec, samples []sample
612613
if sample.Warmup || sample.Status != sampleStatusPass {
613614
continue
614615
}
616+
successfulSamples++
615617
for key, value := range sample.Measurements {
616618
if math.IsNaN(value) || math.IsInf(value, 0) {
617619
continue
@@ -622,6 +624,17 @@ func writeScenarioBaseline(opts options, scenario scenarioSpec, samples []sample
622624
values[key] = append(values[key], value)
623625
}
624626
}
627+
if opts.MinimumSuccessfulSamples > 0 && successfulSamples < opts.MinimumSuccessfulSamples {
628+
return fmt.Errorf(
629+
"scenario %q produced %d passing measured samples, need at least %d",
630+
scenario.Name,
631+
successfulSamples,
632+
opts.MinimumSuccessfulSamples,
633+
)
634+
}
635+
if len(values) == 0 {
636+
return fmt.Errorf("scenario %q produced no baseline measurements", scenario.Name)
637+
}
625638

626639
summary := make(map[string]measurementSummary, len(values))
627640
for key, metricValues := range values {

hack/perfcheck/analysis_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,58 @@ func TestSummarizeValues(t *testing.T) {
2121
}
2222
}
2323

24+
func TestWriteScenarioBaselineRequiresMinimumSuccessfulSamples(t *testing.T) {
25+
t.Parallel()
26+
27+
opts := defaultOptions("capture")
28+
opts.BaselineDir = t.TempDir()
29+
opts.MinimumSuccessfulSamples = 2
30+
scenario := scenarioSpec{
31+
Name: "lifecycle-convergence",
32+
Primary: []string{metricClusterAvailableSeconds},
33+
}
34+
samples := []sampleDocument{
35+
{
36+
Version: versionV2,
37+
Scenario: scenario.Name,
38+
Sample: 1,
39+
Status: sampleStatusPass,
40+
Environment: runEnvironment{Commit: "abc123"},
41+
Measurements: map[string]float64{
42+
metricClusterAvailableSeconds: 30,
43+
},
44+
},
45+
{
46+
Version: versionV2,
47+
Scenario: scenario.Name,
48+
Sample: 2,
49+
Status: sampleStatusScenarioError,
50+
Error: "setup failed",
51+
Measurements: map[string]float64{},
52+
},
53+
}
54+
55+
err := writeScenarioBaseline(opts, scenario, samples)
56+
if err == nil {
57+
t.Fatalf("expected minimum successful sample error")
58+
}
59+
if !strings.Contains(err.Error(), "produced 1 passing measured samples, need at least 2") {
60+
t.Fatalf("unexpected error: %v", err)
61+
}
62+
63+
opts.MinimumSuccessfulSamples = 1
64+
if err := writeScenarioBaseline(opts, scenario, samples); err != nil {
65+
t.Fatalf("write baseline with enough samples: %v", err)
66+
}
67+
doc, err := readScenarioBaseline(opts, scenario.Name)
68+
if err != nil {
69+
t.Fatalf("read baseline: %v", err)
70+
}
71+
if got := len(doc.Samples[metricClusterAvailableSeconds]); got != 1 {
72+
t.Fatalf("baseline sample count = %d, want 1", got)
73+
}
74+
}
75+
2476
func TestCompareMeasurementsRequiresAbsoluteAndRelativeRegression(t *testing.T) {
2577
current := map[string]measurementSummary{
2678
metricSampleTotalSeconds: {Median: 125, UpperSample: 125, Min: 125, Max: 125, Count: 3},

hack/perfcheck/main.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,12 @@ func bindExecutionFlags(fs *flag.FlagSet, opts *options) {
173173
false,
174174
"continue running remaining samples after a scenario or measurement error",
175175
)
176+
fs.IntVar(
177+
&opts.MinimumSuccessfulSamples,
178+
"minimum-successful-samples",
179+
0,
180+
"minimum passing measured samples required before writing a scenario baseline; 0 disables the guard",
181+
)
176182
fs.BoolVar(&opts.SkipImageBuild, "skip-image-build", false, "skip image build when supported by the executor")
177183
fs.StringVar(&opts.OperatorImage, "operator-image", opts.OperatorImage, "operator image for native scenarios")
178184
fs.StringVar(&opts.ConfigInitImage, "config-init-image", opts.ConfigInitImage, "config-init image")
@@ -263,6 +269,9 @@ func finalizeOptions(opts options) (options, error) {
263269
if opts.TenantChurnCount < 1 {
264270
return options{}, fmt.Errorf("tenant-churn-count must be >= 1")
265271
}
272+
if opts.MinimumSuccessfulSamples < 0 {
273+
return options{}, fmt.Errorf("minimum-successful-samples must be >= 0")
274+
}
266275
return opts, nil
267276
}
268277

hack/perfcheck/main_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,17 @@ func TestFinalizeOptionsRejectsInvalidTenantChurnCount(t *testing.T) {
5050
}
5151
}
5252

53+
func TestFinalizeOptionsRejectsInvalidMinimumSuccessfulSamples(t *testing.T) {
54+
t.Parallel()
55+
56+
opts := defaultOptions("capture")
57+
opts.MinimumSuccessfulSamples = -1
58+
59+
if _, err := finalizeOptions(opts); err == nil {
60+
t.Fatalf("expected minimum-successful-samples validation error")
61+
}
62+
}
63+
5364
func TestDefaultRollingUpgradeSourceUsesPatchUpgrade(t *testing.T) {
5465
t.Setenv("PERF_UPGRADE_FROM_VERSION", "")
5566
t.Setenv("PERF_UPGRADE_FROM_IMAGE", "")

hack/perfcheck/native_setup.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,15 +194,28 @@ func installCertManagerIfNeeded(ctx context.Context, opts options, cluster strin
194194
}
195195
if _, err := nativeKubectl(ctx, opts, cluster,
196196
"wait",
197+
"--for",
198+
"condition=Established",
199+
"crd/certificates.cert-manager.io",
200+
"crd/issuers.cert-manager.io",
201+
"--timeout",
202+
"2m",
203+
); err != nil {
204+
return fmt.Errorf("wait for cert-manager CRDs: %w", err)
205+
}
206+
if _, err := nativeKubectl(ctx, opts, cluster,
207+
"wait",
208+
"deployment.apps/cert-manager",
209+
"deployment.apps/cert-manager-cainjector",
197210
"deployment.apps/cert-manager-webhook",
198211
"--for",
199212
"condition=Available",
200213
"--namespace",
201214
"cert-manager",
202215
"--timeout",
203-
"5m",
216+
"10m",
204217
); err != nil {
205-
return fmt.Errorf("wait for cert-manager webhook: %w", err)
218+
return fmt.Errorf("wait for cert-manager deployments: %w", err)
206219
}
207220
return nil
208221
}

hack/perfcheck/runner.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,22 @@ func runCapture(opts options) error {
4747
}
4848
}
4949

50+
failedSamples := 0
5051
for _, sample := range samples {
51-
if sample.Status != sampleStatusPass && !sample.Warmup {
52+
if sample.Status == sampleStatusPass || sample.Warmup {
53+
continue
54+
}
55+
failedSamples++
56+
}
57+
if failedSamples > 0 {
58+
if !opts.ContinueOnSampleError {
5259
return fmt.Errorf("capture completed with scenario errors; inspect %s", opts.ArtifactDir)
5360
}
61+
fmt.Fprintf(
62+
os.Stderr,
63+
"warning: capture completed with %d failed measured samples; baselines use passing samples only\n",
64+
failedSamples,
65+
)
5466
}
5567
fmt.Printf("wrote v2 baselines under %s\n", opts.BaselineDir)
5668
fmt.Printf("wrote v2 sample artifacts under %s\n", opts.ArtifactDir)
@@ -189,6 +201,9 @@ func executeScenarioSample(
189201
if prepareErr != nil {
190202
sample.Status = sampleStatusScenarioError
191203
sample.Error = prepareErr.Error()
204+
if err := collectKubernetesArtifacts(opts, scenarioDir, cluster, sample, ""); err != nil {
205+
fmt.Fprintf(os.Stderr, "warning: collecting Kubernetes setup artifacts failed: %v\n", err)
206+
}
192207
return finishAndWriteSample(opts, sample)
193208
}
194209
defer func() {
@@ -1080,6 +1095,8 @@ func collectKubernetesArtifacts(
10801095
args []string
10811096
}{
10821097
{"pods.json", append([]string{"get", "pods"}, scopeArgs...)},
1098+
{"deployments.json", append([]string{"get", "deployments"}, scopeArgs...)},
1099+
{"replicasets.json", append([]string{"get", "replicasets"}, scopeArgs...)},
10831100
{"jobs.json", append([]string{"get", "jobs"}, scopeArgs...)},
10841101
{"events.json", append([]string{"get", "events"}, scopeArgs...)},
10851102
{"statefulsets.json", append([]string{"get", "statefulsets"}, scopeArgs...)},

hack/perfcheck/types.go

Lines changed: 44 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -271,49 +271,50 @@ type analysisFinding struct {
271271
}
272272

273273
type options struct {
274-
Mode string
275-
ScenarioNames []string
276-
ScenarioPath string
277-
PolicyPath string
278-
BaselineDir string
279-
ArtifactDir string
280-
PreviousSummaryPath string
281-
SummaryOut string
282-
ReportOut string
283-
FailOnFailures bool
284-
RunID string
285-
EnvironmentID string
286-
NodeImage string
287-
KindBin string
288-
MakeBin string
289-
ScenarioTimeout time.Duration
290-
ClusterTimeout time.Duration
291-
CleanupTimeout time.Duration
292-
KeepOnFailure bool
293-
ContinueOnSampleError bool
294-
SamplesOverride int
295-
WarmupsOverride int
296-
ExistingClusterContext string
297-
Namespace string
298-
NamespacePrefix string
299-
SkipImageBuild bool
300-
OperatorImage string
301-
ConfigInitImage string
302-
BackupExecutorImage string
303-
UpgradeExecutorImage string
304-
OpenBaoVersion string
305-
OpenBaoImage string
306-
UpgradeFromVersion string
307-
UpgradeFromImage string
308-
UpgradeToVersion string
309-
UpgradeToImage string
310-
APIServerCIDR string
311-
StorageClass string
312-
TenantChurnCount int
313-
OperatorNS string
314-
MetricsService string
315-
ServiceAccount string
316-
BindingName string
274+
Mode string
275+
ScenarioNames []string
276+
ScenarioPath string
277+
PolicyPath string
278+
BaselineDir string
279+
ArtifactDir string
280+
PreviousSummaryPath string
281+
SummaryOut string
282+
ReportOut string
283+
FailOnFailures bool
284+
RunID string
285+
EnvironmentID string
286+
NodeImage string
287+
KindBin string
288+
MakeBin string
289+
ScenarioTimeout time.Duration
290+
ClusterTimeout time.Duration
291+
CleanupTimeout time.Duration
292+
KeepOnFailure bool
293+
ContinueOnSampleError bool
294+
SamplesOverride int
295+
WarmupsOverride int
296+
ExistingClusterContext string
297+
Namespace string
298+
NamespacePrefix string
299+
SkipImageBuild bool
300+
OperatorImage string
301+
ConfigInitImage string
302+
BackupExecutorImage string
303+
UpgradeExecutorImage string
304+
OpenBaoVersion string
305+
OpenBaoImage string
306+
UpgradeFromVersion string
307+
UpgradeFromImage string
308+
UpgradeToVersion string
309+
UpgradeToImage string
310+
APIServerCIDR string
311+
StorageClass string
312+
TenantChurnCount int
313+
MinimumSuccessfulSamples int
314+
OperatorNS string
315+
MetricsService string
316+
ServiceAccount string
317+
BindingName string
317318
}
318319

319320
type metricsSnapshot struct {

mk/development.mk

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,7 @@ PERF_ENVIRONMENT ?= kind-v1.34.3
429429
PERF_PREVIOUS_SUMMARY ?=
430430
PERF_REPORT_FAIL_ON_FAILURES ?= false
431431
PERF_CONTINUE_ON_SAMPLE_ERROR ?= false
432+
PERF_MIN_SUCCESSFUL_SAMPLES ?= 0
432433
PERF_OPERATOR_IMAGE ?= example.com/openbao-operator:0.0.1
433434
PERF_CONFIG_INIT_IMAGE ?= openbao-init:dev
434435
PERF_BACKUP_EXECUTOR_IMAGE ?= openbao-backup:dev
@@ -692,6 +693,7 @@ perf-v2-capture: ## Capture v2 performance samples and update per-scenario distr
692693
--environment="$(PERF_ENVIRONMENT)" \
693694
--scenario-timeout="$(PERF_SCENARIO_TIMEOUT)" \
694695
--continue-on-sample-error="$(PERF_CONTINUE_ON_SAMPLE_ERROR)" \
696+
--minimum-successful-samples="$(PERF_MIN_SUCCESSFUL_SAMPLES)" \
695697
--operator-image="$(PERF_OPERATOR_IMAGE)" \
696698
--config-init-image="$(PERF_CONFIG_INIT_IMAGE)" \
697699
--backup-executor-image="$(PERF_BACKUP_EXECUTOR_IMAGE)" \

0 commit comments

Comments
 (0)