Skip to content

Commit 0412516

Browse files
committed
Restrict test retries to an allowlist of test names
Replace the image-tag-based and hardcoded retry eligibility logic in shouldRetryTest() with a single YAML allowlist (retry_allowed_tests.yaml) that is the sole source of truth for which tests may retry on failure. Previously, any test from a permitted image tag could retry, and 6 specific tests had hardcoded exceptions. Now, a test must be explicitly listed in the YAML file to get a retry. Tests not on the list must pass on the first attempt. The allowlist is populated from BigQuery retry_statistics data: all 1395 tests that historically failed first but passed on retry are included. The 6 previously hardcoded exception tests (OCPBUGS-57477, OCPBUGS-57665, OCPBUGS-57658, OCPBUGS-61287, OCPBUGS-61734, OCPBUGS-66967) are covered by the BigQuery data and no longer need special treatment in Go code. The list should be reduced over time as flaky tests are fixed. To disable retries entirely, empty the YAML list.
1 parent 355916d commit 0412516

3 files changed

Lines changed: 1495 additions & 47 deletions

File tree

pkg/test/ginkgo/retries.go

Lines changed: 30 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,41 @@
11
package ginkgo
22

33
import (
4-
"context"
4+
_ "embed"
55
"fmt"
66
"os"
77
"path/filepath"
88
"strconv"
9-
"strings"
109
"time"
1110

1211
"github.com/openshift/origin/pkg/dataloader"
1312
"github.com/sirupsen/logrus"
13+
"sigs.k8s.io/yaml"
1414
)
1515

16+
//go:embed retry_allowed_tests.yaml
17+
var retryAllowedTestsYAML []byte
18+
19+
// retryAllowedTestsConfig represents the YAML structure for the retry allowlist.
20+
type retryAllowedTestsConfig struct {
21+
Tests []string `json:"tests"`
22+
}
23+
24+
// loadRetryAllowedTests parses the embedded YAML and returns the set of test names
25+
// that are permitted to retry on failure.
26+
func loadRetryAllowedTests() map[string]bool {
27+
var config retryAllowedTestsConfig
28+
if err := yaml.Unmarshal(retryAllowedTestsYAML, &config); err != nil {
29+
logrus.WithError(err).Error("Failed to parse retry_allowed_tests.yaml, no tests will be allowed to retry")
30+
return map[string]bool{}
31+
}
32+
allowed := make(map[string]bool, len(config.Tests))
33+
for _, test := range config.Tests {
34+
allowed[test] = true
35+
}
36+
return allowed
37+
}
38+
1639
const (
1740
defaultRetryStrategy = "once"
1841

@@ -60,12 +83,12 @@ type RetryStrategy interface {
6083
}
6184

6285
type RetryOnceStrategy struct {
63-
PermittedRetryImageTags []string
86+
AllowedRetryTests map[string]bool
6487
}
6588

6689
func NewRetryOnceStrategy() *RetryOnceStrategy {
6790
return &RetryOnceStrategy{
68-
PermittedRetryImageTags: []string{"tests"}, // tests = openshift-tests image
91+
AllowedRetryTests: loadRetryAllowedTests(),
6992
}
7093
}
7194

@@ -112,51 +135,11 @@ func (s *RetryOnceStrategy) DecideOutcome(attempts []*testCase) RetryOutcome {
112135
}
113136

114137
func (s *RetryOnceStrategy) shouldRetryTest(test *testCase) bool {
115-
// Internal tests (no binary) are eligible for retry, we shouldn't really have any of these
116-
// now that origin is also an extension.
117-
if test.binary == nil {
138+
if s.AllowedRetryTests[test.name] {
139+
logrus.WithField("test", test.name).Debug("Test is in the retry allowlist, permitting retry")
118140
return true
119141
}
120-
121-
tlog := logrus.WithField("test", test.name)
122-
123-
// Test retries were disabled for some suites when they moved to OTE. This exposed small numbers of tests that
124-
// were actually flaky and nobody knew. We attempted to fix these, a few did not make it in time. Restore
125-
// retries for specific test names so the overall suite can continue to not retry.
126-
retryTestNames := []string{
127-
"[sig-instrumentation] Metrics should grab all metrics from kubelet /metrics/resource endpoint [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-57477
128-
"[sig-network] Services should be rejected for evicted pods (no endpoints exist) [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-57665
129-
"[sig-node] Pods Extended Pod Container lifecycle evicted pods should be terminal [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-57658
130-
"[sig-cli] Kubectl logs all pod logs the Deployment has 2 replicas and each pod has 2 containers should get logs from each pod and each container in Deployment [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-61287
131-
"[sig-cli] Kubectl Port forwarding Shutdown client connection while the remote stream is writing data to the port-forward connection port-forward should keep working after detect broken connection [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-61734
132-
"[sig-storage] OCP CSI Volumes [Driver: csi-hostpath-groupsnapshot] [OCPFeatureGate:VolumeGroupSnapshot] [Testpattern: (delete policy)] volumegroupsnapshottable [Feature:volumegroupsnapshot] VolumeGroupSnapshottable should create snapshots for multiple volumes in a pod", // https://issues.redhat.com/browse/OCPBUGS-66967
133-
}
134-
for _, rtn := range retryTestNames {
135-
if test.name == rtn {
136-
tlog.Debug("test has an exception allowing retry")
137-
return true
138-
}
139-
}
140-
141-
// Get extension info to check if it's from a permitted image
142-
info, err := test.binary.Info(context.Background())
143-
if err != nil {
144-
tlog.WithError(err).
145-
Debug("Failed to get binary info, skipping retry")
146-
return false
147-
}
148-
149-
// Check if the test's source image is in the permitted retry list
150-
for _, permittedTag := range s.PermittedRetryImageTags {
151-
if strings.Contains(info.Source.SourceImage, permittedTag) {
152-
tlog.WithField("image", info.Source.SourceImage).
153-
Debug("Permitting retry")
154-
return true
155-
}
156-
}
157-
158-
tlog.WithField("image", info.Source.SourceImage).
159-
Debug("Test not eligible for retry based on image tag")
142+
logrus.WithField("test", test.name).Debug("Test is not in the retry allowlist, retry not permitted")
160143
return false
161144
}
162145

pkg/test/ginkgo/retries_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,58 @@ import (
55
"time"
66
)
77

8+
func TestRetryOnceStrategy_ShouldRetryTest_Allowlist(t *testing.T) {
9+
tests := []struct {
10+
name string
11+
allowedTests map[string]bool
12+
testName string
13+
wantRetry bool
14+
}{
15+
{
16+
name: "test_on_allowlist_gets_retry",
17+
allowedTests: map[string]bool{"[sig-network] some flaky test": true},
18+
testName: "[sig-network] some flaky test",
19+
wantRetry: true,
20+
},
21+
{
22+
name: "test_not_on_allowlist_gets_no_retry",
23+
allowedTests: map[string]bool{"[sig-network] some flaky test": true},
24+
testName: "[sig-node] some other test",
25+
wantRetry: false,
26+
},
27+
{
28+
name: "empty_allowlist_blocks_all",
29+
allowedTests: map[string]bool{},
30+
testName: "[sig-network] some flaky test",
31+
wantRetry: false,
32+
},
33+
}
34+
35+
for _, tt := range tests {
36+
t.Run(tt.name, func(t *testing.T) {
37+
strategy := &RetryOnceStrategy{AllowedRetryTests: tt.allowedTests}
38+
tc := &testCase{name: tt.testName, failed: true}
39+
40+
if got := strategy.GetMaxRetries(tc); (got == 1) != tt.wantRetry {
41+
t.Errorf("GetMaxRetries() = %d, wantRetry = %v", got, tt.wantRetry)
42+
}
43+
44+
attempts := []*testCase{tc}
45+
if got := strategy.ShouldContinue(tc, attempts, 2); got != tt.wantRetry {
46+
t.Errorf("ShouldContinue() = %v, wantRetry = %v", got, tt.wantRetry)
47+
}
48+
})
49+
}
50+
}
51+
52+
func TestLoadRetryAllowedTests(t *testing.T) {
53+
// Verify the embedded YAML file can be parsed without error.
54+
tests := loadRetryAllowedTests()
55+
if tests == nil {
56+
t.Error("loadRetryAllowedTests returned nil, expected an initialized map")
57+
}
58+
}
59+
860
// createAttempts creates a slice of test attempts with the specified number of successes and failures
961
func createAttempts(successes, failures int) []*testCase {
1062
var attempts []*testCase

0 commit comments

Comments
 (0)