Skip to content

Commit f7fa00e

Browse files
committed
Restrict test retries to an allowlist of CI jobs
Add a job-level allowlist for the "once" retry strategy. Only jobs explicitly listed in retry_allowed_jobs.yaml are permitted to retry failed tests. All other CI jobs must pass on the first attempt. | Scenario | Retries? | |-----------------------------------|----------| | Job listed in the YAML allowlist | Yes | | Job NOT in the allowlist | No | | JOB_NAME not set (local/dev run) | Yes | | Empty allowlist (jobs: []) | No | The allowlist starts empty, meaning no CI job can retry today. To allow retries for a job, add its name to pkg/test/ginkgo/retry_allowed_jobs.yaml: jobs: - periodic-ci-openshift-release-master-ci-4.17-e2e-aws - periodic-ci-openshift-release-master-ci-4.17-e2e-gcp The file is embedded at compile time via go:embed, so changes require rebuilding openshift-tests.
1 parent 355916d commit f7fa00e

3 files changed

Lines changed: 163 additions & 1 deletion

File tree

pkg/test/ginkgo/retries.go

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package ginkgo
22

33
import (
44
"context"
5+
_ "embed"
56
"fmt"
67
"os"
78
"path/filepath"
@@ -11,8 +12,32 @@ import (
1112

1213
"github.com/openshift/origin/pkg/dataloader"
1314
"github.com/sirupsen/logrus"
15+
"sigs.k8s.io/yaml"
1416
)
1517

18+
//go:embed retry_allowed_jobs.yaml
19+
var retryAllowedJobsYAML []byte
20+
21+
// retryAllowedJobsConfig represents the YAML structure for the retry allowlist.
22+
type retryAllowedJobsConfig struct {
23+
Jobs []string `json:"jobs"`
24+
}
25+
26+
// loadRetryAllowedJobs parses the embedded YAML and returns the set of job names
27+
// that are permitted to retry failed tests.
28+
func loadRetryAllowedJobs() map[string]bool {
29+
var config retryAllowedJobsConfig
30+
if err := yaml.Unmarshal(retryAllowedJobsYAML, &config); err != nil {
31+
logrus.WithError(err).Error("Failed to parse retry_allowed_jobs.yaml, no jobs will be allowed to retry")
32+
return map[string]bool{}
33+
}
34+
allowed := make(map[string]bool, len(config.Jobs))
35+
for _, job := range config.Jobs {
36+
allowed[job] = true
37+
}
38+
return allowed
39+
}
40+
1641
const (
1742
defaultRetryStrategy = "once"
1843

@@ -61,11 +86,13 @@ type RetryStrategy interface {
6186

6287
type RetryOnceStrategy struct {
6388
PermittedRetryImageTags []string
89+
AllowedRetryJobs map[string]bool
6490
}
6591

6692
func NewRetryOnceStrategy() *RetryOnceStrategy {
6793
return &RetryOnceStrategy{
6894
PermittedRetryImageTags: []string{"tests"}, // tests = openshift-tests image
95+
AllowedRetryJobs: loadRetryAllowedJobs(),
6996
}
7097
}
7198

@@ -74,7 +101,24 @@ func (s *RetryOnceStrategy) Name() string {
74101
}
75102

76103
func (s *RetryOnceStrategy) ShouldAttemptRetries(failing []*testCase, suite *TestSuite) bool {
77-
return len(failing) > 0 && len(failing) <= suite.MaximumAllowedFlakes
104+
if len(failing) == 0 || len(failing) > suite.MaximumAllowedFlakes {
105+
return false
106+
}
107+
108+
jobName := os.Getenv("JOB_NAME")
109+
if jobName == "" {
110+
// When running outside of CI (no JOB_NAME set), allow retries.
111+
logrus.Info("JOB_NAME not set, allowing retries")
112+
return true
113+
}
114+
115+
if !s.AllowedRetryJobs[jobName] {
116+
logrus.Infof("Job %q is not in the retry allowlist, retries disabled", jobName)
117+
return false
118+
}
119+
120+
logrus.Infof("Job %q is in the retry allowlist, retries permitted", jobName)
121+
return true
78122
}
79123

80124
func (s *RetryOnceStrategy) GetMaxRetries(testCase *testCase) int {

pkg/test/ginkgo/retries_test.go

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,108 @@ import (
55
"time"
66
)
77

8+
func TestRetryOnceStrategy_ShouldAttemptRetries_Allowlist(t *testing.T) {
9+
suite := &TestSuite{MaximumAllowedFlakes: 15}
10+
failing := []*testCase{{name: "some-test", failed: true}}
11+
12+
tests := []struct {
13+
name string
14+
allowedJobs map[string]bool
15+
jobNameEnv string
16+
expected bool
17+
description string
18+
}{
19+
{
20+
name: "allowed_job_should_retry",
21+
allowedJobs: map[string]bool{"periodic-ci-openshift-e2e-aws": true},
22+
jobNameEnv: "periodic-ci-openshift-e2e-aws",
23+
expected: true,
24+
description: "Job in the allowlist should be permitted to retry",
25+
},
26+
{
27+
name: "disallowed_job_should_not_retry",
28+
allowedJobs: map[string]bool{"periodic-ci-openshift-e2e-aws": true},
29+
jobNameEnv: "periodic-ci-openshift-e2e-gcp",
30+
expected: false,
31+
description: "Job not in the allowlist should not be permitted to retry",
32+
},
33+
{
34+
name: "empty_allowlist_should_not_retry",
35+
allowedJobs: map[string]bool{},
36+
jobNameEnv: "periodic-ci-openshift-e2e-aws",
37+
expected: false,
38+
description: "Empty allowlist should block all jobs from retrying",
39+
},
40+
{
41+
name: "no_job_name_should_allow_retry",
42+
allowedJobs: map[string]bool{},
43+
jobNameEnv: "",
44+
expected: true,
45+
description: "When JOB_NAME is not set (local run), retries should be allowed",
46+
},
47+
{
48+
name: "no_failures_should_not_retry",
49+
allowedJobs: map[string]bool{"periodic-ci-openshift-e2e-aws": true},
50+
jobNameEnv: "periodic-ci-openshift-e2e-aws",
51+
expected: false,
52+
description: "No failing tests means no retries regardless of allowlist",
53+
},
54+
}
55+
56+
for _, tt := range tests {
57+
t.Run(tt.name, func(t *testing.T) {
58+
strategy := &RetryOnceStrategy{
59+
PermittedRetryImageTags: []string{"tests"},
60+
AllowedRetryJobs: tt.allowedJobs,
61+
}
62+
63+
t.Setenv("JOB_NAME", tt.jobNameEnv)
64+
65+
testFailing := failing
66+
if tt.name == "no_failures_should_not_retry" {
67+
testFailing = nil
68+
}
69+
70+
result := strategy.ShouldAttemptRetries(testFailing, suite)
71+
if result != tt.expected {
72+
t.Errorf("%s: expected %v, got %v", tt.description, tt.expected, result)
73+
}
74+
})
75+
}
76+
}
77+
78+
func TestRetryOnceStrategy_ShouldAttemptRetries_ExceedsMaxFlakes(t *testing.T) {
79+
suite := &TestSuite{MaximumAllowedFlakes: 2}
80+
81+
// 3 failures exceeds MaximumAllowedFlakes of 2
82+
failing := []*testCase{
83+
{name: "test-1", failed: true},
84+
{name: "test-2", failed: true},
85+
{name: "test-3", failed: true},
86+
}
87+
88+
strategy := &RetryOnceStrategy{
89+
PermittedRetryImageTags: []string{"tests"},
90+
AllowedRetryJobs: map[string]bool{"my-job": true},
91+
}
92+
93+
t.Setenv("JOB_NAME", "my-job")
94+
95+
result := strategy.ShouldAttemptRetries(failing, suite)
96+
if result != false {
97+
t.Errorf("Should not retry when failures exceed MaximumAllowedFlakes, got %v", result)
98+
}
99+
}
100+
101+
func TestLoadRetryAllowedJobs(t *testing.T) {
102+
// This tests that the embedded YAML file can be parsed without error.
103+
// The actual content depends on the YAML file, but parsing should not panic.
104+
jobs := loadRetryAllowedJobs()
105+
if jobs == nil {
106+
t.Error("loadRetryAllowedJobs returned nil, expected an initialized map")
107+
}
108+
}
109+
8110
// createAttempts creates a slice of test attempts with the specified number of successes and failures
9111
func createAttempts(successes, failures int) []*testCase {
10112
var attempts []*testCase
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Jobs that are allowed to retry failed tests once.
2+
# Any job NOT in this list must pass all tests on the first attempt.
3+
#
4+
# This list should be reviewed periodically and reduced over time.
5+
# To determine which jobs should remain here, query BigQuery:
6+
#
7+
# SELECT JobName, COUNT(*) AS total_retried_tests,
8+
# COUNTIF(FinalOutcome = 'flaky') AS passed_on_retry
9+
# FROM retry_statistics
10+
# WHERE _PARTITIONDATE >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)
11+
# GROUP BY JobName
12+
# ORDER BY passed_on_retry DESC
13+
#
14+
# Jobs with passed_on_retry > 0 are benefiting from retries and should stay.
15+
# Jobs with passed_on_retry = 0 (or absent) should be removed from this list.
16+
jobs: []

0 commit comments

Comments
 (0)