|
1 | 1 | package ginkgo |
2 | 2 |
|
3 | 3 | import ( |
4 | | - "context" |
| 4 | + _ "embed" |
5 | 5 | "fmt" |
6 | 6 | "os" |
7 | 7 | "path/filepath" |
8 | 8 | "strconv" |
9 | | - "strings" |
10 | 9 | "time" |
11 | 10 |
|
12 | 11 | "github.com/openshift/origin/pkg/dataloader" |
13 | 12 | "github.com/sirupsen/logrus" |
| 13 | + "sigs.k8s.io/yaml" |
14 | 14 | ) |
15 | 15 |
|
| 16 | +//go:embed retry_allowed_tests.yaml |
| 17 | +var retryAllowedTestsYAML []byte |
| 18 | + |
| 19 | +// retryAllowedTestsConfig represents the YAML structure for the retry allowlist. |
| 20 | +type retryAllowedTestsConfig struct { |
| 21 | + Tests []string `json:"tests"` |
| 22 | +} |
| 23 | + |
| 24 | +// loadRetryAllowedTests parses the embedded YAML and returns the set of test names |
| 25 | +// that are permitted to retry on failure. |
| 26 | +func loadRetryAllowedTests() map[string]bool { |
| 27 | + var config retryAllowedTestsConfig |
| 28 | + if err := yaml.Unmarshal(retryAllowedTestsYAML, &config); err != nil { |
| 29 | + logrus.WithError(err).Error("Failed to parse retry_allowed_tests.yaml, no tests will be allowed to retry") |
| 30 | + return map[string]bool{} |
| 31 | + } |
| 32 | + allowed := make(map[string]bool, len(config.Tests)) |
| 33 | + for _, test := range config.Tests { |
| 34 | + allowed[test] = true |
| 35 | + } |
| 36 | + return allowed |
| 37 | +} |
| 38 | + |
16 | 39 | const ( |
17 | 40 | defaultRetryStrategy = "once" |
18 | 41 |
|
@@ -60,12 +83,12 @@ type RetryStrategy interface { |
60 | 83 | } |
61 | 84 |
|
62 | 85 | type RetryOnceStrategy struct { |
63 | | - PermittedRetryImageTags []string |
| 86 | + AllowedRetryTests map[string]bool |
64 | 87 | } |
65 | 88 |
|
66 | 89 | func NewRetryOnceStrategy() *RetryOnceStrategy { |
67 | 90 | return &RetryOnceStrategy{ |
68 | | - PermittedRetryImageTags: []string{"tests"}, // tests = openshift-tests image |
| 91 | + AllowedRetryTests: loadRetryAllowedTests(), |
69 | 92 | } |
70 | 93 | } |
71 | 94 |
|
@@ -112,51 +135,11 @@ func (s *RetryOnceStrategy) DecideOutcome(attempts []*testCase) RetryOutcome { |
112 | 135 | } |
113 | 136 |
|
114 | 137 | func (s *RetryOnceStrategy) shouldRetryTest(test *testCase) bool { |
115 | | - // Internal tests (no binary) are eligible for retry, we shouldn't really have any of these |
116 | | - // now that origin is also an extension. |
117 | | - if test.binary == nil { |
| 138 | + if s.AllowedRetryTests[test.name] { |
| 139 | + logrus.WithField("test", test.name).Debug("Test is in the retry allowlist, permitting retry") |
118 | 140 | return true |
119 | 141 | } |
120 | | - |
121 | | - tlog := logrus.WithField("test", test.name) |
122 | | - |
123 | | - // Test retries were disabled for some suites when they moved to OTE. This exposed small numbers of tests that |
124 | | - // were actually flaky and nobody knew. We attempted to fix these, a few did not make it in time. Restore |
125 | | - // retries for specific test names so the overall suite can continue to not retry. |
126 | | - retryTestNames := []string{ |
127 | | - "[sig-instrumentation] Metrics should grab all metrics from kubelet /metrics/resource endpoint [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-57477 |
128 | | - "[sig-network] Services should be rejected for evicted pods (no endpoints exist) [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-57665 |
129 | | - "[sig-node] Pods Extended Pod Container lifecycle evicted pods should be terminal [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-57658 |
130 | | - "[sig-cli] Kubectl logs all pod logs the Deployment has 2 replicas and each pod has 2 containers should get logs from each pod and each container in Deployment [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-61287 |
131 | | - "[sig-cli] Kubectl Port forwarding Shutdown client connection while the remote stream is writing data to the port-forward connection port-forward should keep working after detect broken connection [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-61734 |
132 | | - "[sig-storage] OCP CSI Volumes [Driver: csi-hostpath-groupsnapshot] [OCPFeatureGate:VolumeGroupSnapshot] [Testpattern: (delete policy)] volumegroupsnapshottable [Feature:volumegroupsnapshot] VolumeGroupSnapshottable should create snapshots for multiple volumes in a pod", // https://issues.redhat.com/browse/OCPBUGS-66967 |
133 | | - } |
134 | | - for _, rtn := range retryTestNames { |
135 | | - if test.name == rtn { |
136 | | - tlog.Debug("test has an exception allowing retry") |
137 | | - return true |
138 | | - } |
139 | | - } |
140 | | - |
141 | | - // Get extension info to check if it's from a permitted image |
142 | | - info, err := test.binary.Info(context.Background()) |
143 | | - if err != nil { |
144 | | - tlog.WithError(err). |
145 | | - Debug("Failed to get binary info, skipping retry") |
146 | | - return false |
147 | | - } |
148 | | - |
149 | | - // Check if the test's source image is in the permitted retry list |
150 | | - for _, permittedTag := range s.PermittedRetryImageTags { |
151 | | - if strings.Contains(info.Source.SourceImage, permittedTag) { |
152 | | - tlog.WithField("image", info.Source.SourceImage). |
153 | | - Debug("Permitting retry") |
154 | | - return true |
155 | | - } |
156 | | - } |
157 | | - |
158 | | - tlog.WithField("image", info.Source.SourceImage). |
159 | | - Debug("Test not eligible for retry based on image tag") |
| 142 | + logrus.WithField("test", test.name).Debug("Test is not in the retry allowlist, retry not permitted") |
160 | 143 | return false |
161 | 144 | } |
162 | 145 |
|
|
0 commit comments