Skip to content

Commit c5b337a

Browse files
ciaranRocheclaude
andcommitted
HYPERFLEET-1062 - fix: remove transient state assertions from E2E tests
E2E tests were asserting on transient intermediate states (Reconciled=False, Applied=False) that could complete before the first poll, causing flaky tier0-nightly failures. Replaced with final-state-only validation per distributed systems testing best practices. Also reduced cl-job sleep duration from 15s to 1s. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d3a9552 commit c5b337a

5 files changed

Lines changed: 52 additions & 234 deletions

File tree

e2e/cluster/creation.go

Lines changed: 36 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package cluster
22

33
import (
44
"context"
5-
"time"
65

76
"github.com/onsi/ginkgo/v2"
87
. "github.com/onsi/gomega" //nolint:staticcheck // dot import for test readability
@@ -35,28 +34,10 @@ var _ = ginkgo.Describe("[Suite: cluster][baseline] Cluster Resource Type Lifecy
3534

3635
ginkgo.Describe("Basic Workflow Validation", ginkgo.Label(labels.Tier0), func() {
3736
// This test validates the end-to-end cluster lifecycle workflow:
38-
// 1. Cluster creation via API with initial condition validation
39-
// 2. Required adapter execution with comprehensive metadata validation
40-
// 3. Final cluster state verification (Reconciled and Available conditions)
37+
// 1. Required adapter execution with comprehensive metadata validation
38+
// 2. Final cluster state verification (Reconciled and Available conditions)
4139
ginkgo.It("should validate complete workflow from creation to Reconciled state",
4240
func(ctx context.Context) {
43-
ginkgo.By("Verify initial status of cluster")
44-
// Verify initial conditions are False, indicating workflow has not completed yet
45-
// This ensures the cluster starts in the correct initial state
46-
cluster, err := h.Client.GetCluster(ctx, clusterID)
47-
Expect(err).NotTo(HaveOccurred(), "failed to get cluster")
48-
Expect(cluster.Status).NotTo(BeNil(), "cluster status should be present")
49-
50-
hasReconciledFalse := h.HasResourceCondition(cluster.Status.Conditions,
51-
client.ConditionTypeReconciled, openapi.ResourceConditionStatusFalse)
52-
Expect(hasReconciledFalse).To(BeTrue(),
53-
"initial cluster conditions should have Reconciled=False")
54-
55-
hasAvailableFalse := h.HasResourceCondition(cluster.Status.Conditions,
56-
client.ConditionTypeAvailable, openapi.ResourceConditionStatusFalse)
57-
Expect(hasAvailableFalse).To(BeTrue(),
58-
"initial cluster conditions should have Available=False")
59-
6041
ginkgo.By("Verify required adapter execution results")
6142
// Validate required adapters from config have completed successfully
6243
// If an adapter fails, we can identify which specific adapter failed
@@ -238,137 +219,54 @@ var _ = ginkgo.Describe("[Suite: cluster][baseline] Cluster Resource Type Lifecy
238219
})
239220

240221
ginkgo.Describe("Adapter Dependency Relationships Workflow Validation", ginkgo.Label(labels.Tier0), func() {
241-
// This test validates adapter dependency relationships:
242-
// 1. During cl-job execution: cl-deployment Applied=False and Available=Unknown (never False)
243-
// 2. After cl-job completes: cl-deployment can proceed (no validation on Available during execution)
244-
// 3. Eventually: cl-deployment Available becomes True (success)
222+
// This test validates that dependent adapters complete successfully.
223+
// cl-deployment depends on cl-job — the workflow engine enforces this ordering.
224+
// We validate the final converged state: both adapters reach Applied=True, Available=True, Health=True.
245225
ginkgo.It("should validate cl-deployment dependency on cl-job with comprehensive condition checks",
246226
func(ctx context.Context) {
247-
pollingInterval := "1s"
248-
249-
ginkgo.By("Verify cl-deployment initial state and dependency waiting behavior")
250-
// Capture cl-deployment's initial waiting state
251-
// Poll until cl-deployment appears in the statuses
252-
var foundInitialState bool
227+
ginkgo.By("Verify cl-job and cl-deployment both reach final converged state")
253228
Eventually(func(g Gomega) {
254-
foundInitialState = false
255229
statuses, err := h.Client.GetClusterStatuses(ctx, clusterID)
256230
g.Expect(err).NotTo(HaveOccurred(), "failed to get cluster statuses")
257231

258-
// Find cl-deployment adapter
232+
adapterMap := make(map[string]openapi.AdapterStatus)
259233
for _, adapter := range statuses.Items {
260-
if adapter.Adapter == "cl-deployment" {
261-
foundInitialState = true
262-
263-
// Verify initial waiting state
264-
hasAppliedFalse := h.HasAdapterCondition(
265-
adapter.Conditions,
266-
client.ConditionTypeApplied,
267-
openapi.AdapterConditionStatusFalse,
268-
)
269-
g.Expect(hasAppliedFalse).To(BeTrue(),
270-
"cl-deployment Applied condition should be False initially (waiting for cl-job)")
271-
272-
hasAvailableUnknown := h.HasAdapterCondition(
273-
adapter.Conditions,
274-
client.ConditionTypeAvailable,
275-
openapi.AdapterConditionStatusUnknown,
276-
)
277-
g.Expect(hasAvailableUnknown).To(BeTrue(),
278-
"cl-deployment Available condition should be Unknown initially (waiting for cl-job)")
279-
280-
hasHealthTrue := h.HasAdapterCondition(
281-
adapter.Conditions,
282-
client.ConditionTypeHealth,
283-
openapi.AdapterConditionStatusTrue,
284-
)
285-
g.Expect(hasHealthTrue).To(BeTrue(),
286-
"cl-deployment Health condition should be True (adapter is healthy, just waiting)")
287-
288-
return
289-
}
234+
adapterMap[adapter.Adapter] = adapter
290235
}
291-
g.Expect(foundInitialState).To(BeTrue(), "cl-deployment adapter should appear in statuses")
292-
}, h.Cfg.Timeouts.Adapter.Processing, pollingInterval).Should(Succeed())
293-
294-
ginkgo.By("Verify dependency: cl-deployment Applied=False and Available=Unknown during cl-job execution")
295-
// Poll continuously until cl-deployment Available becomes True:
296-
// - Before cl-job Available=True: verify cl-deployment Applied=False and Available!=False
297-
// - After cl-job Available=True: only wait for cl-deployment Available=True
298-
// - Exit when cl-deployment Available=True
299-
timeout := time.After(h.Cfg.Timeouts.Adapter.Processing)
300-
ticker := time.NewTicker(1 * time.Second)
301-
defer ticker.Stop()
302-
303-
var jobAvailableReachedTrue bool
304-
305-
pollLoop:
306-
for {
307-
select {
308-
case <-timeout:
309-
ginkgo.Fail("Timed out waiting for cl-deployment Available condition to become True")
310-
case <-ticker.C:
311-
statuses, err := h.Client.GetClusterStatuses(ctx, clusterID)
312-
Expect(err).NotTo(HaveOccurred(), "failed to get cluster statuses")
313-
314-
var jobAvailableTrue bool
315-
var deploymentAppliedTrue bool
316-
var deploymentAvailableTrue bool
317-
var deploymentAvailableFalse bool
318-
319-
for _, adapter := range statuses.Items {
320-
if adapter.Adapter == "cl-job" {
321-
jobAvailableTrue = h.HasAdapterCondition(
322-
adapter.Conditions,
323-
client.ConditionTypeAvailable,
324-
openapi.AdapterConditionStatusTrue,
325-
)
326-
}
327-
if adapter.Adapter == "cl-deployment" {
328-
deploymentAppliedTrue = h.HasAdapterCondition(
329-
adapter.Conditions,
330-
client.ConditionTypeApplied,
331-
openapi.AdapterConditionStatusTrue,
332-
)
333-
deploymentAvailableTrue = h.HasAdapterCondition(
334-
adapter.Conditions,
335-
client.ConditionTypeAvailable,
336-
openapi.AdapterConditionStatusTrue,
337-
)
338-
deploymentAvailableFalse = h.HasAdapterCondition(
339-
adapter.Conditions,
340-
client.ConditionTypeAvailable,
341-
openapi.AdapterConditionStatusFalse,
342-
)
343-
}
344-
}
345236

346-
// Track when cl-job Available first becomes True
347-
if jobAvailableTrue && !jobAvailableReachedTrue {
348-
jobAvailableReachedTrue = true
349-
ginkgo.GinkgoWriter.Printf("cl-job Available=True reached, cl-deployment can now proceed\n")
350-
}
237+
for _, name := range []string{"cl-job", "cl-deployment"} {
238+
adapter, exists := adapterMap[name]
239+
g.Expect(exists).To(BeTrue(), "adapter %s should be present in statuses", name)
351240

352-
// Validate dependency enforcement: only check while cl-job is still executing
353-
if !jobAvailableReachedTrue {
354-
// cl-deployment should not start applying resources until cl-job completes
355-
Expect(deploymentAppliedTrue).To(BeFalse(),
356-
"cl-deployment Applied should remain False while cl-job Available is not True yet")
241+
g.Expect(adapter.ObservedGeneration).To(Equal(int32(1)),
242+
"adapter %s should have observed_generation=1 for new creation request", name)
357243

358-
// cl-deployment Available should stay Unknown (not False) while waiting for cl-job
359-
Expect(deploymentAvailableFalse).To(BeFalse(),
360-
"cl-deployment Available must be Unknown (not False) during cl-job execution")
361-
}
244+
g.Expect(h.HasAdapterCondition(adapter.Conditions,
245+
client.ConditionTypeApplied, openapi.AdapterConditionStatusTrue)).To(BeTrue(),
246+
"adapter %s should have Applied=True", name)
247+
248+
g.Expect(h.HasAdapterCondition(adapter.Conditions,
249+
client.ConditionTypeAvailable, openapi.AdapterConditionStatusTrue)).To(BeTrue(),
250+
"adapter %s should have Available=True", name)
362251

363-
// Exit when cl-deployment Available becomes True (workflow complete)
364-
if deploymentAvailableTrue {
365-
ginkgo.GinkgoWriter.Printf("cl-deployment Available=True reached, dependency validation successful\n")
366-
break pollLoop
252+
g.Expect(h.HasAdapterCondition(adapter.Conditions,
253+
client.ConditionTypeHealth, openapi.AdapterConditionStatusTrue)).To(BeTrue(),
254+
"adapter %s should have Health=True", name)
255+
256+
for _, condition := range adapter.Conditions {
257+
g.Expect(condition.Reason).NotTo(BeNil(),
258+
"adapter %s condition %s should have non-nil reason", name, condition.Type)
259+
g.Expect(*condition.Reason).NotTo(BeEmpty(),
260+
"adapter %s condition %s should have non-empty reason", name, condition.Type)
261+
g.Expect(condition.Message).NotTo(BeNil(),
262+
"adapter %s condition %s should have non-nil message", name, condition.Type)
263+
g.Expect(*condition.Message).NotTo(BeEmpty(),
264+
"adapter %s condition %s should have non-empty message", name, condition.Type)
265+
g.Expect(condition.LastTransitionTime).NotTo(BeZero(),
266+
"adapter %s condition %s should have valid last_transition_time", name, condition.Type)
367267
}
368268
}
369-
}
370-
371-
ginkgo.GinkgoWriter.Printf("Successfully validated cl-deployment dependency on cl-job with correct condition transitions\n")
269+
}, h.Cfg.Timeouts.Adapter.Processing, h.Cfg.Polling.Interval).Should(Succeed())
372270
})
373271
})
374272

e2e/nodepool/creation.go

Lines changed: 2 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package nodepool
33
import (
44
"context"
55
"fmt"
6-
"time"
76

87
"github.com/onsi/ginkgo/v2"
98
. "github.com/onsi/gomega" //nolint:staticcheck // dot import for test readability
@@ -43,37 +42,10 @@ var _ = ginkgo.Describe("[Suite: nodepool][baseline] NodePool Resource Type Life
4342

4443
ginkgo.Describe("Basic Workflow Validation", ginkgo.Label(labels.Tier0), func() {
4544
// This test validates the end-to-end nodepool lifecycle workflow:
46-
// 1. Initial condition validation (Reconciled=False, Available=False)
47-
// 2. Required adapter execution with comprehensive metadata validation
48-
// 3. Final nodepool state verification (Reconciled and Available conditions)
45+
// 1. Required adapter execution with comprehensive metadata validation
46+
// 2. Final nodepool state verification (Reconciled and Available conditions)
4947
ginkgo.It("should validate complete workflow from creation to Reconciled state",
5048
func(ctx context.Context) {
51-
var err error
52-
53-
ginkgo.By("Verify initial status of nodepool")
54-
// Verify initial conditions are False, indicating workflow has not completed yet
55-
// This ensures the nodepool starts in the correct initial state
56-
// Use Eventually to handle race conditions where conditions might not be populated yet
57-
initStatusPollInterval := time.Second
58-
initCheckTimeout := 3 * time.Second
59-
Eventually(func(g Gomega) {
60-
61-
np, err := h.Client.GetNodePool(ctx, clusterID, nodepoolID)
62-
g.Expect(err).NotTo(HaveOccurred(), "failed to get nodepool")
63-
g.Expect(np.Status).NotTo(BeNil(), "nodepool status should be present")
64-
g.Expect(np.Status.Conditions).NotTo(BeEmpty(), "conditions should be populated")
65-
66-
hasReconciledFalse := h.HasResourceCondition(np.Status.Conditions,
67-
client.ConditionTypeReconciled, openapi.ResourceConditionStatusFalse)
68-
g.Expect(hasReconciledFalse).To(BeTrue(),
69-
"initial nodepool conditions should have Reconciled=False")
70-
71-
hasAvailableFalse := h.HasResourceCondition(np.Status.Conditions,
72-
client.ConditionTypeAvailable, openapi.ResourceConditionStatusFalse)
73-
g.Expect(hasAvailableFalse).To(BeTrue(),
74-
"initial nodepool conditions should have Available=False")
75-
}, initCheckTimeout, initStatusPollInterval).Should(Succeed())
76-
7749
ginkgo.By("Verify required adapter execution results")
7850
// Validate required adapters from config have completed successfully
7951
// If an adapter fails, we can identify which specific adapter failed

test-design/testcases/cluster.md

Lines changed: 10 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -58,20 +58,8 @@ curl -X POST ${API_URL}/api/hyperfleet/v1/clusters \
5858

5959
**Expected Result:**
6060
- Response includes the created cluster ID and initial metadata
61-
- Initial cluster conditions have `status: False` for both condition `{"type": "Reconciled"}` and `{"type": "Available"}`
6261

63-
#### Step 2: Verify initial status of cluster
64-
**Action:**
65-
- Poll cluster status for initial response
66-
```bash
67-
curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}
68-
```
69-
70-
**Expected Result:**
71-
- Cluster `Reconciled` condition `status: False`
72-
- Cluster `Available` condition `status: False`
73-
74-
#### Step 3: Verify required adapter execution results
62+
#### Step 2: Verify required adapter execution results
7563

7664
**Action:**
7765
- Retrieve adapter statuses information:
@@ -100,7 +88,7 @@ curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/statuses
10088
- Config file: `configs/config.yaml` under `adapters.cluster`
10189
- Environment variable: `HYPERFLEET_ADAPTERS_CLUSTER` (comma-separated list)
10290

103-
#### Step 4: Verify final cluster state
91+
#### Step 3: Verify final cluster state
10492

10593
**Action:**
10694
- Wait for cluster Reconciled condition to transition to True
@@ -110,12 +98,11 @@ curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}
11098
```
11199

112100
**Expected Result:**
113-
- Cluster `Reconciled` condition transitions from `status: False` to `status: True`
114101
- Final cluster conditions have `status: True` for both condition `{"type": "Reconciled"}` and `{"type": "Available"}`
115102
- Validate that the observedGeneration for the Reconciled and Available conditions is 1 for a new creation request
116103
- This confirms the cluster has reached the desired end state
117104

118-
#### Step 5: Cleanup resources
105+
#### Step 4: Cleanup resources
119106

120107
**Action:**
121108
- Delete the cluster via the API:
@@ -252,7 +239,7 @@ kubectl delete namespace {cluster_id} --ignore-not-found
252239

253240
### Description
254241

255-
This test validates that CLM correctly handles adapter dependency relationships when processing a clusters resource request. Specifically, it verifies the dependency relationship where the cl-deployment adapter depends on the cl-job adapter completion. The test continuously polls and validates throughout the workflow period to ensure: (1) cl-deployment's Applied condition remains False until cl-job's Available condition reaches True, enforcing the dependency precondition; (2) during cl-job execution, cl-deployment's Available condition stays Unknown (never False), confirming the adapter waits correctly without attempting execution; (3) successful completion with cl-deployment's Available eventually transitioning to True. This validation demonstrates that the workflow engine properly enforces adapter dependencies and ensures dependent adapters wait for prerequisites before executing.
242+
This test validates that CLM correctly handles adapter dependency relationships when processing a clusters resource request. Specifically, it verifies the dependency relationship where the cl-deployment adapter depends on the cl-job adapter completion. The test validates the final converged state: both cl-job and cl-deployment reach Applied=True, Available=True, and Health=True. Dependency ordering is enforced by the workflow engine and validated at the unit/integration test level; the E2E test confirms the end-to-end outcome.
256243

257244
---
258245

@@ -291,47 +278,20 @@ curl -X POST ${API_URL}/api/hyperfleet/v1/clusters \
291278
**Expected Result:**
292279
- API returns successful response
293280

294-
#### Step 2: Verify cl-deployment initial state and dependency waiting behavior
295-
296-
**Action:**
297-
- Poll adapter statuses to capture cl-deployment's initial waiting state:
298-
```bash
299-
curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/statuses
300-
```
301-
302-
**Expected Result:**
303-
At the initial state (when cl-deployment first appears in statuses):
304-
- Response returns HTTP 200 (OK) status code
305-
- The `cl-deployment` adapter is present with initial waiting state:
306-
- `Applied` condition has `status: "False"` (deployment hasn't been applied yet, waiting for cl-job dependency)
307-
- `Available` condition has `status: "Unknown"` (deployment hasn't been applied yet)
308-
- `Health` condition has `status: "True"` (adapter itself is healthy, just waiting)
309-
310-
#### Step 3: Verify dependency relationship and condition transitions throughout entire workflow
281+
#### Step 2: Verify cl-job and cl-deployment both reach final state
311282

312283
**Action:**
313-
- Continuously poll adapter statuses from the initial state until cl-deployment completes:
284+
- Poll adapter statuses until both adapters reach their final state:
314285
```bash
315286
curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/statuses
316287
```
317288

318289
**Expected Result:**
319-
Throughout the entire period (from initial state until cl-deployment completes), validate the following on each poll:
320-
321-
**Validation 1 - Dependency enforcement (during cl-job execution):**
322-
- While `cl-job` adapter's `Available` condition has NOT reached `status: "True"`:
323-
- The `cl-deployment` adapter's `Applied` condition must remain `status: "False"`
324-
- The `cl-deployment` adapter's `Available` condition must remain `status: "Unknown"` (never `status: "False"`)
325-
- This validates that cl-deployment waits for cl-job to complete without attempting to apply resources
326-
327-
**Validation 2 - Success condition:**
328-
- Once `cl-job` adapter's `Available` reaches `status: "True"`, cl-deployment can proceed with execution
329-
- Once `cl-deployment` completes execution, its `Available` condition eventually becomes `status: "True"`
330-
- This confirms the complete dependency workflow succeeded
290+
- Both `cl-job` and `cl-deployment` adapters are present in the statuses response
291+
- Each adapter has all three conditions with `status: "True"`: `Applied`, `Available`, `Health`
292+
- This confirms the dependency workflow completed successfully end-to-end
331293

332-
**Note:** After cl-job completes, cl-deployment's `Available` condition may temporarily be `False` (e.g., `MinimumReplicasUnavailable` during deployment startup) before becoming `True`, which is expected behavior and not validated.
333-
334-
#### Step 4: Cleanup resources
294+
#### Step 3: Cleanup resources
335295

336296
**Action:**
337297
- Delete the cluster via the API:

0 commit comments

Comments
 (0)