Skip to content

Commit 8c2f948

Browse files
tmshortclaude
andcommitted
feat(experimental): run catalogd and operator-controller with 2 replicas
The experimental e2e suite uses a 2-node kind cluster, making it a natural fit to validate HA behaviour. Set replicas=2 for both components in helm/experimental.yaml so the experimental and experimental-e2e manifests exercise the multi-replica path end-to-end. This is safe for operator-controller (no leader-only HTTP servers) and for catalogd now that the catalog server starts on all pods via NeedLeaderElection=false, preventing the rolling-update deadlock that would arise if the server were leader-only. Also adds a @CatalogdHA experimental e2e scenario that force-deletes the catalogd leader pod and verifies that a new leader is elected and the catalog resumes serving. The scenario is gated on a 2-node cluster (detected in BeforeSuite and reflected in the featureGates map), so it is automatically skipped in the standard 1-node e2e suite. The experimental e2e timeout is bumped from 20m to 25m to accommodate leader re-election time (~163s worst case). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> Signed-off-by: Todd Short <tshort@redhat.com>
1 parent a375d74 commit 8c2f948

8 files changed

Lines changed: 111 additions & 5 deletions

File tree

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ test-experimental-e2e: COVERAGE_NAME := experimental-e2e
316316
test-experimental-e2e: export MANIFEST := $(EXPERIMENTAL_RELEASE_MANIFEST)
317317
test-experimental-e2e: export INSTALL_DEFAULT_CATALOGS := false
318318
test-experimental-e2e: PROMETHEUS_VALUES := helm/prom_experimental.yaml
319-
test-experimental-e2e: E2E_TIMEOUT := 20m
319+
test-experimental-e2e: E2E_TIMEOUT := 25m
320320
test-experimental-e2e: run-internal prometheus e2e e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster
321321

322322
.PHONY: prometheus

helm/experimental.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
# to pull in resources or additions
88
options:
99
operatorController:
10+
deployment:
11+
replicas: 2
1012
features:
1113
enabled:
1214
- SingleOwnNamespaceInstallSupport
@@ -20,6 +22,8 @@ options:
2022
# Use with {{- if has "FeatureGate" .Values.options.catalogd.features.enabled }}
2123
# to pull in resources or additions
2224
catalogd:
25+
deployment:
26+
replicas: 2
2327
features:
2428
enabled:
2529
- APIV1MetasHandler

manifests/experimental-e2e.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2621,7 +2621,7 @@ metadata:
26212621
namespace: olmv1-system
26222622
spec:
26232623
minReadySeconds: 5
2624-
replicas: 1
2624+
replicas: 2
26252625
strategy:
26262626
type: RollingUpdate
26272627
rollingUpdate:
@@ -2772,7 +2772,7 @@ metadata:
27722772
name: operator-controller-controller-manager
27732773
namespace: olmv1-system
27742774
spec:
2775-
replicas: 1
2775+
replicas: 2
27762776
strategy:
27772777
type: RollingUpdate
27782778
rollingUpdate:

manifests/experimental.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2541,7 +2541,7 @@ metadata:
25412541
namespace: olmv1-system
25422542
spec:
25432543
minReadySeconds: 5
2544-
replicas: 1
2544+
replicas: 2
25452545
strategy:
25462546
type: RollingUpdate
25472547
rollingUpdate:
@@ -2679,7 +2679,7 @@ metadata:
26792679
name: operator-controller-controller-manager
26802680
namespace: olmv1-system
26812681
spec:
2682-
replicas: 1
2682+
replicas: 2
26832683
strategy:
26842684
type: RollingUpdate
26852685
rollingUpdate:

test/e2e/features/ha.feature

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
Feature: HA failover for catalogd
2+
3+
When catalogd is deployed with multiple replicas, the remaining pods must
4+
elect a new leader and resume serving catalogs if the leader's node is lost.
5+
6+
Background:
7+
Given OLM is available
8+
And an image registry is available
9+
10+
@CatalogdHA
11+
Scenario: Catalogd resumes serving catalogs after leader node failure
12+
Given a catalog "test" with packages:
13+
| package | version | channel | replaces | contents |
14+
| test | 1.0.0 | stable | | CRD, Deployment, ConfigMap |
15+
And catalogd is ready to reconcile resources
16+
And catalog "test" is reconciled
17+
When the catalogd leader pod is force-deleted
18+
Then a new catalogd leader is elected
19+
And catalog "test" reports Serving as True with Reason Available

test/e2e/steps/ha_steps.go

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
package steps
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"os"
7+
"strings"
8+
9+
"k8s.io/component-base/featuregate"
10+
)
11+
12+
// catalogdHAFeature gates scenarios that require a multi-node cluster.
13+
// It is set to true in BeforeSuite when the cluster has at least 2 nodes,
14+
// which is the case for the experimental e2e suite (kind-config-2node.yaml)
15+
// but not the standard suite.
16+
const catalogdHAFeature featuregate.Feature = "CatalogdHA"
17+
18+
// containerRuntime returns the container runtime from CONTAINER_RUNTIME, defaulting to "docker".
19+
func containerRuntime() string {
20+
if rt := os.Getenv("CONTAINER_RUNTIME"); rt != "" {
21+
return rt
22+
}
23+
return "docker"
24+
}
25+
26+
// CatalogdLeaderPodIsForceDeleted force-deletes the catalogd leader pod to simulate leader loss.
27+
// The pod is identified from sc.leaderPods["catalogd"] (populated by a prior
28+
// "catalogd is ready to reconcile resources" step). Force-deletion is equivalent to
29+
// an abrupt process crash: the lease is no longer renewed and the surviving pod
30+
// acquires leadership after the lease expires.
31+
//
32+
// Note: stopping the kind node container is not used here because both nodes in the
33+
// experimental 2-node cluster are control-plane nodes that run etcd — stopping either
34+
// would break etcd quorum and make the API server unreachable for the rest of the test.
35+
func CatalogdLeaderPodIsForceDeleted(ctx context.Context) error {
36+
sc := scenarioCtx(ctx)
37+
leaderPod := sc.leaderPods["catalogd"]
38+
if leaderPod == "" {
39+
return fmt.Errorf("catalogd leader pod not found in scenario context; run 'catalogd is ready to reconcile resources' first")
40+
}
41+
42+
logger.Info("Force-deleting catalogd leader pod", "pod", leaderPod)
43+
if _, err := k8sClient("delete", "pod", leaderPod, "-n", olmNamespace,
44+
"--force", "--grace-period=0"); err != nil {
45+
return fmt.Errorf("failed to force-delete catalogd leader pod %q: %w", leaderPod, err)
46+
}
47+
return nil
48+
}
49+
50+
// NewCatalogdLeaderIsElected polls the catalogd leader election lease until the holder
51+
// identity changes to a pod other than the deleted leader. It updates
52+
// sc.leaderPods["catalogd"] with the new leader pod name.
53+
func NewCatalogdLeaderIsElected(ctx context.Context) error {
54+
sc := scenarioCtx(ctx)
55+
oldLeader := sc.leaderPods["catalogd"]
56+
57+
waitFor(ctx, func() bool {
58+
holder, err := k8sClient("get", "lease", leaseNames["catalogd"], "-n", olmNamespace,
59+
"-o", "jsonpath={.spec.holderIdentity}")
60+
if err != nil || holder == "" {
61+
return false
62+
}
63+
newPod := strings.Split(strings.TrimSpace(holder), "_")[0]
64+
if newPod == oldLeader {
65+
return false
66+
}
67+
sc.leaderPods["catalogd"] = newPod
68+
logger.Info("New catalogd leader elected", "pod", newPod)
69+
return true
70+
})
71+
return nil
72+
}

test/e2e/steps/hooks.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"os/exec"
99
"regexp"
1010
"strconv"
11+
"strings"
1112
"sync"
1213

1314
"github.com/cucumber/godog"
@@ -90,6 +91,7 @@ var (
9091
features.HelmChartSupport: false,
9192
features.BoxcutterRuntime: false,
9293
features.DeploymentConfig: false,
94+
catalogdHAFeature: false,
9395
}
9496
logger logr.Logger
9597
)
@@ -152,6 +154,12 @@ func BeforeSuite() {
152154
}
153155
}
154156
}
157+
// Enable HA scenarios when the cluster has at least 2 nodes.
158+
if out, err := k8sClient("get", "nodes", "--no-headers", "-o", "name"); err == nil &&
159+
len(strings.Fields(strings.TrimSpace(out))) >= 2 {
160+
featureGates[catalogdHAFeature] = true
161+
}
162+
155163
logger.Info(fmt.Sprintf("Enabled feature gates: %v", featureGates))
156164
}
157165

test/e2e/steps/steps.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,9 @@ func RegisterSteps(sc *godog.ScenarioContext) {
194194
sc.Step(`^(?i)the "([^"]+)" component is configured with HTTPS_PROXY "([^"]+)"$`, ConfigureDeploymentWithHTTPSProxy)
195195
sc.Step(`^(?i)the "([^"]+)" component is configured with HTTPS_PROXY pointing to a recording proxy$`, StartRecordingProxyAndConfigureDeployment)
196196
sc.Step(`^(?i)the recording proxy received a CONNECT request for the catalogd service$`, RecordingProxyReceivedCONNECTForCatalogd)
197+
198+
sc.Step(`^(?i)the catalogd leader pod is force-deleted$`, CatalogdLeaderPodIsForceDeleted)
199+
sc.Step(`^(?i)a new catalogd leader is elected$`, NewCatalogdLeaderIsElected)
197200
}
198201

199202
func init() {

0 commit comments

Comments
 (0)