Skip to content

Commit e0a9ee8

Browse files
authored
Merge pull request #577 from cockroachdb/nishanth/operator-advanced-tests
[CRDB-53973] e2e/operator: advanced e2e tests for bare-metal infrastructure(kind).
2 parents 15efecd + 6ab0067 commit e0a9ee8

8 files changed

Lines changed: 1841 additions & 61 deletions

Makefile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,13 @@ test/single-cluster/up: bin/k3d
148148
test/multi-cluster/down: bin/k3d
149149
./tests/k3d/dev-multi-cluster.sh down
150150

151+
test/nightly-e2e/advanced/single-region: bin/cockroach bin/kubectl bin/helm build/self-signer bin/kind
152+
@PATH="$(PWD)/bin:${PATH}" PROVIDER=kind TEST_ADVANCED_FEATURES=true go test -timeout 90m -v -test.run TestOperatorInSingleRegion ./tests/e2e/operator/singleRegion/... || (echo "Advanced single-region tests failed with exit code $$?" && exit 1)
153+
154+
test/nightly-e2e/advanced/multi-region: bin/cockroach bin/kubectl bin/helm build/self-signer bin/kind
155+
@PATH="$(PWD)/bin:${PATH}" PROVIDER=kind TEST_ADVANCED_FEATURES=true go test -timeout 90m -v -test.run TestOperatorInMultiRegion ./tests/e2e/operator/multiRegion/... || (echo "Advanced multi-region tests failed with exit code $$?" && exit 1)
156+
157+
151158
test/lint: bin/helm ## lint the helm chart
152159
@build/lint.sh && \
153160
bin/helm lint cockroachdb && \

tests/e2e/migrate/helm_chart_to_cockroach_enterprise_operator_test.go

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010

1111
api "github.com/cockroachdb/cockroach-operator/apis/v1alpha1"
1212
"github.com/cockroachdb/helm-charts/tests/e2e/operator"
13+
"github.com/cockroachdb/helm-charts/tests/e2e/operator/infra"
1314
"github.com/cockroachdb/helm-charts/tests/testutil"
1415
"github.com/cockroachdb/helm-charts/tests/testutil/migration"
1516
"github.com/gruntwork-io/terratest/modules/helm"
@@ -59,6 +60,21 @@ func init() {
5960
}
6061
}
6162

63+
// providerCloudRegion returns the cloud region for the active provider.
64+
func providerCloudRegion() string {
65+
p := strings.TrimSpace(strings.ToLower(os.Getenv("PROVIDER")))
66+
switch p {
67+
case "k3d", "": // default to k3d when PROVIDER is unset
68+
return infra.RegionCodes[infra.ProviderK3D][0]
69+
case "kind":
70+
return infra.RegionCodes[infra.ProviderKind][0]
71+
case "gcp":
72+
return infra.RegionCodes[infra.ProviderGCP][0]
73+
default:
74+
return ""
75+
}
76+
}
77+
6278
func TestHelmChartToOperatorMigration(t *testing.T) {
6379
h := newHelmChartToOperator()
6480
t.Run("helm chart to cockroach enterprise operator migration", h.TestDefaultMigration)
@@ -145,7 +161,7 @@ func (h *HelmChartToOperator) TestDefaultMigration(t *testing.T) {
145161
k8s.RunKubectl(t, kubectlOptions, "delete", "priorityclass", "crdb-critical")
146162
}()
147163

148-
operator.InstallCockroachDBEnterpriseOperator(t, kubectlOptions)
164+
operator.InstallCockroachDBEnterpriseOperator(t, kubectlOptions, providerCloudRegion())
149165
defer func() {
150166
t.Log("Uninstall the cockroachdb enterprise operator")
151167
operator.UninstallCockroachDBEnterpriseOperator(t, kubectlOptions)
@@ -263,7 +279,7 @@ func (h *HelmChartToOperator) TestCertManagerMigration(t *testing.T) {
263279
k8s.RunKubectl(t, kubectlOptions, "delete", "priorityclass", "crdb-critical")
264280
}()
265281

266-
operator.InstallCockroachDBEnterpriseOperator(t, kubectlOptions)
282+
operator.InstallCockroachDBEnterpriseOperator(t, kubectlOptions, providerCloudRegion())
267283
defer func() {
268284
t.Log("Uninstall the cockroachdb enterprise operator")
269285
operator.UninstallCockroachDBEnterpriseOperator(t, kubectlOptions)
@@ -381,7 +397,7 @@ func (h *HelmChartToOperator) TestDedicatedLogsPVCMigration(t *testing.T) {
381397
k8s.RunKubectl(t, kubectlOptions, "delete", "priorityclass", "crdb-critical")
382398
}()
383399

384-
operator.InstallCockroachDBEnterpriseOperator(t, kubectlOptions)
400+
operator.InstallCockroachDBEnterpriseOperator(t, kubectlOptions, providerCloudRegion())
385401
defer func() {
386402
t.Log("Uninstall the cockroachdb enterprise operator")
387403
operator.UninstallCockroachDBEnterpriseOperator(t, kubectlOptions)
@@ -476,7 +492,7 @@ func (h *HelmChartToOperator) TestPCRPrimaryMigration(t *testing.T) {
476492
k8s.RunKubectl(t, kubectlOptions, "delete", "priorityclass", "crdb-critical")
477493
}()
478494

479-
operator.InstallCockroachDBEnterpriseOperator(t, kubectlOptions)
495+
operator.InstallCockroachDBEnterpriseOperator(t, kubectlOptions, providerCloudRegion())
480496
defer func() {
481497
t.Log("Uninstall the cockroachdb enterprise operator")
482498
operator.UninstallCockroachDBEnterpriseOperator(t, kubectlOptions)

tests/e2e/migrate/public_operator_to_cockroach_enterprise_operator_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ func (o *PublicOperatorToCockroachEnterpriseOperator) TestDefaultMigration(t *te
156156
k8s.KubectlApply(t, kubectlOptions, filepath.Join(manifestsDirPath, "rbac.yaml"))
157157

158158
t.Log("Install the cockroachdb enterprise operator")
159-
operator.InstallCockroachDBEnterpriseOperator(t, kubectlOptions)
159+
operator.InstallCockroachDBEnterpriseOperator(t, kubectlOptions, providerCloudRegion())
160160
defer func() {
161161
t.Log("Uninstall the cockroachdb enterprise operator")
162162
operator.UninstallCockroachDBEnterpriseOperator(t, kubectlOptions)
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
package multiRegion
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
"testing"
7+
8+
"github.com/cockroachdb/helm-charts/tests/e2e/operator"
9+
"github.com/gruntwork-io/terratest/modules/k8s"
10+
"github.com/gruntwork-io/terratest/modules/random"
11+
"github.com/stretchr/testify/require"
12+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13+
)
14+
15+
// TestWALFailoverMultiRegion tests WAL failover with different paths in each region
16+
// Region 0: WAL failover enabled with custom path
17+
// Region 1: WAL failover disabled
18+
func (r *multiRegion) TestWALFailoverMultiRegion(t *testing.T) {
19+
// Setup namespaces and CA for each region
20+
cleanup := r.SetupMultiClusterWithCA(t)
21+
defer cleanup()
22+
23+
// Region 0: Install with WAL failover enabled
24+
cluster0 := r.Clusters[0]
25+
walPath0 := "/cockroach/wal-region-0"
26+
27+
t.Logf("Installing region 0 (%s) with WAL failover enabled at path %s", cluster0, walPath0)
28+
config0 := operator.AdvancedInstallConfig{
29+
WALFailoverEnabled: true,
30+
WALFailoverSize: "5Gi",
31+
CustomValues: map[string]string{
32+
"cockroachdb.crdbCluster.walFailoverSpec.path": walPath0,
33+
},
34+
}
35+
r.InstallChartsWithAdvancedConfig(t, cluster0, 0, config0)
36+
37+
// Region 1: Install without WAL failover
38+
cluster1 := r.Clusters[1]
39+
t.Logf("Installing region 1 (%s) without WAL failover", cluster1)
40+
config1 := operator.AdvancedInstallConfig{}
41+
r.InstallChartsWithAdvancedConfig(t, cluster1, 1, config1)
42+
43+
// Validate CockroachDB cluster health in both regions
44+
for _, cluster := range r.Clusters {
45+
r.ValidateCRDB(t, cluster)
46+
}
47+
48+
// Validate multi-region setup
49+
r.ValidateMultiRegionSetup(t)
50+
51+
// Validate WAL failover in region 0
52+
t.Log("Validating WAL failover in region 0")
53+
r.ValidateWALFailover(t, cluster0, &operator.AdvancedValidationConfig{
54+
WALFailover: operator.WALFailoverValidation{
55+
CustomPath: walPath0,
56+
},
57+
})
58+
59+
// Validate NO WAL failover in region 1
60+
t.Log("Validating NO WAL failover in region 1")
61+
kubeConfig, _ := r.GetCurrentContext(t)
62+
kubectlOptions1 := k8s.NewKubectlOptions(cluster1, kubeConfig, r.Namespace[cluster1])
63+
64+
pods := k8s.ListPods(t, kubectlOptions1, metav1.ListOptions{
65+
LabelSelector: operator.LabelSelector,
66+
})
67+
require.True(t, len(pods) > 0, "No CockroachDB pods found in region 1")
68+
69+
podCommand, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions1,
70+
"get", "pod", pods[0].Name, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].command}")
71+
require.NoError(t, err)
72+
require.NotContains(t, podCommand, "--wal-failover", "Region 1 should not have WAL failover enabled")
73+
t.Log("Confirmed region 1 does not have WAL failover")
74+
75+
t.Logf("WAL failover multi-region test completed successfully")
76+
}
77+
78+
// TestEncryptionAtRestMultiRegion tests encryption at rest with different secrets per region
79+
// Region 0: Encryption enabled with secret "cmek-key-secret-region-0"
80+
// Region 1: Encryption disabled (no encryption)
81+
func (r *multiRegion) TestEncryptionAtRestMultiRegion(t *testing.T) {
82+
// Setup namespaces and CA for each region
83+
cleanup := r.SetupMultiClusterWithCA(t)
84+
defer cleanup()
85+
86+
// Generate encryption key for region 0
87+
encryptionKeyB64 := r.GenerateEncryptionKey(t)
88+
t.Logf("Generated encryption key for region 0 (base64 length: %d)", len(encryptionKeyB64))
89+
90+
// Region 0: Install with encryption at rest enabled
91+
cluster0 := r.Clusters[0]
92+
secretName0 := "cmek-key-secret-region-0"
93+
94+
encryptionRegions0 := []map[string]interface{}{
95+
{
96+
"code": r.RegionCodes[0],
97+
"cloudProvider": r.Provider,
98+
"nodes": r.NodeCount,
99+
"namespace": r.Namespace[cluster0],
100+
"domain": operator.CustomDomains[0],
101+
"encryptionAtRest": map[string]interface{}{
102+
"platform": "UNKNOWN_KEY_TYPE",
103+
"keySecretName": secretName0,
104+
},
105+
},
106+
}
107+
108+
t.Logf("Installing region 0 (%s) with encryption at rest enabled", cluster0)
109+
config0 := operator.AdvancedInstallConfig{
110+
EncryptionEnabled: true,
111+
EncryptionKeySecret: encryptionKeyB64,
112+
EncryptionKeySecretName: secretName0,
113+
CustomRegions: encryptionRegions0,
114+
}
115+
r.InstallChartsWithAdvancedConfig(t, cluster0, 0, config0)
116+
117+
// Region 1: Install without encryption
118+
cluster1 := r.Clusters[1]
119+
t.Logf("Installing region 1 (%s) without encryption at rest", cluster1)
120+
config1 := operator.AdvancedInstallConfig{}
121+
r.InstallChartsWithAdvancedConfig(t, cluster1, 1, config1)
122+
123+
// Validate CockroachDB cluster health in both regions
124+
for _, cluster := range r.Clusters {
125+
r.ValidateCRDB(t, cluster)
126+
}
127+
128+
// Validate multi-region setup
129+
r.ValidateMultiRegionSetup(t)
130+
131+
// Validate encryption in region 0
132+
t.Log("Validating encryption at rest in region 0")
133+
r.ValidateEncryptionAtRest(t, cluster0, &operator.AdvancedValidationConfig{
134+
EncryptionAtRest: operator.EncryptionAtRestValidation{
135+
SecretName: secretName0,
136+
},
137+
})
138+
139+
// Validate NO encryption in region 1
140+
t.Log("Validating NO encryption at rest in region 1")
141+
kubeConfig, _ := r.GetCurrentContext(t)
142+
kubectlOptions1 := k8s.NewKubectlOptions(cluster1, kubeConfig, r.Namespace[cluster1])
143+
144+
pods := k8s.ListPods(t, kubectlOptions1, metav1.ListOptions{
145+
LabelSelector: operator.LabelSelector,
146+
})
147+
require.True(t, len(pods) > 0, "No CockroachDB pods found in region 1")
148+
149+
podCommand, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions1,
150+
"get", "pod", pods[0].Name, "-o", "jsonpath={.spec.containers[?(@.name=='cockroachdb')].command}")
151+
require.NoError(t, err)
152+
require.NotContains(t, podCommand, "--enterprise-encryption", "Region 1 should not have encryption enabled")
153+
t.Log("Confirmed region 1 does not have encryption at rest")
154+
155+
t.Logf("Encryption at rest multi-region test completed successfully")
156+
}
157+
158+
// TestPCRMultiRegion tests Physical Cluster Replication with multi-region setup
159+
// Creates a multi-region primary cluster, then creates a standby cluster and tests failover/failback
160+
func (r *multiRegion) TestPCRMultiRegion(t *testing.T) {
161+
// Creating random namespace for primary multi-region cluster
162+
for _, cluster := range r.Clusters {
163+
r.Namespace[cluster] = fmt.Sprintf("%s-primary-%s", operator.Namespace, strings.ToLower(random.UniqueId()))
164+
}
165+
166+
// Create CA certificate once for all clusters
167+
cleanupCA := r.RequireCACertificate(t)
168+
defer cleanupCA()
169+
170+
var standbyNamespace string
171+
172+
// Capture primary namespaces now. During standby installation r.Namespace[Clusters[0]]
173+
// is temporarily overwritten with the standby namespace. If the test fails before
174+
// restoring it, CleanupResources would clean up the wrong namespace and leak the
175+
// primary namespace. Restoring here ensures correct cleanup regardless of failure point.
176+
primaryNS := make(map[string]string)
177+
for _, cluster := range r.Clusters {
178+
primaryNS[cluster] = r.Namespace[cluster]
179+
}
180+
defer func() {
181+
for cluster, ns := range primaryNS {
182+
r.Namespace[cluster] = ns
183+
}
184+
r.CleanupResources(t)
185+
}()
186+
defer func() {
187+
if standbyNamespace != "" {
188+
kubeConfig, _ := r.GetCurrentContext(t)
189+
// Standby is always installed on Clusters[0]; use its context explicitly.
190+
kubectlOptions := k8s.NewKubectlOptions(r.Clusters[0], kubeConfig, standbyNamespace)
191+
if err := k8s.DeleteNamespaceE(t, kubectlOptions, standbyNamespace); err != nil {
192+
t.Logf("Warning: failed to delete standby namespace %s (cluster may be unreachable): %v", standbyNamespace, err)
193+
}
194+
}
195+
}()
196+
197+
// Step 1: Install primary multi-region cluster
198+
t.Log("Installing primary multi-region cluster")
199+
for i, cluster := range r.Clusters {
200+
primaryConfig := operator.AdvancedInstallConfig{
201+
VirtualClusterMode: "primary",
202+
}
203+
r.InstallChartsWithAdvancedConfig(t, cluster, i, primaryConfig)
204+
}
205+
206+
// Validate primary cluster health in all regions
207+
for _, cluster := range r.Clusters {
208+
r.ValidateCRDB(t, cluster)
209+
}
210+
211+
// Validate multi-region setup
212+
r.ValidateMultiRegionSetup(t)
213+
t.Log("Primary multi-region cluster is healthy")
214+
215+
// Step 2: Install standby cluster (single region for simplicity)
216+
t.Log("Installing standby cluster")
217+
standbyCluster := r.Clusters[0] // Use first cluster for standby
218+
standbyNamespace = fmt.Sprintf("%s-standby-%s", operator.Namespace, strings.ToLower(random.UniqueId()))
219+
220+
// Temporarily update namespace for standby installation
221+
originalNamespace := r.Namespace[standbyCluster]
222+
r.Namespace[standbyCluster] = standbyNamespace
223+
224+
standbyConfig := operator.AdvancedInstallConfig{
225+
VirtualClusterMode: "standby",
226+
SkipOperatorInstall: true, // Operator already installed
227+
}
228+
r.InstallChartsWithAdvancedConfig(t, standbyCluster, 0, standbyConfig)
229+
230+
// Validate standby cluster
231+
r.VirtualClusterModeStandby = true
232+
r.ValidateCRDB(t, standbyCluster)
233+
r.VirtualClusterModeStandby = false
234+
t.Log("Standby cluster is healthy")
235+
236+
// Step 3: Set up replication and test failover/failback
237+
t.Log("Testing PCR failover and failback")
238+
r.ValidatePCR(t, &operator.AdvancedValidationConfig{
239+
PCR: operator.PCRValidation{
240+
Cluster: standbyCluster,
241+
PrimaryNamespace: originalNamespace,
242+
StandbyNamespace: standbyNamespace,
243+
},
244+
})
245+
246+
// Restore original namespace
247+
r.Namespace[standbyCluster] = originalNamespace
248+
249+
t.Logf("PCR multi-region test completed successfully")
250+
}

tests/e2e/operator/multiRegion/cockroachdb_multi_region_e2e_test.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,14 +83,21 @@ func TestOperatorInMultiRegion(t *testing.T) {
8383
// Set up infrastructure for this provider once.
8484
cloudProvider.SetUpInfra(t)
8585

86-
testCases := map[string]func(*testing.T){
87-
"TestHelmInstall": providerRegion.TestHelmInstall,
88-
"TestHelmUpgrade": providerRegion.TestHelmUpgrade,
89-
"TestClusterRollingRestart": providerRegion.TestClusterRollingRestart,
90-
"TestKillingCockroachNode": providerRegion.TestKillingCockroachNode,
91-
"TestClusterScaleUp": func(t *testing.T) { providerRegion.TestClusterScaleUp(t, cloudProvider) },
86+
// Build test cases based on TEST_ADVANCED_FEATURES environment variable
87+
testCases := make(map[string]func(*testing.T))
88+
89+
// Run only advanced test cases when TEST_ADVANCED_FEATURES is enabled
90+
if os.Getenv("TEST_ADVANCED_FEATURES") == "true" {
91+
testCases["TestWALFailoverMultiRegion"] = providerRegion.TestWALFailoverMultiRegion
92+
testCases["TestEncryptionAtRestMultiRegion"] = providerRegion.TestEncryptionAtRestMultiRegion
93+
testCases["TestPCRMultiRegion"] = providerRegion.TestPCRMultiRegion
94+
} else {
95+
testCases["TestHelmInstall"] = providerRegion.TestHelmInstall
96+
testCases["TestHelmUpgrade"] = providerRegion.TestHelmUpgrade
97+
testCases["TestClusterRollingRestart"] = providerRegion.TestClusterRollingRestart
98+
testCases["TestKillingCockroachNode"] = providerRegion.TestKillingCockroachNode
99+
testCases["TestClusterScaleUp"] = func(t *testing.T) { providerRegion.TestClusterScaleUp(t, cloudProvider) }
92100
}
93-
94101
// Run tests sequentially within a provider.
95102
var testFailed bool
96103
for name, method := range testCases {

0 commit comments

Comments
 (0)