Skip to content

Commit dfef36d

Browse files
committed
e2e/operator: Fix GCP single-region and multi-region test infrastructure
1 parent e0a9ee8 commit dfef36d

4 files changed

Lines changed: 128 additions & 14 deletions

File tree

tests/e2e/operator/infra/common.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,17 @@ func UpdateKubeconfigGCP(t *testing.T, projectID, region, clusterName, alias str
302302
return fmt.Errorf("failed to get GCP credentials for cluster %s: %w", clusterName, err)
303303
}
304304

305-
// Step 2: Rename context
305+
// Step 2: Skip TLS verification to avoid x509 errors from GKE's internal CA.
306306
longContextName := fmt.Sprintf("gke_%s_%s_%s", projectID, region, clusterName)
307+
skipTLSCmd := exec.Command("kubectl", "config", "set-cluster", longContextName,
308+
"--insecure-skip-tls-verify=true")
309+
output, err = skipTLSCmd.CombinedOutput()
310+
if err != nil {
311+
t.Logf("kubectl set-cluster insecure-skip-tls-verify command failed. Output:\n%s\n", string(output))
312+
return fmt.Errorf("failed to set insecure-skip-tls-verify for cluster %s: %w", clusterName, err)
313+
}
314+
315+
// Step 3: Rename context
307316
renameCmd := exec.Command("kubectl", "config", "rename-context", longContextName, alias)
308317
output, err = renameCmd.CombinedOutput()
309318
if err != nil {

tests/e2e/operator/infra/gcp.go

Lines changed: 80 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package infra
22

33
import (
44
"context"
5+
"encoding/json"
56
"errors"
67
"fmt"
78
"net/http"
@@ -14,6 +15,7 @@ import (
1415

1516
"github.com/gruntwork-io/terratest/modules/k8s"
1617
"github.com/gruntwork-io/terratest/modules/random"
18+
"github.com/gruntwork-io/terratest/modules/retry"
1719
"github.com/stretchr/testify/require"
1820
"google.golang.org/api/compute/v1"
1921
"google.golang.org/api/container/v1"
@@ -44,6 +46,12 @@ const (
4446
// Default project ID to use if not specified in the environment.
4547
const defaultProjectID = "helm-testing"
4648

49+
// getNodeServiceAccount returns the GKE node service account from the environment.
50+
// If empty, gcloud falls back to the project's default Compute Engine service account.
51+
func getNodeServiceAccount() string {
52+
return os.Getenv("GCP_NODE_SERVICE_ACCOUNT")
53+
}
54+
4755
// getProjectID returns the GCP project ID from the environment variable or falls back to default.
4856
func getProjectID() string {
4957
if projectID := os.Getenv("GCP_PROJECT_ID"); projectID != "" {
@@ -227,6 +235,72 @@ func (r *GcpRegion) SetUpInfra(t *testing.T) {
227235
require.NoError(t, err)
228236
err = r.deployAndConfigureCoreDNS(t, kubeConfigPath)
229237
require.NoError(t, err, "failed to deploy and configure CoreDNS")
238+
239+
if r.IsMultiRegion {
240+
for _, clusterName := range r.Clusters {
241+
err = patchKubeDNSForCustomDomains(t, clusterName, kubeConfigPath)
242+
require.NoError(t, err, "failed to patch kube-dns for custom domains on cluster %s", clusterName)
243+
}
244+
}
245+
}
246+
247+
// patchKubeDNSForCustomDomains patches the kube-dns ConfigMap with stubDomains for
248+
// custom cluster domains and restarts node-local-dns to pick up the new config.
249+
func patchKubeDNSForCustomDomains(t *testing.T, clusterName, kubeConfigPath string) error {
250+
kubectlOpts := k8s.NewKubectlOptions(clusterName, kubeConfigPath, "kube-system")
251+
252+
clusterIP, err := k8s.RunKubectlAndGetOutputE(t, kubectlOpts,
253+
"get", "service", "kube-dns-upstream", "-o", "jsonpath={.spec.clusterIP}")
254+
if err != nil {
255+
return fmt.Errorf("failed to get kube-dns-upstream ClusterIP on cluster %s: %w", clusterName, err)
256+
}
257+
clusterIP = strings.TrimSpace(clusterIP)
258+
if clusterIP == "" {
259+
return fmt.Errorf("kube-dns-upstream service has no ClusterIP on cluster %s", clusterName)
260+
}
261+
t.Logf("[gcp] kube-dns-upstream ClusterIP on cluster %s: %s", clusterName, clusterIP)
262+
263+
stubDomains := make(map[string][]string)
264+
for _, domain := range operator.CustomDomains {
265+
stubDomains[domain] = []string{clusterIP}
266+
}
267+
stubDomainsJSON, err := json.Marshal(stubDomains)
268+
if err != nil {
269+
return fmt.Errorf("failed to marshal stubDomains: %w", err)
270+
}
271+
272+
type configMapPatch struct {
273+
Data map[string]string `json:"data"`
274+
}
275+
patchJSON, err := json.Marshal(configMapPatch{Data: map[string]string{
276+
"stubDomains": string(stubDomainsJSON),
277+
}})
278+
if err != nil {
279+
return fmt.Errorf("failed to marshal ConfigMap patch: %w", err)
280+
}
281+
282+
if err := k8s.RunKubectlE(t, kubectlOpts, "patch", "configmap", "kube-dns",
283+
"--type=merge", "-p", string(patchJSON)); err != nil {
284+
return fmt.Errorf("failed to patch kube-dns ConfigMap on cluster %s: %w", clusterName, err)
285+
}
286+
t.Logf("[gcp] Patched kube-dns ConfigMap with stubDomains on cluster %s", clusterName)
287+
288+
if err := k8s.RunKubectlE(t, kubectlOpts, "delete", "pods",
289+
"-l", "k8s-app=node-local-dns", "--grace-period=0", "--force"); err != nil {
290+
t.Logf("[gcp] Warning: force-delete of node-local-dns pods failed (may be harmless): %v", err)
291+
}
292+
293+
_, err = retry.DoWithRetryE(t, "wait for node-local-dns rollout", defaultRetries, defaultRetryInterval,
294+
func() (string, error) {
295+
return k8s.RunKubectlAndGetOutputE(t, kubectlOpts,
296+
"rollout", "status", "daemonset/node-local-dns", "--timeout=10s")
297+
})
298+
if err != nil {
299+
return fmt.Errorf("node-local-dns did not become ready after patching on cluster %s: %w", clusterName, err)
300+
}
301+
302+
t.Logf("[gcp] node-local-dns ready with stub domains for custom cluster domains on cluster %s", clusterName)
303+
return nil
230304
}
231305

232306
// TeardownInfra deletes all GCP resources created by SetUpInfra.
@@ -627,16 +701,20 @@ func createGKERegionalCluster(ctx context.Context, client *container.Service, se
627701
"--tags", strings.Join([]string{defaultNodeTag}, ","), // Join tags if there are multiple
628702
"--enable-master-authorized-networks",
629703
"--master-authorized-networks", strings.Join([]string{"0.0.0.0/0"}, ","),
630-
"--num-nodes", fmt.Sprint(defaultNodesPerZone),
704+
"--num-nodes", fmt.Sprint(defaultNodesPerZone+1), // 2 nodes/zone avoids autoscaling during TestClusterScaleUp
631705
"--min-nodes", fmt.Sprint(defaultNodesPerZone),
632-
"--max-nodes", fmt.Sprint(defaultNodesPerZone + 1), // Needed for scaling cluster
706+
"--max-nodes", fmt.Sprint(defaultNodesPerZone + 2), // headroom above initial count
633707
"--enable-autoscaling", // Enable autoscaling
634708
"--autoprovisioning-network-tags", strings.Join([]string{autoprovisioningNodeTag}, ","),
635709
"--machine-type", gcpDefaultMachineType,
636710
"--disk-size", "30GB", // Limit disk size to 30GB
637711
"--quiet", // Suppress interactive prompts
638712
}
639713

714+
if sa := getNodeServiceAccount(); sa != "" {
715+
args = append(args, "--service-account", sa)
716+
}
717+
640718
cmd := exec.Command("gcloud", args...)
641719

642720
// Stream gcloud's stdout/stderr directly for real-time visibility into the long-running creation process.

tests/e2e/operator/region.go

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ const (
3434
testOperatorRepo = "cockroachdb-operator"
3535
testInitContainerRepo = "init-container"
3636
testInotifywaitRepo = "inotifywait"
37+
DefaultClusterDomain = "cluster.local"
3738
)
3839

3940
var (
@@ -136,9 +137,14 @@ func (r *Region) InstallCharts(t *testing.T, cluster string, index int) {
136137
InstallCockroachDBEnterpriseOperator(t, kubectlOptions, r.RegionCodes[index])
137138
}
138139

140+
clusterDomain := DefaultClusterDomain
141+
if r.IsMultiRegion {
142+
clusterDomain = CustomDomains[index]
143+
}
144+
139145
if r.IsCertManager {
140146
crdbOp = PatchHelmValues(map[string]string{
141-
"cockroachdb.clusterDomain": CustomDomains[index],
147+
"cockroachdb.clusterDomain": clusterDomain,
142148
"cockroachdb.tls.enabled": "true",
143149
"cockroachdb.tls.selfSigner.enabled": "false",
144150
"cockroachdb.tls.certManager.enabled": "true",
@@ -147,7 +153,7 @@ func (r *Region) InstallCharts(t *testing.T, cluster string, index int) {
147153
})
148154
} else {
149155
crdbOp = PatchHelmValues(map[string]string{
150-
"cockroachdb.clusterDomain": CustomDomains[index],
156+
"cockroachdb.clusterDomain": clusterDomain,
151157
"cockroachdb.tls.enabled": "true",
152158
"cockroachdb.tls.selfSigner.caProvided": "true",
153159
"cockroachdb.tls.selfSigner.caSecret": customCASecret,
@@ -165,6 +171,11 @@ func (r *Region) InstallCharts(t *testing.T, cluster string, index int) {
165171

166172
helm.Install(t, crdbOptions, helmChartPath, ReleaseName)
167173

174+
k8s.WaitUntilServiceAvailable(t, kubectlOptions, "cockroachdb-join", 30, 5*time.Second)
175+
err := k8s.RunKubectlE(t, kubectlOptions, "patch", "service", "cockroachdb-join",
176+
"--type=merge", "-p", `{"spec":{"publishNotReadyAddresses":true}}`)
177+
require.NoError(t, err)
178+
168179
serviceName := "cockroachdb-public"
169180
k8s.WaitUntilServiceAvailable(t, kubectlOptions, serviceName, 30, 5*time.Second)
170181
}
@@ -357,6 +368,8 @@ func (r *Region) ValidateCRDBContainerResources(t *testing.T, kubectlOptions *k8
357368

358369
// CreateCACertificate creates CA cert and key at the same path.
359370
func (r *Region) CreateCACertificate(t *testing.T) error {
371+
r.CleanUpCACertificate(t)
372+
360373
// Create CA secret in all regions.
361374
cmd := shell.Command{
362375
Command: "cockroach",
@@ -487,10 +500,14 @@ func (r *Region) createOperatorRegions(index int, nodes int, customDomains map[i
487500
"nodes": nodes,
488501
"namespace": r.Namespace[r.Clusters[i]],
489502
}
490-
if len(r.Clusters) > i && r.Clusters[i] != "" {
491-
if domain, ok := customDomains[i]; ok {
492-
region["domain"] = domain
503+
if r.IsMultiRegion {
504+
if len(r.Clusters) > i && r.Clusters[i] != "" {
505+
if domain, ok := customDomains[i]; ok {
506+
region["domain"] = domain
507+
}
493508
}
509+
} else {
510+
region["domain"] = DefaultClusterDomain
494511
}
495512
return region
496513
}
@@ -786,8 +803,12 @@ func (r *Region) BaseRegionConfig(cluster string, index int) map[string]interfac
786803
"nodes": r.NodeCount,
787804
"namespace": r.Namespace[cluster],
788805
}
789-
if domain, ok := CustomDomains[index]; ok {
790-
region["domain"] = domain
806+
if r.IsMultiRegion {
807+
if domain, ok := CustomDomains[index]; ok {
808+
region["domain"] = domain
809+
}
810+
} else {
811+
region["domain"] = DefaultClusterDomain
791812
}
792813
return region
793814
}
@@ -909,9 +930,14 @@ func (r *Region) InstallChartsWithAdvancedConfig(t *testing.T, cluster string, i
909930
}
910931
}
911932

933+
clusterDomain := DefaultClusterDomain
934+
if r.IsMultiRegion {
935+
clusterDomain = CustomDomains[index]
936+
}
937+
912938
// Build helm values
913939
helmValues := PatchHelmValues(map[string]string{
914-
"cockroachdb.clusterDomain": CustomDomains[index],
940+
"cockroachdb.clusterDomain": clusterDomain,
915941
"cockroachdb.tls.selfSigner.caProvided": "true",
916942
"cockroachdb.tls.selfSigner.caSecret": customCASecret,
917943
})

tests/testutil/require.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ func RequireClusterToBeReadyEventuallyTimeout(t *testing.T, crdbCluster Cockroac
5757
func(ctx context.Context) (bool, error) {
5858
ss, err := fetchStatefulSet(crdbCluster.K8sClient, crdbCluster.StatefulSetName, crdbCluster.Namespace)
5959
if err != nil {
60-
t.Logf("error fetching stateful set")
61-
return false, err
60+
t.Logf("transient error fetching stateful set, will retry: %v", err)
61+
return false, nil
6262
}
6363

6464
if ss == nil {
@@ -87,7 +87,8 @@ func RequireCRDBClusterToBeReadyEventuallyTimeout(t *testing.T, opts *k8s.Kubect
8787
LabelSelector: "app=cockroachdb",
8888
})
8989
if err != nil {
90-
return false, err
90+
t.Logf("transient error listing pods, will retry: %v", err)
91+
return false, nil
9192
}
9293

9394
if len(pods) != crdbCluster.DesiredNodes {

0 commit comments

Comments
 (0)