Skip to content

Commit c8aef41

Browse files
committed
additional e2e tests for resize_controlplane
1 parent 4a5969d commit c8aef41

5 files changed

Lines changed: 236 additions & 6 deletions

File tree

.pipelines/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ stages:
369369
# Override the E2E label for IndividualCI/BatchedCI (i.e. not manually
370370
# ran/PR jobs) to run all non-smoke tasks (default is !smoke&&!regressiontest)
371371
- bash: |
372-
echo "##vso[task.setvariable variable=E2E_LABEL]!smoke"
372+
echo "##vso[task.setvariable variable=E2E_LABEL]!smoke&&!slow"
373373
displayName: Enable regression tests in CI
374374
condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
375375

Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ COMMIT = $(shell git rev-parse --short=7 HEAD)$(shell [[ $$(git status --porcela
99
ARO_IMAGE_BASE = ${RP_IMAGE_ACR}.azurecr.io/aro
1010
E2E_FLAGS ?= -test.v --ginkgo.vv --ginkgo.timeout 180m --ginkgo.flake-attempts=2 --ginkgo.junit-report=e2e-report.xml
1111
E2E_LABEL ?= !smoke&&!regressiontest
12+
E2E_FOKUS ?=
1213
GO_FLAGS ?= -tags=containers_image_openpgp,exclude_graphdriver_btrfs,exclude_graphdriver_devicemapper
1314
OC ?= oc
1415

@@ -300,6 +301,10 @@ tunnel:
300301
e2e.test:
301302
go test ./test/e2e/... -tags e2e,codec.safe -c -ldflags "-X github.com/Azure/ARO-RP/pkg/util/version.GitCommit=$(VERSION)" -o e2e.test
302303

304+
.PHONY: e2e
305+
e2e:
306+
go test ./test/e2e/... -tags e2e,codec.safe -timeout 180m --ginkgo.v --ginkgo.flake-attempts=2 -ginkgo.label-filter="$(E2E_LABEL)" -ginkgo.focus="$(E2E_FOKUS)" -v
307+
303308
.PHONY: e2etools
304309
e2etools:
305310
CGO_ENABLED=0 go build -ldflags "-X github.com/Azure/ARO-RP/pkg/util/version.GitCommit=$(VERSION)" ./hack/cluster
@@ -325,7 +330,7 @@ validate-go: validate-go-action $(GOLANGCI_LINT)
325330

326331
.PHONY: validate-go-action
327332
validate-go-action: validate-imports validate-lint-go-fix validate-gh-actions
328-
go run ./hack/licenses -validate -ignored-go vendor,pkg/client,.git -ignored-python python/client,python/az/aro/azext_aro/aaz,vendor,.git
333+
go run ./hack/licenses -validate -ignored-go vendor,pkg/client,.git -ignored-python python/client,python/az/aro/azext_aro/aaz,python/az/aro/build/lib/azext_aro,vendor,.git
329334
@[ -z "$$(ls pkg/util/*.go 2>/dev/null)" ] || (echo error: go files are not allowed in pkg/util, use a subpackage; exit 1)
330335
@[ -z "$$(find . -name "*:*")" ] || (echo error: filenames with colons are not allowed on Windows, please rename; exit 1)
331336
@sha256sum --quiet -c .sha256sum || (echo error: client library is stale, please run make client; exit 1)

test/e2e/adminapi_resize_controlplane.go

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,127 @@ package e2e
55

66
import (
77
"context"
8+
"encoding/json"
9+
"fmt"
10+
"math"
811
"net/http"
912
"net/url"
13+
"slices"
14+
"strings"
15+
"time"
1016

1117
. "github.com/onsi/ginkgo/v2"
1218
. "github.com/onsi/gomega"
19+
20+
corev1 "k8s.io/api/core/v1"
21+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22+
"k8s.io/apimachinery/pkg/types"
23+
24+
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-06-01/compute"
25+
26+
machinev1beta1 "github.com/openshift/api/machine/v1beta1"
27+
28+
"github.com/Azure/ARO-RP/pkg/api"
29+
"github.com/Azure/ARO-RP/pkg/api/validate"
30+
"github.com/Azure/ARO-RP/pkg/util/stringutils"
1331
)
1432

33+
const (
34+
masterMachineRoleLabelSelector = "machine.openshift.io/cluster-api-machine-role=master"
35+
machineLabelInstanceType = "machine.openshift.io/instance-type"
36+
nodeLabelInstanceType = "node.kubernetes.io/instance-type"
37+
)
38+
39+
func getControlPlaneVMs(ctx context.Context) []compute.VirtualMachine {
40+
oc, err := clients.OpenshiftClusters.Get(ctx, vnetResourceGroup, clusterName)
41+
Expect(err).NotTo(HaveOccurred())
42+
clusterResourceGroup := stringutils.LastTokenByte(*oc.ClusterProfile.ResourceGroupID, '/')
43+
vms, err := clients.VirtualMachines.List(ctx, clusterResourceGroup)
44+
Expect(err).NotTo(HaveOccurred())
45+
return slices.DeleteFunc(vms, func(vm compute.VirtualMachine) bool {
46+
Expect(vm.Name).ToNot(BeNil())
47+
return !strings.Contains(*vm.Name, "master")
48+
})
49+
}
50+
51+
// getControlPlaneVMSize retrieves the VM size of one of the control plane
52+
// (master) VMs in the cluster by listing all VMs in the cluster resource group
53+
// and returning the size of the first VM whose name contains "master".
54+
func getControlPlaneVMSize(ctx context.Context) string {
55+
vms := getControlPlaneVMs(ctx)
56+
Expect(vms).NotTo(BeEmpty())
57+
Expect(vms[0].HardwareProfile).NotTo(BeNil())
58+
return string(vms[0].HardwareProfile.VMSize)
59+
}
60+
61+
// nextLargerSupportedMasterVMSize returns the supported master VM size in the
62+
// same family as currentVMSize that has the smallest core count strictly
63+
// greater than currentVMSize's core count. It returns an error if currentVMSize
64+
// is not in the supported master list, or if no larger size exists in the same
65+
// family.
66+
func nextLargerSupportedMasterVMSize(currentVMSize string) (string, error) {
67+
supportedMasterSizes := validate.SupportedVMSizesByRole(validate.VMRoleMaster)
68+
currentInfo, ok := supportedMasterSizes[api.VMSize(currentVMSize)]
69+
if !ok {
70+
return "", fmt.Errorf("current VM size %q is not in the supported master list", currentVMSize)
71+
}
72+
73+
targetSku := ""
74+
targetCores := math.MaxInt
75+
for size, info := range supportedMasterSizes {
76+
if info.Family != currentInfo.Family {
77+
continue
78+
}
79+
if info.CoreCount <= currentInfo.CoreCount {
80+
continue
81+
}
82+
if info.CoreCount < targetCores {
83+
targetCores = info.CoreCount
84+
targetSku = string(size)
85+
}
86+
}
87+
88+
if targetSku == "" {
89+
return "", fmt.Errorf("no supported master VM size larger than %q (family %s, %d cores) is available", currentVMSize, currentInfo.Family, currentInfo.CoreCount)
90+
}
91+
return targetSku, nil
92+
}
93+
94+
// validateMasterVMSizeLabels makes sure that master machine and node Resources in the cluster have the correct vmsize labels. It verifies that the following are equal to the targetSku
95+
// - metadata.labels."machine.openshift.io/instance-type" for machine
96+
// - spec.ProviderSpec.value.vmSize for machine
97+
// - metadata.labels."node.kubernetes.io/instance-type" for node
98+
// for each of the master nodes
99+
//
100+
// There is no return value, as this is supposed to be called directly from ginkgo test cases. This function validates the labels via [github.com/onsi/gomega.Expect] statements
101+
func validateMasterVMSizeLabels(ctx context.Context, targetSku string) {
102+
masterMachinesList, err := clients.MachineAPI.MachineV1beta1().Machines("openshift-machine-api").List(ctx, metav1.ListOptions{
103+
LabelSelector: masterMachineRoleLabelSelector,
104+
})
105+
Expect(err).ToNot(HaveOccurred())
106+
107+
for _, ma := range masterMachinesList.Items {
108+
By(fmt.Sprintf("Checking machine and node labels for %s", ma.GetName()))
109+
sizeLabelVal, ok := ma.GetObjectMeta().GetLabels()[machineLabelInstanceType]
110+
Expect(ok).To(BeTrue())
111+
Expect(sizeLabelVal).To(Equal(targetSku))
112+
113+
var machineProvSpec machinev1beta1.AzureMachineProviderSpec
114+
Expect(json.Unmarshal(ma.Spec.ProviderSpec.Value.Raw, &machineProvSpec)).ToNot(HaveOccurred())
115+
Expect(machineProvSpec.VMSize).To(Equal(targetSku))
116+
117+
Expect(ma.Status.NodeRef).ToNot(BeNil())
118+
119+
var curNode corev1.Node
120+
err = clients.KubeClient.Get(ctx, types.NamespacedName{Name: ma.Status.NodeRef.Name}, &curNode)
121+
Expect(err).ToNot(HaveOccurred())
122+
123+
nodeSizeLabelVal, ok := curNode.GetLabels()[nodeLabelInstanceType]
124+
Expect(ok).To(BeTrue())
125+
Expect(nodeSizeLabelVal).To(Equal(targetSku))
126+
}
127+
}
128+
15129
var _ = Describe("[Admin API] Resize control plane", func() {
16130
BeforeEach(skipIfNotInDevelopmentEnv)
17131

@@ -39,4 +153,110 @@ var _ = Describe("[Admin API] Resize control plane", func() {
39153
Expect(err).NotTo(HaveOccurred())
40154
Expect(resp.StatusCode).To(Equal(http.StatusBadRequest))
41155
})
156+
157+
It("should not resize when size is already the same", func(ctx context.Context) {
158+
By("Getting the current machine size")
159+
preResizeVMSize := getControlPlaneVMSize(ctx)
160+
Expect(preResizeVMSize).ToNot(BeZero())
161+
162+
By(fmt.Sprintf("Resizing to the current machine size: %s", preResizeVMSize))
163+
164+
params := url.Values{
165+
"deallocateVM": []string{"false"},
166+
"vmSize": []string{preResizeVMSize},
167+
}
168+
169+
resp, err := adminRequest(ctx, http.MethodPost,
170+
"/admin"+clusterResourceID+"/resizecontrolplane",
171+
params, true, nil, nil)
172+
Expect(err).NotTo(HaveOccurred())
173+
Expect(resp.StatusCode).To(Equal(http.StatusOK))
174+
175+
controlPlaneVms := getControlPlaneVMs(ctx)
176+
Expect(controlPlaneVms).ToNot(BeEmpty())
177+
for _, vm := range controlPlaneVms {
178+
Expect(vm.HardwareProfile).ToNot(BeNil())
179+
Expect(string(vm.HardwareProfile.VMSize)).To(Equal(preResizeVMSize))
180+
}
181+
})
182+
183+
It("Should not attempt to resize if there is no quota", func(ctx context.Context) {
184+
By("Finding a supported Master VM Size without Quota")
185+
usageRes, err := clients.Usages.List(ctx, _env.Location())
186+
Expect(err).ToNot(HaveOccurred())
187+
supportedSizes := validate.SupportedVMSizesByRole(validate.VMRoleMaster)
188+
// looking for supported vms with 0 quota
189+
targetSku := ""
190+
for size, sizeInfo := range supportedSizes {
191+
for _, u := range usageRes {
192+
if u.Name == nil ||
193+
u.Name.Value == nil ||
194+
*u.Name.Value != sizeInfo.Family ||
195+
u.Limit == nil {
196+
continue
197+
}
198+
199+
if *u.Limit == 0 {
200+
targetSku = size.String()
201+
}
202+
}
203+
}
204+
205+
if targetSku == "" {
206+
Skip("Can't run test. No supported SKU without quota found")
207+
}
208+
209+
By(fmt.Sprintf("Trying to resize controlplane vms to %s", targetSku))
210+
params := url.Values{
211+
"deallocateVM": []string{"false"},
212+
"vmSize": []string{targetSku},
213+
}
214+
215+
out := api.CloudError{}
216+
resp, err := adminRequest(ctx, http.MethodPost, "/admin"+clusterResourceID+"/resizecontrolplane", params, true, nil, &out)
217+
218+
Expect(err).NotTo(HaveOccurred())
219+
Expect(resp.StatusCode).To(Equal(http.StatusBadRequest))
220+
Expect(out.Message).To(Equal("Pre-flight validation failed."))
221+
Expect(out.Details).To(HaveLen(1))
222+
Expect(out.Details[0].Code).To(Equal("ResourceQuotaExceeded"))
223+
})
224+
225+
It("should do the resize when target size is different", Label(slow), FlakeAttempts(1), Serial, func(ctx context.Context) {
226+
By("Getting the current machine size")
227+
preResizeVMSize := getControlPlaneVMSize(ctx)
228+
Expect(preResizeVMSize).ToNot(BeZero())
229+
230+
// Pick the next-larger VM size within the same family from the
231+
// supported-master list. This keeps the resize on a well-tested size
232+
// while avoiding arbitrary family swaps.
233+
targetSku, err := nextLargerSupportedMasterVMSize(preResizeVMSize)
234+
if err != nil {
235+
Skip(err.Error())
236+
}
237+
238+
By(fmt.Sprintf("Resizing from %s to %s", preResizeVMSize, targetSku))
239+
params := url.Values{
240+
"deallocateVM": []string{"false"},
241+
"vmSize": []string{targetSku},
242+
}
243+
244+
resp, err := adminRequest(ctx, http.MethodPost, "/admin"+clusterResourceID+"/resizecontrolplane", params, true, nil, nil)
245+
246+
Expect(err).NotTo(HaveOccurred())
247+
Expect(resp.StatusCode).To(Equal(http.StatusOK))
248+
249+
By("Validating vm size after resize")
250+
controlPlaneVms := getControlPlaneVMs(ctx)
251+
Expect(controlPlaneVms).ToNot(BeEmpty())
252+
for _, vm := range controlPlaneVms {
253+
Expect(vm.HardwareProfile).ToNot(BeNil())
254+
Expect(string(vm.HardwareProfile.VMSize)).To(Equal(targetSku))
255+
Expect(vm.ProvisioningState).ToNot(BeNil())
256+
Expect(*vm.ProvisioningState).To(Equal(string(compute.ProvisioningStateSucceeded)))
257+
}
258+
259+
By("Validating machine and node labels")
260+
validateMasterVMSizeLabels(ctx, targetSku)
261+
}, NodeTimeout(30*time.Minute))
42262
})

test/e2e/setup.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ const (
8080
// These tests focus on core OCP health, ARO-specific customizations,
8181
// and Azure integration.
8282
install = "install"
83+
// slow is for tests that take a long time to run. They should be skipped
84+
// except when their execution is explicitly enabled via a label selector.
85+
slow = "slow"
8386
)
8487

8588
//go:embed static_resources
@@ -104,13 +107,14 @@ type clientSet struct {
104107
Subnet armnetwork.SubnetsClient
105108
VirtualNetworks armnetwork.VirtualNetworksClient
106109
Storage storage.AccountsClient
110+
Usages compute.UsageClient
107111

108112
Dynamic dynamic.Client
109113
RestConfig *rest.Config
110114
HiveRestConfig *rest.Config
111115
Monitoring monitoringclient.Interface
112116
Kubernetes kubernetes.Interface
113-
Client client.Client
117+
KubeClient client.Client
114118
MachineAPI machineclient.Interface
115119
MachineConfig mcoclient.Interface
116120
Route routeclient.Interface
@@ -537,12 +541,13 @@ func newClientSet(ctx context.Context) (*clientSet, error) {
537541
Subnet: subnetsClient,
538542
VirtualNetworks: virtualNetworksClient,
539543
Storage: storage.NewAccountsClient(_env.Environment(), _env.SubscriptionID(), authorizer),
544+
Usages: compute.NewUsageClient(_env.Environment(), _env.SubscriptionID(), authorizer),
540545

541546
RestConfig: restconfig,
542547
HiveRestConfig: hiveRestConfig,
543548
Kubernetes: cli,
544549
Dynamic: dynamiccli,
545-
Client: controllerRuntimeClient,
550+
KubeClient: controllerRuntimeClient,
546551
Monitoring: monitoring,
547552
MachineAPI: machineapicli,
548553
MachineConfig: mcocli,

test/e2e/update.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ var _ = Describe("Update clusters", func() {
3737
Name: "openshift-azure-operator",
3838
},
3939
}
40-
err := clients.Client.Delete(ctx, cr)
40+
err := clients.KubeClient.Delete(ctx, cr)
4141
Expect(err).NotTo(HaveOccurred())
4242

4343
By("sending the PATCH request to update the cluster")
@@ -46,7 +46,7 @@ var _ = Describe("Update clusters", func() {
4646

4747
By("checking that the CredentialsRequest has been recreated")
4848
cr = &cloudcredentialv1.CredentialsRequest{}
49-
err = clients.Client.Get(ctx, crNamespacedName, cr)
49+
err = clients.KubeClient.Get(ctx, crNamespacedName, cr)
5050
Expect(err).NotTo(HaveOccurred())
5151
})
5252

0 commit comments

Comments
 (0)