From 1117ff7467407b0bb4c9d6ef6d7c329d0c09a38e Mon Sep 17 00:00:00 2001 From: Fabian Wiesel Date: Thu, 18 Jun 2026 11:42:22 +0200 Subject: [PATCH 1/2] Apply offboarding taint once VMs are evicted Gate on Evicting=False to avoid racing with live-migration. Watch Hypervisor status changes (evictingConditionChangedPredicate) so the taint is applied promptly when the condition transitions. --- internal/controller/constants.go | 5 ++ .../gardener_node_lifecycle_controller.go | 81 ++++++++++++++++++- ...gardener_node_lifecycle_controller_test.go | 59 ++++++++++++++ 3 files changed, 144 insertions(+), 1 deletion(-) diff --git a/internal/controller/constants.go b/internal/controller/constants.go index f4119dc6..3617d02e 100644 --- a/internal/controller/constants.go +++ b/internal/controller/constants.go @@ -21,4 +21,9 @@ package controller const ( labelHypervisor = "nova.openstack.cloud.sap/virt-driver" testAggregateName = "tenant_filter_tests" + + // taintKeyOffboarding is used as a NoExecute taint. nova-compute and + // neutron agent pods do not tolerate it (the kvm-node-agent and the + // signalling pod do), so applying it forces those agents off the node. + taintKeyOffboarding = "kvm.cloud.sap/offboarding" ) diff --git a/internal/controller/gardener_node_lifecycle_controller.go b/internal/controller/gardener_node_lifecycle_controller.go index 9966651c..1af657fe 100644 --- a/internal/controller/gardener_node_lifecycle_controller.go +++ b/internal/controller/gardener_node_lifecycle_controller.go @@ -33,9 +33,13 @@ import ( v1 "k8s.io/client-go/applyconfigurations/meta/v1" policyv1ac "k8s.io/client-go/applyconfigurations/policy/v1" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" logger "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" ) @@ -85,6 +89,19 @@ func (r *GardenerNodeLifecycleController) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, nil } + // Apply the offboarding taint once VMs are gone; gate on Evicting=False + // to avoid racing with live-migration. + if hv.Spec.Maintenance == kvmv1.MaintenanceTermination && + meta.IsStatusConditionFalse(hv.Status.Conditions, kvmv1.ConditionTypeEvicting) { + patched, err := r.ensureOffboardingTaint(ctx, node) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to ensure offboarding taint: %w", err) + } + if patched { + return ctrl.Result{}, nil + } + } + // We do not care about the particular value, as long as it isn't an error var minAvailable int32 = 1 @@ -115,6 +132,30 @@ func (r *GardenerNodeLifecycleController) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, nil } +// ensureOffboardingTaint adds the offboarding NoExecute taint if not already +// present. Returns true when a patch was issued (caller should return early). +func (r *GardenerNodeLifecycleController) ensureOffboardingTaint(ctx context.Context, node *corev1.Node) (bool, error) { + for _, t := range node.Spec.Taints { + if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute { + return false, nil + } + } + + log := logger.FromContext(ctx) + log.Info("Adding offboarding taint to node", + "node", node.Name, + "taint", taintKeyOffboarding, + "effect", corev1.TaintEffectNoExecute) + + // StrategicMergeFrom merges taints by key, preserving concurrent additions. + patch := k8sclient.StrategicMergeFrom(node.DeepCopy()) + node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{ + Key: taintKeyOffboarding, + Effect: corev1.TaintEffectNoExecute, + }) + return true, r.Patch(ctx, node, patch, k8sclient.FieldOwner(MaintenanceControllerName)) +} + func (r *GardenerNodeLifecycleController) ensureBlockingPodDisruptionBudget(ctx context.Context, node *corev1.Node, minAvailable int32) error { name := nameForNode(node) nodeLabels := labelsForNode(node) @@ -226,10 +267,48 @@ func (r *GardenerNodeLifecycleController) SetupWithManager(mgr ctrl.Manager, nam _ = logger.FromContext(ctx) r.namespace = namespace + hypervisorToNode := handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &corev1.Node{}) + + // Maintenance=termination bumps generation; Evicting status changes do not. + hypervisorRelevantChange := predicate.Or( + predicate.GenerationChangedPredicate{}, + evictingConditionChangedPredicate{}, + ) + return ctrl.NewControllerManagedBy(mgr). Named(MaintenanceControllerName). For(&corev1.Node{}). - Owns(&appsv1.Deployment{}). // trigger the r.Reconcile whenever an Own-ed deployment is created/updated/deleted + Watches(&kvmv1.Hypervisor{}, hypervisorToNode, + builder.WithPredicates(hypervisorRelevantChange), + ). + Owns(&appsv1.Deployment{}). Owns(&policyv1.PodDisruptionBudget{}). Complete(r) } + +// evictingConditionChangedPredicate complements GenerationChangedPredicate, +// which ignores status-only updates. +type evictingConditionChangedPredicate struct { + predicate.Funcs +} + +func (evictingConditionChangedPredicate) Update(e event.UpdateEvent) bool { + if e.ObjectOld == nil || e.ObjectNew == nil { + return false + } + oldHv, ok1 := e.ObjectOld.(*kvmv1.Hypervisor) + newHv, ok2 := e.ObjectNew.(*kvmv1.Hypervisor) + if !ok1 || !ok2 { + return false + } + oldCond := meta.FindStatusCondition(oldHv.Status.Conditions, kvmv1.ConditionTypeEvicting) + newCond := meta.FindStatusCondition(newHv.Status.Conditions, kvmv1.ConditionTypeEvicting) + switch { + case oldCond == nil && newCond == nil: + return false + case oldCond == nil || newCond == nil: + return true + default: + return oldCond.Status != newCond.Status + } +} diff --git a/internal/controller/gardener_node_lifecycle_controller_test.go b/internal/controller/gardener_node_lifecycle_controller_test.go index 205b51fa..ace31b80 100644 --- a/internal/controller/gardener_node_lifecycle_controller_test.go +++ b/internal/controller/gardener_node_lifecycle_controller_test.go @@ -207,4 +207,63 @@ var _ = Describe("Gardener Maintenance Controller", func() { Expect(pdb.Spec.MinAvailable).To(HaveField("IntVal", BeNumerically("==", int32(0)))) }) }) + + Context("Offboarding taint", func() { + findOffboardingTaint := func(node *corev1.Node) *corev1.Taint { + for i := range node.Spec.Taints { + t := &node.Spec.Taints[i] + if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute { + return t + } + } + return nil + } + + When("the hypervisor is in maintenance termination and the VMs have been evicted", func() { + BeforeEach(func(ctx SpecContext) { + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, name, hypervisor)).To(Succeed()) + hypervisor.Spec.Maintenance = kvmv1.MaintenanceTermination + Expect(k8sClient.Update(ctx, hypervisor)).To(Succeed()) + + meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeEvicting, + Status: metav1.ConditionFalse, + Reason: "Succeeded", + Message: "All VMs evicted", + }) + Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed()) + }) + + It("should add the offboarding NoExecute taint to the node", func(ctx SpecContext) { + _, err := controller.Reconcile(ctx, reconcileReq) + Expect(err).NotTo(HaveOccurred()) + + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, name, node)).To(Succeed()) + taint := findOffboardingTaint(node) + Expect(taint).NotTo(BeNil()) + Expect(taint.Key).To(Equal(taintKeyOffboarding)) + Expect(taint.Effect).To(Equal(corev1.TaintEffectNoExecute)) + }) + + It("should be idempotent: a second reconcile must not add a duplicate taint", func(ctx SpecContext) { + _, err := controller.Reconcile(ctx, reconcileReq) + Expect(err).NotTo(HaveOccurred()) + _, err = controller.Reconcile(ctx, reconcileReq) + Expect(err).NotTo(HaveOccurred()) + + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, name, node)).To(Succeed()) + + count := 0 + for _, t := range node.Spec.Taints { + if t.Key == taintKeyOffboarding { + count++ + } + } + Expect(count).To(Equal(1)) + }) + }) + }) }) From 2f3157a84fd2e0c8bd99f4a5dcdc2638b90489e7 Mon Sep 17 00:00:00 2001 From: Fabian Wiesel Date: Thu, 18 Jun 2026 11:46:03 +0200 Subject: [PATCH 2/2] Gate compute-service deletion on agent pods being evicted A running nova-compute pod would re-register the service immediately after deletion, undoing the offboarding. Introduce AgentPodsEvicted condition computed by HypervisorController and block service deletion in the offboarding controller until it is True. The condition is only evaluated once VM eviction has finished (Evicting=False) and the offboarding taint is present on the node. Pods are listed per-namespace via --agent-namespaces (chart default: monsoon3; the operator refuses to start without it) in pages of 100, with the pod cache disabled so the operator does not start a cluster-wide pod informer. RequeueAfter is used to poll since pods are not watched. --- api/v1/hypervisor_types.go | 5 + .../templates/deployment.yaml | 2 + .../templates/role.yaml | 6 + .../openstack-hypervisor-operator/values.yaml | 2 + cmd/main.go | 37 ++ go.mod | 2 +- internal/controller/hypervisor_controller.go | 146 +++++++- .../controller/hypervisor_controller_test.go | 345 ++++++++++++++++++ internal/controller/offboarding_controller.go | 13 +- .../controller/offboarding_controller_test.go | 61 ++++ internal/global/global.go | 5 + 11 files changed, 610 insertions(+), 14 deletions(-) diff --git a/api/v1/hypervisor_types.go b/api/v1/hypervisor_types.go index 34f73b76..9a78b4d0 100644 --- a/api/v1/hypervisor_types.go +++ b/api/v1/hypervisor_types.go @@ -71,6 +71,11 @@ const ( // ConditionTypeAggregatesUpdated is the type of condition for aggregates updated status of a hypervisor ConditionTypeAggregatesUpdated = "AggregatesUpdated" + + // ConditionTypeAgentPodsEvicted gates compute-service deletion during + // offboarding: a running nova-compute pod would otherwise re-register + // the service we are about to delete. + ConditionTypeAgentPodsEvicted = "AgentPodsEvicted" ) // Condition Reasons diff --git a/charts/openstack-hypervisor-operator/templates/deployment.yaml b/charts/openstack-hypervisor-operator/templates/deployment.yaml index b37cfca1..f82077cb 100644 --- a/charts/openstack-hypervisor-operator/templates/deployment.yaml +++ b/charts/openstack-hypervisor-operator/templates/deployment.yaml @@ -45,6 +45,8 @@ spec: value: {{ quote .Values.controllerManager.manager.env.certificateIssuerName }} - name: LABEL_SELECTOR value: {{ quote .Values.controllerManager.manager.env.labelSelector }} + - name: AGENT_NAMESPACES + value: {{ quote .Values.controllerManager.manager.env.agentNamespaces }} - name: KUBERNETES_CLUSTER_DOMAIN value: {{ quote .Values.kubernetesClusterDomain }} image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Chart.AppVersion }} diff --git a/charts/openstack-hypervisor-operator/templates/role.yaml b/charts/openstack-hypervisor-operator/templates/role.yaml index edd53191..3b1ecda8 100644 --- a/charts/openstack-hypervisor-operator/templates/role.yaml +++ b/charts/openstack-hypervisor-operator/templates/role.yaml @@ -20,6 +20,12 @@ rules: - nodes/status verbs: - get +- apiGroups: + - "" + resources: + - pods + verbs: + - list - apiGroups: - apps resources: diff --git a/charts/openstack-hypervisor-operator/values.yaml b/charts/openstack-hypervisor-operator/values.yaml index a64814aa..cc4f686c 100644 --- a/charts/openstack-hypervisor-operator/values.yaml +++ b/charts/openstack-hypervisor-operator/values.yaml @@ -7,6 +7,7 @@ controllerManager: - --certificate-namespace=$(CERTIFICATE_NAMESPACE) - --certificate-issuer-name=$(CERTIFICATE_ISSUER_NAME) - --label-selector=$(LABEL_SELECTOR) + - --agent-namespaces=$(AGENT_NAMESPACES) containerSecurityContext: allowPrivilegeEscalation: false capabilities: @@ -16,6 +17,7 @@ controllerManager: certificateIssuerName: nova-hypervisor-agents-ca-issuer certificateNamespace: monsoon3 labelSelector: "" + agentNamespaces: "monsoon3" osAuthUrl: "" osProjectDomainName: "" osProjectName: "" diff --git a/cmd/main.go b/cmd/main.go index 6efef160..8bf875f7 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -25,6 +25,7 @@ import ( "fmt" "os" gruntime "runtime" + "strings" "github.com/sapcc/go-api-declarations/bininfo" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -97,6 +98,10 @@ func main() { "If set, HTTP/2 will be enabled for the metrics and webhook servers") flag.StringVar(&global.LabelSelector, "label-selector", "", "Label selector to filter watched resources (namely nodes).") + var agentNamespacesFlag string + flag.StringVar(&agentNamespacesFlag, "agent-namespaces", "", + "Comma-separated list of namespaces to search for agent pods (nova-compute, neutron) during offboarding.") + flag.StringVar(&certificateNamespace, "certificate-namespace", "monsoon3", "The namespace for the certificates. ") flag.StringVar(&certificateIssuerName, "certificate-issuer-name", "nova-hypervisor-agents-ca-issuer", "Name of the certificate issuer.") @@ -111,6 +116,8 @@ func main() { opts.BindFlags(flag.CommandLine) flag.Parse() + // Handle --version before validating required runtime flags so that + // `./manager --version` works without providing --agent-namespaces. if version { fmt.Printf("%s %s (%s/%s) %s\n", bininfo.Component(), bininfo.VersionOr("devel"), gruntime.GOOS, gruntime.GOARCH, @@ -118,6 +125,29 @@ func main() { os.Exit(0) } + if agentNamespacesFlag != "" { + // Deduplicate. Without this, repeated entries (e.g. from templated + // values) would cause redundant pod-list calls and inflate the + // AgentPodsEvicted condition message with duplicate pod names. + seen := map[string]struct{}{} + for ns := range strings.SplitSeq(agentNamespacesFlag, ",") { + ns = strings.TrimSpace(ns) + if ns == "" { + continue + } + if _, ok := seen[ns]; ok { + continue + } + seen[ns] = struct{}{} + global.AgentNamespaces = append(global.AgentNamespaces, ns) + } + } + + if len(global.AgentNamespaces) == 0 { + setupLog.Error(errors.New("--agent-namespaces is required"), "invalid configuration") + os.Exit(1) + } + if certificateIssuerName == "" { setupLog.Error(errors.New("certificate-issuer-name cannot be empty"), "invalid certificate issuer name") os.Exit(1) @@ -226,6 +256,13 @@ func main() { // Optionally configure the cache to listen/watch for specific labeled resources only Cache: cacheOptions, + // Pods are listed directly (not cached) to avoid a cluster-wide pod + // informer — a single large namespace would cause an OOM on startup. + Client: client.Options{ + Cache: &client.CacheOptions{ + DisableFor: []client.Object{&corev1.Pod{}}, + }, + }, }) if err != nil { diff --git a/go.mod b/go.mod index 4d7fc577..5ddfce0a 100644 --- a/go.mod +++ b/go.mod @@ -43,7 +43,7 @@ require ( github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/logr v1.4.3 github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.22.4 // indirect diff --git a/internal/controller/hypervisor_controller.go b/internal/controller/hypervisor_controller.go index 138aa0b0..0ee57305 100644 --- a/internal/controller/hypervisor_controller.go +++ b/internal/controller/hypervisor_controller.go @@ -21,13 +21,17 @@ import ( "context" "fmt" "slices" + "sort" "strings" + "time" + "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" @@ -69,6 +73,7 @@ type HypervisorController struct { // +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch // +kubebuilder:rbac:groups="",resources=nodes/status,verbs=get +// +kubebuilder:rbac:groups="",resources=pods,verbs=list // +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch @@ -100,8 +105,22 @@ func (hv *HypervisorController) Reconcile(ctx context.Context, req ctrl.Request) } // continue with creation } else { - // update Status if needed - base := hypervisor.DeepCopy() + // First, propagate spec/metadata derived from the Node (labels, + // annotations -> aggregates/traits, lifecycle). This must run on + // every reconcile, including those where status will also change + // (e.g. AgentPodsEvicted=False during termination); otherwise the + // Hypervisor spec/labels go stale. + specBase := hypervisor.DeepCopy() + syncLabelsAndAnnotations(nodeLabels, hypervisor, node) + if !equality.Semantic.DeepEqual(hypervisor, specBase) { + if err := hv.Patch(ctx, hypervisor, k8sclient.MergeFromWithOptions(specBase, + k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(HypervisorControllerName)); err != nil { + return ctrl.Result{}, err + } + } + + // Then, compute and persist any status changes derived from the Node. + statusBase := hypervisor.DeepCopy() // transfer internal IP for _, address := range node.Status.Addresses { @@ -123,26 +142,43 @@ func (hv *HypervisorController) Reconcile(ctx context.Context, req ctrl.Request) }) } - if !equality.Semantic.DeepEqual(hypervisor, base) { + // Only evaluate after VM eviction; a spurious True on a fresh node + // (agents not yet scheduled) would be misleading. + var statusRequeueAfter time.Duration + if hypervisor.Spec.Maintenance == kvmv1.MaintenanceTermination && + meta.IsStatusConditionFalse(hypervisor.Status.Conditions, kvmv1.ConditionTypeEvicting) && + nodeHasOffboardingTaint(node) { + cond, err := hv.computeAgentPodsEvictedCondition(ctx, log, node.Name) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to compute %s condition: %w", kvmv1.ConditionTypeAgentPodsEvicted, err) + } + meta.SetStatusCondition(&hypervisor.Status.Conditions, cond) + if cond.Status == metav1.ConditionFalse { + // No pod watch — rely on periodic requeue. + statusRequeueAfter = defaultPollTime + } + } + + if !equality.Semantic.DeepEqual(hypervisor, statusBase) { // Capture values to apply - only mutate fields this controller owns newInternalIP := hypervisor.Status.InternalIP terminatingCondition := meta.FindStatusCondition(hypervisor.Status.Conditions, kvmv1.ConditionTypeTerminating) + agentPodsCondition := meta.FindStatusCondition(hypervisor.Status.Conditions, kvmv1.ConditionTypeAgentPodsEvicted) - return ctrl.Result{}, utils.PatchHypervisorStatusWithRetry(ctx, hv.Client, hypervisor.Name, HypervisorControllerName, func(h *kvmv1.Hypervisor) { + if err := utils.PatchHypervisorStatusWithRetry(ctx, hv.Client, hypervisor.Name, HypervisorControllerName, func(h *kvmv1.Hypervisor) { h.Status.InternalIP = newInternalIP if terminatingCondition != nil { meta.SetStatusCondition(&h.Status.Conditions, *terminatingCondition) } - }) - } - - syncLabelsAndAnnotations(nodeLabels, hypervisor, node) - if equality.Semantic.DeepEqual(hypervisor, base) { - return ctrl.Result{}, nil + if agentPodsCondition != nil { + meta.SetStatusCondition(&h.Status.Conditions, *agentPodsCondition) + } + }); err != nil { + return ctrl.Result{}, err + } } - return ctrl.Result{}, hv.Patch(ctx, hypervisor, k8sclient.MergeFromWithOptions(base, - k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(HypervisorControllerName)) + return ctrl.Result{RequeueAfter: statusRequeueAfter}, nil } syncLabelsAndAnnotations(nodeLabels, hypervisor, node) @@ -262,3 +298,89 @@ func transportLabels(source, destination *metav1.ObjectMeta) { } } } + +// nodeHasOffboardingTaint reports whether the offboarding NoExecute taint has +// been applied to the node. The pod list is only meaningful after that point — +// before it, no agents have been evicted yet. +func nodeHasOffboardingTaint(node *corev1.Node) bool { + for _, t := range node.Spec.Taints { + if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute { + return true + } + } + return false +} + +// computeAgentPodsEvictedCondition checks whether pods that would be evicted +// by the offboarding taint are still running. Only call once Evicting=False. +func (hv *HypervisorController) computeAgentPodsEvictedCondition(ctx context.Context, log logr.Logger, nodeName string) (metav1.Condition, error) { + offboardingTaint := corev1.Taint{ + Key: taintKeyOffboarding, + Effect: corev1.TaintEffectNoExecute, + } + + var agentPods []string + for _, ns := range global.AgentNamespaces { + var continueToken string + for { + pods := &corev1.PodList{} + if err := hv.List(ctx, pods, + k8sclient.InNamespace(ns), + k8sclient.MatchingFieldsSelector{ + Selector: fields.OneTermEqualSelector("spec.nodeName", nodeName), + }, + &k8sclient.ListOptions{Limit: 100, Continue: continueToken}, + ); err != nil { + return metav1.Condition{}, fmt.Errorf("failed to list pods on node %s in namespace %q: %w", nodeName, ns, err) + } + + for _, pod := range pods.Items { + if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed { + continue + } + if podToleratesTaint(log, &pod, &offboardingTaint) { + continue + } + agentPods = append(agentPods, pod.Namespace+"/"+pod.Name) + } + + if pods.Continue == "" { + break + } + continueToken = pods.Continue + } + } + + if len(agentPods) == 0 { + return metav1.Condition{ + Type: kvmv1.ConditionTypeAgentPodsEvicted, + Status: metav1.ConditionTrue, + Reason: "NoAgentPods", + Message: "No agent pods are running on this node", + }, nil + } + + sort.Strings(agentPods) + return metav1.Condition{ + Type: kvmv1.ConditionTypeAgentPodsEvicted, + Status: metav1.ConditionFalse, + Reason: "AgentPodsRunning", + Message: fmt.Sprintf("%d agent pod(s) still running on node: %s", len(agentPods), strings.Join(agentPods, ", ")), + }, nil +} + +// podToleratesTaint reports whether the pod tolerates the taint indefinitely. +// Tolerations with a finite TolerationSeconds are excluded: the pod will +// eventually be evicted and must not be treated as safe to ignore. +func podToleratesTaint(log logr.Logger, pod *corev1.Pod, taint *corev1.Taint) bool { + for i := range pod.Spec.Tolerations { + t := &pod.Spec.Tolerations[i] + if t.TolerationSeconds != nil { + continue + } + if t.ToleratesTaint(log, taint, false) { + return true + } + } + return false +} diff --git a/internal/controller/hypervisor_controller_test.go b/internal/controller/hypervisor_controller_test.go index 9e1657bf..f99ffa35 100644 --- a/internal/controller/hypervisor_controller_test.go +++ b/internal/controller/hypervisor_controller_test.go @@ -18,17 +18,28 @@ limitations under the License. package controller import ( + "fmt" + "sync/atomic" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/types" + k8sscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/global" ) +// envtest does not actually GC pods when their namespace is deleted, so +// each spec gets a fresh namespace via this counter to keep them isolated. +var agentNamespaceCounter atomic.Uint64 + var _ = Describe("Hypervisor Controller", func() { const ( resourceName = "other-node" @@ -402,4 +413,338 @@ var _ = Describe("Hypervisor Controller", func() { Expect(hypervisor.Status.InternalIP).To(Equal("192.168.1.100")) }) }) + + Context("AgentPodsEvicted condition", func() { + var agentNamespace string + + BeforeEach(func(ctx SpecContext) { + agentNamespace = fmt.Sprintf("agent-ns-%d", agentNamespaceCounter.Add(1)) + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: agentNamespace}} + Expect(client.IgnoreAlreadyExists(k8sClient.Create(ctx, ns))).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, ns))).To(Succeed()) + }) + + // Restrict pod listing to the test namespace. + global.AgentNamespaces = []string{agentNamespace} + DeferCleanup(func() { global.AgentNamespaces = nil }) + + // The condition is only computed during termination. + _, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + hypervisor.Spec.Maintenance = kvmv1.MaintenanceTermination + hypervisor.Spec.LifecycleEnabled = true + Expect(k8sClient.Update(ctx, hypervisor)).To(Succeed()) + + // Default for these specs: VM eviction is done. Subcontexts + // that exercise the "not yet done" path override this. + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeEvicting, + Status: metav1.ConditionFalse, + Reason: "Succeeded", + Message: "All VMs evicted", + }) + Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed()) + + // The pod list is only issued once the offboarding taint is on the node. + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: resource.Name}, node)).To(Succeed()) + base := node.DeepCopy() + node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{ + Key: taintKeyOffboarding, + Effect: corev1.TaintEffectNoExecute, + }) + Expect(k8sClient.Patch(ctx, node, client.MergeFrom(base))).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + fresh := &corev1.Node{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: resource.Name}, fresh); err == nil { + base := fresh.DeepCopy() + taints := fresh.Spec.Taints[:0] + for _, t := range fresh.Spec.Taints { + if t.Key != taintKeyOffboarding { + taints = append(taints, t) + } + } + fresh.Spec.Taints = taints + Expect(client.IgnoreNotFound(k8sClient.Patch(ctx, fresh, client.MergeFrom(base)))).To(Succeed()) + } + }) + }) + + createPod := func(ctx SpecContext, name, namespace string, phase corev1.PodPhase, tolerations ...corev1.Toleration) *corev1.Pod { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: corev1.PodSpec{ + NodeName: resource.Name, + Containers: []corev1.Container{ + {Name: "main", Image: "registry.example.com/whatever:latest"}, + }, + Tolerations: tolerations, + }, + Status: corev1.PodStatus{Phase: phase}, + } + Expect(k8sClient.Create(ctx, pod)).To(Succeed()) + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, pod)).To(Succeed()) + pod.Status.Phase = phase + Expect(k8sClient.Status().Update(ctx, pod)).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, pod))).To(Succeed()) + }) + return pod + } + + When("only pods that tolerate the offboarding taint are running", func() { + BeforeEach(func(ctx SpecContext) { + createPod(ctx, "tolerator", agentNamespace, corev1.PodRunning, corev1.Toleration{ + Key: taintKeyOffboarding, + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoExecute, + }) + // Phase=Succeeded must not count regardless of tolerations. + createPod(ctx, "old-job", agentNamespace, corev1.PodSucceeded) + }) + + It("should set AgentPodsEvicted=True without requeue", func(ctx SpecContext) { + result, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(BeZero()) + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + Expect(hypervisor.Status.Conditions).To(ContainElement( + SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeAgentPodsEvicted), + HaveField("Status", metav1.ConditionTrue), + HaveField("Reason", "NoAgentPods"), + ), + )) + }) + }) + + When("an agent pod is running on the node and VM eviction is done", func() { + BeforeEach(func(ctx SpecContext) { + createPod(ctx, "nova-compute-xyz", agentNamespace, corev1.PodRunning) + }) + + It("should set AgentPodsEvicted=False with reason AgentPodsRunning and requeue", func(ctx SpecContext) { + result, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(defaultPollTime)) + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + Expect(hypervisor.Status.Conditions).To(ContainElement( + SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeAgentPodsEvicted), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", "AgentPodsRunning"), + HaveField("Message", ContainSubstring("nova-compute-xyz")), + ), + )) + }) + + It("should also flush InternalIP and Terminating in the same reconcile", func(ctx SpecContext) { + // Stage a fresh InternalIP and a node-level Terminating condition + // that the same reconcile pass would normally pick up. The + // AgentPodsEvicted=False branch must not skip persisting these + // fields; otherwise the Hypervisor status remains stale until a + // later reconcile. + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: resource.Name}, node)).To(Succeed()) + base := node.DeepCopy() + node.Status.Addresses = []corev1.NodeAddress{ + {Type: corev1.NodeInternalIP, Address: "192.168.42.7"}, + } + node.Status.Conditions = append(node.Status.Conditions, corev1.NodeCondition{ + Type: "Terminating", + Status: corev1.ConditionTrue, + Reason: terminatingReason, + Message: "Node is terminating", + }) + Expect(k8sClient.Status().Patch(ctx, node, client.MergeFrom(base))).To(Succeed()) + + result, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(defaultPollTime)) + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + + By("persisting AgentPodsEvicted=False") + Expect(hypervisor.Status.Conditions).To(ContainElement( + SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeAgentPodsEvicted), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", "AgentPodsRunning"), + ), + )) + + By("persisting the freshly observed InternalIP") + Expect(hypervisor.Status.InternalIP).To(Equal("192.168.42.7")) + + By("persisting the propagated Terminating condition") + Expect(hypervisor.Status.Conditions).To(ContainElement( + SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeTerminating), + HaveField("Status", metav1.ConditionTrue), + HaveField("Reason", terminatingReason), + ), + )) + }) + + It("should still propagate node label changes to the Hypervisor", func(ctx SpecContext) { + // Stage a node label that the reconcile pass would normally + // transport to the Hypervisor metadata via + // syncLabelsAndAnnotations. The AgentPodsEvicted=False branch + // must not skip this propagation; otherwise the Hypervisor + // labels remain stale during termination. + // + // (The Hypervisor spec is immutable while + // maintenance=='termination', so only metadata.Labels — not + // spec-derived fields like aggregates/customTraits — can + // legitimately change in this state.) + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: resource.Name}, node)).To(Succeed()) + base := node.DeepCopy() + node.Labels[workerGroupLabel] = workerGroupValue + Expect(k8sClient.Patch(ctx, node, client.MergeFrom(base))).To(Succeed()) + + result, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(defaultPollTime), + "AgentPodsEvicted=False must still drive a periodic requeue") + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + + By("transporting the node label to the Hypervisor") + Expect(hypervisor.Labels).To(HaveKeyWithValue(workerGroupLabel, workerGroupValue)) + }) + }) + + When("the only non-tolerating pod is already being deleted", func() { + // A finalizer keeps the API object around with DeletionTimestamp + // set, simulating a pod whose containers are shutting down but + // whose deletion is blocked on some unrelated finalizer. + const finalizer = "test.kvm.cloud.sap/keep-alive" + + BeforeEach(func(ctx SpecContext) { + pod := createPod(ctx, "nova-compute-deleting", agentNamespace, corev1.PodRunning) + pod.Finalizers = []string{finalizer} + Expect(k8sClient.Update(ctx, pod)).To(Succeed()) + Expect(k8sClient.Delete(ctx, pod)).To(Succeed()) + + DeferCleanup(func(ctx SpecContext) { + fresh := &corev1.Pod{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}, fresh); err == nil { + fresh.Finalizers = nil + Expect(client.IgnoreNotFound(k8sClient.Update(ctx, fresh))).To(Succeed()) + } + }) + + // Sanity: the pod must still exist with DeletionTimestamp. + fresh := &corev1.Pod{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}, fresh)).To(Succeed()) + Expect(fresh.DeletionTimestamp).NotTo(BeNil()) + }) + + It("should set AgentPodsEvicted=False (deletion-pending pod still counts as running)", func(ctx SpecContext) { + result, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(defaultPollTime)) + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + Expect(hypervisor.Status.Conditions).To(ContainElement( + SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeAgentPodsEvicted), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", "AgentPodsRunning"), + HaveField("Message", ContainSubstring("nova-compute-deleting")), + ), + )) + }) + }) + }) +}) + +var _ = Describe("computeAgentPodsEvictedCondition field selector", func() { + // This test verifies that the pod list issued by computeAgentPodsEvictedCondition + // uses the spec.nodeName field selector so that only pods on the target node + // are returned, not all pods in the cluster. + It("should only list pods scheduled on the target node", func(ctx SpecContext) { + // Use a client with DisableFor pods, mirroring production config. + uncachedClient, err := client.New(cfg, client.Options{ + Scheme: k8sscheme.Scheme, + Cache: &client.CacheOptions{ + DisableFor: []client.Object{&corev1.Pod{}}, + }, + }) + Expect(err).NotTo(HaveOccurred()) + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "field-selector-test"}} + Expect(client.IgnoreAlreadyExists(k8sClient.Create(ctx, ns))).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, ns))).To(Succeed()) + }) + + // Create a pod on "target-node". + onTarget := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "on-target", Namespace: ns.Name}, + Spec: corev1.PodSpec{ + NodeName: "target-node", + Containers: []corev1.Container{{Name: "c", Image: "registry.example.com/img:latest"}}, + }, + } + Expect(k8sClient.Create(ctx, onTarget)).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, onTarget))).To(Succeed()) + }) + + // Create a pod on a different node — must not appear in results. + onOther := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "on-other", Namespace: ns.Name}, + Spec: corev1.PodSpec{ + NodeName: "other-node", + Containers: []corev1.Container{{Name: "c", Image: "registry.example.com/img:latest"}}, + }, + } + Expect(k8sClient.Create(ctx, onOther)).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, onOther))).To(Succeed()) + }) + + pods := &corev1.PodList{} + Expect(uncachedClient.List(ctx, pods, + client.MatchingFieldsSelector{ + Selector: fields.OneTermEqualSelector("spec.nodeName", "target-node"), + }, + )).To(Succeed()) + + names := make([]string, len(pods.Items)) + for i, p := range pods.Items { + names[i] = p.Name + } + Expect(names).To(ConsistOf("on-target"), + "field selector spec.nodeName must filter server-side; got pods: %v", names) + }) }) diff --git a/internal/controller/offboarding_controller.go b/internal/controller/offboarding_controller.go index d838471b..39ee92b5 100644 --- a/internal/controller/offboarding_controller.go +++ b/internal/controller/offboarding_controller.go @@ -93,6 +93,18 @@ func (r *HypervisorOffboardingReconciler) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, nil } + // A still-running nova-compute agent would re-register the service we + // are about to delete, undoing the offboarding. This gate must be checked + // before any external OpenStack call so that ErrNoHypervisor cannot mark + // offboarding complete while agent pods are still running on the node. + if !meta.IsStatusConditionTrue(hv.Status.Conditions, kvmv1.ConditionTypeAgentPodsEvicted) { + msg := "Waiting for agent pods (nova-compute, neutron) to be evicted from node" + if cond := meta.FindStatusCondition(hv.Status.Conditions, kvmv1.ConditionTypeAgentPodsEvicted); cond != nil && cond.Message != "" { + msg = "Waiting for agent pods to be evicted: " + cond.Message + } + return r.setOffboardingCondition(ctx, hv, msg) + } + hypervisor, err := openstack.GetHypervisorByName(ctx, r.computeClient, hostname, true) if err != nil { if errors.Is(err, openstack.ErrNoHypervisor) { @@ -127,7 +139,6 @@ func (r *HypervisorOffboardingReconciler) Reconcile(ctx context.Context, req ctr return r.setOffboardingCondition(ctx, hv, msg) } - // Deleting and evicted, so better delete the service err = services.Delete(ctx, r.computeClient, hypervisor.Service.ID).ExtractErr() if err != nil && !gophercloud.ResponseCodeIs(err, http.StatusNotFound) { msg := fmt.Sprintf("cannot delete service %s due to %v", hypervisor.Service.ID, err) diff --git a/internal/controller/offboarding_controller_test.go b/internal/controller/offboarding_controller_test.go index 06c38c45..cf2cc1eb 100644 --- a/internal/controller/offboarding_controller_test.go +++ b/internal/controller/offboarding_controller_test.go @@ -141,6 +141,13 @@ var _ = Describe("Offboarding Controller", func() { Reason: "dontcare", Message: "dontcare", }) + // HypervisorController-side signal that agent pods are gone. + meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeAgentPodsEvicted, + Status: metav1.ConditionTrue, + Reason: "NoAgentPods", + Message: "No agent pods are running on this node", + }) Expect(k8sClient.Status().Update(ctx, hv)).To(Succeed()) By("Mocking OpenStack API endpoints") @@ -312,6 +319,14 @@ var _ = Describe("Offboarding Controller", func() { Reason: "Offboarding", Message: "Hypervisor is being offboarded, removing host from nova", }) + // Bypass the pod-eviction gate; it is exercised by its own + // case below. + meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeAgentPodsEvicted, + Status: metav1.ConditionTrue, + Reason: "NoAgentPods", + Message: "No agent pods are running on this node", + }) Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed()) }) @@ -400,6 +415,52 @@ var _ = Describe("Offboarding Controller", func() { }) }) + Context("When agent pods are still running on the node", func() { + var getHypervisorsCalled int + + BeforeEach(func(ctx SpecContext) { + getHypervisorsCalled = 0 + fakeServer.Mux.HandleFunc("GET /os-hypervisors/detail", func(w http.ResponseWriter, r *http.Request) { + getHypervisorsCalled++ + w.Header().Add("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + fmt.Fprintf(w, `{ + "hypervisors": [{ + "id": "c48f6247-abe4-4a24-824e-ea39e108874f", + "hypervisor_hostname": "node-test", + "hypervisor_version": 2002000, + "state": "up", + "status": "enabled", + "running_vms": 0, + "service": { + "id": "service-1234", + "host": "node-test" + } + }] + }`) + }) + + By("Overriding AgentPodsEvicted to False") + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, resourceName, hypervisor)).To(Succeed()) + meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeAgentPodsEvicted, + Status: metav1.ConditionFalse, + Reason: "AgentPodsRunning", + Message: "1 agent pod(s) still running on node: monsoon3/nova-compute-xyz", + }) + Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed()) + }) + + It("should wait for the agent pods to be evicted and not delete the compute service", func(ctx SpecContext) { + _, err := offboardingReconciler.Reconcile(ctx, reconcileReq) + Expect(err).NotTo(HaveOccurred()) + sharedOffboardingErrorCheck(ctx, "Waiting for agent pods to be evicted") + Expect(getHypervisorsCalled).To(Equal(0), + "external hypervisor lookup must be short-circuited by the AgentPodsEvicted gate") + }) + }) + Context("When deleting service fails", func() { BeforeEach(func() { fakeServer.Mux.HandleFunc("GET /os-hypervisors/detail", func(w http.ResponseWriter, r *http.Request) { diff --git a/internal/global/global.go b/internal/global/global.go index 270e1e0b..3a687b62 100644 --- a/internal/global/global.go +++ b/internal/global/global.go @@ -20,4 +20,9 @@ package global var ( // LabelSelector is a custom label that is used to select resources managed by the operator. LabelSelector = "" + + // AgentNamespaces is the list of namespaces in which agent pods (nova-compute, + // neutron) are scheduled. The pod list during offboarding is restricted to + // these namespaces. Must be non-empty; set via --agent-namespaces. + AgentNamespaces []string )