diff --git a/api/v1/hypervisor_types.go b/api/v1/hypervisor_types.go index 34f73b76..9a78b4d0 100644 --- a/api/v1/hypervisor_types.go +++ b/api/v1/hypervisor_types.go @@ -71,6 +71,11 @@ const ( // ConditionTypeAggregatesUpdated is the type of condition for aggregates updated status of a hypervisor ConditionTypeAggregatesUpdated = "AggregatesUpdated" + + // ConditionTypeAgentPodsEvicted gates compute-service deletion during + // offboarding: a running nova-compute pod would otherwise re-register + // the service we are about to delete. + ConditionTypeAgentPodsEvicted = "AgentPodsEvicted" ) // Condition Reasons diff --git a/charts/openstack-hypervisor-operator/templates/deployment.yaml b/charts/openstack-hypervisor-operator/templates/deployment.yaml index b37cfca1..f82077cb 100644 --- a/charts/openstack-hypervisor-operator/templates/deployment.yaml +++ b/charts/openstack-hypervisor-operator/templates/deployment.yaml @@ -45,6 +45,8 @@ spec: value: {{ quote .Values.controllerManager.manager.env.certificateIssuerName }} - name: LABEL_SELECTOR value: {{ quote .Values.controllerManager.manager.env.labelSelector }} + - name: AGENT_NAMESPACES + value: {{ quote .Values.controllerManager.manager.env.agentNamespaces }} - name: KUBERNETES_CLUSTER_DOMAIN value: {{ quote .Values.kubernetesClusterDomain }} image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Chart.AppVersion }} diff --git a/charts/openstack-hypervisor-operator/templates/role.yaml b/charts/openstack-hypervisor-operator/templates/role.yaml index edd53191..3b1ecda8 100644 --- a/charts/openstack-hypervisor-operator/templates/role.yaml +++ b/charts/openstack-hypervisor-operator/templates/role.yaml @@ -20,6 +20,12 @@ rules: - nodes/status verbs: - get +- apiGroups: + - "" + resources: + - pods + verbs: + - list - apiGroups: - apps resources: diff --git a/charts/openstack-hypervisor-operator/values.yaml b/charts/openstack-hypervisor-operator/values.yaml index a64814aa..cc4f686c 100644 --- a/charts/openstack-hypervisor-operator/values.yaml +++ b/charts/openstack-hypervisor-operator/values.yaml @@ -7,6 +7,7 @@ controllerManager: - --certificate-namespace=$(CERTIFICATE_NAMESPACE) - --certificate-issuer-name=$(CERTIFICATE_ISSUER_NAME) - --label-selector=$(LABEL_SELECTOR) + - --agent-namespaces=$(AGENT_NAMESPACES) containerSecurityContext: allowPrivilegeEscalation: false capabilities: @@ -16,6 +17,7 @@ controllerManager: certificateIssuerName: nova-hypervisor-agents-ca-issuer certificateNamespace: monsoon3 labelSelector: "" + agentNamespaces: "monsoon3" osAuthUrl: "" osProjectDomainName: "" osProjectName: "" diff --git a/cmd/main.go b/cmd/main.go index 6efef160..8bf875f7 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -25,6 +25,7 @@ import ( "fmt" "os" gruntime "runtime" + "strings" "github.com/sapcc/go-api-declarations/bininfo" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -97,6 +98,10 @@ func main() { "If set, HTTP/2 will be enabled for the metrics and webhook servers") flag.StringVar(&global.LabelSelector, "label-selector", "", "Label selector to filter watched resources (namely nodes).") + var agentNamespacesFlag string + flag.StringVar(&agentNamespacesFlag, "agent-namespaces", "", + "Comma-separated list of namespaces to search for agent pods (nova-compute, neutron) during offboarding.") + flag.StringVar(&certificateNamespace, "certificate-namespace", "monsoon3", "The namespace for the certificates. ") flag.StringVar(&certificateIssuerName, "certificate-issuer-name", "nova-hypervisor-agents-ca-issuer", "Name of the certificate issuer.") @@ -111,6 +116,8 @@ func main() { opts.BindFlags(flag.CommandLine) flag.Parse() + // Handle --version before validating required runtime flags so that + // `./manager --version` works without providing --agent-namespaces. if version { fmt.Printf("%s %s (%s/%s) %s\n", bininfo.Component(), bininfo.VersionOr("devel"), gruntime.GOOS, gruntime.GOARCH, @@ -118,6 +125,29 @@ func main() { os.Exit(0) } + if agentNamespacesFlag != "" { + // Deduplicate. Without this, repeated entries (e.g. from templated + // values) would cause redundant pod-list calls and inflate the + // AgentPodsEvicted condition message with duplicate pod names. + seen := map[string]struct{}{} + for ns := range strings.SplitSeq(agentNamespacesFlag, ",") { + ns = strings.TrimSpace(ns) + if ns == "" { + continue + } + if _, ok := seen[ns]; ok { + continue + } + seen[ns] = struct{}{} + global.AgentNamespaces = append(global.AgentNamespaces, ns) + } + } + + if len(global.AgentNamespaces) == 0 { + setupLog.Error(errors.New("--agent-namespaces is required"), "invalid configuration") + os.Exit(1) + } + if certificateIssuerName == "" { setupLog.Error(errors.New("certificate-issuer-name cannot be empty"), "invalid certificate issuer name") os.Exit(1) @@ -226,6 +256,13 @@ func main() { // Optionally configure the cache to listen/watch for specific labeled resources only Cache: cacheOptions, + // Pods are listed directly (not cached) to avoid a cluster-wide pod + // informer — a single large namespace would cause an OOM on startup. + Client: client.Options{ + Cache: &client.CacheOptions{ + DisableFor: []client.Object{&corev1.Pod{}}, + }, + }, }) if err != nil { diff --git a/go.mod b/go.mod index 4d7fc577..5ddfce0a 100644 --- a/go.mod +++ b/go.mod @@ -43,7 +43,7 @@ require ( github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/logr v1.4.3 github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.22.4 // indirect diff --git a/internal/controller/constants.go b/internal/controller/constants.go index f4119dc6..3617d02e 100644 --- a/internal/controller/constants.go +++ b/internal/controller/constants.go @@ -21,4 +21,9 @@ package controller const ( labelHypervisor = "nova.openstack.cloud.sap/virt-driver" testAggregateName = "tenant_filter_tests" + + // taintKeyOffboarding is used as a NoExecute taint. nova-compute and + // neutron agent pods do not tolerate it (the kvm-node-agent and the + // signalling pod do), so applying it forces those agents off the node. + taintKeyOffboarding = "kvm.cloud.sap/offboarding" ) diff --git a/internal/controller/gardener_node_lifecycle_controller.go b/internal/controller/gardener_node_lifecycle_controller.go index 9966651c..1af657fe 100644 --- a/internal/controller/gardener_node_lifecycle_controller.go +++ b/internal/controller/gardener_node_lifecycle_controller.go @@ -33,9 +33,13 @@ import ( v1 "k8s.io/client-go/applyconfigurations/meta/v1" policyv1ac "k8s.io/client-go/applyconfigurations/policy/v1" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" logger "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" ) @@ -85,6 +89,19 @@ func (r *GardenerNodeLifecycleController) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, nil } + // Apply the offboarding taint once VMs are gone; gate on Evicting=False + // to avoid racing with live-migration. + if hv.Spec.Maintenance == kvmv1.MaintenanceTermination && + meta.IsStatusConditionFalse(hv.Status.Conditions, kvmv1.ConditionTypeEvicting) { + patched, err := r.ensureOffboardingTaint(ctx, node) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to ensure offboarding taint: %w", err) + } + if patched { + return ctrl.Result{}, nil + } + } + // We do not care about the particular value, as long as it isn't an error var minAvailable int32 = 1 @@ -115,6 +132,30 @@ func (r *GardenerNodeLifecycleController) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, nil } +// ensureOffboardingTaint adds the offboarding NoExecute taint if not already +// present. Returns true when a patch was issued (caller should return early). +func (r *GardenerNodeLifecycleController) ensureOffboardingTaint(ctx context.Context, node *corev1.Node) (bool, error) { + for _, t := range node.Spec.Taints { + if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute { + return false, nil + } + } + + log := logger.FromContext(ctx) + log.Info("Adding offboarding taint to node", + "node", node.Name, + "taint", taintKeyOffboarding, + "effect", corev1.TaintEffectNoExecute) + + // StrategicMergeFrom merges taints by key, preserving concurrent additions. + patch := k8sclient.StrategicMergeFrom(node.DeepCopy()) + node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{ + Key: taintKeyOffboarding, + Effect: corev1.TaintEffectNoExecute, + }) + return true, r.Patch(ctx, node, patch, k8sclient.FieldOwner(MaintenanceControllerName)) +} + func (r *GardenerNodeLifecycleController) ensureBlockingPodDisruptionBudget(ctx context.Context, node *corev1.Node, minAvailable int32) error { name := nameForNode(node) nodeLabels := labelsForNode(node) @@ -226,10 +267,48 @@ func (r *GardenerNodeLifecycleController) SetupWithManager(mgr ctrl.Manager, nam _ = logger.FromContext(ctx) r.namespace = namespace + hypervisorToNode := handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &corev1.Node{}) + + // Maintenance=termination bumps generation; Evicting status changes do not. + hypervisorRelevantChange := predicate.Or( + predicate.GenerationChangedPredicate{}, + evictingConditionChangedPredicate{}, + ) + return ctrl.NewControllerManagedBy(mgr). Named(MaintenanceControllerName). For(&corev1.Node{}). - Owns(&appsv1.Deployment{}). // trigger the r.Reconcile whenever an Own-ed deployment is created/updated/deleted + Watches(&kvmv1.Hypervisor{}, hypervisorToNode, + builder.WithPredicates(hypervisorRelevantChange), + ). + Owns(&appsv1.Deployment{}). Owns(&policyv1.PodDisruptionBudget{}). Complete(r) } + +// evictingConditionChangedPredicate complements GenerationChangedPredicate, +// which ignores status-only updates. +type evictingConditionChangedPredicate struct { + predicate.Funcs +} + +func (evictingConditionChangedPredicate) Update(e event.UpdateEvent) bool { + if e.ObjectOld == nil || e.ObjectNew == nil { + return false + } + oldHv, ok1 := e.ObjectOld.(*kvmv1.Hypervisor) + newHv, ok2 := e.ObjectNew.(*kvmv1.Hypervisor) + if !ok1 || !ok2 { + return false + } + oldCond := meta.FindStatusCondition(oldHv.Status.Conditions, kvmv1.ConditionTypeEvicting) + newCond := meta.FindStatusCondition(newHv.Status.Conditions, kvmv1.ConditionTypeEvicting) + switch { + case oldCond == nil && newCond == nil: + return false + case oldCond == nil || newCond == nil: + return true + default: + return oldCond.Status != newCond.Status + } +} diff --git a/internal/controller/gardener_node_lifecycle_controller_test.go b/internal/controller/gardener_node_lifecycle_controller_test.go index 205b51fa..ace31b80 100644 --- a/internal/controller/gardener_node_lifecycle_controller_test.go +++ b/internal/controller/gardener_node_lifecycle_controller_test.go @@ -207,4 +207,63 @@ var _ = Describe("Gardener Maintenance Controller", func() { Expect(pdb.Spec.MinAvailable).To(HaveField("IntVal", BeNumerically("==", int32(0)))) }) }) + + Context("Offboarding taint", func() { + findOffboardingTaint := func(node *corev1.Node) *corev1.Taint { + for i := range node.Spec.Taints { + t := &node.Spec.Taints[i] + if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute { + return t + } + } + return nil + } + + When("the hypervisor is in maintenance termination and the VMs have been evicted", func() { + BeforeEach(func(ctx SpecContext) { + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, name, hypervisor)).To(Succeed()) + hypervisor.Spec.Maintenance = kvmv1.MaintenanceTermination + Expect(k8sClient.Update(ctx, hypervisor)).To(Succeed()) + + meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeEvicting, + Status: metav1.ConditionFalse, + Reason: "Succeeded", + Message: "All VMs evicted", + }) + Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed()) + }) + + It("should add the offboarding NoExecute taint to the node", func(ctx SpecContext) { + _, err := controller.Reconcile(ctx, reconcileReq) + Expect(err).NotTo(HaveOccurred()) + + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, name, node)).To(Succeed()) + taint := findOffboardingTaint(node) + Expect(taint).NotTo(BeNil()) + Expect(taint.Key).To(Equal(taintKeyOffboarding)) + Expect(taint.Effect).To(Equal(corev1.TaintEffectNoExecute)) + }) + + It("should be idempotent: a second reconcile must not add a duplicate taint", func(ctx SpecContext) { + _, err := controller.Reconcile(ctx, reconcileReq) + Expect(err).NotTo(HaveOccurred()) + _, err = controller.Reconcile(ctx, reconcileReq) + Expect(err).NotTo(HaveOccurred()) + + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, name, node)).To(Succeed()) + + count := 0 + for _, t := range node.Spec.Taints { + if t.Key == taintKeyOffboarding { + count++ + } + } + Expect(count).To(Equal(1)) + }) + }) + }) }) diff --git a/internal/controller/hypervisor_controller.go b/internal/controller/hypervisor_controller.go index 138aa0b0..0ee57305 100644 --- a/internal/controller/hypervisor_controller.go +++ b/internal/controller/hypervisor_controller.go @@ -21,13 +21,17 @@ import ( "context" "fmt" "slices" + "sort" "strings" + "time" + "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" @@ -69,6 +73,7 @@ type HypervisorController struct { // +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch // +kubebuilder:rbac:groups="",resources=nodes/status,verbs=get +// +kubebuilder:rbac:groups="",resources=pods,verbs=list // +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch @@ -100,8 +105,22 @@ func (hv *HypervisorController) Reconcile(ctx context.Context, req ctrl.Request) } // continue with creation } else { - // update Status if needed - base := hypervisor.DeepCopy() + // First, propagate spec/metadata derived from the Node (labels, + // annotations -> aggregates/traits, lifecycle). This must run on + // every reconcile, including those where status will also change + // (e.g. AgentPodsEvicted=False during termination); otherwise the + // Hypervisor spec/labels go stale. + specBase := hypervisor.DeepCopy() + syncLabelsAndAnnotations(nodeLabels, hypervisor, node) + if !equality.Semantic.DeepEqual(hypervisor, specBase) { + if err := hv.Patch(ctx, hypervisor, k8sclient.MergeFromWithOptions(specBase, + k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(HypervisorControllerName)); err != nil { + return ctrl.Result{}, err + } + } + + // Then, compute and persist any status changes derived from the Node. + statusBase := hypervisor.DeepCopy() // transfer internal IP for _, address := range node.Status.Addresses { @@ -123,26 +142,43 @@ func (hv *HypervisorController) Reconcile(ctx context.Context, req ctrl.Request) }) } - if !equality.Semantic.DeepEqual(hypervisor, base) { + // Only evaluate after VM eviction; a spurious True on a fresh node + // (agents not yet scheduled) would be misleading. + var statusRequeueAfter time.Duration + if hypervisor.Spec.Maintenance == kvmv1.MaintenanceTermination && + meta.IsStatusConditionFalse(hypervisor.Status.Conditions, kvmv1.ConditionTypeEvicting) && + nodeHasOffboardingTaint(node) { + cond, err := hv.computeAgentPodsEvictedCondition(ctx, log, node.Name) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to compute %s condition: %w", kvmv1.ConditionTypeAgentPodsEvicted, err) + } + meta.SetStatusCondition(&hypervisor.Status.Conditions, cond) + if cond.Status == metav1.ConditionFalse { + // No pod watch — rely on periodic requeue. + statusRequeueAfter = defaultPollTime + } + } + + if !equality.Semantic.DeepEqual(hypervisor, statusBase) { // Capture values to apply - only mutate fields this controller owns newInternalIP := hypervisor.Status.InternalIP terminatingCondition := meta.FindStatusCondition(hypervisor.Status.Conditions, kvmv1.ConditionTypeTerminating) + agentPodsCondition := meta.FindStatusCondition(hypervisor.Status.Conditions, kvmv1.ConditionTypeAgentPodsEvicted) - return ctrl.Result{}, utils.PatchHypervisorStatusWithRetry(ctx, hv.Client, hypervisor.Name, HypervisorControllerName, func(h *kvmv1.Hypervisor) { + if err := utils.PatchHypervisorStatusWithRetry(ctx, hv.Client, hypervisor.Name, HypervisorControllerName, func(h *kvmv1.Hypervisor) { h.Status.InternalIP = newInternalIP if terminatingCondition != nil { meta.SetStatusCondition(&h.Status.Conditions, *terminatingCondition) } - }) - } - - syncLabelsAndAnnotations(nodeLabels, hypervisor, node) - if equality.Semantic.DeepEqual(hypervisor, base) { - return ctrl.Result{}, nil + if agentPodsCondition != nil { + meta.SetStatusCondition(&h.Status.Conditions, *agentPodsCondition) + } + }); err != nil { + return ctrl.Result{}, err + } } - return ctrl.Result{}, hv.Patch(ctx, hypervisor, k8sclient.MergeFromWithOptions(base, - k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(HypervisorControllerName)) + return ctrl.Result{RequeueAfter: statusRequeueAfter}, nil } syncLabelsAndAnnotations(nodeLabels, hypervisor, node) @@ -262,3 +298,89 @@ func transportLabels(source, destination *metav1.ObjectMeta) { } } } + +// nodeHasOffboardingTaint reports whether the offboarding NoExecute taint has +// been applied to the node. The pod list is only meaningful after that point — +// before it, no agents have been evicted yet. +func nodeHasOffboardingTaint(node *corev1.Node) bool { + for _, t := range node.Spec.Taints { + if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute { + return true + } + } + return false +} + +// computeAgentPodsEvictedCondition checks whether pods that would be evicted +// by the offboarding taint are still running. Only call once Evicting=False. +func (hv *HypervisorController) computeAgentPodsEvictedCondition(ctx context.Context, log logr.Logger, nodeName string) (metav1.Condition, error) { + offboardingTaint := corev1.Taint{ + Key: taintKeyOffboarding, + Effect: corev1.TaintEffectNoExecute, + } + + var agentPods []string + for _, ns := range global.AgentNamespaces { + var continueToken string + for { + pods := &corev1.PodList{} + if err := hv.List(ctx, pods, + k8sclient.InNamespace(ns), + k8sclient.MatchingFieldsSelector{ + Selector: fields.OneTermEqualSelector("spec.nodeName", nodeName), + }, + &k8sclient.ListOptions{Limit: 100, Continue: continueToken}, + ); err != nil { + return metav1.Condition{}, fmt.Errorf("failed to list pods on node %s in namespace %q: %w", nodeName, ns, err) + } + + for _, pod := range pods.Items { + if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed { + continue + } + if podToleratesTaint(log, &pod, &offboardingTaint) { + continue + } + agentPods = append(agentPods, pod.Namespace+"/"+pod.Name) + } + + if pods.Continue == "" { + break + } + continueToken = pods.Continue + } + } + + if len(agentPods) == 0 { + return metav1.Condition{ + Type: kvmv1.ConditionTypeAgentPodsEvicted, + Status: metav1.ConditionTrue, + Reason: "NoAgentPods", + Message: "No agent pods are running on this node", + }, nil + } + + sort.Strings(agentPods) + return metav1.Condition{ + Type: kvmv1.ConditionTypeAgentPodsEvicted, + Status: metav1.ConditionFalse, + Reason: "AgentPodsRunning", + Message: fmt.Sprintf("%d agent pod(s) still running on node: %s", len(agentPods), strings.Join(agentPods, ", ")), + }, nil +} + +// podToleratesTaint reports whether the pod tolerates the taint indefinitely. +// Tolerations with a finite TolerationSeconds are excluded: the pod will +// eventually be evicted and must not be treated as safe to ignore. +func podToleratesTaint(log logr.Logger, pod *corev1.Pod, taint *corev1.Taint) bool { + for i := range pod.Spec.Tolerations { + t := &pod.Spec.Tolerations[i] + if t.TolerationSeconds != nil { + continue + } + if t.ToleratesTaint(log, taint, false) { + return true + } + } + return false +} diff --git a/internal/controller/hypervisor_controller_test.go b/internal/controller/hypervisor_controller_test.go index 9e1657bf..f99ffa35 100644 --- a/internal/controller/hypervisor_controller_test.go +++ b/internal/controller/hypervisor_controller_test.go @@ -18,17 +18,28 @@ limitations under the License. package controller import ( + "fmt" + "sync/atomic" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/types" + k8sscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/global" ) +// envtest does not actually GC pods when their namespace is deleted, so +// each spec gets a fresh namespace via this counter to keep them isolated. +var agentNamespaceCounter atomic.Uint64 + var _ = Describe("Hypervisor Controller", func() { const ( resourceName = "other-node" @@ -402,4 +413,338 @@ var _ = Describe("Hypervisor Controller", func() { Expect(hypervisor.Status.InternalIP).To(Equal("192.168.1.100")) }) }) + + Context("AgentPodsEvicted condition", func() { + var agentNamespace string + + BeforeEach(func(ctx SpecContext) { + agentNamespace = fmt.Sprintf("agent-ns-%d", agentNamespaceCounter.Add(1)) + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: agentNamespace}} + Expect(client.IgnoreAlreadyExists(k8sClient.Create(ctx, ns))).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, ns))).To(Succeed()) + }) + + // Restrict pod listing to the test namespace. + global.AgentNamespaces = []string{agentNamespace} + DeferCleanup(func() { global.AgentNamespaces = nil }) + + // The condition is only computed during termination. + _, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + hypervisor.Spec.Maintenance = kvmv1.MaintenanceTermination + hypervisor.Spec.LifecycleEnabled = true + Expect(k8sClient.Update(ctx, hypervisor)).To(Succeed()) + + // Default for these specs: VM eviction is done. Subcontexts + // that exercise the "not yet done" path override this. + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeEvicting, + Status: metav1.ConditionFalse, + Reason: "Succeeded", + Message: "All VMs evicted", + }) + Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed()) + + // The pod list is only issued once the offboarding taint is on the node. + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: resource.Name}, node)).To(Succeed()) + base := node.DeepCopy() + node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{ + Key: taintKeyOffboarding, + Effect: corev1.TaintEffectNoExecute, + }) + Expect(k8sClient.Patch(ctx, node, client.MergeFrom(base))).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + fresh := &corev1.Node{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: resource.Name}, fresh); err == nil { + base := fresh.DeepCopy() + taints := fresh.Spec.Taints[:0] + for _, t := range fresh.Spec.Taints { + if t.Key != taintKeyOffboarding { + taints = append(taints, t) + } + } + fresh.Spec.Taints = taints + Expect(client.IgnoreNotFound(k8sClient.Patch(ctx, fresh, client.MergeFrom(base)))).To(Succeed()) + } + }) + }) + + createPod := func(ctx SpecContext, name, namespace string, phase corev1.PodPhase, tolerations ...corev1.Toleration) *corev1.Pod { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: corev1.PodSpec{ + NodeName: resource.Name, + Containers: []corev1.Container{ + {Name: "main", Image: "registry.example.com/whatever:latest"}, + }, + Tolerations: tolerations, + }, + Status: corev1.PodStatus{Phase: phase}, + } + Expect(k8sClient.Create(ctx, pod)).To(Succeed()) + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, pod)).To(Succeed()) + pod.Status.Phase = phase + Expect(k8sClient.Status().Update(ctx, pod)).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, pod))).To(Succeed()) + }) + return pod + } + + When("only pods that tolerate the offboarding taint are running", func() { + BeforeEach(func(ctx SpecContext) { + createPod(ctx, "tolerator", agentNamespace, corev1.PodRunning, corev1.Toleration{ + Key: taintKeyOffboarding, + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoExecute, + }) + // Phase=Succeeded must not count regardless of tolerations. + createPod(ctx, "old-job", agentNamespace, corev1.PodSucceeded) + }) + + It("should set AgentPodsEvicted=True without requeue", func(ctx SpecContext) { + result, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(BeZero()) + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + Expect(hypervisor.Status.Conditions).To(ContainElement( + SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeAgentPodsEvicted), + HaveField("Status", metav1.ConditionTrue), + HaveField("Reason", "NoAgentPods"), + ), + )) + }) + }) + + When("an agent pod is running on the node and VM eviction is done", func() { + BeforeEach(func(ctx SpecContext) { + createPod(ctx, "nova-compute-xyz", agentNamespace, corev1.PodRunning) + }) + + It("should set AgentPodsEvicted=False with reason AgentPodsRunning and requeue", func(ctx SpecContext) { + result, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(defaultPollTime)) + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + Expect(hypervisor.Status.Conditions).To(ContainElement( + SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeAgentPodsEvicted), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", "AgentPodsRunning"), + HaveField("Message", ContainSubstring("nova-compute-xyz")), + ), + )) + }) + + It("should also flush InternalIP and Terminating in the same reconcile", func(ctx SpecContext) { + // Stage a fresh InternalIP and a node-level Terminating condition + // that the same reconcile pass would normally pick up. The + // AgentPodsEvicted=False branch must not skip persisting these + // fields; otherwise the Hypervisor status remains stale until a + // later reconcile. + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: resource.Name}, node)).To(Succeed()) + base := node.DeepCopy() + node.Status.Addresses = []corev1.NodeAddress{ + {Type: corev1.NodeInternalIP, Address: "192.168.42.7"}, + } + node.Status.Conditions = append(node.Status.Conditions, corev1.NodeCondition{ + Type: "Terminating", + Status: corev1.ConditionTrue, + Reason: terminatingReason, + Message: "Node is terminating", + }) + Expect(k8sClient.Status().Patch(ctx, node, client.MergeFrom(base))).To(Succeed()) + + result, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(defaultPollTime)) + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + + By("persisting AgentPodsEvicted=False") + Expect(hypervisor.Status.Conditions).To(ContainElement( + SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeAgentPodsEvicted), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", "AgentPodsRunning"), + ), + )) + + By("persisting the freshly observed InternalIP") + Expect(hypervisor.Status.InternalIP).To(Equal("192.168.42.7")) + + By("persisting the propagated Terminating condition") + Expect(hypervisor.Status.Conditions).To(ContainElement( + SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeTerminating), + HaveField("Status", metav1.ConditionTrue), + HaveField("Reason", terminatingReason), + ), + )) + }) + + It("should still propagate node label changes to the Hypervisor", func(ctx SpecContext) { + // Stage a node label that the reconcile pass would normally + // transport to the Hypervisor metadata via + // syncLabelsAndAnnotations. The AgentPodsEvicted=False branch + // must not skip this propagation; otherwise the Hypervisor + // labels remain stale during termination. + // + // (The Hypervisor spec is immutable while + // maintenance=='termination', so only metadata.Labels — not + // spec-derived fields like aggregates/customTraits — can + // legitimately change in this state.) + node := &corev1.Node{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: resource.Name}, node)).To(Succeed()) + base := node.DeepCopy() + node.Labels[workerGroupLabel] = workerGroupValue + Expect(k8sClient.Patch(ctx, node, client.MergeFrom(base))).To(Succeed()) + + result, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(defaultPollTime), + "AgentPodsEvicted=False must still drive a periodic requeue") + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + + By("transporting the node label to the Hypervisor") + Expect(hypervisor.Labels).To(HaveKeyWithValue(workerGroupLabel, workerGroupValue)) + }) + }) + + When("the only non-tolerating pod is already being deleted", func() { + // A finalizer keeps the API object around with DeletionTimestamp + // set, simulating a pod whose containers are shutting down but + // whose deletion is blocked on some unrelated finalizer. + const finalizer = "test.kvm.cloud.sap/keep-alive" + + BeforeEach(func(ctx SpecContext) { + pod := createPod(ctx, "nova-compute-deleting", agentNamespace, corev1.PodRunning) + pod.Finalizers = []string{finalizer} + Expect(k8sClient.Update(ctx, pod)).To(Succeed()) + Expect(k8sClient.Delete(ctx, pod)).To(Succeed()) + + DeferCleanup(func(ctx SpecContext) { + fresh := &corev1.Pod{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}, fresh); err == nil { + fresh.Finalizers = nil + Expect(client.IgnoreNotFound(k8sClient.Update(ctx, fresh))).To(Succeed()) + } + }) + + // Sanity: the pod must still exist with DeletionTimestamp. + fresh := &corev1.Pod{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}, fresh)).To(Succeed()) + Expect(fresh.DeletionTimestamp).NotTo(BeNil()) + }) + + It("should set AgentPodsEvicted=False (deletion-pending pod still counts as running)", func(ctx SpecContext) { + result, err := hypervisorController.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: resource.Name}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(defaultPollTime)) + + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed()) + Expect(hypervisor.Status.Conditions).To(ContainElement( + SatisfyAll( + HaveField("Type", kvmv1.ConditionTypeAgentPodsEvicted), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", "AgentPodsRunning"), + HaveField("Message", ContainSubstring("nova-compute-deleting")), + ), + )) + }) + }) + }) +}) + +var _ = Describe("computeAgentPodsEvictedCondition field selector", func() { + // This test verifies that the pod list issued by computeAgentPodsEvictedCondition + // uses the spec.nodeName field selector so that only pods on the target node + // are returned, not all pods in the cluster. + It("should only list pods scheduled on the target node", func(ctx SpecContext) { + // Use a client with DisableFor pods, mirroring production config. + uncachedClient, err := client.New(cfg, client.Options{ + Scheme: k8sscheme.Scheme, + Cache: &client.CacheOptions{ + DisableFor: []client.Object{&corev1.Pod{}}, + }, + }) + Expect(err).NotTo(HaveOccurred()) + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "field-selector-test"}} + Expect(client.IgnoreAlreadyExists(k8sClient.Create(ctx, ns))).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, ns))).To(Succeed()) + }) + + // Create a pod on "target-node". + onTarget := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "on-target", Namespace: ns.Name}, + Spec: corev1.PodSpec{ + NodeName: "target-node", + Containers: []corev1.Container{{Name: "c", Image: "registry.example.com/img:latest"}}, + }, + } + Expect(k8sClient.Create(ctx, onTarget)).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, onTarget))).To(Succeed()) + }) + + // Create a pod on a different node — must not appear in results. + onOther := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "on-other", Namespace: ns.Name}, + Spec: corev1.PodSpec{ + NodeName: "other-node", + Containers: []corev1.Container{{Name: "c", Image: "registry.example.com/img:latest"}}, + }, + } + Expect(k8sClient.Create(ctx, onOther)).To(Succeed()) + DeferCleanup(func(ctx SpecContext) { + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, onOther))).To(Succeed()) + }) + + pods := &corev1.PodList{} + Expect(uncachedClient.List(ctx, pods, + client.MatchingFieldsSelector{ + Selector: fields.OneTermEqualSelector("spec.nodeName", "target-node"), + }, + )).To(Succeed()) + + names := make([]string, len(pods.Items)) + for i, p := range pods.Items { + names[i] = p.Name + } + Expect(names).To(ConsistOf("on-target"), + "field selector spec.nodeName must filter server-side; got pods: %v", names) + }) }) diff --git a/internal/controller/offboarding_controller.go b/internal/controller/offboarding_controller.go index d838471b..39ee92b5 100644 --- a/internal/controller/offboarding_controller.go +++ b/internal/controller/offboarding_controller.go @@ -93,6 +93,18 @@ func (r *HypervisorOffboardingReconciler) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, nil } + // A still-running nova-compute agent would re-register the service we + // are about to delete, undoing the offboarding. This gate must be checked + // before any external OpenStack call so that ErrNoHypervisor cannot mark + // offboarding complete while agent pods are still running on the node. + if !meta.IsStatusConditionTrue(hv.Status.Conditions, kvmv1.ConditionTypeAgentPodsEvicted) { + msg := "Waiting for agent pods (nova-compute, neutron) to be evicted from node" + if cond := meta.FindStatusCondition(hv.Status.Conditions, kvmv1.ConditionTypeAgentPodsEvicted); cond != nil && cond.Message != "" { + msg = "Waiting for agent pods to be evicted: " + cond.Message + } + return r.setOffboardingCondition(ctx, hv, msg) + } + hypervisor, err := openstack.GetHypervisorByName(ctx, r.computeClient, hostname, true) if err != nil { if errors.Is(err, openstack.ErrNoHypervisor) { @@ -127,7 +139,6 @@ func (r *HypervisorOffboardingReconciler) Reconcile(ctx context.Context, req ctr return r.setOffboardingCondition(ctx, hv, msg) } - // Deleting and evicted, so better delete the service err = services.Delete(ctx, r.computeClient, hypervisor.Service.ID).ExtractErr() if err != nil && !gophercloud.ResponseCodeIs(err, http.StatusNotFound) { msg := fmt.Sprintf("cannot delete service %s due to %v", hypervisor.Service.ID, err) diff --git a/internal/controller/offboarding_controller_test.go b/internal/controller/offboarding_controller_test.go index 06c38c45..cf2cc1eb 100644 --- a/internal/controller/offboarding_controller_test.go +++ b/internal/controller/offboarding_controller_test.go @@ -141,6 +141,13 @@ var _ = Describe("Offboarding Controller", func() { Reason: "dontcare", Message: "dontcare", }) + // HypervisorController-side signal that agent pods are gone. + meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeAgentPodsEvicted, + Status: metav1.ConditionTrue, + Reason: "NoAgentPods", + Message: "No agent pods are running on this node", + }) Expect(k8sClient.Status().Update(ctx, hv)).To(Succeed()) By("Mocking OpenStack API endpoints") @@ -312,6 +319,14 @@ var _ = Describe("Offboarding Controller", func() { Reason: "Offboarding", Message: "Hypervisor is being offboarded, removing host from nova", }) + // Bypass the pod-eviction gate; it is exercised by its own + // case below. + meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeAgentPodsEvicted, + Status: metav1.ConditionTrue, + Reason: "NoAgentPods", + Message: "No agent pods are running on this node", + }) Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed()) }) @@ -400,6 +415,52 @@ var _ = Describe("Offboarding Controller", func() { }) }) + Context("When agent pods are still running on the node", func() { + var getHypervisorsCalled int + + BeforeEach(func(ctx SpecContext) { + getHypervisorsCalled = 0 + fakeServer.Mux.HandleFunc("GET /os-hypervisors/detail", func(w http.ResponseWriter, r *http.Request) { + getHypervisorsCalled++ + w.Header().Add("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + fmt.Fprintf(w, `{ + "hypervisors": [{ + "id": "c48f6247-abe4-4a24-824e-ea39e108874f", + "hypervisor_hostname": "node-test", + "hypervisor_version": 2002000, + "state": "up", + "status": "enabled", + "running_vms": 0, + "service": { + "id": "service-1234", + "host": "node-test" + } + }] + }`) + }) + + By("Overriding AgentPodsEvicted to False") + hypervisor := &kvmv1.Hypervisor{} + Expect(k8sClient.Get(ctx, resourceName, hypervisor)).To(Succeed()) + meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{ + Type: kvmv1.ConditionTypeAgentPodsEvicted, + Status: metav1.ConditionFalse, + Reason: "AgentPodsRunning", + Message: "1 agent pod(s) still running on node: monsoon3/nova-compute-xyz", + }) + Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed()) + }) + + It("should wait for the agent pods to be evicted and not delete the compute service", func(ctx SpecContext) { + _, err := offboardingReconciler.Reconcile(ctx, reconcileReq) + Expect(err).NotTo(HaveOccurred()) + sharedOffboardingErrorCheck(ctx, "Waiting for agent pods to be evicted") + Expect(getHypervisorsCalled).To(Equal(0), + "external hypervisor lookup must be short-circuited by the AgentPodsEvicted gate") + }) + }) + Context("When deleting service fails", func() { BeforeEach(func() { fakeServer.Mux.HandleFunc("GET /os-hypervisors/detail", func(w http.ResponseWriter, r *http.Request) { diff --git a/internal/global/global.go b/internal/global/global.go index 270e1e0b..3a687b62 100644 --- a/internal/global/global.go +++ b/internal/global/global.go @@ -20,4 +20,9 @@ package global var ( // LabelSelector is a custom label that is used to select resources managed by the operator. LabelSelector = "" + + // AgentNamespaces is the list of namespaces in which agent pods (nova-compute, + // neutron) are scheduled. The pod list during offboarding is restricted to + // these namespaces. Must be non-empty; set via --agent-namespaces. + AgentNamespaces []string )