Skip to content

Commit 6b461ab

Browse files
committed
Gate compute-service deletion on agent pods being evicted
A running nova-compute pod would re-register the service immediately after deletion, undoing the offboarding. Introduce AgentPodsEvicted condition computed by HypervisorController and block service deletion in the offboarding controller until it is True. The condition is only evaluated once VM eviction has finished (Evicting=False) and the offboarding taint is present on the node. Pods are listed per-namespace via --agent-namespaces (chart default: monsoon3; the operator refuses to start without it) in pages of 100, with the pod cache disabled so the operator does not start a cluster-wide pod informer. RequeueAfter is used to poll since pods are not watched.
1 parent 0e8e58c commit 6b461ab

6 files changed

Lines changed: 175 additions & 24 deletions

File tree

charts/openstack-hypervisor-operator/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ spec:
4545
value: {{ quote .Values.controllerManager.manager.env.certificateIssuerName }}
4646
- name: LABEL_SELECTOR
4747
value: {{ quote .Values.controllerManager.manager.env.labelSelector }}
48+
- name: AGENT_NAMESPACES
49+
value: {{ quote .Values.controllerManager.manager.env.agentNamespaces }}
4850
- name: KUBERNETES_CLUSTER_DOMAIN
4951
value: {{ quote .Values.kubernetesClusterDomain }}
5052
image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Chart.AppVersion }}

charts/openstack-hypervisor-operator/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ controllerManager:
77
- --certificate-namespace=$(CERTIFICATE_NAMESPACE)
88
- --certificate-issuer-name=$(CERTIFICATE_ISSUER_NAME)
99
- --label-selector=$(LABEL_SELECTOR)
10+
- --agent-namespaces=$(AGENT_NAMESPACES)
1011
containerSecurityContext:
1112
allowPrivilegeEscalation: false
1213
capabilities:
@@ -16,6 +17,7 @@ controllerManager:
1617
certificateIssuerName: nova-hypervisor-agents-ca-issuer
1718
certificateNamespace: monsoon3
1819
labelSelector: ""
20+
agentNamespaces: "monsoon3"
1921
osAuthUrl: ""
2022
osProjectDomainName: ""
2123
osProjectName: ""

cmd/main.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"fmt"
2626
"os"
2727
gruntime "runtime"
28+
"strings"
2829

2930
"github.com/sapcc/go-api-declarations/bininfo"
3031
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
@@ -97,6 +98,10 @@ func main() {
9798
"If set, HTTP/2 will be enabled for the metrics and webhook servers")
9899
flag.StringVar(&global.LabelSelector, "label-selector", "", "Label selector to filter watched resources (namely nodes).")
99100

101+
var agentNamespacesFlag string
102+
flag.StringVar(&agentNamespacesFlag, "agent-namespaces", "",
103+
"Comma-separated list of namespaces to search for agent pods (nova-compute, neutron) during offboarding.")
104+
100105
flag.StringVar(&certificateNamespace, "certificate-namespace", "monsoon3", "The namespace for the certificates. ")
101106
flag.StringVar(&certificateIssuerName, "certificate-issuer-name", "nova-hypervisor-agents-ca-issuer",
102107
"Name of the certificate issuer.")
@@ -111,6 +116,19 @@ func main() {
111116
opts.BindFlags(flag.CommandLine)
112117
flag.Parse()
113118

119+
if agentNamespacesFlag != "" {
120+
for ns := range strings.SplitSeq(agentNamespacesFlag, ",") {
121+
if ns = strings.TrimSpace(ns); ns != "" {
122+
global.AgentNamespaces = append(global.AgentNamespaces, ns)
123+
}
124+
}
125+
}
126+
127+
if len(global.AgentNamespaces) == 0 {
128+
setupLog.Error(nil, "--agent-namespaces is required")
129+
os.Exit(1)
130+
}
131+
114132
if version {
115133
fmt.Printf("%s %s (%s/%s) %s\n",
116134
bininfo.Component(), bininfo.VersionOr("devel"), gruntime.GOOS, gruntime.GOARCH,
@@ -226,6 +244,13 @@ func main() {
226244

227245
// Optionally configure the cache to listen/watch for specific labeled resources only
228246
Cache: cacheOptions,
247+
// Pods are listed directly (not cached) to avoid a cluster-wide pod
248+
// informer — a single large namespace would cause an OOM on startup.
249+
Client: client.Options{
250+
Cache: &client.CacheOptions{
251+
DisableFor: []client.Object{&corev1.Pod{}},
252+
},
253+
},
229254
})
230255

231256
if err != nil {

internal/controller/hypervisor_controller.go

Lines changed: 42 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,8 @@ func (hv *HypervisorController) Reconcile(ctx context.Context, req ctrl.Request)
130130
// Only evaluate after VM eviction; a spurious True on a fresh node
131131
// (agents not yet scheduled) would be misleading.
132132
if hypervisor.Spec.Maintenance == kvmv1.MaintenanceTermination &&
133-
meta.IsStatusConditionFalse(hypervisor.Status.Conditions, kvmv1.ConditionTypeEvicting) {
133+
meta.IsStatusConditionFalse(hypervisor.Status.Conditions, kvmv1.ConditionTypeEvicting) &&
134+
nodeHasOffboardingTaint(node) {
134135
cond, err := hv.computeAgentPodsEvictedCondition(ctx, log, node.Name)
135136
if err != nil {
136137
return ctrl.Result{}, fmt.Errorf("failed to compute %s condition: %w", kvmv1.ConditionTypeAgentPodsEvicted, err)
@@ -291,35 +292,56 @@ func transportLabels(source, destination *metav1.ObjectMeta) {
291292
}
292293
}
293294

295+
// nodeHasOffboardingTaint reports whether the offboarding NoExecute taint has
296+
// been applied to the node. The pod list is only meaningful after that point —
297+
// before it, no agents have been evicted yet.
298+
func nodeHasOffboardingTaint(node *corev1.Node) bool {
299+
for _, t := range node.Spec.Taints {
300+
if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute {
301+
return true
302+
}
303+
}
304+
return false
305+
}
306+
294307
// computeAgentPodsEvictedCondition checks whether pods that would be evicted
295308
// by the offboarding taint are still running. Only call once Evicting=False.
296309
func (hv *HypervisorController) computeAgentPodsEvictedCondition(ctx context.Context, log logr.Logger, nodeName string) (metav1.Condition, error) {
297-
pods := &corev1.PodList{}
298-
if err := hv.List(ctx, pods, k8sclient.MatchingFieldsSelector{
299-
Selector: fields.OneTermEqualSelector("spec.nodeName", nodeName),
300-
}); err != nil {
301-
return metav1.Condition{}, fmt.Errorf("failed to list pods on node %s: %w", nodeName, err)
302-
}
303-
304310
offboardingTaint := corev1.Taint{
305311
Key: taintKeyOffboarding,
306312
Effect: corev1.TaintEffectNoExecute,
307313
}
308314

309315
var agentPods []string
310-
for _, pod := range pods.Items {
311-
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
312-
continue
313-
}
314-
// Deletion in progress; don't wait for API object removal
315-
// (may be blocked on unrelated finalizers).
316-
if pod.DeletionTimestamp != nil {
317-
continue
318-
}
319-
if podToleratesTaint(log, &pod, &offboardingTaint) {
320-
continue
316+
for _, ns := range global.AgentNamespaces {
317+
var continueToken string
318+
for {
319+
pods := &corev1.PodList{}
320+
if err := hv.List(ctx, pods,
321+
k8sclient.InNamespace(ns),
322+
k8sclient.MatchingFieldsSelector{
323+
Selector: fields.OneTermEqualSelector("spec.nodeName", nodeName),
324+
},
325+
&k8sclient.ListOptions{Limit: 100, Continue: continueToken},
326+
); err != nil {
327+
return metav1.Condition{}, fmt.Errorf("failed to list pods on node %s in namespace %q: %w", nodeName, ns, err)
328+
}
329+
330+
for _, pod := range pods.Items {
331+
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
332+
continue
333+
}
334+
if podToleratesTaint(log, &pod, &offboardingTaint) {
335+
continue
336+
}
337+
agentPods = append(agentPods, pod.Namespace+"/"+pod.Name)
338+
}
339+
340+
if pods.Continue == "" {
341+
break
342+
}
343+
continueToken = pods.Continue
321344
}
322-
agentPods = append(agentPods, pod.Namespace+"/"+pod.Name)
323345
}
324346

325347
if len(agentPods) == 0 {

internal/controller/hypervisor_controller_test.go

Lines changed: 99 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,14 @@ import (
2626
corev1 "k8s.io/api/core/v1"
2727
"k8s.io/apimachinery/pkg/api/meta"
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
"k8s.io/apimachinery/pkg/fields"
2930
"k8s.io/apimachinery/pkg/types"
31+
k8sscheme "k8s.io/client-go/kubernetes/scheme"
3032
ctrl "sigs.k8s.io/controller-runtime"
3133
"sigs.k8s.io/controller-runtime/pkg/client"
3234

3335
kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
36+
"github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/global"
3437
)
3538

3639
// envtest does not actually GC pods when their namespace is deleted, so
@@ -422,6 +425,10 @@ var _ = Describe("Hypervisor Controller", func() {
422425
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, ns))).To(Succeed())
423426
})
424427

428+
// Restrict pod listing to the test namespace.
429+
global.AgentNamespaces = []string{agentNamespace}
430+
DeferCleanup(func() { global.AgentNamespaces = nil })
431+
425432
// The condition is only computed during termination.
426433
_, err := hypervisorController.Reconcile(ctx, ctrl.Request{
427434
NamespacedName: types.NamespacedName{Name: resource.Name},
@@ -444,6 +451,30 @@ var _ = Describe("Hypervisor Controller", func() {
444451
Message: "All VMs evicted",
445452
})
446453
Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed())
454+
455+
// The pod list is only issued once the offboarding taint is on the node.
456+
node := &corev1.Node{}
457+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: resource.Name}, node)).To(Succeed())
458+
base := node.DeepCopy()
459+
node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{
460+
Key: taintKeyOffboarding,
461+
Effect: corev1.TaintEffectNoExecute,
462+
})
463+
Expect(k8sClient.Patch(ctx, node, client.MergeFrom(base))).To(Succeed())
464+
DeferCleanup(func(ctx SpecContext) {
465+
fresh := &corev1.Node{}
466+
if err := k8sClient.Get(ctx, types.NamespacedName{Name: resource.Name}, fresh); err == nil {
467+
base := fresh.DeepCopy()
468+
taints := fresh.Spec.Taints[:0]
469+
for _, t := range fresh.Spec.Taints {
470+
if t.Key != taintKeyOffboarding {
471+
taints = append(taints, t)
472+
}
473+
}
474+
fresh.Spec.Taints = taints
475+
Expect(client.IgnoreNotFound(k8sClient.Patch(ctx, fresh, client.MergeFrom(base)))).To(Succeed())
476+
}
477+
})
447478
})
448479

449480
createPod := func(ctx SpecContext, name, namespace string, phase corev1.PodPhase, tolerations ...corev1.Toleration) *corev1.Pod {
@@ -552,22 +583,86 @@ var _ = Describe("Hypervisor Controller", func() {
552583
Expect(fresh.DeletionTimestamp).NotTo(BeNil())
553584
})
554585

555-
It("should set AgentPodsEvicted=True (deletion-pending pod is treated as gone)", func(ctx SpecContext) {
556-
_, err := hypervisorController.Reconcile(ctx, ctrl.Request{
586+
It("should set AgentPodsEvicted=False (deletion-pending pod still counts as running)", func(ctx SpecContext) {
587+
result, err := hypervisorController.Reconcile(ctx, ctrl.Request{
557588
NamespacedName: types.NamespacedName{Name: resource.Name},
558589
})
559590
Expect(err).NotTo(HaveOccurred())
591+
Expect(result.RequeueAfter).To(Equal(defaultPollTime))
560592

561593
hypervisor := &kvmv1.Hypervisor{}
562594
Expect(k8sClient.Get(ctx, hypervisorName, hypervisor)).To(Succeed())
563595
Expect(hypervisor.Status.Conditions).To(ContainElement(
564596
SatisfyAll(
565597
HaveField("Type", kvmv1.ConditionTypeAgentPodsEvicted),
566-
HaveField("Status", metav1.ConditionTrue),
567-
HaveField("Reason", "NoAgentPods"),
598+
HaveField("Status", metav1.ConditionFalse),
599+
HaveField("Reason", "AgentPodsRunning"),
600+
HaveField("Message", ContainSubstring("nova-compute-deleting")),
568601
),
569602
))
570603
})
571604
})
572605
})
573606
})
607+
608+
var _ = Describe("computeAgentPodsEvictedCondition field selector", func() {
609+
// This test verifies that the pod list issued by computeAgentPodsEvictedCondition
610+
// uses the spec.nodeName field selector so that only pods on the target node
611+
// are returned, not all pods in the cluster.
612+
It("should only list pods scheduled on the target node", func(ctx SpecContext) {
613+
// Use a client with DisableFor pods, mirroring production config.
614+
uncachedClient, err := client.New(cfg, client.Options{
615+
Scheme: k8sscheme.Scheme,
616+
Cache: &client.CacheOptions{
617+
DisableFor: []client.Object{&corev1.Pod{}},
618+
},
619+
})
620+
Expect(err).NotTo(HaveOccurred())
621+
622+
ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "field-selector-test"}}
623+
Expect(client.IgnoreAlreadyExists(k8sClient.Create(ctx, ns))).To(Succeed())
624+
DeferCleanup(func(ctx SpecContext) {
625+
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, ns))).To(Succeed())
626+
})
627+
628+
// Create a pod on "target-node".
629+
onTarget := &corev1.Pod{
630+
ObjectMeta: metav1.ObjectMeta{Name: "on-target", Namespace: ns.Name},
631+
Spec: corev1.PodSpec{
632+
NodeName: "target-node",
633+
Containers: []corev1.Container{{Name: "c", Image: "registry.example.com/img:latest"}},
634+
},
635+
}
636+
Expect(k8sClient.Create(ctx, onTarget)).To(Succeed())
637+
DeferCleanup(func(ctx SpecContext) {
638+
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, onTarget))).To(Succeed())
639+
})
640+
641+
// Create a pod on a different node — must not appear in results.
642+
onOther := &corev1.Pod{
643+
ObjectMeta: metav1.ObjectMeta{Name: "on-other", Namespace: ns.Name},
644+
Spec: corev1.PodSpec{
645+
NodeName: "other-node",
646+
Containers: []corev1.Container{{Name: "c", Image: "registry.example.com/img:latest"}},
647+
},
648+
}
649+
Expect(k8sClient.Create(ctx, onOther)).To(Succeed())
650+
DeferCleanup(func(ctx SpecContext) {
651+
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, onOther))).To(Succeed())
652+
})
653+
654+
pods := &corev1.PodList{}
655+
Expect(uncachedClient.List(ctx, pods,
656+
client.MatchingFieldsSelector{
657+
Selector: fields.OneTermEqualSelector("spec.nodeName", "target-node"),
658+
},
659+
)).To(Succeed())
660+
661+
names := make([]string, len(pods.Items))
662+
for i, p := range pods.Items {
663+
names[i] = p.Name
664+
}
665+
Expect(names).To(ConsistOf("on-target"),
666+
"field selector spec.nodeName must filter server-side; got pods: %v", names)
667+
})
668+
})

internal/global/global.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,9 @@ package global
2020
var (
2121
// LabelSelector is a custom label that is used to select resources managed by the operator.
2222
LabelSelector = ""
23+
24+
// AgentNamespaces is the list of namespaces in which agent pods (nova-compute,
25+
// neutron) are scheduled. The pod list during offboarding is restricted to
26+
// these namespaces. Must be non-empty; set via --agent-namespaces.
27+
AgentNamespaces []string
2328
)

0 commit comments

Comments
 (0)