Skip to content

Commit 9217eec

Browse files
committed
Gate compute-service deletion on agent pods being evicted
A running nova-compute pod would re-register the service immediately after deletion, undoing the offboarding. Introduce AgentPodsEvicted condition computed by HypervisorController and block service deletion in the offboarding controller until it is True. The condition is only evaluated once VM eviction has finished (Evicting=False) and the offboarding taint is present on the node. Pods are listed per-namespace via --agent-namespaces (chart default: monsoon3; the operator refuses to start without it) in pages of 100, with the pod cache disabled so the operator does not start a cluster-wide pod informer. RequeueAfter is used to poll since pods are not watched.
1 parent 1117ff7 commit 9217eec

11 files changed

Lines changed: 555 additions & 4 deletions

File tree

api/v1/hypervisor_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ const (
7171

7272
// ConditionTypeAggregatesUpdated is the type of condition for aggregates updated status of a hypervisor
7373
ConditionTypeAggregatesUpdated = "AggregatesUpdated"
74+
75+
// ConditionTypeAgentPodsEvicted gates compute-service deletion during
76+
// offboarding: a running nova-compute pod would otherwise re-register
77+
// the service we are about to delete.
78+
ConditionTypeAgentPodsEvicted = "AgentPodsEvicted"
7479
)
7580

7681
// Condition Reasons

charts/openstack-hypervisor-operator/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ spec:
4545
value: {{ quote .Values.controllerManager.manager.env.certificateIssuerName }}
4646
- name: LABEL_SELECTOR
4747
value: {{ quote .Values.controllerManager.manager.env.labelSelector }}
48+
- name: AGENT_NAMESPACES
49+
value: {{ quote .Values.controllerManager.manager.env.agentNamespaces }}
4850
- name: KUBERNETES_CLUSTER_DOMAIN
4951
value: {{ quote .Values.kubernetesClusterDomain }}
5052
image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Chart.AppVersion }}

charts/openstack-hypervisor-operator/templates/role.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ rules:
2020
- nodes/status
2121
verbs:
2222
- get
23+
- apiGroups:
24+
- ""
25+
resources:
26+
- pods
27+
verbs:
28+
- list
2329
- apiGroups:
2430
- apps
2531
resources:

charts/openstack-hypervisor-operator/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ controllerManager:
77
- --certificate-namespace=$(CERTIFICATE_NAMESPACE)
88
- --certificate-issuer-name=$(CERTIFICATE_ISSUER_NAME)
99
- --label-selector=$(LABEL_SELECTOR)
10+
- --agent-namespaces=$(AGENT_NAMESPACES)
1011
containerSecurityContext:
1112
allowPrivilegeEscalation: false
1213
capabilities:
@@ -16,6 +17,7 @@ controllerManager:
1617
certificateIssuerName: nova-hypervisor-agents-ca-issuer
1718
certificateNamespace: monsoon3
1819
labelSelector: ""
20+
agentNamespaces: "monsoon3"
1921
osAuthUrl: ""
2022
osProjectDomainName: ""
2123
osProjectName: ""

cmd/main.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"fmt"
2626
"os"
2727
gruntime "runtime"
28+
"strings"
2829

2930
"github.com/sapcc/go-api-declarations/bininfo"
3031
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
@@ -97,6 +98,10 @@ func main() {
9798
"If set, HTTP/2 will be enabled for the metrics and webhook servers")
9899
flag.StringVar(&global.LabelSelector, "label-selector", "", "Label selector to filter watched resources (namely nodes).")
99100

101+
var agentNamespacesFlag string
102+
flag.StringVar(&agentNamespacesFlag, "agent-namespaces", "",
103+
"Comma-separated list of namespaces to search for agent pods (nova-compute, neutron) during offboarding.")
104+
100105
flag.StringVar(&certificateNamespace, "certificate-namespace", "monsoon3", "The namespace for the certificates. ")
101106
flag.StringVar(&certificateIssuerName, "certificate-issuer-name", "nova-hypervisor-agents-ca-issuer",
102107
"Name of the certificate issuer.")
@@ -111,13 +116,28 @@ func main() {
111116
opts.BindFlags(flag.CommandLine)
112117
flag.Parse()
113118

119+
// Handle --version before validating required runtime flags so that
120+
// `./manager --version` works without providing --agent-namespaces.
114121
if version {
115122
fmt.Printf("%s %s (%s/%s) %s\n",
116123
bininfo.Component(), bininfo.VersionOr("devel"), gruntime.GOOS, gruntime.GOARCH,
117124
bininfo.CommitOr("edge"))
118125
os.Exit(0)
119126
}
120127

128+
if agentNamespacesFlag != "" {
129+
for ns := range strings.SplitSeq(agentNamespacesFlag, ",") {
130+
if ns = strings.TrimSpace(ns); ns != "" {
131+
global.AgentNamespaces = append(global.AgentNamespaces, ns)
132+
}
133+
}
134+
}
135+
136+
if len(global.AgentNamespaces) == 0 {
137+
setupLog.Error(errors.New("--agent-namespaces is required"), "invalid configuration")
138+
os.Exit(1)
139+
}
140+
121141
if certificateIssuerName == "" {
122142
setupLog.Error(errors.New("certificate-issuer-name cannot be empty"), "invalid certificate issuer name")
123143
os.Exit(1)
@@ -226,6 +246,13 @@ func main() {
226246

227247
// Optionally configure the cache to listen/watch for specific labeled resources only
228248
Cache: cacheOptions,
249+
// Pods are listed directly (not cached) to avoid a cluster-wide pod
250+
// informer — a single large namespace would cause an OOM on startup.
251+
Client: client.Options{
252+
Cache: &client.CacheOptions{
253+
DisableFor: []client.Object{&corev1.Pod{}},
254+
},
255+
},
229256
})
230257

231258
if err != nil {

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ require (
4343
github.com/felixge/httpsnoop v1.0.4 // indirect
4444
github.com/fsnotify/fsnotify v1.9.0 // indirect
4545
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
46-
github.com/go-logr/logr v1.4.3 // indirect
46+
github.com/go-logr/logr v1.4.3
4747
github.com/go-logr/stdr v1.2.2 // indirect
4848
github.com/go-logr/zapr v1.3.0 // indirect
4949
github.com/go-openapi/jsonpointer v0.22.4 // indirect

internal/controller/hypervisor_controller.go

Lines changed: 120 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,17 @@ import (
2121
"context"
2222
"fmt"
2323
"slices"
24+
"sort"
2425
"strings"
26+
"time"
2527

28+
"github.com/go-logr/logr"
2629
corev1 "k8s.io/api/core/v1"
2730
"k8s.io/apimachinery/pkg/api/equality"
2831
"k8s.io/apimachinery/pkg/api/errors"
2932
"k8s.io/apimachinery/pkg/api/meta"
3033
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
34+
"k8s.io/apimachinery/pkg/fields"
3135
"k8s.io/apimachinery/pkg/labels"
3236
"k8s.io/apimachinery/pkg/runtime"
3337
ctrl "sigs.k8s.io/controller-runtime"
@@ -69,6 +73,7 @@ type HypervisorController struct {
6973

7074
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
7175
// +kubebuilder:rbac:groups="",resources=nodes/status,verbs=get
76+
// +kubebuilder:rbac:groups="",resources=pods,verbs=list
7277
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch;create;update;patch;delete
7378
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch
7479

@@ -123,17 +128,44 @@ func (hv *HypervisorController) Reconcile(ctx context.Context, req ctrl.Request)
123128
})
124129
}
125130

131+
// Only evaluate after VM eviction; a spurious True on a fresh node
132+
// (agents not yet scheduled) would be misleading.
133+
var statusRequeueAfter time.Duration
134+
if hypervisor.Spec.Maintenance == kvmv1.MaintenanceTermination &&
135+
meta.IsStatusConditionFalse(hypervisor.Status.Conditions, kvmv1.ConditionTypeEvicting) &&
136+
nodeHasOffboardingTaint(node) {
137+
cond, err := hv.computeAgentPodsEvictedCondition(ctx, log, node.Name)
138+
if err != nil {
139+
return ctrl.Result{}, fmt.Errorf("failed to compute %s condition: %w", kvmv1.ConditionTypeAgentPodsEvicted, err)
140+
}
141+
meta.SetStatusCondition(&hypervisor.Status.Conditions, cond)
142+
if cond.Status == metav1.ConditionFalse {
143+
// No pod watch — rely on periodic requeue.
144+
statusRequeueAfter = defaultPollTime
145+
}
146+
}
147+
126148
if !equality.Semantic.DeepEqual(hypervisor, base) {
127149
// Capture values to apply - only mutate fields this controller owns
128150
newInternalIP := hypervisor.Status.InternalIP
129151
terminatingCondition := meta.FindStatusCondition(hypervisor.Status.Conditions, kvmv1.ConditionTypeTerminating)
152+
agentPodsCondition := meta.FindStatusCondition(hypervisor.Status.Conditions, kvmv1.ConditionTypeAgentPodsEvicted)
130153

131-
return ctrl.Result{}, utils.PatchHypervisorStatusWithRetry(ctx, hv.Client, hypervisor.Name, HypervisorControllerName, func(h *kvmv1.Hypervisor) {
154+
if err := utils.PatchHypervisorStatusWithRetry(ctx, hv.Client, hypervisor.Name, HypervisorControllerName, func(h *kvmv1.Hypervisor) {
132155
h.Status.InternalIP = newInternalIP
133156
if terminatingCondition != nil {
134157
meta.SetStatusCondition(&h.Status.Conditions, *terminatingCondition)
135158
}
136-
})
159+
if agentPodsCondition != nil {
160+
meta.SetStatusCondition(&h.Status.Conditions, *agentPodsCondition)
161+
}
162+
}); err != nil {
163+
return ctrl.Result{}, err
164+
}
165+
return ctrl.Result{RequeueAfter: statusRequeueAfter}, nil
166+
}
167+
if statusRequeueAfter > 0 {
168+
return ctrl.Result{RequeueAfter: statusRequeueAfter}, nil
137169
}
138170

139171
syncLabelsAndAnnotations(nodeLabels, hypervisor, node)
@@ -262,3 +294,89 @@ func transportLabels(source, destination *metav1.ObjectMeta) {
262294
}
263295
}
264296
}
297+
298+
// nodeHasOffboardingTaint reports whether the offboarding NoExecute taint has
299+
// been applied to the node. The pod list is only meaningful after that point —
300+
// before it, no agents have been evicted yet.
301+
func nodeHasOffboardingTaint(node *corev1.Node) bool {
302+
for _, t := range node.Spec.Taints {
303+
if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute {
304+
return true
305+
}
306+
}
307+
return false
308+
}
309+
310+
// computeAgentPodsEvictedCondition checks whether pods that would be evicted
311+
// by the offboarding taint are still running. Only call once Evicting=False.
312+
func (hv *HypervisorController) computeAgentPodsEvictedCondition(ctx context.Context, log logr.Logger, nodeName string) (metav1.Condition, error) {
313+
offboardingTaint := corev1.Taint{
314+
Key: taintKeyOffboarding,
315+
Effect: corev1.TaintEffectNoExecute,
316+
}
317+
318+
var agentPods []string
319+
for _, ns := range global.AgentNamespaces {
320+
var continueToken string
321+
for {
322+
pods := &corev1.PodList{}
323+
if err := hv.List(ctx, pods,
324+
k8sclient.InNamespace(ns),
325+
k8sclient.MatchingFieldsSelector{
326+
Selector: fields.OneTermEqualSelector("spec.nodeName", nodeName),
327+
},
328+
&k8sclient.ListOptions{Limit: 100, Continue: continueToken},
329+
); err != nil {
330+
return metav1.Condition{}, fmt.Errorf("failed to list pods on node %s in namespace %q: %w", nodeName, ns, err)
331+
}
332+
333+
for _, pod := range pods.Items {
334+
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
335+
continue
336+
}
337+
if podToleratesTaint(log, &pod, &offboardingTaint) {
338+
continue
339+
}
340+
agentPods = append(agentPods, pod.Namespace+"/"+pod.Name)
341+
}
342+
343+
if pods.Continue == "" {
344+
break
345+
}
346+
continueToken = pods.Continue
347+
}
348+
}
349+
350+
if len(agentPods) == 0 {
351+
return metav1.Condition{
352+
Type: kvmv1.ConditionTypeAgentPodsEvicted,
353+
Status: metav1.ConditionTrue,
354+
Reason: "NoAgentPods",
355+
Message: "No agent pods are running on this node",
356+
}, nil
357+
}
358+
359+
sort.Strings(agentPods)
360+
return metav1.Condition{
361+
Type: kvmv1.ConditionTypeAgentPodsEvicted,
362+
Status: metav1.ConditionFalse,
363+
Reason: "AgentPodsRunning",
364+
Message: fmt.Sprintf("%d agent pod(s) still running on node: %s", len(agentPods), strings.Join(agentPods, ", ")),
365+
}, nil
366+
}
367+
368+
// podToleratesTaint reports whether the pod tolerates the taint indefinitely.
369+
// Tolerations with a finite TolerationSeconds are excluded: the pod will
370+
// eventually be evicted and must not be treated as safe to ignore.
371+
func podToleratesTaint(log logr.Logger, pod *corev1.Pod, taint *corev1.Taint) bool {
372+
for i := range pod.Spec.Tolerations {
373+
t := &pod.Spec.Tolerations[i]
374+
if t.TolerationSeconds != nil {
375+
continue
376+
}
377+
if t.ToleratesTaint(log, taint, false) {
378+
return true
379+
}
380+
}
381+
return false
382+
}

0 commit comments

Comments
 (0)