Skip to content

Commit a20a49f

Browse files
committed
Gate compute-service deletion on agent pods being evicted
A running nova-compute pod would re-register the service immediately after deletion, undoing the offboarding. Introduce AgentPodsEvicted condition computed by HypervisorController and block service deletion in the offboarding controller until it is True. The condition is only evaluated once VM eviction has finished (Evicting=False) and the offboarding taint is present on the node. Pods are listed per-namespace via --agent-namespaces (chart default: monsoon3; the operator refuses to start without it) in pages of 100, with the pod cache disabled so the operator does not start a cluster-wide pod informer. RequeueAfter is used to poll since pods are not watched.
1 parent 1117ff7 commit a20a49f

11 files changed

Lines changed: 491 additions & 2 deletions

File tree

api/v1/hypervisor_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ const (
7171

7272
// ConditionTypeAggregatesUpdated is the type of condition for aggregates updated status of a hypervisor
7373
ConditionTypeAggregatesUpdated = "AggregatesUpdated"
74+
75+
// ConditionTypeAgentPodsEvicted gates compute-service deletion during
76+
// offboarding: a running nova-compute pod would otherwise re-register
77+
// the service we are about to delete.
78+
ConditionTypeAgentPodsEvicted = "AgentPodsEvicted"
7479
)
7580

7681
// Condition Reasons

charts/openstack-hypervisor-operator/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ spec:
4545
value: {{ quote .Values.controllerManager.manager.env.certificateIssuerName }}
4646
- name: LABEL_SELECTOR
4747
value: {{ quote .Values.controllerManager.manager.env.labelSelector }}
48+
- name: AGENT_NAMESPACES
49+
value: {{ quote .Values.controllerManager.manager.env.agentNamespaces }}
4850
- name: KUBERNETES_CLUSTER_DOMAIN
4951
value: {{ quote .Values.kubernetesClusterDomain }}
5052
image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Chart.AppVersion }}

charts/openstack-hypervisor-operator/templates/role.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ rules:
2020
- nodes/status
2121
verbs:
2222
- get
23+
- apiGroups:
24+
- ""
25+
resources:
26+
- pods
27+
verbs:
28+
- list
2329
- apiGroups:
2430
- apps
2531
resources:

charts/openstack-hypervisor-operator/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ controllerManager:
77
- --certificate-namespace=$(CERTIFICATE_NAMESPACE)
88
- --certificate-issuer-name=$(CERTIFICATE_ISSUER_NAME)
99
- --label-selector=$(LABEL_SELECTOR)
10+
- --agent-namespaces=$(AGENT_NAMESPACES)
1011
containerSecurityContext:
1112
allowPrivilegeEscalation: false
1213
capabilities:
@@ -16,6 +17,7 @@ controllerManager:
1617
certificateIssuerName: nova-hypervisor-agents-ca-issuer
1718
certificateNamespace: monsoon3
1819
labelSelector: ""
20+
agentNamespaces: "monsoon3"
1921
osAuthUrl: ""
2022
osProjectDomainName: ""
2123
osProjectName: ""

cmd/main.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"fmt"
2626
"os"
2727
gruntime "runtime"
28+
"strings"
2829

2930
"github.com/sapcc/go-api-declarations/bininfo"
3031
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
@@ -97,6 +98,10 @@ func main() {
9798
"If set, HTTP/2 will be enabled for the metrics and webhook servers")
9899
flag.StringVar(&global.LabelSelector, "label-selector", "", "Label selector to filter watched resources (namely nodes).")
99100

101+
var agentNamespacesFlag string
102+
flag.StringVar(&agentNamespacesFlag, "agent-namespaces", "",
103+
"Comma-separated list of namespaces to search for agent pods (nova-compute, neutron) during offboarding.")
104+
100105
flag.StringVar(&certificateNamespace, "certificate-namespace", "monsoon3", "The namespace for the certificates. ")
101106
flag.StringVar(&certificateIssuerName, "certificate-issuer-name", "nova-hypervisor-agents-ca-issuer",
102107
"Name of the certificate issuer.")
@@ -111,13 +116,28 @@ func main() {
111116
opts.BindFlags(flag.CommandLine)
112117
flag.Parse()
113118

119+
// Handle --version before validating required runtime flags so that
120+
// `./manager --version` works without providing --agent-namespaces.
114121
if version {
115122
fmt.Printf("%s %s (%s/%s) %s\n",
116123
bininfo.Component(), bininfo.VersionOr("devel"), gruntime.GOOS, gruntime.GOARCH,
117124
bininfo.CommitOr("edge"))
118125
os.Exit(0)
119126
}
120127

128+
if agentNamespacesFlag != "" {
129+
for ns := range strings.SplitSeq(agentNamespacesFlag, ",") {
130+
if ns = strings.TrimSpace(ns); ns != "" {
131+
global.AgentNamespaces = append(global.AgentNamespaces, ns)
132+
}
133+
}
134+
}
135+
136+
if len(global.AgentNamespaces) == 0 {
137+
setupLog.Error(errors.New("--agent-namespaces is required"), "invalid configuration")
138+
os.Exit(1)
139+
}
140+
121141
if certificateIssuerName == "" {
122142
setupLog.Error(errors.New("certificate-issuer-name cannot be empty"), "invalid certificate issuer name")
123143
os.Exit(1)
@@ -226,6 +246,13 @@ func main() {
226246

227247
// Optionally configure the cache to listen/watch for specific labeled resources only
228248
Cache: cacheOptions,
249+
// Pods are listed directly (not cached) to avoid a cluster-wide pod
250+
// informer — a single large namespace would cause an OOM on startup.
251+
Client: client.Options{
252+
Cache: &client.CacheOptions{
253+
DisableFor: []client.Object{&corev1.Pod{}},
254+
},
255+
},
229256
})
230257

231258
if err != nil {

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ require (
4343
github.com/felixge/httpsnoop v1.0.4 // indirect
4444
github.com/fsnotify/fsnotify v1.9.0 // indirect
4545
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
46-
github.com/go-logr/logr v1.4.3 // indirect
46+
github.com/go-logr/logr v1.4.3
4747
github.com/go-logr/stdr v1.2.2 // indirect
4848
github.com/go-logr/zapr v1.3.0 // indirect
4949
github.com/go-openapi/jsonpointer v0.22.4 // indirect

internal/controller/hypervisor_controller.go

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,16 @@ import (
2121
"context"
2222
"fmt"
2323
"slices"
24+
"sort"
2425
"strings"
2526

27+
"github.com/go-logr/logr"
2628
corev1 "k8s.io/api/core/v1"
2729
"k8s.io/apimachinery/pkg/api/equality"
2830
"k8s.io/apimachinery/pkg/api/errors"
2931
"k8s.io/apimachinery/pkg/api/meta"
3032
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
33+
"k8s.io/apimachinery/pkg/fields"
3134
"k8s.io/apimachinery/pkg/labels"
3235
"k8s.io/apimachinery/pkg/runtime"
3336
ctrl "sigs.k8s.io/controller-runtime"
@@ -69,6 +72,7 @@ type HypervisorController struct {
6972

7073
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
7174
// +kubebuilder:rbac:groups="",resources=nodes/status,verbs=get
75+
// +kubebuilder:rbac:groups="",resources=pods,verbs=list
7276
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch;create;update;patch;delete
7377
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch
7478

@@ -123,16 +127,41 @@ func (hv *HypervisorController) Reconcile(ctx context.Context, req ctrl.Request)
123127
})
124128
}
125129

130+
// Only evaluate after VM eviction; a spurious True on a fresh node
131+
// (agents not yet scheduled) would be misleading.
132+
if hypervisor.Spec.Maintenance == kvmv1.MaintenanceTermination &&
133+
meta.IsStatusConditionFalse(hypervisor.Status.Conditions, kvmv1.ConditionTypeEvicting) &&
134+
nodeHasOffboardingTaint(node) {
135+
cond, err := hv.computeAgentPodsEvictedCondition(ctx, log, node.Name)
136+
if err != nil {
137+
return ctrl.Result{}, fmt.Errorf("failed to compute %s condition: %w", kvmv1.ConditionTypeAgentPodsEvicted, err)
138+
}
139+
meta.SetStatusCondition(&hypervisor.Status.Conditions, cond)
140+
if cond.Status == metav1.ConditionFalse {
141+
// No pod watch — rely on periodic requeue.
142+
if err := utils.PatchHypervisorStatusWithRetry(ctx, hv.Client, hypervisor.Name, HypervisorControllerName, func(h *kvmv1.Hypervisor) {
143+
meta.SetStatusCondition(&h.Status.Conditions, cond)
144+
}); err != nil {
145+
return ctrl.Result{}, err
146+
}
147+
return ctrl.Result{RequeueAfter: defaultPollTime}, nil
148+
}
149+
}
150+
126151
if !equality.Semantic.DeepEqual(hypervisor, base) {
127152
// Capture values to apply - only mutate fields this controller owns
128153
newInternalIP := hypervisor.Status.InternalIP
129154
terminatingCondition := meta.FindStatusCondition(hypervisor.Status.Conditions, kvmv1.ConditionTypeTerminating)
155+
agentPodsCondition := meta.FindStatusCondition(hypervisor.Status.Conditions, kvmv1.ConditionTypeAgentPodsEvicted)
130156

131157
return ctrl.Result{}, utils.PatchHypervisorStatusWithRetry(ctx, hv.Client, hypervisor.Name, HypervisorControllerName, func(h *kvmv1.Hypervisor) {
132158
h.Status.InternalIP = newInternalIP
133159
if terminatingCondition != nil {
134160
meta.SetStatusCondition(&h.Status.Conditions, *terminatingCondition)
135161
}
162+
if agentPodsCondition != nil {
163+
meta.SetStatusCondition(&h.Status.Conditions, *agentPodsCondition)
164+
}
136165
})
137166
}
138167

@@ -262,3 +291,89 @@ func transportLabels(source, destination *metav1.ObjectMeta) {
262291
}
263292
}
264293
}
294+
295+
// nodeHasOffboardingTaint reports whether the offboarding NoExecute taint has
296+
// been applied to the node. The pod list is only meaningful after that point —
297+
// before it, no agents have been evicted yet.
298+
func nodeHasOffboardingTaint(node *corev1.Node) bool {
299+
for _, t := range node.Spec.Taints {
300+
if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute {
301+
return true
302+
}
303+
}
304+
return false
305+
}
306+
307+
// computeAgentPodsEvictedCondition checks whether pods that would be evicted
308+
// by the offboarding taint are still running. Only call once Evicting=False.
309+
func (hv *HypervisorController) computeAgentPodsEvictedCondition(ctx context.Context, log logr.Logger, nodeName string) (metav1.Condition, error) {
310+
offboardingTaint := corev1.Taint{
311+
Key: taintKeyOffboarding,
312+
Effect: corev1.TaintEffectNoExecute,
313+
}
314+
315+
var agentPods []string
316+
for _, ns := range global.AgentNamespaces {
317+
var continueToken string
318+
for {
319+
pods := &corev1.PodList{}
320+
if err := hv.List(ctx, pods,
321+
k8sclient.InNamespace(ns),
322+
k8sclient.MatchingFieldsSelector{
323+
Selector: fields.OneTermEqualSelector("spec.nodeName", nodeName),
324+
},
325+
&k8sclient.ListOptions{Limit: 100, Continue: continueToken},
326+
); err != nil {
327+
return metav1.Condition{}, fmt.Errorf("failed to list pods on node %s in namespace %q: %w", nodeName, ns, err)
328+
}
329+
330+
for _, pod := range pods.Items {
331+
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
332+
continue
333+
}
334+
if podToleratesTaint(log, &pod, &offboardingTaint) {
335+
continue
336+
}
337+
agentPods = append(agentPods, pod.Namespace+"/"+pod.Name)
338+
}
339+
340+
if pods.Continue == "" {
341+
break
342+
}
343+
continueToken = pods.Continue
344+
}
345+
}
346+
347+
if len(agentPods) == 0 {
348+
return metav1.Condition{
349+
Type: kvmv1.ConditionTypeAgentPodsEvicted,
350+
Status: metav1.ConditionTrue,
351+
Reason: "NoAgentPods",
352+
Message: "No agent pods are running on this node",
353+
}, nil
354+
}
355+
356+
sort.Strings(agentPods)
357+
return metav1.Condition{
358+
Type: kvmv1.ConditionTypeAgentPodsEvicted,
359+
Status: metav1.ConditionFalse,
360+
Reason: "AgentPodsRunning",
361+
Message: fmt.Sprintf("%d agent pod(s) still running on node: %s", len(agentPods), strings.Join(agentPods, ", ")),
362+
}, nil
363+
}
364+
365+
// podToleratesTaint reports whether the pod tolerates the taint indefinitely.
366+
// Tolerations with a finite TolerationSeconds are excluded: the pod will
367+
// eventually be evicted and must not be treated as safe to ignore.
368+
func podToleratesTaint(log logr.Logger, pod *corev1.Pod, taint *corev1.Taint) bool {
369+
for i := range pod.Spec.Tolerations {
370+
t := &pod.Spec.Tolerations[i]
371+
if t.TolerationSeconds != nil {
372+
continue
373+
}
374+
if t.ToleratesTaint(log, taint, false) {
375+
return true
376+
}
377+
}
378+
return false
379+
}

0 commit comments

Comments
 (0)