Skip to content

Commit 2f3157a

Browse files
committed
Gate compute-service deletion on agent pods being evicted
A running nova-compute pod would re-register the service immediately after deletion, undoing the offboarding. Introduce AgentPodsEvicted condition computed by HypervisorController and block service deletion in the offboarding controller until it is True. The condition is only evaluated once VM eviction has finished (Evicting=False) and the offboarding taint is present on the node. Pods are listed per-namespace via --agent-namespaces (chart default: monsoon3; the operator refuses to start without it) in pages of 100, with the pod cache disabled so the operator does not start a cluster-wide pod informer. RequeueAfter is used to poll since pods are not watched.
1 parent 1117ff7 commit 2f3157a

11 files changed

Lines changed: 610 additions & 14 deletions

File tree

api/v1/hypervisor_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ const (
7171

7272
// ConditionTypeAggregatesUpdated is the type of condition for aggregates updated status of a hypervisor
7373
ConditionTypeAggregatesUpdated = "AggregatesUpdated"
74+
75+
// ConditionTypeAgentPodsEvicted gates compute-service deletion during
76+
// offboarding: a running nova-compute pod would otherwise re-register
77+
// the service we are about to delete.
78+
ConditionTypeAgentPodsEvicted = "AgentPodsEvicted"
7479
)
7580

7681
// Condition Reasons

charts/openstack-hypervisor-operator/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ spec:
4545
value: {{ quote .Values.controllerManager.manager.env.certificateIssuerName }}
4646
- name: LABEL_SELECTOR
4747
value: {{ quote .Values.controllerManager.manager.env.labelSelector }}
48+
- name: AGENT_NAMESPACES
49+
value: {{ quote .Values.controllerManager.manager.env.agentNamespaces }}
4850
- name: KUBERNETES_CLUSTER_DOMAIN
4951
value: {{ quote .Values.kubernetesClusterDomain }}
5052
image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Chart.AppVersion }}

charts/openstack-hypervisor-operator/templates/role.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ rules:
2020
- nodes/status
2121
verbs:
2222
- get
23+
- apiGroups:
24+
- ""
25+
resources:
26+
- pods
27+
verbs:
28+
- list
2329
- apiGroups:
2430
- apps
2531
resources:

charts/openstack-hypervisor-operator/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ controllerManager:
77
- --certificate-namespace=$(CERTIFICATE_NAMESPACE)
88
- --certificate-issuer-name=$(CERTIFICATE_ISSUER_NAME)
99
- --label-selector=$(LABEL_SELECTOR)
10+
- --agent-namespaces=$(AGENT_NAMESPACES)
1011
containerSecurityContext:
1112
allowPrivilegeEscalation: false
1213
capabilities:
@@ -16,6 +17,7 @@ controllerManager:
1617
certificateIssuerName: nova-hypervisor-agents-ca-issuer
1718
certificateNamespace: monsoon3
1819
labelSelector: ""
20+
agentNamespaces: "monsoon3"
1921
osAuthUrl: ""
2022
osProjectDomainName: ""
2123
osProjectName: ""

cmd/main.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"fmt"
2626
"os"
2727
gruntime "runtime"
28+
"strings"
2829

2930
"github.com/sapcc/go-api-declarations/bininfo"
3031
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
@@ -97,6 +98,10 @@ func main() {
9798
"If set, HTTP/2 will be enabled for the metrics and webhook servers")
9899
flag.StringVar(&global.LabelSelector, "label-selector", "", "Label selector to filter watched resources (namely nodes).")
99100

101+
var agentNamespacesFlag string
102+
flag.StringVar(&agentNamespacesFlag, "agent-namespaces", "",
103+
"Comma-separated list of namespaces to search for agent pods (nova-compute, neutron) during offboarding.")
104+
100105
flag.StringVar(&certificateNamespace, "certificate-namespace", "monsoon3", "The namespace for the certificates. ")
101106
flag.StringVar(&certificateIssuerName, "certificate-issuer-name", "nova-hypervisor-agents-ca-issuer",
102107
"Name of the certificate issuer.")
@@ -111,13 +116,38 @@ func main() {
111116
opts.BindFlags(flag.CommandLine)
112117
flag.Parse()
113118

119+
// Handle --version before validating required runtime flags so that
120+
// `./manager --version` works without providing --agent-namespaces.
114121
if version {
115122
fmt.Printf("%s %s (%s/%s) %s\n",
116123
bininfo.Component(), bininfo.VersionOr("devel"), gruntime.GOOS, gruntime.GOARCH,
117124
bininfo.CommitOr("edge"))
118125
os.Exit(0)
119126
}
120127

128+
if agentNamespacesFlag != "" {
129+
// Deduplicate. Without this, repeated entries (e.g. from templated
130+
// values) would cause redundant pod-list calls and inflate the
131+
// AgentPodsEvicted condition message with duplicate pod names.
132+
seen := map[string]struct{}{}
133+
for ns := range strings.SplitSeq(agentNamespacesFlag, ",") {
134+
ns = strings.TrimSpace(ns)
135+
if ns == "" {
136+
continue
137+
}
138+
if _, ok := seen[ns]; ok {
139+
continue
140+
}
141+
seen[ns] = struct{}{}
142+
global.AgentNamespaces = append(global.AgentNamespaces, ns)
143+
}
144+
}
145+
146+
if len(global.AgentNamespaces) == 0 {
147+
setupLog.Error(errors.New("--agent-namespaces is required"), "invalid configuration")
148+
os.Exit(1)
149+
}
150+
121151
if certificateIssuerName == "" {
122152
setupLog.Error(errors.New("certificate-issuer-name cannot be empty"), "invalid certificate issuer name")
123153
os.Exit(1)
@@ -226,6 +256,13 @@ func main() {
226256

227257
// Optionally configure the cache to listen/watch for specific labeled resources only
228258
Cache: cacheOptions,
259+
// Pods are listed directly (not cached) to avoid a cluster-wide pod
260+
// informer — a single large namespace would cause an OOM on startup.
261+
Client: client.Options{
262+
Cache: &client.CacheOptions{
263+
DisableFor: []client.Object{&corev1.Pod{}},
264+
},
265+
},
229266
})
230267

231268
if err != nil {

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ require (
4343
github.com/felixge/httpsnoop v1.0.4 // indirect
4444
github.com/fsnotify/fsnotify v1.9.0 // indirect
4545
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
46-
github.com/go-logr/logr v1.4.3 // indirect
46+
github.com/go-logr/logr v1.4.3
4747
github.com/go-logr/stdr v1.2.2 // indirect
4848
github.com/go-logr/zapr v1.3.0 // indirect
4949
github.com/go-openapi/jsonpointer v0.22.4 // indirect

internal/controller/hypervisor_controller.go

Lines changed: 134 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,17 @@ import (
2121
"context"
2222
"fmt"
2323
"slices"
24+
"sort"
2425
"strings"
26+
"time"
2527

28+
"github.com/go-logr/logr"
2629
corev1 "k8s.io/api/core/v1"
2730
"k8s.io/apimachinery/pkg/api/equality"
2831
"k8s.io/apimachinery/pkg/api/errors"
2932
"k8s.io/apimachinery/pkg/api/meta"
3033
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
34+
"k8s.io/apimachinery/pkg/fields"
3135
"k8s.io/apimachinery/pkg/labels"
3236
"k8s.io/apimachinery/pkg/runtime"
3337
ctrl "sigs.k8s.io/controller-runtime"
@@ -69,6 +73,7 @@ type HypervisorController struct {
6973

7074
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
7175
// +kubebuilder:rbac:groups="",resources=nodes/status,verbs=get
76+
// +kubebuilder:rbac:groups="",resources=pods,verbs=list
7277
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch;create;update;patch;delete
7378
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;update;patch
7479

@@ -100,8 +105,22 @@ func (hv *HypervisorController) Reconcile(ctx context.Context, req ctrl.Request)
100105
}
101106
// continue with creation
102107
} else {
103-
// update Status if needed
104-
base := hypervisor.DeepCopy()
108+
// First, propagate spec/metadata derived from the Node (labels,
109+
// annotations -> aggregates/traits, lifecycle). This must run on
110+
// every reconcile, including those where status will also change
111+
// (e.g. AgentPodsEvicted=False during termination); otherwise the
112+
// Hypervisor spec/labels go stale.
113+
specBase := hypervisor.DeepCopy()
114+
syncLabelsAndAnnotations(nodeLabels, hypervisor, node)
115+
if !equality.Semantic.DeepEqual(hypervisor, specBase) {
116+
if err := hv.Patch(ctx, hypervisor, k8sclient.MergeFromWithOptions(specBase,
117+
k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(HypervisorControllerName)); err != nil {
118+
return ctrl.Result{}, err
119+
}
120+
}
121+
122+
// Then, compute and persist any status changes derived from the Node.
123+
statusBase := hypervisor.DeepCopy()
105124

106125
// transfer internal IP
107126
for _, address := range node.Status.Addresses {
@@ -123,26 +142,43 @@ func (hv *HypervisorController) Reconcile(ctx context.Context, req ctrl.Request)
123142
})
124143
}
125144

126-
if !equality.Semantic.DeepEqual(hypervisor, base) {
145+
// Only evaluate after VM eviction; a spurious True on a fresh node
146+
// (agents not yet scheduled) would be misleading.
147+
var statusRequeueAfter time.Duration
148+
if hypervisor.Spec.Maintenance == kvmv1.MaintenanceTermination &&
149+
meta.IsStatusConditionFalse(hypervisor.Status.Conditions, kvmv1.ConditionTypeEvicting) &&
150+
nodeHasOffboardingTaint(node) {
151+
cond, err := hv.computeAgentPodsEvictedCondition(ctx, log, node.Name)
152+
if err != nil {
153+
return ctrl.Result{}, fmt.Errorf("failed to compute %s condition: %w", kvmv1.ConditionTypeAgentPodsEvicted, err)
154+
}
155+
meta.SetStatusCondition(&hypervisor.Status.Conditions, cond)
156+
if cond.Status == metav1.ConditionFalse {
157+
// No pod watch — rely on periodic requeue.
158+
statusRequeueAfter = defaultPollTime
159+
}
160+
}
161+
162+
if !equality.Semantic.DeepEqual(hypervisor, statusBase) {
127163
// Capture values to apply - only mutate fields this controller owns
128164
newInternalIP := hypervisor.Status.InternalIP
129165
terminatingCondition := meta.FindStatusCondition(hypervisor.Status.Conditions, kvmv1.ConditionTypeTerminating)
166+
agentPodsCondition := meta.FindStatusCondition(hypervisor.Status.Conditions, kvmv1.ConditionTypeAgentPodsEvicted)
130167

131-
return ctrl.Result{}, utils.PatchHypervisorStatusWithRetry(ctx, hv.Client, hypervisor.Name, HypervisorControllerName, func(h *kvmv1.Hypervisor) {
168+
if err := utils.PatchHypervisorStatusWithRetry(ctx, hv.Client, hypervisor.Name, HypervisorControllerName, func(h *kvmv1.Hypervisor) {
132169
h.Status.InternalIP = newInternalIP
133170
if terminatingCondition != nil {
134171
meta.SetStatusCondition(&h.Status.Conditions, *terminatingCondition)
135172
}
136-
})
137-
}
138-
139-
syncLabelsAndAnnotations(nodeLabels, hypervisor, node)
140-
if equality.Semantic.DeepEqual(hypervisor, base) {
141-
return ctrl.Result{}, nil
173+
if agentPodsCondition != nil {
174+
meta.SetStatusCondition(&h.Status.Conditions, *agentPodsCondition)
175+
}
176+
}); err != nil {
177+
return ctrl.Result{}, err
178+
}
142179
}
143180

144-
return ctrl.Result{}, hv.Patch(ctx, hypervisor, k8sclient.MergeFromWithOptions(base,
145-
k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(HypervisorControllerName))
181+
return ctrl.Result{RequeueAfter: statusRequeueAfter}, nil
146182
}
147183

148184
syncLabelsAndAnnotations(nodeLabels, hypervisor, node)
@@ -262,3 +298,89 @@ func transportLabels(source, destination *metav1.ObjectMeta) {
262298
}
263299
}
264300
}
301+
302+
// nodeHasOffboardingTaint reports whether the offboarding NoExecute taint has
303+
// been applied to the node. The pod list is only meaningful after that point —
304+
// before it, no agents have been evicted yet.
305+
func nodeHasOffboardingTaint(node *corev1.Node) bool {
306+
for _, t := range node.Spec.Taints {
307+
if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute {
308+
return true
309+
}
310+
}
311+
return false
312+
}
313+
314+
// computeAgentPodsEvictedCondition checks whether pods that would be evicted
315+
// by the offboarding taint are still running. Only call once Evicting=False.
316+
func (hv *HypervisorController) computeAgentPodsEvictedCondition(ctx context.Context, log logr.Logger, nodeName string) (metav1.Condition, error) {
317+
offboardingTaint := corev1.Taint{
318+
Key: taintKeyOffboarding,
319+
Effect: corev1.TaintEffectNoExecute,
320+
}
321+
322+
var agentPods []string
323+
for _, ns := range global.AgentNamespaces {
324+
var continueToken string
325+
for {
326+
pods := &corev1.PodList{}
327+
if err := hv.List(ctx, pods,
328+
k8sclient.InNamespace(ns),
329+
k8sclient.MatchingFieldsSelector{
330+
Selector: fields.OneTermEqualSelector("spec.nodeName", nodeName),
331+
},
332+
&k8sclient.ListOptions{Limit: 100, Continue: continueToken},
333+
); err != nil {
334+
return metav1.Condition{}, fmt.Errorf("failed to list pods on node %s in namespace %q: %w", nodeName, ns, err)
335+
}
336+
337+
for _, pod := range pods.Items {
338+
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
339+
continue
340+
}
341+
if podToleratesTaint(log, &pod, &offboardingTaint) {
342+
continue
343+
}
344+
agentPods = append(agentPods, pod.Namespace+"/"+pod.Name)
345+
}
346+
347+
if pods.Continue == "" {
348+
break
349+
}
350+
continueToken = pods.Continue
351+
}
352+
}
353+
354+
if len(agentPods) == 0 {
355+
return metav1.Condition{
356+
Type: kvmv1.ConditionTypeAgentPodsEvicted,
357+
Status: metav1.ConditionTrue,
358+
Reason: "NoAgentPods",
359+
Message: "No agent pods are running on this node",
360+
}, nil
361+
}
362+
363+
sort.Strings(agentPods)
364+
return metav1.Condition{
365+
Type: kvmv1.ConditionTypeAgentPodsEvicted,
366+
Status: metav1.ConditionFalse,
367+
Reason: "AgentPodsRunning",
368+
Message: fmt.Sprintf("%d agent pod(s) still running on node: %s", len(agentPods), strings.Join(agentPods, ", ")),
369+
}, nil
370+
}
371+
372+
// podToleratesTaint reports whether the pod tolerates the taint indefinitely.
373+
// Tolerations with a finite TolerationSeconds are excluded: the pod will
374+
// eventually be evicted and must not be treated as safe to ignore.
375+
func podToleratesTaint(log logr.Logger, pod *corev1.Pod, taint *corev1.Taint) bool {
376+
for i := range pod.Spec.Tolerations {
377+
t := &pod.Spec.Tolerations[i]
378+
if t.TolerationSeconds != nil {
379+
continue
380+
}
381+
if t.ToleratesTaint(log, taint, false) {
382+
return true
383+
}
384+
}
385+
return false
386+
}

0 commit comments

Comments
 (0)