Skip to content

Commit 38c7019

Browse files
committed
[CASCL-1386] Create temporary PodDisruptionBudgets during eviction
Part of a stack splitting #3026 (too large to review in one piece) into small pieces that each build and pass tests on their own. The command is fully functional only once the whole stack lands.
1 parent 72d803c commit 38c7019

4 files changed

Lines changed: 847 additions & 7 deletions

File tree

cmd/kubectl-datadog/autoscaling/cluster/evict/pdb.go

Lines changed: 340 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,353 @@ package evict
22

33
import (
44
"context"
5+
"crypto/sha256"
6+
"encoding/hex"
7+
"errors"
8+
"fmt"
9+
"log"
10+
"reflect"
11+
"slices"
12+
"strings"
513

14+
"github.com/samber/lo"
15+
appsv1 "k8s.io/api/apps/v1"
16+
corev1 "k8s.io/api/core/v1"
17+
policyv1 "k8s.io/api/policy/v1"
18+
apierrors "k8s.io/apimachinery/pkg/api/errors"
19+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
20+
"k8s.io/apimachinery/pkg/runtime"
21+
"k8s.io/apimachinery/pkg/util/intstr"
22+
"k8s.io/apimachinery/pkg/util/validation"
623
"k8s.io/client-go/kubernetes"
24+
"k8s.io/client-go/tools/pager"
725
"sigs.k8s.io/controller-runtime/pkg/client"
26+
27+
commonk8s "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/k8s"
28+
)
29+
30+
// Label keys used to mark PodDisruptionBudgets created by this command. Both
31+
// must be present for cleanup to consider the PDB ours — this makes the
32+
// cleanup safe against accidentally removing a user PDB with a colliding name.
33+
const (
34+
pdbManagedByLabelKey = "app.kubernetes.io/managed-by"
35+
pdbManagedByLabelValue = "kubectl-datadog"
36+
pdbTempLabelKey = "autoscaling.datadoghq.com/temporary-pdb"
37+
pdbTempLabelValue = "true"
38+
// pdbNameSuffix is appended to the controller name to form the temp PDB
39+
// name.
40+
pdbNameSuffix = "-evict-legacy"
841
)
942

1043
func ensureTempPDBs(ctx context.Context, clientset kubernetes.Interface, ctrlClient client.Client, targets []Target, dryRun bool) error {
11-
panic("TODO: ensureTempPDBs — implemented in PR https://github.com/DataDog/datadog-operator/pull/3178")
44+
allNodes := lo.FlatMap(targets, func(t Target, _ int) []string { return t.Nodes })
45+
nodeSet := lo.SliceToMap(allNodes, func(n string) (string, struct{}) { return n, struct{}{} })
46+
if len(nodeSet) == 0 {
47+
return nil
48+
}
49+
50+
controllers, err := discoverControllers(ctx, clientset, nodeSet)
51+
if err != nil {
52+
return fmt.Errorf("failed to discover controllers: %w", err)
53+
}
54+
if len(controllers) == 0 {
55+
return nil
56+
}
57+
58+
// Group controllers by namespace to amortize the per-namespace PDB list.
59+
byNamespace := make(map[string][]controllerInfo)
60+
for _, c := range controllers {
61+
byNamespace[c.Namespace] = append(byNamespace[c.Namespace], c)
62+
}
63+
64+
var errs []error
65+
for ns, ctrls := range byNamespace {
66+
existing, err := listNamespacePDBs(ctx, clientset, ns)
67+
if err != nil {
68+
errs = append(errs, fmt.Errorf("namespace %s: failed to list PDBs: %w", ns, err))
69+
continue
70+
}
71+
for _, c := range ctrls {
72+
if hasUserPDB(existing, c.Selector) {
73+
continue
74+
}
75+
if err := createTempPDB(ctx, ctrlClient, c, dryRun); err != nil {
76+
errs = append(errs, fmt.Errorf("controller %s/%s/%s: %w", c.Namespace, c.Kind, c.Name, err))
77+
}
78+
}
79+
}
80+
return errors.Join(errs...)
1281
}
1382

1483
func cleanupTempPDBs(ctx context.Context, ctrlClient client.Client, dryRun bool) error {
15-
panic("TODO: cleanupTempPDBs — implemented in PR https://github.com/DataDog/datadog-operator/pull/3178")
84+
list := &policyv1.PodDisruptionBudgetList{}
85+
if err := ctrlClient.List(ctx, list, client.MatchingLabels{
86+
pdbManagedByLabelKey: pdbManagedByLabelValue,
87+
pdbTempLabelKey: pdbTempLabelValue,
88+
}); err != nil {
89+
return fmt.Errorf("failed to list temporary PDBs: %w", err)
90+
}
91+
if len(list.Items) == 0 {
92+
return nil
93+
}
94+
var errs []error
95+
for i := range list.Items {
96+
pdb := &list.Items[i]
97+
if dryRun {
98+
log.Printf("[dry-run] would delete PDB %s/%s", pdb.Namespace, pdb.Name)
99+
continue
100+
}
101+
if err := commonk8s.Delete(ctx, ctrlClient, pdb); err != nil {
102+
errs = append(errs, fmt.Errorf("PDB %s/%s: %w", pdb.Namespace, pdb.Name, err))
103+
}
104+
}
105+
return errors.Join(errs...)
106+
}
107+
108+
// reclaimLeakedTempPDBs deletes temporary PDBs left behind by an interrupted
109+
// previous run. It is a no-op when temp-PDB management is disabled. Failures are
110+
// logged rather than returned: this runs on the no-op exit path (nothing left
111+
// to evict), where reclaiming a leftover PDB is best-effort, not a reason to
112+
// fail the command.
113+
func reclaimLeakedTempPDBs(ctx context.Context, ctrlClient client.Client, ensurePDBs, dryRun bool) {
114+
if !ensurePDBs {
115+
return
116+
}
117+
if err := cleanupTempPDBs(ctx, ctrlClient, dryRun); err != nil {
118+
log.Printf("Warning: failed to cleanup leftover temporary PDBs: %v", err)
119+
}
120+
}
121+
122+
// controllerKey identifies a top-level controller that owns evictable pods on
123+
// our target nodes. It is the dedup key in the seen map and the identity half
124+
// of controllerInfo.
125+
type controllerKey struct {
126+
Namespace string
127+
Kind string // "Deployment", "StatefulSet", "ReplicaSet"
128+
Name string
129+
}
130+
131+
// controllerInfo is a controllerKey plus the controller's pod selector — what a
132+
// PDB would match on.
133+
type controllerInfo struct {
134+
controllerKey
135+
Selector *metav1.LabelSelector
136+
}
137+
138+
// discoverControllers lists every Pod cluster-wide once (paginated) and, for
139+
// each Pod scheduled on one of the target nodes, resolves the top-level
140+
// controller. Listing once and filtering client-side avoids the N-API-calls
141+
// problem of doing one List per node, which on a large legacy fleet would
142+
// dominate the command's wall-clock time. The resulting slice contains each
143+
// controller at most once.
144+
func discoverControllers(ctx context.Context, clientset kubernetes.Interface, nodeSet map[string]struct{}) ([]controllerInfo, error) {
145+
seen := make(map[controllerKey]*metav1.LabelSelector)
146+
depCache := make(map[client.ObjectKey]*appsv1.Deployment)
147+
rsCache := make(map[client.ObjectKey]*appsv1.ReplicaSet)
148+
stsCache := make(map[client.ObjectKey]*appsv1.StatefulSet)
149+
150+
p := pager.New(func(ctx context.Context, opts metav1.ListOptions) (runtime.Object, error) {
151+
return clientset.CoreV1().Pods(metav1.NamespaceAll).List(ctx, opts)
152+
})
153+
if err := p.EachListItem(ctx, metav1.ListOptions{}, func(obj runtime.Object) error {
154+
pod := obj.(*corev1.Pod)
155+
if _, onTarget := nodeSet[pod.Spec.NodeName]; !onTarget {
156+
return nil
157+
}
158+
if shouldSkipEviction(pod) {
159+
return nil
160+
}
161+
info, err := resolveTopLevelController(ctx, clientset, pod, depCache, rsCache, stsCache)
162+
if err != nil {
163+
if apierrors.IsNotFound(err) {
164+
log.Printf("Warning: controller for pod %s/%s not found (deleted mid-scan); skipping", pod.Namespace, pod.Name)
165+
return nil
166+
}
167+
return fmt.Errorf("resolve controller for pod %s/%s: %w", pod.Namespace, pod.Name, err)
168+
}
169+
if info == nil {
170+
return nil
171+
}
172+
seen[info.controllerKey] = info.Selector
173+
return nil
174+
}); err != nil {
175+
return nil, fmt.Errorf("list pods: %w", err)
176+
}
177+
178+
return lo.MapToSlice(seen, func(key controllerKey, selector *metav1.LabelSelector) controllerInfo {
179+
return controllerInfo{controllerKey: key, Selector: selector}
180+
}), nil
181+
}
182+
183+
// resolveTopLevelController walks a Pod's owner chain up to the workload
184+
// controller (Deployment > ReplicaSet > Pod, StatefulSet > Pod). Returns nil
185+
// for Pods whose top-level owner is not a stable workload — Jobs (TTL-managed),
186+
// DaemonSets (already skipped at eviction), or bare Pods.
187+
func resolveTopLevelController(
188+
ctx context.Context,
189+
clientset kubernetes.Interface,
190+
pod *corev1.Pod,
191+
depCache map[client.ObjectKey]*appsv1.Deployment,
192+
rsCache map[client.ObjectKey]*appsv1.ReplicaSet,
193+
stsCache map[client.ObjectKey]*appsv1.StatefulSet,
194+
) (*controllerInfo, error) {
195+
owner := metav1.GetControllerOf(pod)
196+
if owner == nil {
197+
return nil, nil
198+
}
199+
switch owner.Kind {
200+
case "ReplicaSet":
201+
rs, err := getCached(ctx, rsCache, pod.Namespace, owner.Name, clientset.AppsV1().ReplicaSets(pod.Namespace).Get)
202+
if err != nil {
203+
return nil, err
204+
}
205+
rsOwner := metav1.GetControllerOf(rs)
206+
if rsOwner != nil && rsOwner.Kind == "Deployment" {
207+
dep, err := getCached(ctx, depCache, pod.Namespace, rsOwner.Name, clientset.AppsV1().Deployments(pod.Namespace).Get)
208+
if err == nil {
209+
return &controllerInfo{
210+
controllerKey: controllerKey{Namespace: pod.Namespace, Kind: "Deployment", Name: dep.Name},
211+
Selector: dep.Spec.Selector,
212+
}, nil
213+
}
214+
if !apierrors.IsNotFound(err) {
215+
return nil, err
216+
}
217+
}
218+
return &controllerInfo{
219+
controllerKey: controllerKey{Namespace: pod.Namespace, Kind: "ReplicaSet", Name: rs.Name},
220+
Selector: rs.Spec.Selector,
221+
}, nil
222+
case "StatefulSet":
223+
sts, err := getCached(ctx, stsCache, pod.Namespace, owner.Name, clientset.AppsV1().StatefulSets(pod.Namespace).Get)
224+
if err != nil {
225+
return nil, err
226+
}
227+
return &controllerInfo{
228+
controllerKey: controllerKey{Namespace: pod.Namespace, Kind: "StatefulSet", Name: sts.Name},
229+
Selector: sts.Spec.Selector,
230+
}, nil
231+
default:
232+
// DaemonSet (skipped before reaching here), Job (TTL), CronJob,
233+
// custom controllers — none get a temporary PDB.
234+
return nil, nil
235+
}
236+
}
237+
238+
// getCached returns the object identified by (ns, name) from cache, fetching it
239+
// via get — the namespace-bound typed clientset accessor, e.g.
240+
// clientset.AppsV1().ReplicaSets(ns).Get — and populating the cache on a miss.
241+
// T is inferred from the cache's value type, collapsing the per-kind getters
242+
// into a single generic lookup.
243+
func getCached[T any](
244+
ctx context.Context,
245+
cache map[client.ObjectKey]*T,
246+
ns, name string,
247+
get func(ctx context.Context, name string, opts metav1.GetOptions) (*T, error),
248+
) (*T, error) {
249+
key := client.ObjectKey{Namespace: ns, Name: name}
250+
if obj, ok := cache[key]; ok {
251+
return obj, nil
252+
}
253+
obj, err := get(ctx, name, metav1.GetOptions{})
254+
if err != nil {
255+
return nil, err
256+
}
257+
cache[key] = obj
258+
return obj, nil
259+
}
260+
261+
// listNamespacePDBs returns every PDB in the namespace. Used to detect
262+
// pre-existing user PDBs covering a controller we'd otherwise PDB-protect.
263+
func listNamespacePDBs(ctx context.Context, clientset kubernetes.Interface, namespace string) ([]policyv1.PodDisruptionBudget, error) {
264+
list, err := clientset.PolicyV1().PodDisruptionBudgets(namespace).List(ctx, metav1.ListOptions{})
265+
if err != nil {
266+
return nil, err
267+
}
268+
return list.Items, nil
269+
}
270+
271+
// hasUserPDB returns true when an existing non-temporary PDB has the same
272+
// selector as the controller's pod selector. This is a conservative
273+
// equality check: a broader user PDB will NOT be detected, and we'll create
274+
// our own. Eviction will then respect the most restrictive of the two,
275+
// preserving the user's intent.
276+
func hasUserPDB(existing []policyv1.PodDisruptionBudget, controllerSelector *metav1.LabelSelector) bool {
277+
if controllerSelector == nil {
278+
return false
279+
}
280+
return slices.ContainsFunc(existing, func(pdb policyv1.PodDisruptionBudget) bool {
281+
return !isTemporaryPDB(&pdb) && reflect.DeepEqual(pdb.Spec.Selector, controllerSelector)
282+
})
283+
}
284+
285+
func isTemporaryPDB(pdb *policyv1.PodDisruptionBudget) bool {
286+
return pdb.Labels[pdbManagedByLabelKey] == pdbManagedByLabelValue &&
287+
pdb.Labels[pdbTempLabelKey] == pdbTempLabelValue
288+
}
289+
290+
// createTempPDB writes (or no-ops if our PDB already exists) a temporary
291+
// PodDisruptionBudget with maxUnavailable: 1. Existing PDBs that aren't ours
292+
// at the same name are left alone with a logged warning — that's a name
293+
// collision the user must resolve.
294+
func createTempPDB(ctx context.Context, ctrlClient client.Client, c controllerInfo, dryRun bool) error {
295+
name := tempPDBName(c.Kind, c.Name)
296+
if dryRun {
297+
log.Printf("[dry-run] would create PDB %s/%s (maxUnavailable: 1, selector: %s)", c.Namespace, name, metav1.FormatLabelSelector(c.Selector))
298+
return nil
299+
}
300+
301+
existing := &policyv1.PodDisruptionBudget{}
302+
err := ctrlClient.Get(ctx, client.ObjectKey{Namespace: c.Namespace, Name: name}, existing)
303+
switch {
304+
case err == nil:
305+
if !isTemporaryPDB(existing) {
306+
log.Printf("Warning: PDB %s/%s exists but is not labelled as temporary; leaving it untouched", c.Namespace, name)
307+
return nil
308+
}
309+
// Our PDB from a previous (possibly crashed) run. Leave as-is; the
310+
// cleanup step will remove it at the end of the current run.
311+
return nil
312+
case !apierrors.IsNotFound(err):
313+
return fmt.Errorf("failed to get PDB %s/%s: %w", c.Namespace, name, err)
314+
}
315+
316+
maxUnavailable := intstr.FromInt(1)
317+
pdb := &policyv1.PodDisruptionBudget{
318+
TypeMeta: metav1.TypeMeta{APIVersion: "policy/v1", Kind: "PodDisruptionBudget"},
319+
ObjectMeta: metav1.ObjectMeta{
320+
Name: name,
321+
Namespace: c.Namespace,
322+
Labels: map[string]string{
323+
pdbManagedByLabelKey: pdbManagedByLabelValue,
324+
pdbTempLabelKey: pdbTempLabelValue,
325+
},
326+
},
327+
Spec: policyv1.PodDisruptionBudgetSpec{
328+
Selector: c.Selector.DeepCopy(),
329+
MaxUnavailable: &maxUnavailable,
330+
},
331+
}
332+
if err := ctrlClient.Create(ctx, pdb); err != nil {
333+
return fmt.Errorf("failed to create PDB %s/%s: %w", c.Namespace, name, err)
334+
}
335+
log.Printf("Created temporary PDB %s/%s for %s/%s (maxUnavailable: 1).", c.Namespace, name, c.Kind, c.Name)
336+
return nil
337+
}
338+
339+
// tempPDBName builds a DNS-label-safe PDB name. When the base is too long to
340+
// fit the 63-char limit (with the suffix), it is truncated and a short hash of
341+
// the full base is appended, so two controllers of the same kind+namespace
342+
// whose names share a truncated prefix don't collide on the same PDB name.
343+
func tempPDBName(kind, controllerName string) string {
344+
base := strings.ToLower(kind) + "-" + controllerName
345+
if len(base)+len(pdbNameSuffix) <= validation.DNS1123LabelMaxLength {
346+
return base + pdbNameSuffix
347+
}
348+
const hashLen = 8
349+
sum := sha256.Sum256([]byte(base))
350+
hash := hex.EncodeToString(sum[:hashLen/2]) // hashLen/2 bytes → hashLen hex chars
351+
// -1 leaves room for the '-' separating the truncated base from the hash.
352+
keep := validation.DNS1123LabelMaxLength - len(pdbNameSuffix) - hashLen - 1
353+
return base[:keep] + "-" + hash + pdbNameSuffix
16354
}

0 commit comments

Comments
 (0)