@@ -2,15 +2,353 @@ package evict
22
33import (
44 "context"
5+ "crypto/sha256"
6+ "encoding/hex"
7+ "errors"
8+ "fmt"
9+ "log"
10+ "reflect"
11+ "slices"
12+ "strings"
513
14+ "github.com/samber/lo"
15+ appsv1 "k8s.io/api/apps/v1"
16+ corev1 "k8s.io/api/core/v1"
17+ policyv1 "k8s.io/api/policy/v1"
18+ apierrors "k8s.io/apimachinery/pkg/api/errors"
19+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
20+ "k8s.io/apimachinery/pkg/runtime"
21+ "k8s.io/apimachinery/pkg/util/intstr"
22+ "k8s.io/apimachinery/pkg/util/validation"
623 "k8s.io/client-go/kubernetes"
24+ "k8s.io/client-go/tools/pager"
725 "sigs.k8s.io/controller-runtime/pkg/client"
26+
27+ commonk8s "github.com/DataDog/datadog-operator/cmd/kubectl-datadog/autoscaling/cluster/common/k8s"
28+ )
29+
30+ // Label keys used to mark PodDisruptionBudgets created by this command. Both
31+ // must be present for cleanup to consider the PDB ours — this makes the
32+ // cleanup safe against accidentally removing a user PDB with a colliding name.
33+ const (
34+ pdbManagedByLabelKey = "app.kubernetes.io/managed-by"
35+ pdbManagedByLabelValue = "kubectl-datadog"
36+ pdbTempLabelKey = "autoscaling.datadoghq.com/temporary-pdb"
37+ pdbTempLabelValue = "true"
38+ // pdbNameSuffix is appended to the controller name to form the temp PDB
39+ // name.
40+ pdbNameSuffix = "-evict-legacy"
841)
942
1043func ensureTempPDBs (ctx context.Context , clientset kubernetes.Interface , ctrlClient client.Client , targets []Target , dryRun bool ) error {
11- panic ("TODO: ensureTempPDBs — implemented in PR https://github.com/DataDog/datadog-operator/pull/3178" )
44+ allNodes := lo .FlatMap (targets , func (t Target , _ int ) []string { return t .Nodes })
45+ nodeSet := lo .SliceToMap (allNodes , func (n string ) (string , struct {}) { return n , struct {}{} })
46+ if len (nodeSet ) == 0 {
47+ return nil
48+ }
49+
50+ controllers , err := discoverControllers (ctx , clientset , nodeSet )
51+ if err != nil {
52+ return fmt .Errorf ("failed to discover controllers: %w" , err )
53+ }
54+ if len (controllers ) == 0 {
55+ return nil
56+ }
57+
58+ // Group controllers by namespace to amortize the per-namespace PDB list.
59+ byNamespace := make (map [string ][]controllerInfo )
60+ for _ , c := range controllers {
61+ byNamespace [c .Namespace ] = append (byNamespace [c .Namespace ], c )
62+ }
63+
64+ var errs []error
65+ for ns , ctrls := range byNamespace {
66+ existing , err := listNamespacePDBs (ctx , clientset , ns )
67+ if err != nil {
68+ errs = append (errs , fmt .Errorf ("namespace %s: failed to list PDBs: %w" , ns , err ))
69+ continue
70+ }
71+ for _ , c := range ctrls {
72+ if hasUserPDB (existing , c .Selector ) {
73+ continue
74+ }
75+ if err := createTempPDB (ctx , ctrlClient , c , dryRun ); err != nil {
76+ errs = append (errs , fmt .Errorf ("controller %s/%s/%s: %w" , c .Namespace , c .Kind , c .Name , err ))
77+ }
78+ }
79+ }
80+ return errors .Join (errs ... )
1281}
1382
1483func cleanupTempPDBs (ctx context.Context , ctrlClient client.Client , dryRun bool ) error {
15- panic ("TODO: cleanupTempPDBs — implemented in PR https://github.com/DataDog/datadog-operator/pull/3178" )
84+ list := & policyv1.PodDisruptionBudgetList {}
85+ if err := ctrlClient .List (ctx , list , client.MatchingLabels {
86+ pdbManagedByLabelKey : pdbManagedByLabelValue ,
87+ pdbTempLabelKey : pdbTempLabelValue ,
88+ }); err != nil {
89+ return fmt .Errorf ("failed to list temporary PDBs: %w" , err )
90+ }
91+ if len (list .Items ) == 0 {
92+ return nil
93+ }
94+ var errs []error
95+ for i := range list .Items {
96+ pdb := & list .Items [i ]
97+ if dryRun {
98+ log .Printf ("[dry-run] would delete PDB %s/%s" , pdb .Namespace , pdb .Name )
99+ continue
100+ }
101+ if err := commonk8s .Delete (ctx , ctrlClient , pdb ); err != nil {
102+ errs = append (errs , fmt .Errorf ("PDB %s/%s: %w" , pdb .Namespace , pdb .Name , err ))
103+ }
104+ }
105+ return errors .Join (errs ... )
106+ }
107+
108+ // reclaimLeakedTempPDBs deletes temporary PDBs left behind by an interrupted
109+ // previous run. It is a no-op when temp-PDB management is disabled. Failures are
110+ // logged rather than returned: this runs on the no-op exit path (nothing left
111+ // to evict), where reclaiming a leftover PDB is best-effort, not a reason to
112+ // fail the command.
113+ func reclaimLeakedTempPDBs (ctx context.Context , ctrlClient client.Client , ensurePDBs , dryRun bool ) {
114+ if ! ensurePDBs {
115+ return
116+ }
117+ if err := cleanupTempPDBs (ctx , ctrlClient , dryRun ); err != nil {
118+ log .Printf ("Warning: failed to cleanup leftover temporary PDBs: %v" , err )
119+ }
120+ }
121+
122+ // controllerKey identifies a top-level controller that owns evictable pods on
123+ // our target nodes. It is the dedup key in the seen map and the identity half
124+ // of controllerInfo.
125+ type controllerKey struct {
126+ Namespace string
127+ Kind string // "Deployment", "StatefulSet", "ReplicaSet"
128+ Name string
129+ }
130+
131+ // controllerInfo is a controllerKey plus the controller's pod selector — what a
132+ // PDB would match on.
133+ type controllerInfo struct {
134+ controllerKey
135+ Selector * metav1.LabelSelector
136+ }
137+
138+ // discoverControllers lists every Pod cluster-wide once (paginated) and, for
139+ // each Pod scheduled on one of the target nodes, resolves the top-level
140+ // controller. Listing once and filtering client-side avoids the N-API-calls
141+ // problem of doing one List per node, which on a large legacy fleet would
142+ // dominate the command's wall-clock time. The resulting slice contains each
143+ // controller at most once.
144+ func discoverControllers (ctx context.Context , clientset kubernetes.Interface , nodeSet map [string ]struct {}) ([]controllerInfo , error ) {
145+ seen := make (map [controllerKey ]* metav1.LabelSelector )
146+ depCache := make (map [client.ObjectKey ]* appsv1.Deployment )
147+ rsCache := make (map [client.ObjectKey ]* appsv1.ReplicaSet )
148+ stsCache := make (map [client.ObjectKey ]* appsv1.StatefulSet )
149+
150+ p := pager .New (func (ctx context.Context , opts metav1.ListOptions ) (runtime.Object , error ) {
151+ return clientset .CoreV1 ().Pods (metav1 .NamespaceAll ).List (ctx , opts )
152+ })
153+ if err := p .EachListItem (ctx , metav1.ListOptions {}, func (obj runtime.Object ) error {
154+ pod := obj .(* corev1.Pod )
155+ if _ , onTarget := nodeSet [pod .Spec .NodeName ]; ! onTarget {
156+ return nil
157+ }
158+ if shouldSkipEviction (pod ) {
159+ return nil
160+ }
161+ info , err := resolveTopLevelController (ctx , clientset , pod , depCache , rsCache , stsCache )
162+ if err != nil {
163+ if apierrors .IsNotFound (err ) {
164+ log .Printf ("Warning: controller for pod %s/%s not found (deleted mid-scan); skipping" , pod .Namespace , pod .Name )
165+ return nil
166+ }
167+ return fmt .Errorf ("resolve controller for pod %s/%s: %w" , pod .Namespace , pod .Name , err )
168+ }
169+ if info == nil {
170+ return nil
171+ }
172+ seen [info .controllerKey ] = info .Selector
173+ return nil
174+ }); err != nil {
175+ return nil , fmt .Errorf ("list pods: %w" , err )
176+ }
177+
178+ return lo .MapToSlice (seen , func (key controllerKey , selector * metav1.LabelSelector ) controllerInfo {
179+ return controllerInfo {controllerKey : key , Selector : selector }
180+ }), nil
181+ }
182+
183+ // resolveTopLevelController walks a Pod's owner chain up to the workload
184+ // controller (Deployment > ReplicaSet > Pod, StatefulSet > Pod). Returns nil
185+ // for Pods whose top-level owner is not a stable workload — Jobs (TTL-managed),
186+ // DaemonSets (already skipped at eviction), or bare Pods.
187+ func resolveTopLevelController (
188+ ctx context.Context ,
189+ clientset kubernetes.Interface ,
190+ pod * corev1.Pod ,
191+ depCache map [client.ObjectKey ]* appsv1.Deployment ,
192+ rsCache map [client.ObjectKey ]* appsv1.ReplicaSet ,
193+ stsCache map [client.ObjectKey ]* appsv1.StatefulSet ,
194+ ) (* controllerInfo , error ) {
195+ owner := metav1 .GetControllerOf (pod )
196+ if owner == nil {
197+ return nil , nil
198+ }
199+ switch owner .Kind {
200+ case "ReplicaSet" :
201+ rs , err := getCached (ctx , rsCache , pod .Namespace , owner .Name , clientset .AppsV1 ().ReplicaSets (pod .Namespace ).Get )
202+ if err != nil {
203+ return nil , err
204+ }
205+ rsOwner := metav1 .GetControllerOf (rs )
206+ if rsOwner != nil && rsOwner .Kind == "Deployment" {
207+ dep , err := getCached (ctx , depCache , pod .Namespace , rsOwner .Name , clientset .AppsV1 ().Deployments (pod .Namespace ).Get )
208+ if err == nil {
209+ return & controllerInfo {
210+ controllerKey : controllerKey {Namespace : pod .Namespace , Kind : "Deployment" , Name : dep .Name },
211+ Selector : dep .Spec .Selector ,
212+ }, nil
213+ }
214+ if ! apierrors .IsNotFound (err ) {
215+ return nil , err
216+ }
217+ }
218+ return & controllerInfo {
219+ controllerKey : controllerKey {Namespace : pod .Namespace , Kind : "ReplicaSet" , Name : rs .Name },
220+ Selector : rs .Spec .Selector ,
221+ }, nil
222+ case "StatefulSet" :
223+ sts , err := getCached (ctx , stsCache , pod .Namespace , owner .Name , clientset .AppsV1 ().StatefulSets (pod .Namespace ).Get )
224+ if err != nil {
225+ return nil , err
226+ }
227+ return & controllerInfo {
228+ controllerKey : controllerKey {Namespace : pod .Namespace , Kind : "StatefulSet" , Name : sts .Name },
229+ Selector : sts .Spec .Selector ,
230+ }, nil
231+ default :
232+ // DaemonSet (skipped before reaching here), Job (TTL), CronJob,
233+ // custom controllers — none get a temporary PDB.
234+ return nil , nil
235+ }
236+ }
237+
238+ // getCached returns the object identified by (ns, name) from cache, fetching it
239+ // via get — the namespace-bound typed clientset accessor, e.g.
240+ // clientset.AppsV1().ReplicaSets(ns).Get — and populating the cache on a miss.
241+ // T is inferred from the cache's value type, collapsing the per-kind getters
242+ // into a single generic lookup.
243+ func getCached [T any ](
244+ ctx context.Context ,
245+ cache map [client.ObjectKey ]* T ,
246+ ns , name string ,
247+ get func (ctx context.Context , name string , opts metav1.GetOptions ) (* T , error ),
248+ ) (* T , error ) {
249+ key := client.ObjectKey {Namespace : ns , Name : name }
250+ if obj , ok := cache [key ]; ok {
251+ return obj , nil
252+ }
253+ obj , err := get (ctx , name , metav1.GetOptions {})
254+ if err != nil {
255+ return nil , err
256+ }
257+ cache [key ] = obj
258+ return obj , nil
259+ }
260+
261+ // listNamespacePDBs returns every PDB in the namespace. Used to detect
262+ // pre-existing user PDBs covering a controller we'd otherwise PDB-protect.
263+ func listNamespacePDBs (ctx context.Context , clientset kubernetes.Interface , namespace string ) ([]policyv1.PodDisruptionBudget , error ) {
264+ list , err := clientset .PolicyV1 ().PodDisruptionBudgets (namespace ).List (ctx , metav1.ListOptions {})
265+ if err != nil {
266+ return nil , err
267+ }
268+ return list .Items , nil
269+ }
270+
271+ // hasUserPDB returns true when an existing non-temporary PDB has the same
272+ // selector as the controller's pod selector. This is a conservative
273+ // equality check: a broader user PDB will NOT be detected, and we'll create
274+ // our own. Eviction will then respect the most restrictive of the two,
275+ // preserving the user's intent.
276+ func hasUserPDB (existing []policyv1.PodDisruptionBudget , controllerSelector * metav1.LabelSelector ) bool {
277+ if controllerSelector == nil {
278+ return false
279+ }
280+ return slices .ContainsFunc (existing , func (pdb policyv1.PodDisruptionBudget ) bool {
281+ return ! isTemporaryPDB (& pdb ) && reflect .DeepEqual (pdb .Spec .Selector , controllerSelector )
282+ })
283+ }
284+
285+ func isTemporaryPDB (pdb * policyv1.PodDisruptionBudget ) bool {
286+ return pdb .Labels [pdbManagedByLabelKey ] == pdbManagedByLabelValue &&
287+ pdb .Labels [pdbTempLabelKey ] == pdbTempLabelValue
288+ }
289+
290+ // createTempPDB writes (or no-ops if our PDB already exists) a temporary
291+ // PodDisruptionBudget with maxUnavailable: 1. Existing PDBs that aren't ours
292+ // at the same name are left alone with a logged warning — that's a name
293+ // collision the user must resolve.
294+ func createTempPDB (ctx context.Context , ctrlClient client.Client , c controllerInfo , dryRun bool ) error {
295+ name := tempPDBName (c .Kind , c .Name )
296+ if dryRun {
297+ log .Printf ("[dry-run] would create PDB %s/%s (maxUnavailable: 1, selector: %s)" , c .Namespace , name , metav1 .FormatLabelSelector (c .Selector ))
298+ return nil
299+ }
300+
301+ existing := & policyv1.PodDisruptionBudget {}
302+ err := ctrlClient .Get (ctx , client.ObjectKey {Namespace : c .Namespace , Name : name }, existing )
303+ switch {
304+ case err == nil :
305+ if ! isTemporaryPDB (existing ) {
306+ log .Printf ("Warning: PDB %s/%s exists but is not labelled as temporary; leaving it untouched" , c .Namespace , name )
307+ return nil
308+ }
309+ // Our PDB from a previous (possibly crashed) run. Leave as-is; the
310+ // cleanup step will remove it at the end of the current run.
311+ return nil
312+ case ! apierrors .IsNotFound (err ):
313+ return fmt .Errorf ("failed to get PDB %s/%s: %w" , c .Namespace , name , err )
314+ }
315+
316+ maxUnavailable := intstr .FromInt (1 )
317+ pdb := & policyv1.PodDisruptionBudget {
318+ TypeMeta : metav1.TypeMeta {APIVersion : "policy/v1" , Kind : "PodDisruptionBudget" },
319+ ObjectMeta : metav1.ObjectMeta {
320+ Name : name ,
321+ Namespace : c .Namespace ,
322+ Labels : map [string ]string {
323+ pdbManagedByLabelKey : pdbManagedByLabelValue ,
324+ pdbTempLabelKey : pdbTempLabelValue ,
325+ },
326+ },
327+ Spec : policyv1.PodDisruptionBudgetSpec {
328+ Selector : c .Selector .DeepCopy (),
329+ MaxUnavailable : & maxUnavailable ,
330+ },
331+ }
332+ if err := ctrlClient .Create (ctx , pdb ); err != nil {
333+ return fmt .Errorf ("failed to create PDB %s/%s: %w" , c .Namespace , name , err )
334+ }
335+ log .Printf ("Created temporary PDB %s/%s for %s/%s (maxUnavailable: 1)." , c .Namespace , name , c .Kind , c .Name )
336+ return nil
337+ }
338+
339+ // tempPDBName builds a DNS-label-safe PDB name. When the base is too long to
340+ // fit the 63-char limit (with the suffix), it is truncated and a short hash of
341+ // the full base is appended, so two controllers of the same kind+namespace
342+ // whose names share a truncated prefix don't collide on the same PDB name.
343+ func tempPDBName (kind , controllerName string ) string {
344+ base := strings .ToLower (kind ) + "-" + controllerName
345+ if len (base )+ len (pdbNameSuffix ) <= validation .DNS1123LabelMaxLength {
346+ return base + pdbNameSuffix
347+ }
348+ const hashLen = 8
349+ sum := sha256 .Sum256 ([]byte (base ))
350+ hash := hex .EncodeToString (sum [:hashLen / 2 ]) // hashLen/2 bytes → hashLen hex chars
351+ // -1 leaves room for the '-' separating the truncated base from the hash.
352+ keep := validation .DNS1123LabelMaxLength - len (pdbNameSuffix ) - hashLen - 1
353+ return base [:keep ] + "-" + hash + pdbNameSuffix
16354}
0 commit comments