fix: failed to drain unmanaged clusterqueue

thxCode · thxCode · commit 1ef027e2e35b · 2026-06-17T21:53:04.000+08:00
Signed-off-by: thxCode &lt;thxcode0824@gmail.com&gt;
diff --git a/.claude/skills/gpustack-operator-e2e/SKILL.md b/.claude/skills/gpustack-operator-e2e/SKILL.md
@@ -244,6 +244,43 @@ kubectl -n "$NS" patch nodefeature "${NODE}-gpustack-worker" --type=merge \
 kubectl -n default delete instance gpustack-e2e-instance
 ```
 
+### 4b. Managed toggle — a *second, independent* drain trigger (run when the change touches the ResourceFlavor/Cohort Node-watch)
+
+Excluding a node from management (`gpustack.ai/managed=false`) must drain its single-node
+ResourceFlavors with the **same** chain as §4 (flavor `schedule.gpustack.ai/drain=true` → ClusterQueue
+`HoldAndDrain` → the InstanceType's running Instances `spec.stop=true`). What is non-obvious is that it
+is a *different trigger on a different code path*:
+
+- A §4 capacity reshape changes a *feature label*, so any feature-prefix predicate fires. **A managed
+  toggle changes only `gpustack.ai/managed`** — no feature label — so it drains **only if** the
+  `ResourceFlavorReconciler`/`CohortReconciler` Node-watch `UpdateFunc` predicates include
+  `systemname.ManagedLabelKey` in their `mapx.EqualWithStringPrefix(...)`
+  (`pkg/worker/controllers/worker/{resourceflavor,cohort}.go`). Missing it is the historical bug: the
+  flavor is never enqueued or drained, while the ClusterQueue silently recomputes to a misleading
+  `0/-1` (Active but negative-remaining) quota and the Instance keeps running.
+- **Restart masks it.** The `For`-watch start-up resync re-reconciles every ResourceFlavor, so a freshly
+  (re)started operator drains the orphan regardless of the predicate. Verify against a **continuously
+  running** operator — do not restart between the toggle and the assertion.
+- Toggle via the NodeFeature, not the node (§4 explains why: NFD reverts a direct node label). The unit
+  cases `unmanaged node drains flavor` / `unmanaged node deletes cohort` only guard the index filter,
+  **not** the predicate — so this live check is the only guard for the enqueue path.
+
+```bash
+NS=gpustack-system; NODE=$(kubectl get nodes -o jsonpath='{.items[0].metadata.name}')
+before=$(kubectl get node "$NODE" -o jsonpath='{.metadata.labels.gpustack\.ai/managed}')
+
+# Toggle out of management, then poll the §4 chain (flavor drain → CQ HoldAndDrain → Instance stop).
+kubectl -n "$NS" patch nodefeature "${NODE}-gpustack-worker" --type=merge \
+  -p '{"spec":{"labels":{"gpustack.ai/managed":"false"}}}'
+
+# Restore (skip if doing a full §6 teardown).
+kubectl -n "$NS" patch nodefeature "${NODE}-gpustack-worker" --type=merge \
+  -p "{\"spec\":{\"labels\":{\"gpustack.ai/managed\":\"${before:-true}\"}}}"
+```
+
+> Toggling a node that hosts a *running* Instance Stops that Instance, so on a shared cluster pick a node
+> whose Instances you can disrupt (or one with none, to assert the flavor/CQ drain alone).
+
 ## 5. Optional — simulated accelerator & drain-recycle (accelerated chain)
 
 This exercises the accelerated chain and the drain-recycle behavior (the `ResourceFlavor` tombstone,
diff --git a/pkg/manager/config.go b/pkg/manager/config.go
@@ -27,7 +27,6 @@ import (
 
 type Config struct {
 	InformerCacheResyncPeriod time.Duration
-	AggressiveEventFiltering  bool
 	LoopbackKubeConfigPath    string
 	LoopbackKubeRestConfig    rest.Config
 	LoopbackKubeHTTPClient    *http.Client
@@ -120,11 +119,10 @@ func (c *Config) Apply(ctx context.Context) (*Manager, error) {
 			return nil, fmt.Errorf("create controller manager: %w", err)
 		}
 		ctrlManager = CtrlManager{
-			Manager:                  rawCtrlManager,
-			aggressiveEventFiltering: c.AggressiveEventFiltering,
-			options:                  ctrlMgrOpts,
-			httpClient:               c.LoopbackKubeHTTPClient,
-			indexedFields:            sets.Set[string]{},
+			Manager:       rawCtrlManager,
+			options:       ctrlMgrOpts,
+			httpClient:    c.LoopbackKubeHTTPClient,
+			indexedFields: sets.Set[string]{},
 		}
 	}
 
diff --git a/pkg/manager/helper.go b/pkg/manager/helper.go
@@ -21,11 +21,10 @@ type (
 	// CtrlManager is a wrapper around ctrl.Manager.
 	CtrlManager struct {
 		ctrl.Manager
-		aggressiveEventFiltering bool
-		options                  ctrl.Options
-		httpClient               *http.Client
-		disableController        bool
-		indexedFields            sets.Set[string]
+		options           ctrl.Options
+		httpClient        *http.Client
+		disableController bool
+		indexedFields     sets.Set[string]
 	}
 
 	// RepeatableCtrlFieldIndexer is a wrapper around ctrlcli.FieldIndexer.
@@ -111,11 +110,6 @@ func (m CtrlManager) GetLeaderElectionNamespacedName() types.NamespacedName {
 	}
 }
 
-// AllowAggressiveEventFiltering returns whether aggressive event filtering is allowed.
-func (m CtrlManager) AllowAggressiveEventFiltering() bool {
-	return m.aggressiveEventFiltering
-}
-
 // _CtrlManagerSentinel is a ctrlmgr.Runnable implementation for observing
 // whether the ctrl.Manager is started.
 type _CtrlManagerSentinel struct {
diff --git a/pkg/manager/option.go b/pkg/manager/option.go
@@ -27,7 +27,6 @@ type Options struct {
 	// Control.
 	GopoolWorkerFactor        int
 	InformerCacheResyncPeriod time.Duration
-	AggressiveEventFiltering  bool
 
 	// Connect Kubernetes.
 	KubeConnTimeout        time.Duration
@@ -83,8 +82,6 @@ func (o *Options) AddFlags(fs *pflag.FlagSet, opts ...FlagOption) {
 			"it is calculated by the number of CPU cores multiplied by this factor.")
 	fs.DurationVar(&o.InformerCacheResyncPeriod, "informer-cache-resync-period", o.InformerCacheResyncPeriod,
 		"the period at which the informer's cache is resynced.")
-	fs.BoolVar(&o.AggressiveEventFiltering, "aggressive-event-filtering", o.AggressiveEventFiltering,
-		"indicates to reduce event filtering threshold to make the controllers more aggressive to react to the changes of the cluster.")
 
 	// Connect Kubernetes.
 	fs.DurationVar(&o.KubeConnTimeout, "kube-conn-timeout", o.KubeConnTimeout,
@@ -220,7 +217,6 @@ func (o *Options) Complete(ctx context.Context) (*Config, error) {
 
 	return &Config{
 		InformerCacheResyncPeriod: o.InformerCacheResyncPeriod,
-		AggressiveEventFiltering:  o.AggressiveEventFiltering,
 		LoopbackKubeConfigPath:    lpCfgPath,
 		LoopbackKubeRestConfig:    *lpRestCfg,
 		LoopbackKubeHTTPClient:    lpHttpCli,
diff --git a/pkg/worker/controllers/worker/clusterqueue.go b/pkg/worker/controllers/worker/clusterqueue.go
@@ -449,7 +449,6 @@ func (r *ClusterQueueReconciler) SetupController(ctx context.Context, opts contr
 	r.APIReader = opts.Manager.GetAPIReader()
 
 	dedupWindow := ctrlhandlerx.NewDedupWindow[ctrlreconcile.Request]()
-	aggressive := opts.Manager.AllowAggressiveEventFiltering()
 
 	return ctrl.NewControllerManagedBy(opts.Manager).
 		Named("clusterqueue").
@@ -510,15 +509,11 @@ func (r *ClusterQueueReconciler) SetupController(ctx context.Context, opts contr
 				// - updated if its feature labels or allocatable resources have changed.
 				ctrlpredicate.Funcs{
 					UpdateFunc: func(e ctrlevent.UpdateEvent) bool {
-						if aggressive {
-							return true
-						}
-
 						oldNd, newNd := e.ObjectOld.(*core.Node), e.ObjectNew.(*core.Node)
 						if newNd.DeletionTimestamp == nil {
 							// Fire when feature labels have changed.
 							if !mapx.EqualWithStringPrefix(oldNd.Labels, newNd.Labels,
-								systemname.LabelPrefix,
+								systemname.ManagedLabelKey,
 								nodefeature.FeatureLabelPrefix,
 								nodefeature.GeneralFeatureLabelPrefix,
 								nodefeature.AcceleratableFeatureLabelPrefix) {
diff --git a/pkg/worker/controllers/worker/cohort.go b/pkg/worker/controllers/worker/cohort.go
@@ -195,7 +195,6 @@ func (r *CohortReconciler) SetupController(ctx context.Context, opts controller.
 	r.Client = opts.Manager.GetClient()
 
 	dedupWindow := ctrlhandlerx.NewDedupWindow[ctrlreconcile.Request]()
-	aggressive := opts.Manager.AllowAggressiveEventFiltering()
 
 	return ctrl.NewControllerManagedBy(opts.Manager).
 		Named("cohort").
@@ -211,17 +210,14 @@ func (r *CohortReconciler) SetupController(ctx context.Context, opts controller.
 				// Trigger reconciliation when a Node is:
 				// - created.
 				// - deleted.
-				// - updated if its feature labels have changed.
+				// - updated if its managed mark or feature labels have changed.
 				ctrlpredicate.Funcs{
 					UpdateFunc: func(e ctrlevent.UpdateEvent) bool {
-						if aggressive {
-							return true
-						}
-
 						oldNd, newNd := e.ObjectOld.(*core.Node), e.ObjectNew.(*core.Node)
 						if newNd.DeletionTimestamp == nil {
-							// Fire when feature labels have changed.
+							// Fire when the managed mark or feature labels have changed.
 							return !mapx.EqualWithStringPrefix(oldNd.Labels, newNd.Labels,
+								systemname.ManagedLabelKey,
 								nodefeature.FeatureLabelPrefix,
 								nodefeature.GeneralFeatureLabelPrefix,
 								nodefeature.AcceleratableFeatureLabelPrefix)
diff --git a/pkg/worker/controllers/worker/cohort_test.go b/pkg/worker/controllers/worker/cohort_test.go
@@ -17,6 +17,7 @@ import (
 	"gpustack.ai/gpustack/pkg/kubeclients/kubernetes/scheme"
 	"gpustack.ai/gpustack/pkg/nodefeature"
 	"gpustack.ai/gpustack/pkg/systemmeta"
+	"gpustack.ai/gpustack/pkg/systemname"
 )
 
 // newInstanceTypeClusterQueue builds an "instancetypes" ClusterQueue that
@@ -57,6 +58,7 @@ func TestCohortReconciler_Reconcile(t *testing.T) {
 		name string
 
 		withNode         bool
+		nodeUnmanaged    bool // node present but gpustack.ai/managed=false
 		withCohort       bool
 		withClusterQueue bool
 
@@ -82,14 +84,29 @@ func TestCohortReconciler_Reconcile(t *testing.T) {
 			name:       "no node no ClusterQueue deletes cohort",
 			withCohort: true,
 		},
+		{
+			// A Node exists but is no longer managed (gpustack.ai/managed=false), so
+			// indexNodeByCohortProfile excludes it. With no ClusterQueue either, the
+			// cohort is idle and must be deleted. Guards the index's managed filter
+			// (the path a node leaving management relies on); does NOT exercise the
+			// Node-watch predicate.
+			name:          "unmanaged node deletes cohort",
+			withNode:      true,
+			nodeUnmanaged: true,
+			withCohort:    true,
+		},
 	}
 
 	for _, c := range cases {
 		c := c
 		t.Run(c.name, func(t *testing.T) {
 			var objs []ctrlcli.Object
 			if c.withNode {
-				objs = append(objs, newGeneralNode("node-1"))
+				nd := newGeneralNode("node-1")
+				if c.nodeUnmanaged {
+					nd.Labels[systemname.ManagedLabelKey] = "false"
+				}
+				objs = append(objs, nd)
 			}
 			if c.withCohort {
 				objs = append(objs, &kueue.Cohort{ObjectMeta: meta.ObjectMeta{Name: cohortName}})
diff --git a/pkg/worker/controllers/worker/nodefeature.go b/pkg/worker/controllers/worker/nodefeature.go
@@ -108,7 +108,6 @@ func (r *NodeFeatureReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 func (r *NodeFeatureReconciler) SetupController(_ context.Context, opts controller.SetupOptions) error {
 	r.Client = opts.Manager.GetClient()
 
-	aggressive := opts.Manager.AllowAggressiveEventFiltering()
 	return ctrl.NewControllerManagedBy(opts.Manager).
 		Named("nodefeature").
 		For(
@@ -122,10 +121,6 @@ func (r *NodeFeatureReconciler) SetupController(_ context.Context, opts controll
 						return false
 					},
 					UpdateFunc: func(e ctrlevent.UpdateEvent) bool {
-						if aggressive {
-							return e.ObjectNew.GetDeletionTimestamp() == nil
-						}
-
 						oldNd, newNd := e.ObjectOld.(*core.Node), e.ObjectNew.(*core.Node)
 						if newNd.DeletionTimestamp == nil {
 							// Fire when labels have changed.
diff --git a/pkg/worker/controllers/worker/resourceflavor.go b/pkg/worker/controllers/worker/resourceflavor.go
@@ -236,8 +236,6 @@ func (r *ResourceFlavorReconciler) SetupController(ctx context.Context, opts con
 
 	r.Client = opts.Manager.GetClient()
 
-	aggressive := opts.Manager.AllowAggressiveEventFiltering()
-
 	return ctrl.NewControllerManagedBy(opts.Manager).
 		Named("resourceflavor").
 		For(
@@ -295,17 +293,15 @@ func (r *ResourceFlavorReconciler) SetupController(ctx context.Context, opts con
 				// Trigger reconciliation when a Node is:
 				// - created.
 				// - deleted (so a flavor losing its last Node gets drained).
-				// - updated if its feature labels or taints have changed.
+				// - updated if its managed mark, feature labels or taints have
+				//   changed (a node leaving management drains its orphaned flavors).
 				ctrlpredicate.Funcs{
 					UpdateFunc: func(e ctrlevent.UpdateEvent) bool {
-						if aggressive {
-							return e.ObjectNew.GetDeletionTimestamp() == nil
-						}
-
 						oldNd, newNd := e.ObjectOld.(*core.Node), e.ObjectNew.(*core.Node)
 						if newNd.DeletionTimestamp == nil {
-							// Fire when labels have changed.
+							// Fire when the managed mark or feature labels have changed.
 							if !mapx.EqualWithStringPrefix(oldNd.Labels, newNd.Labels,
+								systemname.ManagedLabelKey,
 								nodefeature.FeatureLabelPrefix,
 								nodefeature.GeneralFeatureLabelPrefix,
 								nodefeature.AcceleratableFeatureLabelPrefix) {
diff --git a/pkg/worker/controllers/worker/resourceflavor_test.go b/pkg/worker/controllers/worker/resourceflavor_test.go
@@ -89,6 +89,7 @@ func TestResourceFlavorReconciler_Reconcile(t *testing.T) {
 		withFlavor     bool
 		flavorDraining bool
 		withNode       bool
+		nodeUnmanaged  bool // node present but gpustack.ai/managed=false
 
 		wantExists   bool
 		wantDraining bool // _ResourceFlavorDrainAnnoKey present
@@ -128,6 +129,18 @@ func TestResourceFlavorReconciler_Reconcile(t *testing.T) {
 			withNode:   true,
 			wantExists: true,
 		},
+		{
+			// A Node exists but is no longer managed (gpustack.ai/managed=false), so
+			// indexNodeByFlavorProfile excludes it: the flavor is orphaned and must be
+			// marked draining. Guards the index's managed filter (the path a node
+			// leaving management relies on); does NOT exercise the Node-watch predicate.
+			name:          "unmanaged node drains flavor",
+			withFlavor:    true,
+			withNode:      true,
+			nodeUnmanaged: true,
+			wantExists:    true,
+			wantDraining:  true,
+		},
 	}
 
 	for _, c := range cases {
@@ -138,7 +151,11 @@ func TestResourceFlavorReconciler_Reconcile(t *testing.T) {
 				objs = append(objs, newNodesResourceFlavor(flavorName, c.flavorDraining))
 			}
 			if c.withNode {
-				objs = append(objs, newGeneralNode("node-1"))
+				nd := newGeneralNode("node-1")
+				if c.nodeUnmanaged {
+					nd.Labels[systemname.ManagedLabelKey] = "false"
+				}
+				objs = append(objs, nd)
 			}
 			cli := buildFlavorClient(objs...)