Skip to content

Commit e82e571

Browse files
committed
require explicit enablement of awaiting csi driver in the untaint controller
1 parent 6bec367 commit e82e571

10 files changed

Lines changed: 150 additions & 127 deletions

File tree

cmd/main.go

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ type options struct {
148148
datadogGenericResourceEnabled bool
149149
datadogCSIDriverEnabled bool
150150
untaintControllerEnabled bool
151+
untaintControllerWaitForCSIDriver bool
151152

152153
// Secret Backend options
153154
secretBackendCommand string
@@ -188,6 +189,8 @@ func (opts *options) Parse() {
188189
flag.BoolVar(&opts.datadogGenericResourceEnabled, "datadogGenericResourceEnabled", false, "Enable the DatadogGenericResource controller")
189190
flag.BoolVar(&opts.datadogCSIDriverEnabled, "datadogCSIDriverEnabled", false, "Enable the DatadogCSIDriver controller")
190191
flag.BoolVar(&opts.untaintControllerEnabled, "untaintControllerEnabled", false, "Enable the Untaint controller")
192+
flag.BoolVar(&opts.untaintControllerWaitForCSIDriver, "untaintControllerWaitForCSIDriver", false,
193+
"When true (requires --untaintControllerEnabled), the Untaint controller removes the startup taint only after both the node Agent and Datadog CSI node-server pods are Ready. Requires Pod watch coverage of CSI namespaces (DD_CSIDRIVER_WATCH_NAMESPACE).")
191194

192195
// DatadogAgentInternal
193196
flag.BoolVar(&opts.createControllerRevisions, "createControllerRevisions", false, "Enable creation of ControllerRevision snapshots on each DDA spec change")
@@ -235,6 +238,10 @@ func run(opts *options) error {
235238
}
236239
version.PrintVersionLogs(setupLog)
237240

241+
if opts.untaintControllerWaitForCSIDriver && !opts.untaintControllerEnabled {
242+
return setupErrorf(setupLog, fmt.Errorf("invalid flags"), "--untaintControllerWaitForCSIDriver requires --untaintControllerEnabled=true")
243+
}
244+
238245
// submits the maximum go routine setting as a metric
239246
metrics.MaxGoroutines.Set(float64(opts.maximumGoroutines))
240247

@@ -287,15 +294,16 @@ func run(opts *options) error {
287294
RenewDeadline: &renewDeadline,
288295
RetryPeriod: &retryPeriod,
289296
Cache: config.CacheOptions(setupLog, config.WatchOptions{
290-
DatadogAgentEnabled: opts.datadogAgentEnabled,
291-
DatadogMonitorEnabled: opts.datadogMonitorEnabled,
292-
DatadogSLOEnabled: opts.datadogSLOEnabled,
293-
DatadogAgentProfileEnabled: opts.datadogAgentProfileEnabled,
294-
IntrospectionEnabled: opts.introspectionEnabled,
295-
DatadogDashboardEnabled: opts.datadogDashboardEnabled,
296-
DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled,
297-
DatadogCSIDriverEnabled: opts.datadogCSIDriverEnabled,
298-
UntaintControllerEnabled: opts.untaintControllerEnabled,
297+
DatadogAgentEnabled: opts.datadogAgentEnabled,
298+
DatadogMonitorEnabled: opts.datadogMonitorEnabled,
299+
DatadogSLOEnabled: opts.datadogSLOEnabled,
300+
DatadogAgentProfileEnabled: opts.datadogAgentProfileEnabled,
301+
IntrospectionEnabled: opts.introspectionEnabled,
302+
DatadogDashboardEnabled: opts.datadogDashboardEnabled,
303+
DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled,
304+
DatadogCSIDriverEnabled: opts.datadogCSIDriverEnabled,
305+
UntaintControllerEnabled: opts.untaintControllerEnabled,
306+
UntaintControllerWaitForCSIDriver: opts.untaintControllerWaitForCSIDriver,
299307
}),
300308
// UsePriorityQueue makes all controllers use the priority queue, which
301309
// directly registers workqueue metrics into controller-runtime's metrics
@@ -366,20 +374,21 @@ func run(opts *options) error {
366374
CanaryAutoPauseMaxSlowStartDuration: opts.edsCanaryAutoPauseMaxSlowStartDuration,
367375
MaxPodSchedulerFailure: opts.edsMaxPodSchedulerFailure,
368376
},
369-
SupportCilium: opts.supportCilium,
370-
CredsManager: credsManager,
371-
DatadogAgentEnabled: opts.datadogAgentEnabled,
372-
CreateControllerRevisions: opts.createControllerRevisions && opts.datadogAgentEnabled,
373-
DatadogMonitorEnabled: opts.datadogMonitorEnabled,
374-
DatadogSLOEnabled: opts.datadogSLOEnabled,
375-
OperatorMetricsEnabled: opts.operatorMetricsEnabled,
376-
V2APIEnabled: true,
377-
IntrospectionEnabled: opts.introspectionEnabled,
378-
DatadogAgentProfileEnabled: opts.datadogAgentProfileEnabled,
379-
DatadogDashboardEnabled: opts.datadogDashboardEnabled,
380-
DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled,
381-
DatadogCSIDriverEnabled: opts.datadogCSIDriverEnabled,
382-
UntaintControllerEnabled: opts.untaintControllerEnabled,
377+
SupportCilium: opts.supportCilium,
378+
CredsManager: credsManager,
379+
DatadogAgentEnabled: opts.datadogAgentEnabled,
380+
CreateControllerRevisions: opts.createControllerRevisions && opts.datadogAgentEnabled,
381+
DatadogMonitorEnabled: opts.datadogMonitorEnabled,
382+
DatadogSLOEnabled: opts.datadogSLOEnabled,
383+
OperatorMetricsEnabled: opts.operatorMetricsEnabled,
384+
V2APIEnabled: true,
385+
IntrospectionEnabled: opts.introspectionEnabled,
386+
DatadogAgentProfileEnabled: opts.datadogAgentProfileEnabled,
387+
DatadogDashboardEnabled: opts.datadogDashboardEnabled,
388+
DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled,
389+
DatadogCSIDriverEnabled: opts.datadogCSIDriverEnabled,
390+
UntaintControllerEnabled: opts.untaintControllerEnabled,
391+
UntaintControllerWaitForCSIDriver: opts.untaintControllerWaitForCSIDriver,
383392
}
384393

385394
versionInfo, platformInfo, err := getVersionAndPlatformInfo(rest.CopyConfig(mgr.GetConfig()))

docs/untaint_controller.md

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ readiness criteria are met (see below), or after a configurable timeout. It is
1010
intended to run alongside a separate mechanism (cluster-autoscaler hook, CCM,
1111
admission webhook, etc.) that adds the taint to new nodes.
1212

13-
**With `--untaintControllerEnabled` only** (or with `--datadogCSIDriverEnabled=false`):
13+
**With `--untaintControllerEnabled=true` only** (and without `--untaintControllerWaitForCSIDriver`):
1414
the controller removes the taint once the **node Agent** pod
1515
(`agent.datadoghq.com/component=agent`) on that node is `Ready`. Agent pods are
1616
listed in the operator's agent watch namespaces (`WATCH_NAMESPACE` /
1717
`DD_AGENT_WATCH_NAMESPACE`).
1818

19-
**With both `--untaintControllerEnabled=true` and `--datadogCSIDriverEnabled=true`:**
19+
**With `--untaintControllerEnabled=true` and `--untaintControllerWaitForCSIDriver=true`:**
2020
the controller waits until **both** the node Agent and **CSI
2121
node-server** pod (`app=datadog-csi-driver-node-server`) on the node are
2222
`Ready` before removing the taint. The taint stays until both are
@@ -25,19 +25,26 @@ satisfied or a timeout fires. The operator's Pod informer then watches the
2525
pods in those namespaces—keep namespaces tight). Ensure CSI namespaces are
2626
covered so the controller can list CSI pod status.
2727

28+
**`--datadogCSIDriverEnabled`** only controls whether the **DatadogCSIDriver**
29+
controller runs; it does **not** by itself turn on dual-readiness untaint.
30+
Enable `--untaintControllerWaitForCSIDriver` only when you actually deploy CSI
31+
node-server pods on tainted nodes (for example via a `DatadogCSIDriver` CR with
32+
the operator's CSI controller enabled, or another install path that produces
33+
the same pod labels).
34+
2835
If a required pod never reaches Ready on a tainted node, a configurable timeout
2936
policy ensures the node is never permanently unschedulable. Two clocks cover
3037
the main failure modes:
3138

3239
- **Readiness timeout** — at least one Agent pod is on the node but the Agent
33-
is not Ready yet, **or** (with CSI enabled) the Agent is Ready but a CSI
40+
is not Ready yet, **or** (with `--untaintControllerWaitForCSIDriver`) the Agent is Ready but a CSI
3441
node-server pod exists on the node and is not Ready. Clock: latest
3542
`pod.Status.StartTime` among **Agent** pods in the first case, and among **CSI
3643
node-server** pods only in the second (the Agent’s age does not shorten the
3744
wait for CSI). Pod recreation restarts the window; container restarts inside the
3845
same pod do not.
39-
- **Scheduling timeout** — no Agent pod is on the node, **or** (with CSI
40-
enabled) the Agent is Ready but **no** CSI node-server pod is on the node
46+
- **Scheduling timeout** — no Agent pod is on the node, **or** (with wait-for-CSI)
47+
the Agent is Ready but **no** CSI node-server pod is on the node
4148
yet. Clock: `node.metadata.creationTimestamp`. Covers DaemonSets that never
4249
schedule onto the node (taint not tolerated, missing labels, CSI still pulling,
4350
etc.).
@@ -62,18 +69,23 @@ manager:
6269
```yaml
6370
args:
6471
- --untaintControllerEnabled=true
72+
# Optional: require CSI node-server Ready before untainting (see Overview).
73+
- --untaintControllerWaitForCSIDriver=true
6574
```
6675
67-
| `--untaintControllerEnabled` | `--datadogCSIDriverEnabled` | Behavior |
68-
| ----------------------------- | --------------------------- | -------- |
69-
| `false` | any | Untaint controller off; no startup toleration injection for this feature on Agent or CSI. |
70-
| `true` | `false` | Agent-only readiness and Agent DaemonSet toleration (default historical behavior). |
71-
| `true` | `true` | Wait for Agent **and** CSI node-server Ready; widened Pod cache (agent + `DD_CSIDRIVER_WATCH_NAMESPACE` namespaces); toleration on Agent and CSI DaemonSets. |
76+
| `--untaintControllerEnabled` | `--untaintControllerWaitForCSIDriver` | Behavior |
77+
| ----------------------------- | ------------------------------------- | -------- |
78+
| `false` | any | Untaint controller off; no Agent startup toleration for this feature. |
79+
| `true` | `false` | Agent-only readiness; Agent DaemonSet startup toleration injected. |
80+
| `true` | `true` | Wait for Agent **and** CSI node-server Ready; widened Pod cache (agent + `DD_CSIDRIVER_WATCH_NAMESPACE` namespaces); startup toleration on Agent and, when the DatadogCSIDriver controller is enabled, on the CSI node DaemonSet. |
81+
82+
`--untaintControllerWaitForCSIDriver` requires `--untaintControllerEnabled=true` (the operator exits on invalid combinations).
7283

73-
When this flag is enabled, the operator injects a toleration for
84+
When `--untaintControllerEnabled` is enabled, the operator injects a toleration for
7485
`agent.datadoghq.com/not-ready=presence:NoSchedule` into the node Agent
7586
DaemonSet (or ExtendedDaemonSet) pod template, unless an equivalent toleration
76-
is already present. When **`--datadogCSIDriverEnabled`** is also true, the same
87+
is already present. When **`--untaintControllerWaitForCSIDriver`** is also true **and**
88+
the DatadogCSIDriver controller is running (`--datadogCSIDriverEnabled=true`), the same
7789
toleration is injected into the **Datadog CSI node-server** DaemonSet pod
7890
template so the CSI workload can schedule on tainted nodes before the taint is
7991
removed.
@@ -104,7 +116,7 @@ Metrics, under the `untaint` Prometheus subsystem:
104116
Kubernetes Events (gated by `DD_UNTAINT_CONTROLLER_EVENTS_ENABLED=true`):
105117

106118
- `TaintRemoved` (Normal) — taint removed after the Agent became Ready, or (when
107-
the Datadog CSI driver controller is also enabled) after both the Agent and
119+
`--untaintControllerWaitForCSIDriver` is enabled) after both the Agent and
108120
CSI node-server pods became Ready.
109121
- `UntaintTimeout` — a timeout fired. Normal under `remove`, Warning under `keep`. Message carries the reason, elapsed time, and policy.
110122

internal/controller/datadogcsidriver/controller.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,19 @@ const (
3636

3737
// Reconciler reconciles a DatadogCSIDriver object
3838
type Reconciler struct {
39-
client client.Client
40-
scheme *runtime.Scheme
41-
recorder record.EventRecorder
42-
untaintControllerEnabled bool
39+
client client.Client
40+
scheme *runtime.Scheme
41+
recorder record.EventRecorder
42+
untaintInjectCSIStartupToleration bool
4343
}
4444

4545
// NewReconciler creates a new DatadogCSIDriver reconciler
46-
func NewReconciler(client client.Client, scheme *runtime.Scheme, recorder record.EventRecorder, untaintControllerEnabled bool) *Reconciler {
46+
func NewReconciler(client client.Client, scheme *runtime.Scheme, recorder record.EventRecorder, untaintInjectCSIStartupToleration bool) *Reconciler {
4747
return &Reconciler{
48-
client: client,
49-
scheme: scheme,
50-
recorder: recorder,
51-
untaintControllerEnabled: untaintControllerEnabled,
48+
client: client,
49+
scheme: scheme,
50+
recorder: recorder,
51+
untaintInjectCSIStartupToleration: untaintInjectCSIStartupToleration,
5252
}
5353
}
5454

@@ -203,7 +203,7 @@ func (r *Reconciler) reconcileCSIDriver(ctx context.Context, instance *v1alpha1.
203203
func (r *Reconciler) reconcileDaemonSet(ctx context.Context, instance *v1alpha1.DatadogCSIDriver) error {
204204
logger := ctrl.LoggerFrom(ctx)
205205
desired := buildDaemonSet(instance)
206-
if r.untaintControllerEnabled {
206+
if r.untaintInjectCSIStartupToleration {
207207
componentagent.EnsureAgentNotReadyStartupToleration(logger, &desired.Spec.Template.Spec)
208208
}
209209

internal/controller/datadogcsidriver/controller_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ const (
3737
testName = "datadog-csi"
3838
)
3939

40-
func newTestReconciler(t *testing.T, untaintControllerEnabled bool, objects ...client.Object) (*Reconciler, client.Client) {
40+
func newTestReconciler(t *testing.T, injectCSIStartupToleration bool, objects ...client.Object) (*Reconciler, client.Client) {
4141
t.Helper()
4242
s := scheme.Scheme
4343
s.AddKnownTypes(v1alpha1.GroupVersion,
@@ -54,7 +54,7 @@ func newTestReconciler(t *testing.T, untaintControllerEnabled bool, objects ...c
5454
// Set the default controller-runtime logger so ctrl.LoggerFrom(ctx) works in tests
5555
ctrl.SetLogger(zap.New(zap.UseDevMode(true)))
5656
recorder := record.NewFakeRecorder(10)
57-
r := NewReconciler(c, s, recorder, untaintControllerEnabled)
57+
r := NewReconciler(c, s, recorder, injectCSIStartupToleration)
5858

5959
return r, c
6060
}
@@ -489,7 +489,7 @@ func TestReconcile_CSIDriverSpecDriftIsReconciled(t *testing.T) {
489489
assert.Contains(t, csiDriver.Spec.VolumeLifecycleModes, storagev1.VolumeLifecycleEphemeral)
490490
}
491491

492-
func TestReconcile_DaemonSetIncludesStartupTolerationWhenUntaintEnabled(t *testing.T) {
492+
func TestReconcile_DaemonSetIncludesStartupTolerationWhenUntaintWaitForCSI(t *testing.T) {
493493
instance := defaultCSIDriverCR()
494494
r, c := newTestReconciler(t, true, instance)
495495
ctx := context.Background()
@@ -508,7 +508,7 @@ func TestReconcile_DaemonSetIncludesStartupTolerationWhenUntaintEnabled(t *testi
508508
"expected %+v in %+v", want, ds.Spec.Template.Spec.Tolerations)
509509
}
510510

511-
func TestReconcile_DaemonSetOmitsStartupTolerationWhenUntaintDisabled(t *testing.T) {
511+
func TestReconcile_DaemonSetOmitsStartupTolerationWhenUntaintCoordinationOff(t *testing.T) {
512512
instance := defaultCSIDriverCR()
513513
r, c := newTestReconciler(t, false, instance)
514514
ctx := context.Background()

internal/controller/datadogcsidriver_controller.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ import (
2727

2828
// DatadogCSIDriverReconciler reconciles a DatadogCSIDriver object.
2929
type DatadogCSIDriverReconciler struct {
30-
Client client.Client
31-
Scheme *runtime.Scheme
32-
Recorder record.EventRecorder
33-
UntaintControllerEnabled bool
34-
internal *datadogcsidriver.Reconciler
30+
Client client.Client
31+
Scheme *runtime.Scheme
32+
Recorder record.EventRecorder
33+
UntaintInjectCSIStartupToleration bool
34+
internal *datadogcsidriver.Reconciler
3535
}
3636

3737
// RBACs for DatadogCSIDriver objects
@@ -61,7 +61,7 @@ func (r *DatadogCSIDriverReconciler) Reconcile(ctx context.Context, instance *da
6161

6262
// SetupWithManager creates a new DatadogCSIDriver controller.
6363
func (r *DatadogCSIDriverReconciler) SetupWithManager(mgr ctrl.Manager) error {
64-
r.internal = datadogcsidriver.NewReconciler(r.Client, r.Scheme, r.Recorder, r.UntaintControllerEnabled)
64+
r.internal = datadogcsidriver.NewReconciler(r.Client, r.Scheme, r.Recorder, r.UntaintInjectCSIStartupToleration)
6565

6666
or := reconcile.AsReconciler[*datadoghqv1alpha1.DatadogCSIDriver](r.Client, r)
6767
return ctrl.NewControllerManagedBy(mgr).

internal/controller/setup.go

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,22 +34,23 @@ const (
3434

3535
// SetupOptions defines options for setting up controllers to ease testing
3636
type SetupOptions struct {
37-
SupportExtendedDaemonset ExtendedDaemonsetOptions
38-
SupportCilium bool
39-
CredsManager *config.CredentialManager
40-
DatadogAgentEnabled bool
41-
DatadogMonitorEnabled bool
42-
DatadogSLOEnabled bool
43-
OperatorMetricsEnabled bool
44-
V2APIEnabled bool
45-
IntrospectionEnabled bool
46-
DatadogAgentProfileEnabled bool
47-
OtelAgentEnabled bool
48-
DatadogDashboardEnabled bool
49-
DatadogGenericResourceEnabled bool
50-
CreateControllerRevisions bool
51-
DatadogCSIDriverEnabled bool
52-
UntaintControllerEnabled bool
37+
SupportExtendedDaemonset ExtendedDaemonsetOptions
38+
SupportCilium bool
39+
CredsManager *config.CredentialManager
40+
DatadogAgentEnabled bool
41+
DatadogMonitorEnabled bool
42+
DatadogSLOEnabled bool
43+
OperatorMetricsEnabled bool
44+
V2APIEnabled bool
45+
IntrospectionEnabled bool
46+
DatadogAgentProfileEnabled bool
47+
OtelAgentEnabled bool
48+
DatadogDashboardEnabled bool
49+
DatadogGenericResourceEnabled bool
50+
CreateControllerRevisions bool
51+
DatadogCSIDriverEnabled bool
52+
UntaintControllerEnabled bool
53+
UntaintControllerWaitForCSIDriver bool
5354
}
5455

5556
// ExtendedDaemonsetOptions defines ExtendedDaemonset options
@@ -250,7 +251,7 @@ func startUntaint(logger logr.Logger, mgr manager.Manager, _ kubernetes.Platform
250251
mgr.GetClient(),
251252
ctrl.Log.WithName("controllers").WithName(untaintControllerName),
252253
mgr.GetEventRecorderFor(untaintControllerName),
253-
options.DatadogCSIDriverEnabled,
254+
options.UntaintControllerWaitForCSIDriver,
254255
)
255256
if err != nil {
256257
return fmt.Errorf("untaint controller setup: %w", err)
@@ -279,9 +280,10 @@ func startDatadogCSIDriver(logger logr.Logger, mgr manager.Manager, pInfo kubern
279280
}
280281

281282
return (&DatadogCSIDriverReconciler{
282-
Client: mgr.GetClient(),
283-
Scheme: mgr.GetScheme(),
284-
Recorder: mgr.GetEventRecorderFor(csiDriverControllerName),
285-
UntaintControllerEnabled: options.UntaintControllerEnabled,
283+
Client: mgr.GetClient(),
284+
Scheme: mgr.GetScheme(),
285+
Recorder: mgr.GetEventRecorderFor(csiDriverControllerName),
286+
// Inject startup toleration on CSI node DaemonSet only when untaint coordinates with CSI.
287+
UntaintInjectCSIStartupToleration: options.UntaintControllerEnabled && options.UntaintControllerWaitForCSIDriver,
286288
}).SetupWithManager(mgr)
287289
}

0 commit comments

Comments
 (0)