Skip to content

Commit 8df2250

Browse files
authored
[CONTP-1610][CONTP-1611] Wait for CSI driver node server pod readiness in untaint controller if csi feature is enabled (#3096)
* [CONTP-1610][CONTP-1611] Wait for CSI driver node server pod readiness in untaint controller if csi feature is enabled * require explicit enablement of awaiting csi driver in the untaint controller * increase unit test coverage * simplify timeout reconciliation: merge agent and csi into one condition * address final nits
1 parent f36836a commit 8df2250

13 files changed

Lines changed: 780 additions & 185 deletions

cmd/main.go

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ type options struct {
148148
datadogGenericResourceEnabled bool
149149
datadogCSIDriverEnabled bool
150150
untaintControllerEnabled bool
151+
untaintControllerWaitForCSIDriver bool
151152

152153
// Secret Backend options
153154
secretBackendCommand string
@@ -188,6 +189,8 @@ func (opts *options) Parse() {
188189
flag.BoolVar(&opts.datadogGenericResourceEnabled, "datadogGenericResourceEnabled", false, "Enable the DatadogGenericResource controller")
189190
flag.BoolVar(&opts.datadogCSIDriverEnabled, "datadogCSIDriverEnabled", false, "Enable the DatadogCSIDriver controller")
190191
flag.BoolVar(&opts.untaintControllerEnabled, "untaintControllerEnabled", false, "Enable the Untaint controller")
192+
flag.BoolVar(&opts.untaintControllerWaitForCSIDriver, "untaintControllerWaitForCSIDriver", false,
193+
"When true (requires --untaintControllerEnabled), the Untaint controller removes the startup taint only after both the node Agent and Datadog CSI node-server pods are Ready. Requires Pod watch coverage of CSI namespaces (DD_CSIDRIVER_WATCH_NAMESPACE).")
191194

192195
// DatadogAgentInternal
193196
flag.BoolVar(&opts.createControllerRevisions, "createControllerRevisions", false, "Enable creation of ControllerRevision snapshots on each DDA spec change")
@@ -235,6 +238,10 @@ func run(opts *options) error {
235238
}
236239
version.PrintVersionLogs(setupLog)
237240

241+
if opts.untaintControllerWaitForCSIDriver && !opts.untaintControllerEnabled {
242+
return setupErrorf(setupLog, fmt.Errorf("invalid flags"), "--untaintControllerWaitForCSIDriver requires --untaintControllerEnabled=true")
243+
}
244+
238245
// submits the maximum go routine setting as a metric
239246
metrics.MaxGoroutines.Set(float64(opts.maximumGoroutines))
240247

@@ -287,15 +294,16 @@ func run(opts *options) error {
287294
RenewDeadline: &renewDeadline,
288295
RetryPeriod: &retryPeriod,
289296
Cache: config.CacheOptions(setupLog, config.WatchOptions{
290-
DatadogAgentEnabled: opts.datadogAgentEnabled,
291-
DatadogMonitorEnabled: opts.datadogMonitorEnabled,
292-
DatadogSLOEnabled: opts.datadogSLOEnabled,
293-
DatadogAgentProfileEnabled: opts.datadogAgentProfileEnabled,
294-
IntrospectionEnabled: opts.introspectionEnabled,
295-
DatadogDashboardEnabled: opts.datadogDashboardEnabled,
296-
DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled,
297-
DatadogCSIDriverEnabled: opts.datadogCSIDriverEnabled,
298-
UntaintControllerEnabled: opts.untaintControllerEnabled,
297+
DatadogAgentEnabled: opts.datadogAgentEnabled,
298+
DatadogMonitorEnabled: opts.datadogMonitorEnabled,
299+
DatadogSLOEnabled: opts.datadogSLOEnabled,
300+
DatadogAgentProfileEnabled: opts.datadogAgentProfileEnabled,
301+
IntrospectionEnabled: opts.introspectionEnabled,
302+
DatadogDashboardEnabled: opts.datadogDashboardEnabled,
303+
DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled,
304+
DatadogCSIDriverEnabled: opts.datadogCSIDriverEnabled,
305+
UntaintControllerEnabled: opts.untaintControllerEnabled,
306+
UntaintControllerWaitForCSIDriver: opts.untaintControllerWaitForCSIDriver,
299307
}),
300308
// UsePriorityQueue makes all controllers use the priority queue, which
301309
// directly registers workqueue metrics into controller-runtime's metrics
@@ -366,20 +374,21 @@ func run(opts *options) error {
366374
CanaryAutoPauseMaxSlowStartDuration: opts.edsCanaryAutoPauseMaxSlowStartDuration,
367375
MaxPodSchedulerFailure: opts.edsMaxPodSchedulerFailure,
368376
},
369-
SupportCilium: opts.supportCilium,
370-
CredsManager: credsManager,
371-
DatadogAgentEnabled: opts.datadogAgentEnabled,
372-
CreateControllerRevisions: opts.createControllerRevisions && opts.datadogAgentEnabled,
373-
DatadogMonitorEnabled: opts.datadogMonitorEnabled,
374-
DatadogSLOEnabled: opts.datadogSLOEnabled,
375-
OperatorMetricsEnabled: opts.operatorMetricsEnabled,
376-
V2APIEnabled: true,
377-
IntrospectionEnabled: opts.introspectionEnabled,
378-
DatadogAgentProfileEnabled: opts.datadogAgentProfileEnabled,
379-
DatadogDashboardEnabled: opts.datadogDashboardEnabled,
380-
DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled,
381-
DatadogCSIDriverEnabled: opts.datadogCSIDriverEnabled,
382-
UntaintControllerEnabled: opts.untaintControllerEnabled,
377+
SupportCilium: opts.supportCilium,
378+
CredsManager: credsManager,
379+
DatadogAgentEnabled: opts.datadogAgentEnabled,
380+
CreateControllerRevisions: opts.createControllerRevisions && opts.datadogAgentEnabled,
381+
DatadogMonitorEnabled: opts.datadogMonitorEnabled,
382+
DatadogSLOEnabled: opts.datadogSLOEnabled,
383+
OperatorMetricsEnabled: opts.operatorMetricsEnabled,
384+
V2APIEnabled: true,
385+
IntrospectionEnabled: opts.introspectionEnabled,
386+
DatadogAgentProfileEnabled: opts.datadogAgentProfileEnabled,
387+
DatadogDashboardEnabled: opts.datadogDashboardEnabled,
388+
DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled,
389+
DatadogCSIDriverEnabled: opts.datadogCSIDriverEnabled,
390+
UntaintControllerEnabled: opts.untaintControllerEnabled,
391+
UntaintControllerWaitForCSIDriver: opts.untaintControllerWaitForCSIDriver,
383392
}
384393

385394
versionInfo, platformInfo, err := getVersionAndPlatformInfo(rest.CopyConfig(mgr.GetConfig()))

docs/untaint_controller.md

Lines changed: 65 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,55 @@ This feature was introduced in Datadog Operator v1.28 and is currently in previe
55
## Overview
66

77
The Untaint controller watches Kubernetes Nodes carrying the taint
8-
`agent.datadoghq.com/not-ready=presence:NoSchedule` and removes it once the
9-
Datadog Agent pod on that node is `Ready`. It is intended to run alongside a
10-
separate mechanism (cluster-autoscaler hook, CCM, admission webhook, etc.)
11-
that adds the taint to new nodes. The use case is keeping workloads off a
12-
node until the Datadog Agent is Ready, and recovering gracefully if the Agent never
13-
becomes Ready.
14-
15-
Agent pods are matched by the label `agent.datadoghq.com/component=agent` in
16-
the operator's watched namespaces (`WATCH_NAMESPACE` /
8+
`agent.datadoghq.com/not-ready=presence:NoSchedule` and removes it when
9+
readiness criteria are met (see below), or after a configurable timeout. It is
10+
intended to run alongside a separate mechanism (cluster-autoscaler hook, CCM,
11+
admission webhook, etc.) that adds the taint to new nodes.
12+
13+
**With `--untaintControllerEnabled=true` only** (and without `--untaintControllerWaitForCSIDriver`):
14+
the controller removes the taint once the **node Agent** pod
15+
(`agent.datadoghq.com/component=agent`) on that node is `Ready`. Agent pods are
16+
listed in the operator's agent watch namespaces (`WATCH_NAMESPACE` /
1717
`DD_AGENT_WATCH_NAMESPACE`).
1818

19-
If the Agent pod never reaches Ready on a tainted node, a configurable timeout
19+
**With `--untaintControllerEnabled=true` and `--untaintControllerWaitForCSIDriver=true`:**
20+
the controller waits until **both** the node Agent and **CSI
21+
node-server** pod (`app=datadog-csi-driver-node-server`) on the node are
22+
`Ready` before removing the taint. The taint stays until both are
23+
satisfied or a timeout fires. The operator's Pod informer then watches the
24+
**union** of `DD_AGENT_WATCH_NAMESPACE` and `DD_CSIDRIVER_WATCH_NAMESPACE` (all
25+
pods in those namespaces—keep namespaces tight). Ensure CSI namespaces are
26+
covered so the controller can list CSI pod status.
27+
28+
**`--datadogCSIDriverEnabled`** only controls whether the **DatadogCSIDriver**
29+
controller runs; it does **not** by itself turn on dual-readiness untaint.
30+
Enable `--untaintControllerWaitForCSIDriver` only when you actually deploy CSI
31+
node-server pods on tainted nodes (for example via a `DatadogCSIDriver` CR with
32+
the operator's CSI controller enabled, or another install path that produces
33+
the same pod labels).
34+
35+
If a required pod never reaches Ready on a tainted node, a configurable timeout
2036
policy ensures the node is never permanently unschedulable. Two clocks cover
21-
the two failure modes:
22-
23-
- **Readiness timeout** — the Agent pod is on the node but not Ready. Clock:
24-
`pod.Status.StartTime`. Pod recreation restarts the window; container
25-
restarts inside the same pod do not.
26-
- **Scheduling timeout** — no Agent pod is on the node. Clock:
27-
`node.metadata.creationTimestamp`. The expected path when a DaemonSet never
28-
schedules a pod onto the node (taint not tolerated, missing labels, etc.).
37+
the main failure modes:
38+
39+
- **Readiness timeout** — at least one Agent pod is on the node but the Agent
40+
is not Ready yet, **or** (with `--untaintControllerWaitForCSIDriver`) at least
41+
one Agent and one CSI node-server pod are on the node, each has
42+
`pod.Status.StartTime` set, and at least one of them is not Ready. Clock: the
43+
**later** of the latest `StartTime` among Agent pods on the node and the latest
44+
`StartTime` among CSI node-server pods on the node (so a recent restart on
45+
either workload resets the window). Agent-only mode (no wait-for-CSI) still
46+
uses only Agent `StartTime` for this clock.
47+
- **Scheduling timeout** — no Agent pod is on the node, **or** (with wait-for-CSI)
48+
no CSI node-server pod on the node yet. Clock: `node.metadata.creationTimestamp`.
49+
Covers DaemonSets that never schedule onto the node (taint not tolerated,
50+
missing labels, CSI still pulling, etc.).
51+
- **(Wait-for-CSI only)** If **both** an Agent pod and a CSI node-server pod are on
52+
the node but **either** still lacks `StartTime`, the controller **requeues**
53+
after the readiness-timeout duration (coarse poll, same idea as agent-only when
54+
`StartTime` is not populated yet)—it does **not** use the scheduling clock here,
55+
so an old node does not instantly hit a scheduling timeout while waiting for
56+
`StartTime` to appear.
2957

3058
A pod-recreation crash-loop faster than the readiness window can hold a node
3159
tainted indefinitely; run with `policy=keep` and alert on
@@ -47,14 +75,26 @@ manager:
4775
```yaml
4876
args:
4977
- --untaintControllerEnabled=true
78+
# Optional: require CSI node-server Ready before untainting (see Overview).
79+
- --untaintControllerWaitForCSIDriver=true
5080
```
5181
52-
When this flag is enabled, the operator also injects a toleration for
82+
| `--untaintControllerEnabled` | `--untaintControllerWaitForCSIDriver` | Behavior |
83+
| ----------------------------- | ------------------------------------- | -------- |
84+
| `false` | any | Untaint controller off; no Agent startup toleration for this feature. |
85+
| `true` | `false` | Agent-only readiness; Agent DaemonSet startup toleration injected. |
86+
| `true` | `true` | Wait for Agent **and** CSI node-server Ready; widened Pod cache (agent + `DD_CSIDRIVER_WATCH_NAMESPACE` namespaces); startup toleration on Agent and, when the DatadogCSIDriver controller is enabled, on the CSI node DaemonSet. |
87+
88+
`--untaintControllerWaitForCSIDriver` requires `--untaintControllerEnabled=true` (the operator exits on invalid combinations).
89+
90+
When `--untaintControllerEnabled` is enabled, the operator injects a toleration for
5391
`agent.datadoghq.com/not-ready=presence:NoSchedule` into the node Agent
5492
DaemonSet (or ExtendedDaemonSet) pod template, unless an equivalent toleration
55-
is already present. This avoids a deadlock where the node stays tainted because
56-
the Agent pod cannot schedule without the toleration, especially when admission
57-
webhook auto-injection is not in use.
93+
is already present. When **`--untaintControllerWaitForCSIDriver`** is also true **and**
94+
the DatadogCSIDriver controller is running (`--datadogCSIDriverEnabled=true`), the same
95+
toleration is injected into the **Datadog CSI node-server** DaemonSet pod
96+
template so the CSI workload can schedule on tainted nodes before the taint is
97+
removed.
5898

5999
## Configuration
60100

@@ -81,6 +121,8 @@ Metrics, under the `untaint` Prometheus subsystem:
81121

82122
Kubernetes Events (gated by `DD_UNTAINT_CONTROLLER_EVENTS_ENABLED=true`):
83123

84-
- `TaintRemoved` (Normal) — taint removed because the Agent pod became Ready.
124+
- `TaintRemoved` (Normal) — taint removed after the Agent became Ready, or (when
125+
`--untaintControllerWaitForCSIDriver` is enabled) after both the Agent and
126+
CSI node-server pods became Ready.
85127
- `UntaintTimeout` — a timeout fired. Normal under `remove`, Warning under `keep`. Message carries the reason, elapsed time, and policy.
86128

internal/controller/datadogcsidriver/const.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,14 @@
66
package datadogcsidriver
77

88
const (
9+
// AppLabelKey is the Kubernetes label key on CSI node-server pods.
10+
AppLabelKey = "app"
11+
// NodeServerDaemonSetAppValue is the label value identifying CSI node-server pods
12+
// (and the default DaemonSet name).
13+
NodeServerDaemonSetAppValue = "datadog-csi-driver-node-server"
14+
915
// csiDsName is the default name of the CSI driver DaemonSet
10-
csiDsName = "datadog-csi-driver-node-server"
16+
csiDsName = NodeServerDaemonSetAppValue
1117
// csiDriverName is the default name of the CSIDriver Kubernetes object
1218
csiDriverName = "k8s.csi.datadoghq.com"
1319
// defaultCSIDriverImageName is the default CSI driver container image name
@@ -53,7 +59,6 @@ const (
5359
csiDriverPort = int32(5000)
5460

5561
// Pod labels
56-
appLabelKey = "app"
5762
admissionControllerEnabledLabel = "admission.datadoghq.com/enabled"
5863

5964
// CSIDriver annotations

internal/controller/datadogcsidriver/controller.go

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626

2727
"github.com/DataDog/datadog-operator/api/datadoghq/v1alpha1"
2828
"github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1"
29+
componentagent "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent"
2930
)
3031

3132
const (
@@ -35,17 +36,19 @@ const (
3536

3637
// Reconciler reconciles a DatadogCSIDriver object
3738
type Reconciler struct {
38-
client client.Client
39-
scheme *runtime.Scheme
40-
recorder record.EventRecorder
39+
client client.Client
40+
scheme *runtime.Scheme
41+
recorder record.EventRecorder
42+
untaintInjectCSIStartupToleration bool
4143
}
4244

4345
// NewReconciler creates a new DatadogCSIDriver reconciler
44-
func NewReconciler(client client.Client, scheme *runtime.Scheme, recorder record.EventRecorder) *Reconciler {
46+
func NewReconciler(client client.Client, scheme *runtime.Scheme, recorder record.EventRecorder, untaintInjectCSIStartupToleration bool) *Reconciler {
4547
return &Reconciler{
46-
client: client,
47-
scheme: scheme,
48-
recorder: recorder,
48+
client: client,
49+
scheme: scheme,
50+
recorder: recorder,
51+
untaintInjectCSIStartupToleration: untaintInjectCSIStartupToleration,
4952
}
5053
}
5154

@@ -200,13 +203,16 @@ func (r *Reconciler) reconcileCSIDriver(ctx context.Context, instance *v1alpha1.
200203
}
201204

202205
func (r *Reconciler) reconcileDaemonSet(ctx context.Context, instance *v1alpha1.DatadogCSIDriver) error {
206+
logger := ctrl.LoggerFrom(ctx)
203207
desired := buildDaemonSet(instance)
208+
if r.untaintInjectCSIStartupToleration {
209+
componentagent.EnsureAgentNotReadyStartupToleration(logger, &desired.Spec.Template.Spec)
210+
}
204211

205212
if err := controllerutil.SetControllerReference(instance, desired, r.scheme); err != nil {
206213
return fmt.Errorf("setting owner reference: %w", err)
207214
}
208215

209-
logger := ctrl.LoggerFrom(ctx)
210216
nsName := types.NamespacedName{Name: desired.Name, Namespace: desired.Namespace}
211217
current := &appsv1.DaemonSet{}
212218
err := r.client.Get(ctx, nsName, current)

0 commit comments

Comments
 (0)