Skip to content

Commit d60ff16

Browse files
levan-mjanine-c
andauthored
Untaint controller (#2753)
* [draft] untaint controller * Fixes, improvements, doc * Update docs/untaint_controller.md Co-authored-by: Janine Chan <64388808+janine-c@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Janine Chan <64388808+janine-c@users.noreply.github.com> * doc and makefile fixes * PR feedback * improve inline doc * Apply suggestion from @levan-m --------- Co-authored-by: Janine Chan <64388808+janine-c@users.noreply.github.com>
1 parent dad893d commit d60ff16

12 files changed

Lines changed: 1634 additions & 9 deletions

cmd/main.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ type options struct {
147147
datadogDashboardEnabled bool
148148
datadogGenericResourceEnabled bool
149149
datadogCSIDriverEnabled bool
150+
untaintControllerEnabled bool
150151

151152
// Secret Backend options
152153
secretBackendCommand string
@@ -186,6 +187,7 @@ func (opts *options) Parse() {
186187
flag.BoolVar(&opts.datadogDashboardEnabled, "datadogDashboardEnabled", false, "Enable the DatadogDashboard controller")
187188
flag.BoolVar(&opts.datadogGenericResourceEnabled, "datadogGenericResourceEnabled", false, "Enable the DatadogGenericResource controller")
188189
flag.BoolVar(&opts.datadogCSIDriverEnabled, "datadogCSIDriverEnabled", false, "Enable the DatadogCSIDriver controller")
190+
flag.BoolVar(&opts.untaintControllerEnabled, "untaintControllerEnabled", false, "Enable the Untaint controller")
189191

190192
// DatadogAgentInternal
191193
flag.BoolVar(&opts.createControllerRevisions, "createControllerRevisions", false, "Enable creation of ControllerRevision snapshots on each DDA spec change")
@@ -293,6 +295,7 @@ func run(opts *options) error {
293295
DatadogDashboardEnabled: opts.datadogDashboardEnabled,
294296
DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled,
295297
DatadogCSIDriverEnabled: opts.datadogCSIDriverEnabled,
298+
UntaintControllerEnabled: opts.untaintControllerEnabled,
296299
}),
297300
// UsePriorityQueue makes all controllers use the priority queue, which
298301
// directly registers workqueue metrics into controller-runtime's metrics
@@ -376,6 +379,7 @@ func run(opts *options) error {
376379
DatadogDashboardEnabled: opts.datadogDashboardEnabled,
377380
DatadogGenericResourceEnabled: opts.datadogGenericResourceEnabled,
378381
DatadogCSIDriverEnabled: opts.datadogCSIDriverEnabled,
382+
UntaintControllerEnabled: opts.untaintControllerEnabled,
379383
}
380384

381385
versionInfo, platformInfo, err := getVersionAndPlatformInfo(rest.CopyConfig(mgr.GetConfig()))

docs/untaint_controller.md

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Untaint Controller
2+
3+
This feature was introduced in Datadog Operator v1.28 and is currently in preview.
4+
5+
## Overview
6+
7+
The Untaint controller watches Kubernetes Nodes carrying the taint
8+
`agent.datadoghq.com/not-ready=presence:NoSchedule` and removes it once the
9+
Datadog Agent pod on that node is `Ready`. It is intended to run alongside a
10+
separate mechanism (cluster-autoscaler hook, CCM, admission webhook, etc.)
11+
that adds the taint to new nodes. The use case is keeping workloads off a
12+
node until the Datadog Agent is Ready, and recovering gracefully if the Agent never
13+
becomes Ready.
14+
15+
Agent pods are matched by the label `agent.datadoghq.com/component=agent` in
16+
the operator's watched namespaces (`WATCH_NAMESPACE` /
17+
`DD_AGENT_WATCH_NAMESPACE`).
18+
19+
If the Agent pod never reaches Ready on a tainted node, a configurable timeout
20+
policy ensures the node is never permanently unschedulable. Two clocks cover
21+
the two failure modes:
22+
23+
- **Readiness timeout** — the Agent pod is on the node but not Ready. Clock:
24+
`pod.Status.StartTime`. Pod recreation restarts the window; container
25+
restarts inside the same pod do not.
26+
- **Scheduling timeout** — no Agent pod is on the node. Clock:
27+
`node.metadata.creationTimestamp`. The expected path when a DaemonSet never
28+
schedules a pod onto the node (taint not tolerated, missing labels, etc.).
29+
30+
A pod-recreation crash-loop faster than the readiness window can hold a node
31+
tainted indefinitely; run with `policy=keep` and alert on
32+
`untaint_taint_timeouts_total{policy="keep"}` to catch this.
33+
34+
The controller removes only this fixed taint and does not add it; both
35+
timeouts are global and cannot be tuned per Node (Group), DDA, or DAP.
36+
37+
## Prerequisites
38+
39+
- Operator v1.x+
40+
- Tested on Kubernetes 1.27.0+
41+
42+
## Enable the Untaint controller
43+
44+
The Untaint controller is disabled by default. Enable it on the operator
45+
manager:
46+
47+
```yaml
48+
args:
49+
- --untaintControllerEnabled=true
50+
```
51+
52+
## Configuration
53+
54+
All other tuning knobs are environment variables on the operator pod. Values
55+
use Go's `time.ParseDuration` format (`90s`, `5m`, `1h`, etc.). Invalid values
56+
fail the controller startup with an ERROR log; other controllers continue to
57+
start normally.
58+
59+
60+
| Env var | Default | Description |
61+
| ------------------------------------------ | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
62+
| `DD_UNTAINT_CONTROLLER_TIMEOUT` | `10m` | Readiness timeout. Tune to the upper bound of legitimate agent startup on your nodes; 2–5m is often enough on clusters with cached images. |
63+
| `DD_UNTAINT_CONTROLLER_SCHEDULING_TIMEOUT` | `5m` | Scheduling timeout. Set larger than your scheduler retry window; raise it on clusters with large pending queues or aggressive autoscaling. |
64+
| `DD_UNTAINT_CONTROLLER_TIMEOUT_POLICY` | `remove` | Action when a timeout fires. `remove` untaints the node anyway (favors scheduling availability over telemetry; lowest operational risk). `keep` leaves the taint in place and emits a Warning event (favors telemetry; pair with an alert on the timeout counter to surface stuck nodes). |
65+
| `DD_UNTAINT_CONTROLLER_EVENTS_ENABLED` | `false` | Emit Kubernetes Events on Nodes for taint removals and timeout decisions. |
66+
67+
## Observability
68+
69+
Metrics, under the `untaint` Prometheus subsystem:
70+
71+
- `untaint_taint_removals_total` — counter, every taint removal regardless of cause.
72+
- `untaint_taint_removal_latency_seconds` — histogram, time between pod Ready and taint removal.
73+
- `untaint_taint_timeouts_total{reason, policy}` — counter, timeout decisions. `reason` in {`readiness`, `scheduling`}; `policy` in {`remove`, `keep`}. Alert on `policy="keep"` to investigate stuck nodes.
74+
75+
Kubernetes Events (gated by `DD_UNTAINT_CONTROLLER_EVENTS_ENABLED=true`):
76+
77+
- `TaintRemoved` (Normal) — taint removed because the Agent pod became Ready.
78+
- `UntaintTimeout` — a timeout fired. Normal under `remove`, Warning under `keep`. Message carries the reason, elapsed time, and policy.
79+

go.work.sum

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1081,6 +1081,7 @@ github.com/ncw/swift v1.0.47 h1:4DQRPj35Y41WogBxyhOXlrI37nzGlyEcsforeudyYPQ=
10811081
github.com/nelsam/hel/v2 v2.3.3 h1:Z3TAKd9JS3BoKi6fW+d1bKD2Mf0FzTqDUEAwLWzYPRQ=
10821082
github.com/nelsam/hel/v2 v2.3.3/go.mod h1:1ZTGfU2PFTOd5mx22i5O0Lc2GY933lQ2wb/ggy+rL3w=
10831083
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs=
1084+
github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78=
10841085
github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU=
10851086
github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4=
10861087
github.com/onsi/ginkgo v1.12.1 h1:mFwc4LvZ0xpSvDZ3E+k8Yte0hLOMxXUlP+yXtJqkYfQ=

internal/controller/metrics/const.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package metrics
88
const (
99
datadogAgentSubsystem = "datadogagent"
1010
datadogAgentProfileSubsystem = "datadogagentprofile"
11+
untaintSubsystem = "untaint"
1112

1213
TrueValue = 1.0
1314
FalseValue = 0.0
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2016-present Datadog, Inc.
5+
6+
package metrics
7+
8+
import (
9+
"github.com/prometheus/client_golang/prometheus"
10+
"sigs.k8s.io/controller-runtime/pkg/metrics"
11+
)
12+
13+
// Label values for TaintTimeoutsTotal.
14+
const (
15+
// UntaintTimeoutReasonReadiness signals that a pod existed on the node but
16+
// never became Ready within --untaintControllerTimeout.
17+
UntaintTimeoutReasonReadiness = "readiness"
18+
// UntaintTimeoutReasonScheduling signals that no agent pod was scheduled on
19+
// the node within --untaintControllerSchedulingTimeout.
20+
UntaintTimeoutReasonScheduling = "scheduling"
21+
22+
// UntaintTimeoutPolicyRemove untaints the node despite the agent not being ready.
23+
UntaintTimeoutPolicyRemove = "remove"
24+
// UntaintTimeoutPolicyKeep leaves the taint in place but emits observability signals.
25+
UntaintTimeoutPolicyKeep = "keep"
26+
)
27+
28+
var (
29+
// TaintRemovalsTotal is the total number of taints removed from nodes.
30+
TaintRemovalsTotal = prometheus.NewCounter(
31+
prometheus.CounterOpts{
32+
Subsystem: untaintSubsystem,
33+
Name: "taint_removals_total",
34+
Help: "Total number of taints removed from nodes",
35+
},
36+
)
37+
38+
// TaintRemovalLatency is the time between agent pod becoming Ready and taint removal.
39+
TaintRemovalLatency = prometheus.NewHistogram(
40+
prometheus.HistogramOpts{
41+
Subsystem: untaintSubsystem,
42+
Name: "taint_removal_latency_seconds",
43+
Help: "Time between agent pod becoming Ready and taint removal from the node",
44+
Buckets: prometheus.DefBuckets,
45+
},
46+
)
47+
48+
// TaintTimeoutsTotal counts timeout decisions broken down by reason and policy.
49+
TaintTimeoutsTotal = prometheus.NewCounterVec(
50+
prometheus.CounterOpts{
51+
Subsystem: untaintSubsystem,
52+
Name: "taint_timeouts_total",
53+
Help: "Total number of untaint-controller timeout decisions, by reason and policy",
54+
},
55+
[]string{"reason", "policy"},
56+
)
57+
58+
// TaintRemovalErrorsTotal counts hard errors encountered while attempting to
59+
// remove the taint (apiserver Patch failures, JSON marshal failures, …).
60+
// Benign optimistic-concurrency races (IsConflict/IsInvalid) are NOT counted
61+
// here — they're handled by requeueing. Inspect the operator's ERROR-level
62+
// logs for the specific failure cause.
63+
TaintRemovalErrorsTotal = prometheus.NewCounter(
64+
prometheus.CounterOpts{
65+
Subsystem: untaintSubsystem,
66+
Name: "taint_removal_errors_total",
67+
Help: "Total number of errors encountered while attempting to remove the agent-not-ready taint from a node",
68+
},
69+
)
70+
)
71+
72+
func init() {
73+
metrics.Registry.MustRegister(TaintRemovalsTotal)
74+
metrics.Registry.MustRegister(TaintRemovalLatency)
75+
metrics.Registry.MustRegister(TaintTimeoutsTotal)
76+
metrics.Registry.MustRegister(TaintRemovalErrorsTotal)
77+
}

internal/controller/setup.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
package controller
77

88
import (
9+
"fmt"
910
"time"
1011

1112
"github.com/go-logr/logr"
@@ -48,6 +49,7 @@ type SetupOptions struct {
4849
DatadogGenericResourceEnabled bool
4950
CreateControllerRevisions bool
5051
DatadogCSIDriverEnabled bool
52+
UntaintControllerEnabled bool
5153
}
5254

5355
// ExtendedDaemonsetOptions defines ExtendedDaemonset options
@@ -77,6 +79,7 @@ var controllerStarters = map[string]starterFunc{
7779
dashboardControllerName: startDatadogDashboard,
7880
genericResourceControllerName: startDatadogGenericResource,
7981
csiDriverControllerName: startDatadogCSIDriver,
82+
untaintControllerName: startUntaint,
8083
}
8184

8285
// SetupControllers starts all controllers (also used by e2e tests)
@@ -236,6 +239,22 @@ func startDatadogSLO(logger logr.Logger, mgr manager.Manager, pInfo kubernetes.P
236239
return sloReconciler.SetupWithManager(mgr)
237240
}
238241

242+
func startUntaint(logger logr.Logger, mgr manager.Manager, _ kubernetes.PlatformInfo, options SetupOptions, _ datadog.MetricsForwardersManager) error {
243+
if !options.UntaintControllerEnabled {
244+
logger.Info("Feature disabled, not starting the controller", "controller", untaintControllerName)
245+
return nil
246+
}
247+
reconciler, err := NewUntaintReconciler(
248+
mgr.GetClient(),
249+
ctrl.Log.WithName("controllers").WithName(untaintControllerName),
250+
mgr.GetEventRecorderFor(untaintControllerName),
251+
)
252+
if err != nil {
253+
return fmt.Errorf("untaint controller setup: %w", err)
254+
}
255+
return reconciler.SetupWithManager(mgr)
256+
}
257+
239258
func startDatadogAgentProfiles(logger logr.Logger, mgr manager.Manager, pInfo kubernetes.PlatformInfo, options SetupOptions, metricForwardersMgr datadog.MetricsForwardersManager) error {
240259
if !options.DatadogAgentProfileEnabled {
241260
logger.Info("Feature disabled, not starting the controller", "controller", profileControllerName)

internal/controller/setup_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2016-present Datadog, Inc.
5+
6+
package controller
7+
8+
import (
9+
"errors"
10+
"testing"
11+
12+
"github.com/go-logr/logr"
13+
"github.com/stretchr/testify/assert"
14+
"sigs.k8s.io/controller-runtime/pkg/log"
15+
"sigs.k8s.io/controller-runtime/pkg/manager"
16+
17+
"github.com/DataDog/datadog-operator/pkg/controller/utils/datadog"
18+
"github.com/DataDog/datadog-operator/pkg/kubernetes"
19+
)
20+
21+
// TestSetupControllers_StarterErrorsAreBestEffort confirms that starter
22+
// failures are logged at ERROR by SetupControllers but never propagated up:
23+
// one controller's misconfiguration must not bring the whole operator down
24+
// or prevent other controllers from starting. The untaint controller follows
25+
// this same best-effort pattern.
26+
func TestSetupControllers_StarterErrorsAreBestEffort(t *testing.T) {
27+
originalStarters := controllerStarters
28+
t.Cleanup(func() { controllerStarters = originalStarters })
29+
30+
failing := func(logr.Logger, manager.Manager, kubernetes.PlatformInfo, SetupOptions, datadog.MetricsForwardersManager) error {
31+
return errors.New("simulated starter failure")
32+
}
33+
controllerStarters = map[string]starterFunc{
34+
agentControllerName: failing,
35+
untaintControllerName: failing,
36+
}
37+
38+
assert.NoError(t, SetupControllers(
39+
log.Log,
40+
nil,
41+
kubernetes.PlatformInfo{},
42+
SetupOptions{UntaintControllerEnabled: true},
43+
))
44+
}

internal/controller/suite_v2_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,13 @@ var _ = BeforeSuite(func(ctx context.Context) {
8888
node2 := testutils.NewNode("node2", nil)
8989
Expect(k8sClient.Create(context.Background(), node2)).Should(Succeed())
9090

91+
// Configure the untaint controller via its env vars before
92+
// SetupControllers is called (NewUntaintReconciler reads them on construction).
93+
Expect(os.Setenv(EnvEventsEnabled, "true")).Should(Succeed())
94+
Expect(os.Setenv(EnvReadinessTimeout, (4 * time.Second).String())).Should(Succeed())
95+
Expect(os.Setenv(EnvSchedulingTimeout, (4 * time.Second).String())).Should(Succeed())
96+
Expect(os.Setenv(EnvTimeoutPolicy, string(PolicyRemove))).Should(Succeed())
97+
9198
// Start controllers
9299
mgr, err := ctrl.NewManager(cfg, ctrl.Options{
93100
Scheme: scheme.Scheme,
@@ -107,6 +114,7 @@ var _ = BeforeSuite(func(ctx context.Context) {
107114
DatadogMonitorEnabled: true,
108115
DatadogAgentProfileEnabled: true,
109116
V2APIEnabled: true,
117+
UntaintControllerEnabled: true,
110118
}
111119

112120
dummyPlatformInfo := kubernetes.PlatformInfo{}

0 commit comments

Comments
 (0)