Skip to content

Commit 39d6bd3

Browse files
adel121OliviaShoup
andauthored
[CONTP-1609] Auto-inject agent toleration when untaint controller is enabled (#3086)
* [CONTP-1609] Auto-inject agent toleration when untaint controller is enabled * extract taint key and value to a shared package * use builtin toleration detection api * fix lint * increase test coverage * fix formatting * fix lint * Update docs/untaint_controller.md Co-authored-by: Olivia Shoup <116908616+OliviaShoup@users.noreply.github.com> * address review comments --------- Co-authored-by: Olivia Shoup <116908616+OliviaShoup@users.noreply.github.com>
1 parent 19e90a6 commit 39d6bd3

13 files changed

Lines changed: 283 additions & 32 deletions

docs/untaint_controller.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ args:
4949
- --untaintControllerEnabled=true
5050
```
5151
52+
When this flag is enabled, the operator also injects a toleration for
53+
`agent.datadoghq.com/not-ready=presence:NoSchedule` into the node Agent
54+
DaemonSet (or ExtendedDaemonSet) pod template, unless an equivalent toleration
55+
is already present. This avoids a deadlock where the node stays tainted because
56+
the Agent pod cannot schedule without the toleration, especially when admission
57+
webhook auto-injection is not in use.
58+
5259
## Configuration
5360

5461
All other tuning knobs are environment variables on the operator pod. Values
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2016-present Datadog, Inc.
5+
6+
package agent
7+
8+
import (
9+
"github.com/go-logr/logr"
10+
corev1 "k8s.io/api/core/v1"
11+
12+
"github.com/DataDog/datadog-operator/pkg/untaint"
13+
)
14+
15+
// podToleratesAgentNotReadyStartup reports whether tolerations tolerate the
16+
// agent-not-ready startup taint, using the same rules as the scheduler/kubelet
17+
// (corev1.Toleration.ToleratesTaint). Comparison-operator tolerations (Lt/Gt) are
18+
// ignored unless the cluster enables that feature (we pass false).
19+
func podToleratesAgentNotReadyStartup(logger logr.Logger, tolerations []corev1.Toleration) bool {
20+
taint := untaint.AgentNotReadyTaint()
21+
for i := range tolerations {
22+
if tolerations[i].ToleratesTaint(logger, &taint, false) {
23+
return true
24+
}
25+
}
26+
return false
27+
}
28+
29+
// EnsureAgentNotReadyStartupToleration appends the agent-not-ready Equal toleration
30+
// when not already tolerated per Kubernetes toleration matching.
31+
func EnsureAgentNotReadyStartupToleration(logger logr.Logger, spec *corev1.PodSpec) {
32+
if podToleratesAgentNotReadyStartup(logger, spec.Tolerations) {
33+
return
34+
}
35+
spec.Tolerations = append(spec.Tolerations, untaint.AgentNotReadyEqualToleration())
36+
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2016-present Datadog, Inc.
5+
6+
package agent
7+
8+
import (
9+
"testing"
10+
11+
"github.com/go-logr/logr"
12+
"github.com/stretchr/testify/require"
13+
corev1 "k8s.io/api/core/v1"
14+
15+
"github.com/DataDog/datadog-operator/pkg/untaint"
16+
)
17+
18+
func TestEnsureAgentNotReadyStartupToleration_idempotent(t *testing.T) {
19+
spec := &corev1.PodSpec{}
20+
want := untaint.AgentNotReadyEqualToleration()
21+
log := logr.Discard()
22+
23+
EnsureAgentNotReadyStartupToleration(log, spec)
24+
require.Len(t, spec.Tolerations, 1)
25+
require.Equal(t, want, spec.Tolerations[0])
26+
27+
EnsureAgentNotReadyStartupToleration(log, spec)
28+
require.Len(t, spec.Tolerations, 1)
29+
}
30+
31+
func TestEnsureAgentNotReadyStartupToleration_skipIfUserEqual(t *testing.T) {
32+
want := untaint.AgentNotReadyEqualToleration()
33+
spec := &corev1.PodSpec{Tolerations: []corev1.Toleration{want}}
34+
35+
EnsureAgentNotReadyStartupToleration(logr.Discard(), spec)
36+
require.Len(t, spec.Tolerations, 1)
37+
}
38+
39+
func TestEnsureAgentNotReadyStartupToleration_skipIfUserExists(t *testing.T) {
40+
spec := &corev1.PodSpec{
41+
Tolerations: []corev1.Toleration{
42+
{Key: untaint.AgentNotReadyTaintKey, Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule},
43+
},
44+
}
45+
46+
EnsureAgentNotReadyStartupToleration(logr.Discard(), spec)
47+
require.Len(t, spec.Tolerations, 1)
48+
}
49+
50+
func TestEnsureAgentNotReadyStartupToleration_equalEmptyOperator(t *testing.T) {
51+
spec := &corev1.PodSpec{
52+
Tolerations: []corev1.Toleration{
53+
{Key: untaint.AgentNotReadyTaintKey, Operator: "", Value: untaint.AgentNotReadyTaintValue, Effect: corev1.TaintEffectNoSchedule},
54+
},
55+
}
56+
57+
EnsureAgentNotReadyStartupToleration(logr.Discard(), spec)
58+
require.Len(t, spec.Tolerations, 1)
59+
}
60+
61+
func TestEnsureAgentNotReadyStartupToleration_skipIfExistsAllKeys(t *testing.T) {
62+
// Empty key + Exists matches every taint (corev1.Toleration.ToleratesTaint).
63+
spec := &corev1.PodSpec{
64+
Tolerations: []corev1.Toleration{
65+
{Operator: corev1.TolerationOpExists},
66+
},
67+
}
68+
69+
EnsureAgentNotReadyStartupToleration(logr.Discard(), spec)
70+
require.Len(t, spec.Tolerations, 1)
71+
}

internal/controller/datadogagent/controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ type ReconcilerOptions struct {
7474
OperatorMetricsEnabled bool
7575
IntrospectionEnabled bool
7676
DatadogAgentProfileEnabled bool
77+
UntaintControllerEnabled bool
7778
DatadogCSIDriverEnabled bool
7879
CreateControllerRevisions bool
7980
// ExperimentTimeout overrides ExperimentDefaultTimeout. Zero means use the default.

internal/controller/datadogagent/controller_v2_test.go

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"context"
1010
"fmt"
1111
"reflect"
12+
"slices"
1213
"sort"
1314
"testing"
1415
"time"
@@ -44,6 +45,7 @@ import (
4445
"github.com/DataDog/datadog-operator/pkg/images"
4546
"github.com/DataDog/datadog-operator/pkg/kubernetes"
4647
"github.com/DataDog/datadog-operator/pkg/testutils"
48+
"github.com/DataDog/datadog-operator/pkg/untaint"
4749
pkgutils "github.com/DataDog/datadog-operator/pkg/utils"
4850
)
4951

@@ -61,6 +63,17 @@ type testCase struct {
6163
introspectionEnabled bool // For introspection tests
6264
}
6365

66+
// ddaiReconcilerOptionsFromDDA mirrors setup.go wiring so DDAI tests behave like production
67+
// for flags shared with the DatadogAgent reconciler (e.g. UntaintControllerEnabled).
68+
func ddaiReconcilerOptionsFromDDA(opts ReconcilerOptions) datadogagentinternal.ReconcilerOptions {
69+
return datadogagentinternal.ReconcilerOptions{
70+
ExtendedDaemonsetOptions: opts.ExtendedDaemonsetOptions,
71+
SupportCilium: opts.SupportCilium,
72+
OperatorMetricsEnabled: opts.OperatorMetricsEnabled,
73+
UntaintControllerEnabled: opts.UntaintControllerEnabled,
74+
}
75+
}
76+
6477
// runTestCases runs test cases
6578
func runTestCases(t *testing.T, tests []testCase, testFunc func(t *testing.T, tt testCase, opts ReconcilerOptions)) {
6679
for _, tt := range tests {
@@ -106,7 +119,7 @@ func runDDAReconcilerTest(t *testing.T, tt testCase, opts ReconcilerOptions) {
106119
r.initializeComponentRegistry()
107120

108121
ri := datadogagentinternal.NewReconciler(
109-
datadogagentinternal.ReconcilerOptions{},
122+
ddaiReconcilerOptionsFromDDA(opts),
110123
c,
111124
kubernetes.PlatformInfo{},
112125
s,
@@ -179,7 +192,7 @@ func runFullReconcilerTest(t *testing.T, tt testCase, opts ReconcilerOptions) {
179192
r.initializeComponentRegistry()
180193

181194
ri := datadogagentinternal.NewReconciler(
182-
datadogagentinternal.ReconcilerOptions{},
195+
ddaiReconcilerOptionsFromDDA(opts),
183196
c,
184197
kubernetes.PlatformInfo{},
185198
s,
@@ -248,6 +261,64 @@ func buildClient(t *testing.T, tt testCase, s *runtime.Scheme) client.Client {
248261
return builder.Build()
249262
}
250263

264+
func TestReconcileDDA_UntaintController_injectsAgentNotReadyToleration(t *testing.T) {
265+
const resourcesName = "foo"
266+
const resourcesNamespace = "bar"
267+
const dsName = "foo-agent"
268+
defaultRequeueDuration := 15 * time.Second
269+
270+
wantTol := untaint.AgentNotReadyEqualToleration()
271+
tt := testCase{
272+
name: "untaint controller enabled injects agent-not-ready toleration on node agent DS",
273+
loadFunc: func(c client.Client) *v2alpha1.DatadogAgent {
274+
dda := testutils.NewInitializedDatadogAgentBuilder(resourcesNamespace, resourcesName).
275+
Build()
276+
_ = c.Create(context.TODO(), dda)
277+
return dda
278+
},
279+
want: reconcile.Result{RequeueAfter: defaultRequeueDuration},
280+
wantErr: false,
281+
wantFunc: func(t *testing.T, c client.Client) {
282+
ds := &appsv1.DaemonSet{}
283+
err := c.Get(context.TODO(), types.NamespacedName{Namespace: resourcesNamespace, Name: dsName}, ds)
284+
assert.NoError(t, err)
285+
assert.True(t, slices.ContainsFunc(ds.Spec.Template.Spec.Tolerations, func(tol corev1.Toleration) bool {
286+
return reflect.DeepEqual(tol, wantTol)
287+
}), "expected injected toleration %+v in %+v", wantTol, ds.Spec.Template.Spec.Tolerations)
288+
},
289+
}
290+
runDDAReconcilerTest(t, tt, ReconcilerOptions{UntaintControllerEnabled: true})
291+
}
292+
293+
func TestReconcileDDA_UntaintController_disabledDoesNotInjectAgentNotReadyToleration(t *testing.T) {
294+
const resourcesName = "foo"
295+
const resourcesNamespace = "bar"
296+
const dsName = "foo-agent"
297+
defaultRequeueDuration := 15 * time.Second
298+
299+
wantTol := untaint.AgentNotReadyEqualToleration()
300+
tt := testCase{
301+
name: "untaint controller disabled does not inject agent-not-ready toleration on node agent DS",
302+
loadFunc: func(c client.Client) *v2alpha1.DatadogAgent {
303+
dda := testutils.NewInitializedDatadogAgentBuilder(resourcesNamespace, resourcesName).
304+
Build()
305+
_ = c.Create(context.TODO(), dda)
306+
return dda
307+
},
308+
want: reconcile.Result{RequeueAfter: defaultRequeueDuration},
309+
wantErr: false,
310+
wantFunc: func(t *testing.T, c client.Client) {
311+
ds := &appsv1.DaemonSet{}
312+
err := c.Get(context.TODO(), types.NamespacedName{Namespace: resourcesNamespace, Name: dsName}, ds)
313+
assert.NoError(t, err)
314+
assert.False(t, slices.ContainsFunc(ds.Spec.Template.Spec.Tolerations, func(tol corev1.Toleration) bool {
315+
return reflect.DeepEqual(tol, wantTol)
316+
}), "did not expect injected toleration %+v in %+v", wantTol, ds.Spec.Template.Spec.Tolerations)
317+
},
318+
}
319+
runDDAReconcilerTest(t, tt, ReconcilerOptions{UntaintControllerEnabled: false})
320+
}
321+
251322
func TestReconcileDatadogAgentV2_Reconcile(t *testing.T) {
252323
const resourcesName = "foo"
253324
const resourcesNamespace = "bar"

internal/controller/datadogagentinternal/controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ type ReconcilerOptions struct {
6767
ExtendedDaemonsetOptions componentagent.ExtendedDaemonsetOptions
6868
SupportCilium bool
6969
OperatorMetricsEnabled bool
70+
UntaintControllerEnabled bool
7071
}
7172

7273
// Reconciler is the internal reconciler for Datadog Agent

internal/controller/datadogagentinternal/controller_reconcile_agent.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ func (r *Reconciler) reconcileV2Agent(ctx context.Context, requiredComponents fe
9595

9696
experimental.ApplyExperimentalOverrides(objLogger, ddai, podManagers)
9797

98+
if r.options.UntaintControllerEnabled {
99+
componentagent.EnsureAgentNotReadyStartupToleration(objLogger, &podManagers.PodTemplateSpec().Spec)
100+
}
101+
98102
if disabledByOverride {
99103
if agentEnabled {
100104
// The override supersedes what's set in requiredComponents; update status to reflect the conflict
@@ -162,6 +166,10 @@ func (r *Reconciler) reconcileV2Agent(ctx context.Context, requiredComponents fe
162166

163167
experimental.ApplyExperimentalOverrides(objLogger, ddai, podManagers)
164168

169+
if r.options.UntaintControllerEnabled {
170+
componentagent.EnsureAgentNotReadyStartupToleration(objLogger, &podManagers.PodTemplateSpec().Spec)
171+
}
172+
165173
if disabledByOverride {
166174
if agentEnabled {
167175
// The override supersedes what's set in requiredComponents; update status to reflect the conflict

internal/controller/setup.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ func startDatadogAgent(logger logr.Logger, mgr manager.Manager, pInfo kubernetes
130130
OperatorMetricsEnabled: options.OperatorMetricsEnabled,
131131
IntrospectionEnabled: options.IntrospectionEnabled,
132132
DatadogAgentProfileEnabled: options.DatadogAgentProfileEnabled,
133+
UntaintControllerEnabled: options.UntaintControllerEnabled,
133134
DatadogCSIDriverEnabled: options.DatadogCSIDriverEnabled,
134135
CreateControllerRevisions: options.CreateControllerRevisions,
135136
},
@@ -163,8 +164,9 @@ func startDatadogAgentInternal(logger logr.Logger, mgr manager.Manager, pInfo ku
163164
CanaryAutoFailEnabled: options.SupportExtendedDaemonset.CanaryAutoFailEnabled,
164165
CanaryAutoFailMaxRestarts: int32(options.SupportExtendedDaemonset.CanaryAutoFailMaxRestarts),
165166
},
166-
SupportCilium: options.SupportCilium,
167-
OperatorMetricsEnabled: options.OperatorMetricsEnabled,
167+
SupportCilium: options.SupportCilium,
168+
OperatorMetricsEnabled: options.OperatorMetricsEnabled,
169+
UntaintControllerEnabled: options.UntaintControllerEnabled,
168170
},
169171
}).SetupWithManager(mgr, metricForwardersMgr)
170172
}

internal/controller/untaint_controller.go

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"encoding/json"
1111
"fmt"
1212
"os"
13+
"slices"
1314
"time"
1415

1516
"github.com/go-logr/logr"
@@ -30,6 +31,7 @@ import (
3031
"github.com/DataDog/datadog-operator/api/datadoghq/common"
3132
"github.com/DataDog/datadog-operator/internal/controller/metrics"
3233
"github.com/DataDog/datadog-operator/pkg/constants"
34+
"github.com/DataDog/datadog-operator/pkg/untaint"
3335
)
3436

3537
// Environment variables consumed by the untaint controller. Reading them here
@@ -45,11 +47,6 @@ const (
4547
const (
4648
untaintControllerName = "Untaint"
4749

48-
// Fixed taint to remove
49-
agentNotReadyTaintKey = "agent.datadoghq.com/not-ready"
50-
agentNotReadyTaintValue = "presence"
51-
agentNotReadyTaintEffect = corev1.TaintEffectNoSchedule
52-
5350
// untaintPodNodeIndex is a controller-scoped field-indexer key for caching
5451
// pods by their spec.nodeName. Using a namespaced key (rather than the
5552
// bare "spec.nodeName") avoids collisions with any other controller that
@@ -232,7 +229,7 @@ func (r *UntaintReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
232229
}
233230
if r.eventsEnabled {
234231
r.recorder.Eventf(node, corev1.EventTypeNormal, "TaintRemoved",
235-
"Removed taint %s from node %s after agent became ready", agentNotReadyTaintKey, node.Name)
232+
"Removed taint %s from node %s after agent became ready", untaint.AgentNotReadyTaintKey, node.Name)
236233
}
237234
return ctrl.Result{}, nil
238235
}
@@ -351,12 +348,7 @@ func latestPodStartTime(pods []corev1.Pod) (time.Time, bool) {
351348

352349
// hasTaint returns true if the node has the agent-not-ready taint.
353350
func hasTaint(node *corev1.Node) bool {
354-
for _, t := range node.Spec.Taints {
355-
if t.Key == agentNotReadyTaintKey && t.Value == agentNotReadyTaintValue && t.Effect == agentNotReadyTaintEffect {
356-
return true
357-
}
358-
}
359-
return false
351+
return slices.ContainsFunc(node.Spec.Taints, untaint.IsAgentNotReadyTaint)
360352
}
361353

362354
type jsonPatchOp struct {
@@ -376,7 +368,7 @@ func (r *UntaintReconciler) removeTaint(ctx context.Context, node *corev1.Node)
376368
}
377369
newTaints := make([]corev1.Taint, 0, len(currentTaints))
378370
for _, t := range currentTaints {
379-
if t.Key == agentNotReadyTaintKey && t.Value == agentNotReadyTaintValue && t.Effect == agentNotReadyTaintEffect {
371+
if untaint.IsAgentNotReadyTaint(t) {
380372
continue
381373
}
382374
newTaints = append(newTaints, t)

internal/controller/untaint_controller_integration_test.go

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121

2222
"github.com/DataDog/datadog-operator/api/datadoghq/common"
2323
"github.com/DataDog/datadog-operator/pkg/constants"
24+
"github.com/DataDog/datadog-operator/pkg/untaint"
2425
)
2526

2627
// Suite-level timeouts (see suite_v2_test.go):
@@ -39,11 +40,7 @@ const (
3940
func makeTaintedNode(ctx context.Context, name string) func() {
4041
node := &corev1.Node{
4142
ObjectMeta: metav1.ObjectMeta{Name: name},
42-
Spec: corev1.NodeSpec{Taints: []corev1.Taint{{
43-
Key: agentNotReadyTaintKey,
44-
Value: agentNotReadyTaintValue,
45-
Effect: agentNotReadyTaintEffect,
46-
}}},
43+
Spec: corev1.NodeSpec{Taints: []corev1.Taint{untaint.AgentNotReadyTaint()}},
4744
}
4845
Expect(k8sClient.Create(ctx, node)).Should(Succeed())
4946
return func() { _ = k8sClient.Delete(ctx, node) }

0 commit comments

Comments
 (0)