Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/untaint_controller.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ args:
- --untaintControllerEnabled=true
```

When this flag is enabled, the operator also injects a toleration for
`agent.datadoghq.com/not-ready=presence:NoSchedule` into the node Agent
DaemonSet (or ExtendedDaemonSet) pod template, unless an equivalent toleration
is already present. This avoids a deadlock where the node stays tainted because
the Agent pod cannot schedule without the toleration, especially when admission
webhook auto-injection is not in use.
Comment thread
adel121 marked this conversation as resolved.

## Configuration

All other tuning knobs are environment variables on the operator pod. Values
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2016-present Datadog, Inc.

package agent

import (
"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"

"github.com/DataDog/datadog-operator/pkg/untaint"
)

// podToleratesAgentNotReadyStartup reports whether tolerations tolerate the
// agent-not-ready startup taint, using the same rules as the scheduler/kubelet
// (corev1.Toleration.ToleratesTaint). Comparison-operator tolerations (Lt/Gt) are
// ignored unless the cluster enables that feature (we pass false).
func podToleratesAgentNotReadyStartup(logger logr.Logger, tolerations []corev1.Toleration) bool {
taint := untaint.AgentNotReadyTaint()
for i := range tolerations {
if tolerations[i].ToleratesTaint(logger, &taint, false) {
return true
}
}
return false
}

// EnsureAgentNotReadyStartupToleration appends the agent-not-ready Equal toleration
// when not already tolerated per Kubernetes toleration matching.
func EnsureAgentNotReadyStartupToleration(logger logr.Logger, spec *corev1.PodSpec) {
if podToleratesAgentNotReadyStartup(logger, spec.Tolerations) {
return
}
spec.Tolerations = append(spec.Tolerations, untaint.AgentNotReadyEqualToleration())
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2016-present Datadog, Inc.

package agent

import (
"testing"

"github.com/go-logr/logr"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"

"github.com/DataDog/datadog-operator/pkg/untaint"
)

func TestEnsureAgentNotReadyStartupToleration_idempotent(t *testing.T) {
spec := &corev1.PodSpec{}
want := untaint.AgentNotReadyEqualToleration()
log := logr.Discard()

EnsureAgentNotReadyStartupToleration(log, spec)
require.Len(t, spec.Tolerations, 1)
require.Equal(t, want, spec.Tolerations[0])

EnsureAgentNotReadyStartupToleration(log, spec)
require.Len(t, spec.Tolerations, 1)
}

func TestEnsureAgentNotReadyStartupToleration_skipIfUserEqual(t *testing.T) {
want := untaint.AgentNotReadyEqualToleration()
spec := &corev1.PodSpec{Tolerations: []corev1.Toleration{want}}

EnsureAgentNotReadyStartupToleration(logr.Discard(), spec)
require.Len(t, spec.Tolerations, 1)
}

func TestEnsureAgentNotReadyStartupToleration_skipIfUserExists(t *testing.T) {
spec := &corev1.PodSpec{
Tolerations: []corev1.Toleration{
{Key: untaint.AgentNotReadyTaintKey, Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule},
},
}

EnsureAgentNotReadyStartupToleration(logr.Discard(), spec)
require.Len(t, spec.Tolerations, 1)
}

func TestEnsureAgentNotReadyStartupToleration_equalEmptyOperator(t *testing.T) {
spec := &corev1.PodSpec{
Tolerations: []corev1.Toleration{
{Key: untaint.AgentNotReadyTaintKey, Operator: "", Value: untaint.AgentNotReadyTaintValue, Effect: corev1.TaintEffectNoSchedule},
},
}

EnsureAgentNotReadyStartupToleration(logr.Discard(), spec)
require.Len(t, spec.Tolerations, 1)
}

func TestEnsureAgentNotReadyStartupToleration_skipIfExistsAllKeys(t *testing.T) {
// Empty key + Exists matches every taint (corev1.Toleration.ToleratesTaint).
spec := &corev1.PodSpec{
Tolerations: []corev1.Toleration{
{Operator: corev1.TolerationOpExists},
},
}

EnsureAgentNotReadyStartupToleration(logr.Discard(), spec)
require.Len(t, spec.Tolerations, 1)
}
1 change: 1 addition & 0 deletions internal/controller/datadogagent/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ type ReconcilerOptions struct {
OperatorMetricsEnabled bool
IntrospectionEnabled bool
DatadogAgentProfileEnabled bool
UntaintControllerEnabled bool
DatadogCSIDriverEnabled bool
CreateControllerRevisions bool
// ExperimentTimeout overrides ExperimentDefaultTimeout. Zero means use the default.
Expand Down
75 changes: 73 additions & 2 deletions internal/controller/datadogagent/controller_v2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"context"
"fmt"
"reflect"
"slices"
"sort"
"testing"
"time"
Expand Down Expand Up @@ -44,6 +45,7 @@ import (
"github.com/DataDog/datadog-operator/pkg/images"
"github.com/DataDog/datadog-operator/pkg/kubernetes"
"github.com/DataDog/datadog-operator/pkg/testutils"
"github.com/DataDog/datadog-operator/pkg/untaint"
pkgutils "github.com/DataDog/datadog-operator/pkg/utils"
)

Expand All @@ -61,6 +63,17 @@ type testCase struct {
introspectionEnabled bool // For introspection tests
}

// ddaiReconcilerOptionsFromDDA mirrors setup.go wiring so DDAI tests behave like production
// for flags shared with the DatadogAgent reconciler (e.g. UntaintControllerEnabled).
func ddaiReconcilerOptionsFromDDA(opts ReconcilerOptions) datadogagentinternal.ReconcilerOptions {
return datadogagentinternal.ReconcilerOptions{
ExtendedDaemonsetOptions: opts.ExtendedDaemonsetOptions,
SupportCilium: opts.SupportCilium,
OperatorMetricsEnabled: opts.OperatorMetricsEnabled,
UntaintControllerEnabled: opts.UntaintControllerEnabled,
}
}

// runTestCases runs test cases
func runTestCases(t *testing.T, tests []testCase, testFunc func(t *testing.T, tt testCase, opts ReconcilerOptions)) {
for _, tt := range tests {
Expand Down Expand Up @@ -106,7 +119,7 @@ func runDDAReconcilerTest(t *testing.T, tt testCase, opts ReconcilerOptions) {
r.initializeComponentRegistry()

ri := datadogagentinternal.NewReconciler(
datadogagentinternal.ReconcilerOptions{},
ddaiReconcilerOptionsFromDDA(opts),
c,
kubernetes.PlatformInfo{},
s,
Expand Down Expand Up @@ -179,7 +192,7 @@ func runFullReconcilerTest(t *testing.T, tt testCase, opts ReconcilerOptions) {
r.initializeComponentRegistry()

ri := datadogagentinternal.NewReconciler(
datadogagentinternal.ReconcilerOptions{},
ddaiReconcilerOptionsFromDDA(opts),
c,
kubernetes.PlatformInfo{},
s,
Expand Down Expand Up @@ -248,6 +261,64 @@ func buildClient(t *testing.T, tt testCase, s *runtime.Scheme) client.Client {
return builder.Build()
}

func TestReconcileDDA_UntaintController_injectsAgentNotReadyToleration(t *testing.T) {
const resourcesName = "foo"
const resourcesNamespace = "bar"
const dsName = "foo-agent"
defaultRequeueDuration := 15 * time.Second

wantTol := untaint.AgentNotReadyEqualToleration()
tt := testCase{
name: "untaint controller enabled injects agent-not-ready toleration on node agent DS",

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for completeness would be nice to have case asserting tolerations aren't added when feature is disabled.

loadFunc: func(c client.Client) *v2alpha1.DatadogAgent {
dda := testutils.NewInitializedDatadogAgentBuilder(resourcesNamespace, resourcesName).
Build()
_ = c.Create(context.TODO(), dda)
return dda
},
want: reconcile.Result{RequeueAfter: defaultRequeueDuration},
wantErr: false,
wantFunc: func(t *testing.T, c client.Client) {
ds := &appsv1.DaemonSet{}
err := c.Get(context.TODO(), types.NamespacedName{Namespace: resourcesNamespace, Name: dsName}, ds)
assert.NoError(t, err)
assert.True(t, slices.ContainsFunc(ds.Spec.Template.Spec.Tolerations, func(tol corev1.Toleration) bool {
return reflect.DeepEqual(tol, wantTol)
}), "expected injected toleration %+v in %+v", wantTol, ds.Spec.Template.Spec.Tolerations)
},
}
runDDAReconcilerTest(t, tt, ReconcilerOptions{UntaintControllerEnabled: true})
}

func TestReconcileDDA_UntaintController_disabledDoesNotInjectAgentNotReadyToleration(t *testing.T) {
const resourcesName = "foo"
const resourcesNamespace = "bar"
const dsName = "foo-agent"
defaultRequeueDuration := 15 * time.Second

wantTol := untaint.AgentNotReadyEqualToleration()
tt := testCase{
name: "untaint controller disabled does not inject agent-not-ready toleration on node agent DS",
loadFunc: func(c client.Client) *v2alpha1.DatadogAgent {
dda := testutils.NewInitializedDatadogAgentBuilder(resourcesNamespace, resourcesName).
Build()
_ = c.Create(context.TODO(), dda)
return dda
},
want: reconcile.Result{RequeueAfter: defaultRequeueDuration},
wantErr: false,
wantFunc: func(t *testing.T, c client.Client) {
ds := &appsv1.DaemonSet{}
err := c.Get(context.TODO(), types.NamespacedName{Namespace: resourcesNamespace, Name: dsName}, ds)
assert.NoError(t, err)
assert.False(t, slices.ContainsFunc(ds.Spec.Template.Spec.Tolerations, func(tol corev1.Toleration) bool {
return reflect.DeepEqual(tol, wantTol)
}), "did not expect injected toleration %+v in %+v", wantTol, ds.Spec.Template.Spec.Tolerations)
},
}
runDDAReconcilerTest(t, tt, ReconcilerOptions{UntaintControllerEnabled: false})
}

func TestReconcileDatadogAgentV2_Reconcile(t *testing.T) {
const resourcesName = "foo"
const resourcesNamespace = "bar"
Expand Down
1 change: 1 addition & 0 deletions internal/controller/datadogagentinternal/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ type ReconcilerOptions struct {
ExtendedDaemonsetOptions componentagent.ExtendedDaemonsetOptions
SupportCilium bool
OperatorMetricsEnabled bool
UntaintControllerEnabled bool
}

// Reconciler is the internal reconciler for Datadog Agent
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ func (r *Reconciler) reconcileV2Agent(ctx context.Context, requiredComponents fe

experimental.ApplyExperimentalOverrides(objLogger, ddai, podManagers)

if r.options.UntaintControllerEnabled {
componentagent.EnsureAgentNotReadyStartupToleration(objLogger, &podManagers.PodTemplateSpec().Spec)
}

if disabledByOverride {
if agentEnabled {
// The override supersedes what's set in requiredComponents; update status to reflect the conflict
Expand Down Expand Up @@ -162,6 +166,10 @@ func (r *Reconciler) reconcileV2Agent(ctx context.Context, requiredComponents fe

experimental.ApplyExperimentalOverrides(objLogger, ddai, podManagers)

if r.options.UntaintControllerEnabled {
componentagent.EnsureAgentNotReadyStartupToleration(objLogger, &podManagers.PodTemplateSpec().Spec)
}

if disabledByOverride {
if agentEnabled {
// The override supersedes what's set in requiredComponents; update status to reflect the conflict
Expand Down
6 changes: 4 additions & 2 deletions internal/controller/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ func startDatadogAgent(logger logr.Logger, mgr manager.Manager, pInfo kubernetes
OperatorMetricsEnabled: options.OperatorMetricsEnabled,
IntrospectionEnabled: options.IntrospectionEnabled,
DatadogAgentProfileEnabled: options.DatadogAgentProfileEnabled,
UntaintControllerEnabled: options.UntaintControllerEnabled,
DatadogCSIDriverEnabled: options.DatadogCSIDriverEnabled,
CreateControllerRevisions: options.CreateControllerRevisions,
},
Expand Down Expand Up @@ -163,8 +164,9 @@ func startDatadogAgentInternal(logger logr.Logger, mgr manager.Manager, pInfo ku
CanaryAutoFailEnabled: options.SupportExtendedDaemonset.CanaryAutoFailEnabled,
CanaryAutoFailMaxRestarts: int32(options.SupportExtendedDaemonset.CanaryAutoFailMaxRestarts),
},
SupportCilium: options.SupportCilium,
OperatorMetricsEnabled: options.OperatorMetricsEnabled,
SupportCilium: options.SupportCilium,
OperatorMetricsEnabled: options.OperatorMetricsEnabled,
UntaintControllerEnabled: options.UntaintControllerEnabled,
},
}).SetupWithManager(mgr, metricForwardersMgr)
}
Expand Down
18 changes: 5 additions & 13 deletions internal/controller/untaint_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"encoding/json"
"fmt"
"os"
"slices"
"time"

"github.com/go-logr/logr"
Expand All @@ -30,6 +31,7 @@ import (
"github.com/DataDog/datadog-operator/api/datadoghq/common"
"github.com/DataDog/datadog-operator/internal/controller/metrics"
"github.com/DataDog/datadog-operator/pkg/constants"
"github.com/DataDog/datadog-operator/pkg/untaint"
)

// Environment variables consumed by the untaint controller. Reading them here
Expand All @@ -45,11 +47,6 @@ const (
const (
untaintControllerName = "Untaint"

// Fixed taint to remove
agentNotReadyTaintKey = "agent.datadoghq.com/not-ready"
agentNotReadyTaintValue = "presence"
agentNotReadyTaintEffect = corev1.TaintEffectNoSchedule

// untaintPodNodeIndex is a controller-scoped field-indexer key for caching
// pods by their spec.nodeName. Using a namespaced key (rather than the
// bare "spec.nodeName") avoids collisions with any other controller that
Expand Down Expand Up @@ -232,7 +229,7 @@ func (r *UntaintReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
}
if r.eventsEnabled {
r.recorder.Eventf(node, corev1.EventTypeNormal, "TaintRemoved",
"Removed taint %s from node %s after agent became ready", agentNotReadyTaintKey, node.Name)
"Removed taint %s from node %s after agent became ready", untaint.AgentNotReadyTaintKey, node.Name)
}
return ctrl.Result{}, nil
}
Expand Down Expand Up @@ -351,12 +348,7 @@ func latestPodStartTime(pods []corev1.Pod) (time.Time, bool) {

// hasTaint returns true if the node has the agent-not-ready taint.
func hasTaint(node *corev1.Node) bool {
for _, t := range node.Spec.Taints {
if t.Key == agentNotReadyTaintKey && t.Value == agentNotReadyTaintValue && t.Effect == agentNotReadyTaintEffect {
return true
}
}
return false
return slices.ContainsFunc(node.Spec.Taints, untaint.IsAgentNotReadyTaint)
}

type jsonPatchOp struct {
Expand All @@ -376,7 +368,7 @@ func (r *UntaintReconciler) removeTaint(ctx context.Context, node *corev1.Node)
}
newTaints := make([]corev1.Taint, 0, len(currentTaints))
for _, t := range currentTaints {
if t.Key == agentNotReadyTaintKey && t.Value == agentNotReadyTaintValue && t.Effect == agentNotReadyTaintEffect {
if untaint.IsAgentNotReadyTaint(t) {
continue
}
newTaints = append(newTaints, t)
Expand Down
7 changes: 2 additions & 5 deletions internal/controller/untaint_controller_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (

"github.com/DataDog/datadog-operator/api/datadoghq/common"
"github.com/DataDog/datadog-operator/pkg/constants"
"github.com/DataDog/datadog-operator/pkg/untaint"
)

// Suite-level timeouts (see suite_v2_test.go):
Expand All @@ -39,11 +40,7 @@ const (
func makeTaintedNode(ctx context.Context, name string) func() {
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: name},
Spec: corev1.NodeSpec{Taints: []corev1.Taint{{
Key: agentNotReadyTaintKey,
Value: agentNotReadyTaintValue,
Effect: agentNotReadyTaintEffect,
}}},
Spec: corev1.NodeSpec{Taints: []corev1.Taint{untaint.AgentNotReadyTaint()}},
}
Expect(k8sClient.Create(ctx, node)).Should(Succeed())
return func() { _ = k8sClient.Delete(ctx, node) }
Expand Down
Loading
Loading