Skip to content

Commit f49eb24

Browse files
authored
Support oomKill, tcpQueueLength features on GKE COS via provider annotation (#3011)
* Drop provider arg from Feature interface methods CONTP-1577 * Fix control plane monitoring feature * provider capabilities basic framework * Support providers.eks.ec2.useHostnameFromFile * Support oomKill, tcpQueueLength features on GKE COS via provider annotation
1 parent d9789bb commit f49eb24

5 files changed

Lines changed: 197 additions & 2 deletions

File tree

internal/controller/datadogagent/controller_v2_test.go

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,6 +1380,157 @@ func Test_AutopilotOverrides(t *testing.T) {
13801380
runTestCases(t, tests, runFullReconcilerTest)
13811381
}
13821382

1383+
// Test_COSProviderOverrides verifies that the GKE COS provider strips the
1384+
// `src` HostPath volume (and its system-probe mount) that oomkill and
1385+
// tcpqueuelength would otherwise add — the host has no /usr/src on COS nodes.
1386+
// The provider value flows from the DDA's `datadoghq.com/provider` annotation,
1387+
// or from the DAP's annotation propagated onto the per-profile DDAI.
1388+
func Test_COSProviderOverrides(t *testing.T) {
1389+
const resourcesName, resourcesNamespace = "foo", "bar"
1390+
const defaultDsName = "foo-agent"
1391+
const profileName = "cos-profile"
1392+
const profileDsName = "cos-profile-agent"
1393+
1394+
defaultRequeueDuration := 15 * time.Second
1395+
1396+
cosProfile := &v1alpha1.DatadogAgentProfile{
1397+
ObjectMeta: metav1.ObjectMeta{
1398+
Name: profileName,
1399+
Namespace: resourcesNamespace,
1400+
Annotations: map[string]string{
1401+
kubernetes.ProviderAnnotationKey: kubernetes.GKECosProvider,
1402+
},
1403+
},
1404+
Spec: v1alpha1.DatadogAgentProfileSpec{
1405+
ProfileAffinity: &v1alpha1.ProfileAffinity{
1406+
ProfileNodeAffinity: []corev1.NodeSelectorRequirement{
1407+
{
1408+
Key: "foo",
1409+
Operator: corev1.NodeSelectorOpIn,
1410+
Values: []string{"cos-profile"},
1411+
},
1412+
},
1413+
},
1414+
// Config is required by the DAP webhook validator. We don't need
1415+
// any spec changes — the COS provider is signalled via the
1416+
// metadata.annotations propagated to the DDAI.
1417+
Config: &v2alpha1.DatadogAgentSpec{},
1418+
},
1419+
}
1420+
1421+
// assertVolumes asserts the modules volume is always present (oomkill +
1422+
// tcpqueuelength add it unconditionally) and the src volume is present iff
1423+
// wantSrc is true.
1424+
assertVolumes := func(t *testing.T, c client.Client, ns, name string, wantSrc bool) {
1425+
t.Helper()
1426+
ds := &appsv1.DaemonSet{}
1427+
err := c.Get(context.TODO(), types.NamespacedName{Namespace: ns, Name: name}, ds)
1428+
assert.NoError(t, err, "Failed to get DaemonSet %s/%s", ns, name)
1429+
1430+
var sp *corev1.Container
1431+
for i, ctn := range ds.Spec.Template.Spec.Containers {
1432+
if ctn.Name == string(apicommon.SystemProbeContainerName) {
1433+
sp = &ds.Spec.Template.Spec.Containers[i]
1434+
break
1435+
}
1436+
}
1437+
assert.NotNil(t, sp, "system-probe container not found on DaemonSet %s/%s", ns, name)
1438+
1439+
hasModulesMount, hasSrcMount := false, false
1440+
for _, m := range sp.VolumeMounts {
1441+
if m.Name == common.ModulesVolumeName {
1442+
hasModulesMount = true
1443+
}
1444+
if m.Name == common.SrcVolumeName {
1445+
hasSrcMount = true
1446+
}
1447+
}
1448+
assert.True(t, hasModulesMount, "system-probe modules volume mount missing on %s/%s", ns, name)
1449+
assert.Equal(t, wantSrc, hasSrcMount, "system-probe src volume mount: want=%v got=%v on %s/%s", wantSrc, hasSrcMount, ns, name)
1450+
1451+
hasModulesVol, hasSrcVol := false, false
1452+
for _, v := range ds.Spec.Template.Spec.Volumes {
1453+
if v.Name == common.ModulesVolumeName {
1454+
hasModulesVol = true
1455+
}
1456+
if v.Name == common.SrcVolumeName {
1457+
hasSrcVol = true
1458+
}
1459+
}
1460+
assert.True(t, hasModulesVol, "pod-level modules volume missing on %s/%s", ns, name)
1461+
assert.Equal(t, wantSrc, hasSrcVol, "pod-level src volume: want=%v got=%v on %s/%s", wantSrc, hasSrcVol, ns, name)
1462+
}
1463+
1464+
// buildDDA returns a DDA with oomkill + tcpqueuelength enabled. Caller
1465+
// may layer annotations via opts.
1466+
buildDDA := func(annotations map[string]string) *v2alpha1.DatadogAgent {
1467+
b := testutils.NewInitializedDatadogAgentBuilder(resourcesNamespace, resourcesName).
1468+
WithOOMKillEnabled(true)
1469+
if len(annotations) > 0 {
1470+
b = b.WithAnnotations(annotations)
1471+
}
1472+
dda := b.Build()
1473+
dda.Spec.Features.TCPQueueLength = &v2alpha1.TCPQueueLengthFeatureConfig{
1474+
Enabled: ptr.To(true),
1475+
}
1476+
return dda
1477+
}
1478+
1479+
tests := []testCase{
1480+
{
1481+
name: "[cos] baseline DDA no annotation: src volume present on default DS",
1482+
loadFunc: func(c client.Client) *v2alpha1.DatadogAgent {
1483+
dda := buildDDA(nil)
1484+
_ = c.Create(context.TODO(), dda)
1485+
return dda
1486+
},
1487+
want: reconcile.Result{RequeueAfter: defaultRequeueDuration},
1488+
wantErr: false,
1489+
wantFunc: func(t *testing.T, c client.Client) {
1490+
assertVolumes(t, c, resourcesNamespace, defaultDsName, true)
1491+
},
1492+
},
1493+
{
1494+
name: "[cos] DDA with gke-cos annotation strips src volume on default DS",
1495+
loadFunc: func(c client.Client) *v2alpha1.DatadogAgent {
1496+
dda := buildDDA(map[string]string{
1497+
kubernetes.ProviderAnnotationKey: kubernetes.GKECosProvider,
1498+
})
1499+
_ = c.Create(context.TODO(), dda)
1500+
return dda
1501+
},
1502+
want: reconcile.Result{RequeueAfter: defaultRequeueDuration},
1503+
wantErr: false,
1504+
wantFunc: func(t *testing.T, c client.Client) {
1505+
assertVolumes(t, c, resourcesNamespace, defaultDsName, false)
1506+
},
1507+
},
1508+
{
1509+
name: "[cos] DDA without annotation, DAP with gke-cos strips src on profile DS only",
1510+
clientBuilder: fake.NewClientBuilder().
1511+
WithStatusSubresource(&v2alpha1.DatadogAgent{}, &v1alpha1.DatadogAgentProfile{}, &v1alpha1.DatadogAgentInternal{}).
1512+
WithObjects(cosProfile),
1513+
loadFunc: func(c client.Client) *v2alpha1.DatadogAgent {
1514+
dda := buildDDA(nil)
1515+
_ = c.Create(context.TODO(), dda)
1516+
return dda
1517+
},
1518+
profile: cosProfile,
1519+
profilesEnabled: true,
1520+
want: reconcile.Result{RequeueAfter: defaultRequeueDuration},
1521+
wantErr: false,
1522+
wantFunc: func(t *testing.T, c client.Client) {
1523+
// Profile DDAI inherited the DAP's COS annotation → src stripped.
1524+
assertVolumes(t, c, resourcesNamespace, profileDsName, false)
1525+
// Default DDAI has no provider annotation → src present.
1526+
assertVolumes(t, c, resourcesNamespace, defaultDsName, true)
1527+
},
1528+
},
1529+
}
1530+
1531+
runTestCases(t, tests, runFullReconcilerTest)
1532+
}
1533+
13831534
func verifyDaemonsetContainers(t *testing.T, c client.Client, resourcesNamespace, dsName string, expectedContainers []string) {
13841535
ds := &appsv1.DaemonSet{}
13851536
err := c.Get(context.TODO(), types.NamespacedName{Namespace: resourcesNamespace, Name: dsName}, ds)

internal/controller/datadogagent/feature/oomkill/feature.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ import (
1717
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent"
1818
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature"
1919
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/object/volume"
20+
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/providercaps"
21+
"github.com/DataDog/datadog-operator/pkg/kubernetes"
2022
)
2123

2224
func init() {
@@ -34,6 +36,17 @@ func buildOOMKillFeature(options *feature.Options) feature.Feature {
3436

3537
type oomKillFeature struct{}
3638

39+
// NodeAgentProviderCapabilities returns provider-conditional pod-template
40+
// mutations for the node agent. On GKE COS, /usr/src does not exist on host
41+
// nodes; strip the src volume + mounts so the pod schedules successfully.
42+
func (f *oomKillFeature) NodeAgentProviderCapabilities() providercaps.NodeAgentProviderCapabilities {
43+
return providercaps.NodeAgentProviderCapabilities{
44+
kubernetes.GKECosProvider: {
45+
RemoveVolumes: []string{common.SrcVolumeName},
46+
},
47+
}
48+
}
49+
3750
// ID returns the ID of the Feature
3851
func (f *oomKillFeature) ID() feature.IDType {
3952
return feature.OOMKillIDType
@@ -81,7 +94,8 @@ func (f *oomKillFeature) ManageNodeAgent(managers feature.PodTemplateManagers) e
8194
managers.VolumeMount().AddVolumeMountToContainer(&modulesVolMount, apicommon.SystemProbeContainerName)
8295
managers.Volume().AddVolume(&modulesVol)
8396

84-
// src volume mount
97+
// src volume mount — stripped on GKE COS by NodeAgentProviderCapabilities
98+
// (host nodes have no /usr/src).
8599
srcVol, srcVolMount := volume.GetVolumes(common.SrcVolumeName, common.SrcVolumePath, common.SrcVolumePath, true)
86100
managers.VolumeMount().AddVolumeMountToContainer(&srcVolMount, apicommon.SystemProbeContainerName)
87101
managers.Volume().AddVolume(&srcVol)

internal/controller/datadogagent/feature/tcpqueuelength/feature.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ import (
1717
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent"
1818
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature"
1919
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/object/volume"
20+
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/providercaps"
21+
"github.com/DataDog/datadog-operator/pkg/kubernetes"
2022
)
2123

2224
func init() {
@@ -34,6 +36,17 @@ func buildTCPQueueLengthFeature(options *feature.Options) feature.Feature {
3436

3537
type tcpQueueLengthFeature struct{}
3638

39+
// NodeAgentProviderCapabilities returns provider-conditional pod-template
40+
// mutations for the node agent. On GKE COS, /usr/src does not exist on host
41+
// nodes; strip the src volume + mounts so the pod schedules successfully.
42+
func (f *tcpQueueLengthFeature) NodeAgentProviderCapabilities() providercaps.NodeAgentProviderCapabilities {
43+
return providercaps.NodeAgentProviderCapabilities{
44+
kubernetes.GKECosProvider: {
45+
RemoveVolumes: []string{common.SrcVolumeName},
46+
},
47+
}
48+
}
49+
3750
// ID returns the ID of the Feature
3851
func (f *tcpQueueLengthFeature) ID() feature.IDType {
3952
return feature.TCPQueueLengthIDType
@@ -84,7 +97,8 @@ func (f *tcpQueueLengthFeature) ManageNodeAgent(managers feature.PodTemplateMana
8497
managers.VolumeMount().AddVolumeMountToContainer(&modulesVolMount, apicommon.SystemProbeContainerName)
8598
managers.Volume().AddVolume(&modulesVol)
8699

87-
// src volume mount
100+
// src volume mount — stripped on GKE COS by NodeAgentProviderCapabilities
101+
// (host nodes have no /usr/src).
88102
srcVol, srcVolMount := volume.GetVolumes(common.SrcVolumeName, common.SrcVolumePath, common.SrcVolumePath, true)
89103
managers.VolumeMount().AddVolumeMountToContainer(&srcVolMount, apicommon.SystemProbeContainerName)
90104
managers.Volume().AddVolume(&srcVol)

internal/controller/datadogagent/profile.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"github.com/DataDog/datadog-operator/pkg/agentprofile"
2626
"github.com/DataDog/datadog-operator/pkg/constants"
2727
"github.com/DataDog/datadog-operator/pkg/controller/utils/comparison"
28+
"github.com/DataDog/datadog-operator/pkg/kubernetes"
2829
)
2930

3031
func sendProfileEnabledMetric(enabled bool) {
@@ -263,6 +264,16 @@ func setProfileDDAIMeta(ddai *v1alpha1.DatadogAgentInternal, profile *v1alpha1.D
263264
}
264265
ddai.Labels[constants.ProfileLabelKey] = profile.Name
265266
}
267+
// Propagate the provider annotation from the profile onto the DDAI so a
268+
// DAP can declare a provider that differs from the DDA (e.g. a GKE COS
269+
// node pool selected by the profile). The profile value overrides the
270+
// DDA-inherited value when set.
271+
if v, ok := profile.GetAnnotations()[kubernetes.ProviderAnnotationKey]; ok {
272+
if ddai.Annotations == nil {
273+
ddai.Annotations = make(map[string]string)
274+
}
275+
ddai.Annotations[kubernetes.ProviderAnnotationKey] = v
276+
}
266277
return nil
267278
}
268279

pkg/kubernetes/provider.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ const (
3232
// GKECloudProvider GKE CloudProvider name
3333
GKECloudProvider = "gke"
3434

35+
// GKECosProvider is the full provider string for GKE on Container-Optimized OS
36+
// nodes (matches the `{cloudProvider}-{value}` convention from
37+
// generateValidProviderName). Used as a NodeAgentProviderCapabilities map key.
38+
GKECosProvider = "gke-cos"
39+
3540
// GKEProviderLabel is the GKE node label used to determine the node's provider
3641
GKEProviderLabel = "cloud.google.com/gke-os-distribution"
3742

0 commit comments

Comments
 (0)