Skip to content

Commit 05d8537

Browse files
authored
feat: node-exporter into vhd build (#7704)
1 parent e9c1a9c commit 05d8537

97 files changed

Lines changed: 921 additions & 201 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/renovate.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,17 @@
215215
"nilo19"
216216
]
217217
},
218+
{
219+
"matchPackageNames": [
220+
"node-exporter-kubernetes"
221+
],
222+
"matchUpdateTypes": [
223+
"patch"
224+
],
225+
"automerge": false,
226+
"enabled": true,
227+
"groupName": "node-exporter-kubernetes"
228+
},
218229
{
219230
"matchPackageNames": [
220231
"containernetworking/azure-cni"

e2e/validation.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) {
4646
ValidateRxBufferDefault(ctx, s)
4747
ValidateKernelLogs(ctx, s)
4848
ValidateScriptlessCSECmd(ctx, s)
49+
ValidateNodeExporter(ctx, s)
4950

5051
ValidateSysctlConfig(ctx, s, map[string]string{
5152
"net.ipv4.tcp_retries2": "8",

e2e/validators.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1405,6 +1405,67 @@ func ValidateNodeProblemDetector(ctx context.Context, s *Scenario) {
14051405
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "Node Problem Detector (NPD) service validation failed")
14061406
}
14071407

1408+
func ValidateNodeExporter(ctx context.Context, s *Scenario) {
1409+
s.T.Helper()
1410+
1411+
skipFile := "/etc/node-exporter.d/skip_vhd_node_exporter"
1412+
serviceName := "node-exporter.service"
1413+
1414+
// Check if node-exporter is installed on this VHD by looking for the skip sentinel file.
1415+
// The skip file is only present on VHDs that have node-exporter installed (Ubuntu, Mariner, Azure Linux).
1416+
// Flatcar, OSGuard, and older VHDs do not have node-exporter installed and will not have the skip file.
1417+
if !fileExist(ctx, s, skipFile) {
1418+
s.T.Logf("Skipping node-exporter validation: sentinel file %s not found (VHD does not have node-exporter installed)", skipFile)
1419+
return
1420+
}
1421+
1422+
s.T.Logf("skip_vhd_node_exporter sentinel file found, validating node-exporter installation")
1423+
1424+
// Validate service is running
1425+
ValidateSystemdUnitIsRunning(ctx, s, serviceName)
1426+
ValidateSystemdUnitIsNotFailed(ctx, s, serviceName)
1427+
1428+
// Validate service is enabled
1429+
execScriptOnVMForScenarioValidateExitCode(ctx, s, fmt.Sprintf("systemctl is-enabled %s", serviceName), 0, fmt.Sprintf("%s should be enabled", serviceName))
1430+
1431+
// Validate binary exists and is executable
1432+
// The binary is installed at /usr/bin and symlinked to /opt/bin for consistency with other binaries (kubelet, etc.)
1433+
ValidateFileExists(ctx, s, "/usr/bin/node-exporter")
1434+
ValidateFileExists(ctx, s, "/opt/bin/node-exporter")
1435+
ValidateFileExists(ctx, s, "/opt/bin/node-exporter-startup.sh")
1436+
1437+
// Validate configuration files exist
1438+
ValidateFileExists(ctx, s, skipFile)
1439+
ValidateFileExists(ctx, s, "/etc/node-exporter.d/web-config.yml")
1440+
1441+
// Validate that node-exporter is listening on port 19100
1442+
// We verify the port is open using ss/netstat rather than making a full mTLS request,
1443+
// since the e2e test environment may not have the correct client certs set up.
1444+
// The mTLS configuration is validated by checking that the web-config.yml exists
1445+
// and contains the expected TLS settings.
1446+
s.T.Logf("Validating node-exporter is listening on port 19100")
1447+
command := []string{
1448+
"set -ex",
1449+
"NODE_IP=$(hostname -I | awk '{print $1}')",
1450+
// Verify node-exporter is listening on port 19100
1451+
"ss -tlnp | grep -q ':19100' || netstat -tlnp | grep -q ':19100'",
1452+
}
1453+
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "node-exporter should be listening on port 19100")
1454+
1455+
// Verify the web-config.yml has proper TLS configuration
1456+
s.T.Logf("Validating node-exporter TLS configuration")
1457+
tlsCommand := []string{
1458+
"set -ex",
1459+
// Verify web-config.yml contains TLS settings
1460+
"grep -q 'tls_server_config' /etc/node-exporter.d/web-config.yml",
1461+
"grep -q 'client_auth_type' /etc/node-exporter.d/web-config.yml",
1462+
"grep -q 'client_ca_file' /etc/node-exporter.d/web-config.yml",
1463+
}
1464+
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(tlsCommand, "\n"), 0, "node-exporter TLS config should be properly configured")
1465+
1466+
s.T.Logf("node-exporter validation passed")
1467+
}
1468+
14081469
func ValidateNPDFilesystemCorruption(ctx context.Context, s *Scenario) {
14091470
command := []string{
14101471
"set -ex",

parts/common/components.json

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2045,6 +2045,61 @@
20452045
}
20462046
}
20472047
}
2048+
},
2049+
{
2050+
"name": "node-exporter",
2051+
"downloadLocation": "/opt/node-exporter",
2052+
"downloadURIs": {
2053+
"ubuntu": {
2054+
"r2404": {
2055+
"versionsV2": [
2056+
{
2057+
"renovateTag": "name=node-exporter-kubernetes, repository=production, os=ubuntu, release=24.04",
2058+
"latestVersion": "1.9.1-ubuntu24.04u12"
2059+
}
2060+
]
2061+
},
2062+
"r2204": {
2063+
"versionsV2": [
2064+
{
2065+
"renovateTag": "name=node-exporter-kubernetes, repository=production, os=ubuntu, release=22.04",
2066+
"latestVersion": "1.9.1-ubuntu22.04u12"
2067+
}
2068+
]
2069+
},
2070+
"r2004": {
2071+
"versionsV2": [
2072+
{
2073+
"renovateTag": "name=node-exporter-kubernetes, repository=production, os=ubuntu, release=20.04",
2074+
"latestVersion": "1.9.1-ubuntu20.04u12"
2075+
}
2076+
]
2077+
}
2078+
},
2079+
"azurelinux": {
2080+
"v3.0": {
2081+
"versionsV2": [
2082+
{
2083+
"renovateTag": "RPM_registry=https://packages.microsoft.com/azurelinux/3.0/prod/cloud-native/x86_64/repodata, name=node-exporter-kubernetes, os=azurelinux, release=3.0",
2084+
"latestVersion": "1.9.1-12.azl3"
2085+
}
2086+
]
2087+
},
2088+
"OSGUARD/v3.0": {
2089+
"versionsV2": []
2090+
}
2091+
},
2092+
"mariner": {
2093+
"current": {
2094+
"versionsV2": []
2095+
}
2096+
},
2097+
"flatcar": {
2098+
"current": {
2099+
"versionsV2": []
2100+
}
2101+
}
2102+
}
20482103
}
20492104
],
20502105
"OCIArtifacts": [

parts/linux/cloud-init/artifacts/cse_config.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -807,6 +807,25 @@ EOF
807807
systemctlEnableAndStart mig-partition 300
808808
}
809809

810+
configureNodeExporter() {
811+
echo "Configuring Node Exporter"
812+
# Check for skip file to determine if node-exporter was installed on this VHD
813+
if [ ! -f /etc/node-exporter.d/skip_vhd_node_exporter ]; then
814+
echo "Node Exporter assets not found on this VHD (missing /etc/node-exporter.d/skip_vhd_node_exporter); skipping configuration."
815+
return 0
816+
fi
817+
818+
if ! systemctlEnableAndStart node-exporter 30; then
819+
echo "Failed to start node-exporter service"
820+
return $ERR_NODE_EXPORTER_START_FAIL
821+
fi
822+
if ! systemctlEnableAndStart node-exporter-restart.path 30; then
823+
echo "Failed to start node-exporter-restart.path"
824+
return $ERR_NODE_EXPORTER_START_FAIL
825+
fi
826+
echo "Node Exporter started successfully"
827+
}
828+
810829
ensureSysctl() {
811830
SYSCTL_CONFIG_FILE=/etc/sysctl.d/999-sysctl-aks.conf
812831
mkdir -p "$(dirname "${SYSCTL_CONFIG_FILE}")"

parts/linux/cloud-init/artifacts/cse_helpers.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ ERR_ENABLE_MANAGED_GPU_EXPERIENCE=123 # Error confguring managed GPU experience
7878
# Error code 124 is returned when a `timeout` command times out, and --preserve-status is not specified: https://man7.org/linux/man-pages/man1/timeout.1.html
7979
ERR_VHD_BUILD_ERROR=125 # Reserved for VHD CI exit conditions
8080

81+
ERR_NODE_EXPORTER_START_FAIL=128 # Error starting or enabling node-exporter service
82+
8183
ERR_SWAP_CREATE_FAIL=130 # Error allocating swap file
8284
ERR_SWAP_CREATE_INSUFFICIENT_DISK_SPACE=131 # Error insufficient disk space for swap file creation
8385

parts/linux/cloud-init/artifacts/cse_main.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,8 @@ function nodePrep {
473473

474474
logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet
475475

476+
logs_to_events "AKS.CSE.configureNodeExporter" configureNodeExporter
477+
476478
if $REBOOTREQUIRED; then
477479
echo 'reboot required, rebooting node in 1 minute'
478480
/bin/bash -c "shutdown -r 1 &"
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
tls_server_config:
2+
cert_file: "/etc/kubernetes/certs/kubeletserver.crt"
3+
key_file: "/etc/kubernetes/certs/kubeletserver.key"
4+
client_auth_type: "RequireAndVerifyClientCert"
5+
client_ca_file: "/etc/kubernetes/certs/ca.crt"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[Path]
2+
# Watch server cert paths - one will exist depending on whether kubelet serving cert rotation is enabled
3+
# Rotation enabled: kubelet-server-current.pem (symlink updated on rotation)
4+
# Rotation disabled: kubeletserver.crt (static cert)
5+
PathModified=/var/lib/kubelet/pki/kubelet-server-current.pem
6+
PathModified=/etc/kubernetes/certs/kubeletserver.crt
7+
8+
[Install]
9+
WantedBy=multi-user.target
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[Unit]
2+
Description=Restart node-exporter when certificates change
3+
4+
[Service]
5+
Type=oneshot
6+
ExecStart=/bin/systemctl restart node-exporter.service

0 commit comments

Comments
 (0)