Skip to content

Commit 6faa578

Browse files
chmill-zzkwaksaewon
authored andcommitted
feat: node-exporter into vhd build (#7704)
1 parent 7c27026 commit 6faa578

97 files changed

Lines changed: 921 additions & 201 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/renovate.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,17 @@
215215
"nilo19"
216216
]
217217
},
218+
{
219+
"matchPackageNames": [
220+
"node-exporter-kubernetes"
221+
],
222+
"matchUpdateTypes": [
223+
"patch"
224+
],
225+
"automerge": false,
226+
"enabled": true,
227+
"groupName": "node-exporter-kubernetes"
228+
},
218229
{
219230
"matchPackageNames": [
220231
"containernetworking/azure-cni"

e2e/validation.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) {
7878
ValidateRxBufferDefault(ctx, s)
7979
ValidateKernelLogs(ctx, s)
8080
ValidateScriptlessCSECmd(ctx, s)
81+
ValidateNodeExporter(ctx, s)
8182

8283
ValidateSysctlConfig(ctx, s, map[string]string{
8384
"net.ipv4.tcp_retries2": "8",

e2e/validators.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1731,6 +1731,67 @@ func ValidateNodeProblemDetector(ctx context.Context, s *Scenario) {
17311731
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "Node Problem Detector (NPD) service validation failed")
17321732
}
17331733

1734+
func ValidateNodeExporter(ctx context.Context, s *Scenario) {
1735+
s.T.Helper()
1736+
1737+
skipFile := "/etc/node-exporter.d/skip_vhd_node_exporter"
1738+
serviceName := "node-exporter.service"
1739+
1740+
// Check if node-exporter is installed on this VHD by looking for the skip sentinel file.
1741+
// The skip file is only present on VHDs that have node-exporter installed (Ubuntu, Mariner, Azure Linux).
1742+
// Flatcar, OSGuard, and older VHDs do not have node-exporter installed and will not have the skip file.
1743+
if !fileExist(ctx, s, skipFile) {
1744+
s.T.Logf("Skipping node-exporter validation: sentinel file %s not found (VHD does not have node-exporter installed)", skipFile)
1745+
return
1746+
}
1747+
1748+
s.T.Logf("skip_vhd_node_exporter sentinel file found, validating node-exporter installation")
1749+
1750+
// Validate service is running
1751+
ValidateSystemdUnitIsRunning(ctx, s, serviceName)
1752+
ValidateSystemdUnitIsNotFailed(ctx, s, serviceName)
1753+
1754+
// Validate service is enabled
1755+
execScriptOnVMForScenarioValidateExitCode(ctx, s, fmt.Sprintf("systemctl is-enabled %s", serviceName), 0, fmt.Sprintf("%s should be enabled", serviceName))
1756+
1757+
// Validate binary exists and is executable
1758+
// The binary is installed at /usr/bin and symlinked to /opt/bin for consistency with other binaries (kubelet, etc.)
1759+
ValidateFileExists(ctx, s, "/usr/bin/node-exporter")
1760+
ValidateFileExists(ctx, s, "/opt/bin/node-exporter")
1761+
ValidateFileExists(ctx, s, "/opt/bin/node-exporter-startup.sh")
1762+
1763+
// Validate configuration files exist
1764+
ValidateFileExists(ctx, s, skipFile)
1765+
ValidateFileExists(ctx, s, "/etc/node-exporter.d/web-config.yml")
1766+
1767+
// Validate that node-exporter is listening on port 19100
1768+
// We verify the port is open using ss/netstat rather than making a full mTLS request,
1769+
// since the e2e test environment may not have the correct client certs set up.
1770+
// The mTLS configuration is validated by checking that the web-config.yml exists
1771+
// and contains the expected TLS settings.
1772+
s.T.Logf("Validating node-exporter is listening on port 19100")
1773+
command := []string{
1774+
"set -ex",
1775+
"NODE_IP=$(hostname -I | awk '{print $1}')",
1776+
// Verify node-exporter is listening on port 19100
1777+
"ss -tlnp | grep -q ':19100' || netstat -tlnp | grep -q ':19100'",
1778+
}
1779+
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "node-exporter should be listening on port 19100")
1780+
1781+
// Verify the web-config.yml has proper TLS configuration
1782+
s.T.Logf("Validating node-exporter TLS configuration")
1783+
tlsCommand := []string{
1784+
"set -ex",
1785+
// Verify web-config.yml contains TLS settings
1786+
"grep -q 'tls_server_config' /etc/node-exporter.d/web-config.yml",
1787+
"grep -q 'client_auth_type' /etc/node-exporter.d/web-config.yml",
1788+
"grep -q 'client_ca_file' /etc/node-exporter.d/web-config.yml",
1789+
}
1790+
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(tlsCommand, "\n"), 0, "node-exporter TLS config should be properly configured")
1791+
1792+
s.T.Logf("node-exporter validation passed")
1793+
}
1794+
17341795
func ValidateNPDFilesystemCorruption(ctx context.Context, s *Scenario) {
17351796
command := []string{
17361797
"set -ex",

parts/common/components.json

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2045,6 +2045,61 @@
20452045
}
20462046
}
20472047
}
2048+
},
2049+
{
2050+
"name": "node-exporter",
2051+
"downloadLocation": "/opt/node-exporter",
2052+
"downloadURIs": {
2053+
"ubuntu": {
2054+
"r2404": {
2055+
"versionsV2": [
2056+
{
2057+
"renovateTag": "name=node-exporter-kubernetes, repository=production, os=ubuntu, release=24.04",
2058+
"latestVersion": "1.9.1-ubuntu24.04u12"
2059+
}
2060+
]
2061+
},
2062+
"r2204": {
2063+
"versionsV2": [
2064+
{
2065+
"renovateTag": "name=node-exporter-kubernetes, repository=production, os=ubuntu, release=22.04",
2066+
"latestVersion": "1.9.1-ubuntu22.04u12"
2067+
}
2068+
]
2069+
},
2070+
"r2004": {
2071+
"versionsV2": [
2072+
{
2073+
"renovateTag": "name=node-exporter-kubernetes, repository=production, os=ubuntu, release=20.04",
2074+
"latestVersion": "1.9.1-ubuntu20.04u12"
2075+
}
2076+
]
2077+
}
2078+
},
2079+
"azurelinux": {
2080+
"v3.0": {
2081+
"versionsV2": [
2082+
{
2083+
"renovateTag": "RPM_registry=https://packages.microsoft.com/azurelinux/3.0/prod/cloud-native/x86_64/repodata, name=node-exporter-kubernetes, os=azurelinux, release=3.0",
2084+
"latestVersion": "1.9.1-12.azl3"
2085+
}
2086+
]
2087+
},
2088+
"OSGUARD/v3.0": {
2089+
"versionsV2": []
2090+
}
2091+
},
2092+
"mariner": {
2093+
"current": {
2094+
"versionsV2": []
2095+
}
2096+
},
2097+
"flatcar": {
2098+
"current": {
2099+
"versionsV2": []
2100+
}
2101+
}
2102+
}
20482103
}
20492104
],
20502105
"OCIArtifacts": [

parts/linux/cloud-init/artifacts/cse_config.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -807,6 +807,25 @@ EOF
807807
systemctlEnableAndStart mig-partition 300
808808
}
809809

810+
configureNodeExporter() {
811+
echo "Configuring Node Exporter"
812+
# Check for skip file to determine if node-exporter was installed on this VHD
813+
if [ ! -f /etc/node-exporter.d/skip_vhd_node_exporter ]; then
814+
echo "Node Exporter assets not found on this VHD (missing /etc/node-exporter.d/skip_vhd_node_exporter); skipping configuration."
815+
return 0
816+
fi
817+
818+
if ! systemctlEnableAndStart node-exporter 30; then
819+
echo "Failed to start node-exporter service"
820+
return $ERR_NODE_EXPORTER_START_FAIL
821+
fi
822+
if ! systemctlEnableAndStart node-exporter-restart.path 30; then
823+
echo "Failed to start node-exporter-restart.path"
824+
return $ERR_NODE_EXPORTER_START_FAIL
825+
fi
826+
echo "Node Exporter started successfully"
827+
}
828+
810829
ensureSysctl() {
811830
SYSCTL_CONFIG_FILE=/etc/sysctl.d/999-sysctl-aks.conf
812831
mkdir -p "$(dirname "${SYSCTL_CONFIG_FILE}")"

parts/linux/cloud-init/artifacts/cse_helpers.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ ERR_ENABLE_MANAGED_GPU_EXPERIENCE=123 # Error confguring managed GPU experience
7878
# Error code 124 is returned when a `timeout` command times out, and --preserve-status is not specified: https://man7.org/linux/man-pages/man1/timeout.1.html
7979
ERR_VHD_BUILD_ERROR=125 # Reserved for VHD CI exit conditions
8080

81+
ERR_NODE_EXPORTER_START_FAIL=128 # Error starting or enabling node-exporter service
82+
8183
ERR_SWAP_CREATE_FAIL=130 # Error allocating swap file
8284
ERR_SWAP_CREATE_INSUFFICIENT_DISK_SPACE=131 # Error insufficient disk space for swap file creation
8385

parts/linux/cloud-init/artifacts/cse_main.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,8 @@ function nodePrep {
498498

499499
logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet
500500

501+
logs_to_events "AKS.CSE.configureNodeExporter" configureNodeExporter
502+
501503
if $REBOOTREQUIRED; then
502504
echo 'reboot required, rebooting node in 1 minute'
503505
/bin/bash -c "shutdown -r 1 &"
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
tls_server_config:
2+
cert_file: "/etc/kubernetes/certs/kubeletserver.crt"
3+
key_file: "/etc/kubernetes/certs/kubeletserver.key"
4+
client_auth_type: "RequireAndVerifyClientCert"
5+
client_ca_file: "/etc/kubernetes/certs/ca.crt"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[Path]
2+
# Watch server cert paths - one will exist depending on whether kubelet serving cert rotation is enabled
3+
# Rotation enabled: kubelet-server-current.pem (symlink updated on rotation)
4+
# Rotation disabled: kubeletserver.crt (static cert)
5+
PathModified=/var/lib/kubelet/pki/kubelet-server-current.pem
6+
PathModified=/etc/kubernetes/certs/kubeletserver.crt
7+
8+
[Install]
9+
WantedBy=multi-user.target
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[Unit]
2+
Description=Restart node-exporter when certificates change
3+
4+
[Service]
5+
Type=oneshot
6+
ExecStart=/bin/systemctl restart node-exporter.service

0 commit comments

Comments
 (0)