Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1124,6 +1124,33 @@ removeKubeletNodeLabel() {
fi
}

# For Harvest VMs (and potentially other oversubscribed SKUs), the sku-cpu label set by the RP
# reflects the nominal vCPU count from the CRP SKU API, but the guest OS may see more cores.
# This function detects the mismatch by comparing the label value against nproc output and
# corrects the label to match the actual core count visible to the OS, which is what kubelet
# reports as capacity.cpu.
fixSkuCpuLabel() {
local actual_cores
actual_cores=$(nproc 2>/dev/null)
if [ -z "$actual_cores" ] || [ "$actual_cores" -eq 0 ]; then
echo "WARNING: could not detect actual CPU cores via nproc, skipping sku-cpu fix"
return 0
fi

# Extract current sku-cpu value from labels
local current_value
current_value=$(echo "$KUBELET_NODE_LABELS" | grep -oP 'kubernetes\.azure\.com/sku-cpu=\K[^,]*' || true)
if [ -z "$current_value" ]; then
# Label not present — nothing to fix
return 0
fi

if [ "$current_value" != "$actual_cores" ]; then
echo "Correcting sku-cpu label from ${current_value} to ${actual_cores} (actual cores from nproc)"
KUBELET_NODE_LABELS="${KUBELET_NODE_LABELS//kubernetes.azure.com\/sku-cpu=${current_value}/kubernetes.azure.com\/sku-cpu=${actual_cores}}"
fi
}

# generate kubenode binary registry url from acs-mirror url
updateKubeBinaryRegistryURL() {
# if rp already passes registry url, then directly use the registry url that rp passes
Expand Down
4 changes: 4 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,10 @@ function nodePrep {
addKubeletNodeLabel "kubernetes.azure.com/localdns-exporter=enabled"
fi

# For Harvest VMs, the RP-provided sku-cpu label reflects the nominal SKU vCPU count,
# but the guest OS sees more cores. Correct the label to match actual nproc output.
fixSkuCpuLabel

logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet

# Configure localdns metrics exporter socket after ensureKubelet.
Expand Down
24 changes: 24 additions & 0 deletions spec/parts/linux/cloud-init/artifacts/cse_helpers_spec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,30 @@ Describe 'cse_helpers.sh'
End
End

Describe 'fixSkuCpuLabel'
nproc() { echo "16"; }

It 'should correct sku-cpu when nproc reports more cores than the label'
KUBELET_NODE_LABELS="agentpool=harvest,kubernetes.azure.com/sku-cpu=8,kubernetes.azure.com/agentpool=harvest"
When call fixSkuCpuLabel
The stdout should include 'Correcting sku-cpu label from 8 to 16'
The variable KUBELET_NODE_LABELS should equal 'agentpool=harvest,kubernetes.azure.com/sku-cpu=16,kubernetes.azure.com/agentpool=harvest'
End

It 'should not modify sku-cpu when it already matches nproc'
nproc() { echo "8"; }
KUBELET_NODE_LABELS="agentpool=pool1,kubernetes.azure.com/sku-cpu=8,kubernetes.azure.com/agentpool=pool1"
When call fixSkuCpuLabel
The variable KUBELET_NODE_LABELS should equal 'agentpool=pool1,kubernetes.azure.com/sku-cpu=8,kubernetes.azure.com/agentpool=pool1'
End

It 'should do nothing when sku-cpu label is not present'
KUBELET_NODE_LABELS="agentpool=pool1,kubernetes.azure.com/agentpool=pool1"
When call fixSkuCpuLabel
The variable KUBELET_NODE_LABELS should equal 'agentpool=pool1,kubernetes.azure.com/agentpool=pool1'
End
End

Describe 'assert_refresh_token'
# Helper function to create a mock JWT token
# Usage: create_mock_jwt_token '{"permissions":{"actions":["read","pull"]}}'
Expand Down
26 changes: 26 additions & 0 deletions staging/cse/windows/kubeletfunc.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,32 @@ function Remove-KubeletNodeLabel {
$global:KubeletNodeLabels = $filtered -join ","
}

# For Harvest VMs (and potentially other oversubscribed SKUs), the sku-cpu label set by the RP
# reflects the nominal vCPU count from the CRP SKU API, but the guest OS may see more cores.
# This function detects the mismatch by comparing the label value against the actual logical
# processor count and corrects the label to match what kubelet reports as capacity.cpu.
function Fix-SkuCpuLabel {
$actualCores = (Get-CimInstance -ClassName Win32_ComputerSystem).NumberOfLogicalProcessors
if (-not $actualCores -or $actualCores -eq 0) {
Write-Log "WARNING: could not detect actual CPU cores, skipping sku-cpu fix"
return
}

$labelList = $global:KubeletNodeLabels -split ","
$skuCpuLabel = $labelList | Where-Object { $_ -match '^kubernetes\.azure\.com/sku-cpu=' }
if (-not $skuCpuLabel) {
return
}

$currentValue = ($skuCpuLabel -split '=')[1]
if ($currentValue -ne "$actualCores") {
Write-Log "Correcting sku-cpu label from $currentValue to $actualCores (actual logical processors)"
$labelList = $labelList | Where-Object { $_ -notmatch '^kubernetes\.azure\.com/sku-cpu=' }
$labelList += "kubernetes.azure.com/sku-cpu=$actualCores"
$global:KubeletNodeLabels = $labelList -join ","
}
}

function Get-TagValue {
Param(
[Parameter(Mandatory=$true)][string]
Expand Down
25 changes: 25 additions & 0 deletions staging/cse/windows/kubeletfunc.tests.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -400,3 +400,28 @@ Describe 'Remove-KubeletNodeLabel' {
Compare-Object $global:KubeletNodeLabels $expected | Should -Be $null
}
}

Describe 'Fix-SkuCpuLabel' {
It "Should correct sku-cpu when actual cores differ from label value" {
Mock Get-CimInstance -MockWith { [PSCustomObject]@{ NumberOfLogicalProcessors = 16 } }
$global:KubeletNodeLabels = "agentpool=harvest,kubernetes.azure.com/sku-cpu=8,kubernetes.azure.com/agentpool=harvest"
Fix-SkuCpuLabel
$global:KubeletNodeLabels | Should -Match 'kubernetes\.azure\.com/sku-cpu=16'
$global:KubeletNodeLabels | Should -Not -Match 'sku-cpu=8'
}

It "Should not modify sku-cpu when it already matches actual cores" {
Mock Get-CimInstance -MockWith { [PSCustomObject]@{ NumberOfLogicalProcessors = 8 } }
$global:KubeletNodeLabels = "agentpool=pool1,kubernetes.azure.com/sku-cpu=8,kubernetes.azure.com/agentpool=pool1"
Fix-SkuCpuLabel
$global:KubeletNodeLabels | Should -Match 'kubernetes\.azure\.com/sku-cpu=8'
}

It "Should do nothing when sku-cpu label is not present" {
Mock Get-CimInstance -MockWith { [PSCustomObject]@{ NumberOfLogicalProcessors = 16 } }
$global:KubeletNodeLabels = "agentpool=pool1,kubernetes.azure.com/agentpool=pool1"
$expected = $global:KubeletNodeLabels
Fix-SkuCpuLabel
$global:KubeletNodeLabels | Should -Be $expected
}
}
3 changes: 3 additions & 0 deletions staging/cse/windows/provisioningscripts/kubeletstart.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ $global:ExternalNetwork = "ext"
$global:CNIConfig = "$CNIConfig"
$global:NetworkPlugin = $Global:ClusterConfiguration.Cni.Name
$global:KubeletNodeLabels = $Global:ClusterConfiguration.Kubernetes.Kubelet.NodeLabels

# Correct sku-cpu label if it doesn't match actual logical processor count (e.g., Harvest VMs)
Fix-SkuCpuLabel
$global:IsSkipCleanupNetwork = [System.Convert]::ToBoolean($Global:ClusterConfiguration.Services.IsSkipCleanupNetwork)

$global:EnableSecureTLSBootstrapping = [System.Convert]::ToBoolean($Global:ClusterConfiguration.Kubernetes.Kubelet.SecureTLSBootstrapArgs.Enabled)
Expand Down
Loading