Skip to content

Commit 625cc86

Browse files
authored
feat: adding support for amdama (supernova) gpus. (#7927)
1 parent 7e2e16b commit 625cc86

72 files changed

Lines changed: 345 additions & 198 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

e2e/aks_model.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,14 +245,29 @@ func getFirewall(ctx context.Context, location, firewallSubnetID, publicIPID str
245245
TargetFqdns: []*string{to.Ptr(mooncakeMAR), to.Ptr(mooncakeMARData)},
246246
}
247247

248+
// Needed for access to download.microsoft.com
249+
// This is currently only needed by the Supernova (MA35D) SKU GPU tests
250+
// Driver install code in setupAmdAma() depends on this
251+
dmcRule := armnetwork.AzureFirewallApplicationRule{
252+
Name: to.Ptr("dmc-fqdn"),
253+
SourceAddresses: []*string{to.Ptr("*")},
254+
Protocols: []*armnetwork.AzureFirewallApplicationRuleProtocol{
255+
{
256+
ProtocolType: to.Ptr(armnetwork.AzureFirewallApplicationRuleProtocolTypeHTTPS),
257+
Port: to.Ptr[int32](443),
258+
},
259+
},
260+
TargetFqdns: []*string{to.Ptr("download.microsoft.com")},
261+
}
262+
248263
appRuleCollection := armnetwork.AzureFirewallApplicationRuleCollection{
249264
Name: to.Ptr("aksfwar"),
250265
Properties: &armnetwork.AzureFirewallApplicationRuleCollectionPropertiesFormat{
251266
Priority: to.Ptr[int32](100),
252267
Action: &armnetwork.AzureFirewallRCAction{
253268
Type: to.Ptr(armnetwork.AzureFirewallRCActionTypeAllow),
254269
},
255-
Rules: []*armnetwork.AzureFirewallApplicationRule{&aksAppRule, &blobStorageAppRule, &mooncakeMARRule},
270+
Rules: []*armnetwork.AzureFirewallApplicationRule{&aksAppRule, &blobStorageAppRule, &mooncakeMARRule, &dmcRule},
256271
},
257272
}
258273

e2e/scenario_gpu_managed_experience_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning(t *testing.T) {
276276
ValidateNvidiaDevicePluginServiceRunning(ctx, s)
277277

278278
// Validate that GPU resources are advertised by the device plugin
279-
ValidateNodeAdvertisesGPUResources(ctx, s, 1)
279+
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")
280280

281281
// Validate that GPU workloads can be scheduled
282282
ValidateGPUWorkloadSchedulable(ctx, s, 1)
@@ -351,7 +351,7 @@ func Test_Ubuntu2204_NvidiaDevicePluginRunning(t *testing.T) {
351351
ValidateNvidiaDevicePluginServiceRunning(ctx, s)
352352

353353
// Validate that GPU resources are advertised by the device plugin
354-
ValidateNodeAdvertisesGPUResources(ctx, s, 1)
354+
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")
355355

356356
// Validate that GPU workloads can be scheduled
357357
ValidateGPUWorkloadSchedulable(ctx, s, 1)
@@ -425,7 +425,7 @@ func Test_AzureLinux3_NvidiaDevicePluginRunning(t *testing.T) {
425425
ValidateNvidiaDevicePluginServiceRunning(ctx, s)
426426

427427
// Validate that GPU resources are advertised by the device plugin
428-
ValidateNodeAdvertisesGPUResources(ctx, s, 1)
428+
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")
429429

430430
// Validate that GPU workloads can be scheduled
431431
ValidateGPUWorkloadSchedulable(ctx, s, 1)
@@ -501,7 +501,7 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning_MIG(t *testing.T) {
501501
ValidateMIGInstancesCreated(ctx, s, "MIG 2g.20gb")
502502

503503
// Validate that GPU resources are advertised by the device plugin
504-
ValidateNodeAdvertisesGPUResources(ctx, s, 3)
504+
ValidateNodeAdvertisesGPUResources(ctx, s, 3, "nvidia.com/gpu")
505505

506506
// Validate that MIG workloads can be scheduled
507507
ValidateGPUWorkloadSchedulable(ctx, s, 3)
@@ -572,7 +572,7 @@ func Test_Ubuntu2204_NvidiaDevicePluginRunning_WithoutVMSSTag(t *testing.T) {
572572
ValidateNvidiaDevicePluginServiceRunning(ctx, s)
573573

574574
// Validate that GPU resources are advertised by the device plugin
575-
ValidateNodeAdvertisesGPUResources(ctx, s, 1)
575+
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")
576576

577577
// Validate that GPU workloads can be scheduled
578578
ValidateGPUWorkloadSchedulable(ctx, s, 1)

e2e/scenario_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1410,6 +1410,60 @@ func Test_AzureLinuxV3_MessageOfTheDay_Scriptless(t *testing.T) {
14101410
})
14111411
}
14121412

1413+
func Test_AzureLinuxV3_MA35D(t *testing.T) {
1414+
RunScenario(t, &Scenario{
1415+
Description: "Tests that a node using AzureLinuxV3 can support MA35D SKU",
1416+
Config: Config{
1417+
Cluster: ClusterKubenet,
1418+
VHD: config.VHDAzureLinuxV3Gen2,
1419+
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
1420+
nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NM16ads_MA35D"
1421+
nbc.AgentPoolProfile.VMSize = "Standard_NM16ads_MA35D"
1422+
},
1423+
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
1424+
vmss.SKU.Name = to.Ptr("Standard_NM16ads_MA35D")
1425+
},
1426+
Validator: func(ctx context.Context, s *Scenario) {
1427+
ValidateNonEmptyDirectory(ctx, s, "/sys/devices/virtual/misc/ama_transcoder0")
1428+
ValidateNonEmptyDirectory(ctx, s, "/opt/amd/ama/ma35/")
1429+
ValidateSystemdUnitIsRunning(ctx, s, "amdama-device-plugin.service")
1430+
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "squat.ai/amdama")
1431+
},
1432+
},
1433+
// No MA35D GPU capacity in West US, so using East US
1434+
Location: "eastus",
1435+
K8sSystemPoolSKU: "Standard_D2s_v3",
1436+
})
1437+
}
1438+
1439+
func Test_AzureLinuxV3_MA35D_Scriptless(t *testing.T) {
1440+
RunScenario(t, &Scenario{
1441+
Description: "Tests that a node using AzureLinuxV3 can support MA35D SKU",
1442+
Tags: Tags{
1443+
Scriptless: true,
1444+
},
1445+
Config: Config{
1446+
Cluster: ClusterKubenet,
1447+
VHD: config.VHDAzureLinuxV3Gen2,
1448+
AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) {
1449+
config.VmSize = "Standard_NM16ads_MA35D"
1450+
},
1451+
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
1452+
vmss.SKU.Name = to.Ptr("Standard_NM16ads_MA35D")
1453+
},
1454+
Validator: func(ctx context.Context, s *Scenario) {
1455+
ValidateNonEmptyDirectory(ctx, s, "/sys/devices/virtual/misc/ama_transcoder0")
1456+
ValidateNonEmptyDirectory(ctx, s, "/opt/amd/ama/ma35/")
1457+
ValidateSystemdUnitIsRunning(ctx, s, "amdama-device-plugin.service")
1458+
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "squat.ai/amdama")
1459+
},
1460+
},
1461+
// No MA35D GPU capacity in West US, so using East US
1462+
Location: "eastus",
1463+
K8sSystemPoolSKU: "Standard_D2s_v3",
1464+
})
1465+
}
1466+
14131467
func Test_AzureLinuxV3LocalDns_Disabled_Scriptless(t *testing.T) {
14141468
RunScenario(t, &Scenario{
14151469
Description: "Tests that a node using a AzureLinuxV3 can be bootstrapped with localdns disabled",

e2e/validators.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1461,12 +1461,11 @@ func ValidateNvidiaDevicePluginServiceRunning(ctx context.Context, s *Scenario)
14611461
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NVIDIA device plugin systemd service should be active and enabled")
14621462
}
14631463

1464-
func ValidateNodeAdvertisesGPUResources(ctx context.Context, s *Scenario, gpuCountExpected int64) {
1464+
func ValidateNodeAdvertisesGPUResources(ctx context.Context, s *Scenario, gpuCountExpected int64, resourceName string) {
14651465
s.T.Helper()
14661466
s.T.Logf("validating that node advertises GPU resources")
1467-
resourceName := "nvidia.com/gpu"
14681467

1469-
// First, wait for the nvidia.com/gpu resource to be available
1468+
// First, wait for the GPU resource to be available
14701469
waitUntilResourceAvailable(ctx, s, resourceName)
14711470

14721471
// Get the node using the Kubernetes client from the test framework

parts/linux/cloud-init/artifacts/cse_config.sh

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -975,6 +975,71 @@ ensureGPUDrivers() {
975975
fi
976976
}
977977

978+
# Install AMD AMA core SW package for MA35D (Supernova GPU SKU)
979+
# Note that this depends on access to download.microsoft.com, so network-isolated clusters are not supported
980+
dnf_install_amd_ama_core() {
981+
retries=$1; wait_sleep=$2; timeout=$3; shift && shift && shift
982+
for i in $(seq 1 $retries); do
983+
# RPM_FRONTEND env variable needed to disable license agreement prompt
984+
RPM_FRONTEND=noninteractive dnf install -y https://download.microsoft.com/download/16b04fa7-883e-4a94-88c2-801881a47b28/amd-ama-core_1.3.0-2503242033-amd64.rpm && break || \
985+
if [ $i -eq $retries ]; then
986+
return 1
987+
else
988+
sleep $wait_sleep
989+
dnf_makecache
990+
fi
991+
done
992+
echo Executed dnf install AMD AMA core package $i times;
993+
}
994+
995+
# Install AMD AMA drivers/SW for MA35D (Supernova GPU SKU)
996+
# Note that this depends on access to download.microsoft.com, so network-isolated clusters are not supported
997+
setupAmdAma() {
998+
if [ "$(isARM64)" -eq 1 ]; then
999+
return
1000+
fi
1001+
1002+
if isMarinerOrAzureLinux "$OS"; then
1003+
# Install driver - currently version 1.3.0 is supported
1004+
if ! dnf_install 30 1 600 azurelinux-repos-amd; then
1005+
echo "Unable to install Azure Linux AMD package repo, exiting..."
1006+
exit $ERR_AMDAMA_INSTALL_FAIL
1007+
fi
1008+
KERNEL_VERSION=$(uname -r | sed 's/-/./g')
1009+
AMD_AMA_DRIVER_PACKAGE=$(dnf repoquery -y --available "amd-ama-driver-1.3.0*" | grep -E "amd-ama-driver-[0-9]+.*_$KERNEL_VERSION" | sort -V | tail -n 1)
1010+
if [ -z "$AMD_AMA_DRIVER_PACKAGE" ]; then
1011+
echo "Unable to find AMD AMA driver package for current kernel version, exiting..."
1012+
exit $ERR_AMDAMA_DRIVER_NOT_FOUND
1013+
fi
1014+
if ! dnf_install 30 1 600 $AMD_AMA_DRIVER_PACKAGE; then
1015+
echo "Unable to install AMD AMA driver package, exiting..."
1016+
exit $ERR_AMDAMA_INSTALL_FAIL
1017+
fi
1018+
1019+
# Install core package
1020+
if ! dnf_install 30 1 600 azurelinux-repos-extended libzip; then
1021+
echo "Unable to install Azure Linux packages required for AMD AMA core package, exiting..."
1022+
exit $ERR_AMDAMA_INSTALL_FAIL
1023+
fi
1024+
if ! dnf_install_amd_ama_core 30 1 600; then
1025+
echo "Unable to install AMD AMA core package, exiting..."
1026+
exit $ERR_AMDAMA_INSTALL_FAIL
1027+
fi
1028+
1029+
# Install AKS device plugin
1030+
if ! dnf_install 30 1 600 amdama-device-plugin.x86_64; then
1031+
echo "Unable to install AMD AMA AKS device plugin package, exiting..."
1032+
exit $ERR_AMDAMA_INSTALL_FAIL
1033+
fi
1034+
# Configure huge pages
1035+
sh -c "echo 'vm.nr_hugepages=4096' > /etc/sysctl.d/99-ama_transcoder.conf"
1036+
sh -c "echo 4096 > /proc/sys/vm/nr_hugepages"
1037+
if [ "$(systemctl is-active kubelet)" = "active" ]; then
1038+
systemctl restart kubelet
1039+
fi
1040+
fi
1041+
}
1042+
9781043
disableSSH() {
9791044
# On ubuntu, the ssh service is named "ssh.service"
9801045
systemctlDisableAndStop ssh || exit $ERR_DISABLE_SSH

parts/linux/cloud-init/artifacts/cse_helpers.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ ERR_GPU_DEVICE_PLUGIN_START_FAIL=86 # nvidia device plugin could not be started
5252
ERR_GPU_INFO_ROM_CORRUPTED=87 # info ROM corrupted error when executing nvidia-smi
5353
ERR_SGX_DRIVERS_INSTALL_TIMEOUT=90 # Timeout waiting for SGX prereqs to download
5454
ERR_SGX_DRIVERS_START_FAIL=91 # Failed to execute SGX driver binary
55+
ERR_AMDAMA_DRIVER_NOT_FOUND=95 # AMD AMA driver package not found for current kernel version
56+
ERR_AMDAMA_INSTALL_FAIL=96 # Unable to install AMD AMA package
5557
ERR_APT_DAILY_TIMEOUT=98 # Timeout waiting for apt daily updates
5658
ERR_APT_UPDATE_TIMEOUT=99 # Timeout waiting for apt-get update to complete
5759
ERR_CSE_PROVISION_SCRIPT_NOT_READY_TIMEOUT=100 # Timeout waiting for cloud-init to place this script on the vm
@@ -717,6 +719,13 @@ get_imds_vm_tag_value() {
717719
echo "${tag_value,,}"
718720
}
719721

722+
isAmdAmaEnabledNode() {
723+
if [ "$(get_compute_sku)" = "Standard_NM16ads_MA35D" ]; then
724+
return 0
725+
fi
726+
return 1
727+
}
728+
720729
should_skip_nvidia_drivers() {
721730
set -x
722731
# Case-insensitive match for both tag name and value

parts/linux/cloud-init/artifacts/cse_main.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,11 @@ function nodePrep {
417417
echo $(date),$(hostname), "End configuring GPU drivers"
418418
fi
419419

420+
# Install and configure AMD AMA (Supernova) drivers if this is an AMA node
421+
if isAmdAmaEnabledNode; then
422+
logs_to_events "AKS.CSE.setupAmdAma" setupAmdAma
423+
fi
424+
420425
VALIDATION_ERR=0
421426

422427
# TODO(djsly): Look at leveraging the `aks-check-network.sh` script for this validation instead of duplicating the logic here

0 commit comments

Comments
 (0)