Skip to content
Merged
4 changes: 4 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@
CONST_GPU_INSTANCE_PROFILE_MIG4_G = "MIG4g"
CONST_GPU_INSTANCE_PROFILE_MIG7_G = "MIG7g"

# gpu driver install
CONST_GPU_DRIVER_INSTALL = "Install"
CONST_GPU_DRIVER_NONE = "None"

# consts for ManagedCluster
# load balancer sku
CONST_LOAD_BALANCER_SKU_BASIC = "basic"
Expand Down
3 changes: 3 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/_help.py
Original file line number Diff line number Diff line change
Expand Up @@ -1663,6 +1663,9 @@
- name: --if-none-match
type: string
short-summary: Set to '*' to allow a new agentpool to be created, but to prevent updating an existing agentpool. Other values will be ignored.
- name: --gpu-driver
type: string
short-summary: Whether to install driver for GPU node pool. Possible values are "Install" or "None". Default is "Install".

examples:
- name: Create a nodepool in an existing AKS cluster with ephemeral os enabled.
Expand Down
7 changes: 7 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
CONST_GPU_INSTANCE_PROFILE_MIG1_G, CONST_GPU_INSTANCE_PROFILE_MIG2_G,
CONST_GPU_INSTANCE_PROFILE_MIG3_G, CONST_GPU_INSTANCE_PROFILE_MIG4_G,
CONST_GPU_INSTANCE_PROFILE_MIG7_G, CONST_LOAD_BALANCER_SKU_BASIC,
CONST_GPU_DRIVER_INSTALL, CONST_GPU_DRIVER_NONE,
CONST_LOAD_BALANCER_SKU_STANDARD, CONST_MANAGED_CLUSTER_SKU_TIER_FREE,
CONST_MANAGED_CLUSTER_SKU_TIER_STANDARD, CONST_MANAGED_CLUSTER_SKU_TIER_PREMIUM,
CONST_NETWORK_DATAPLANE_AZURE, CONST_NETWORK_DATAPLANE_CILIUM,
Expand Down Expand Up @@ -192,6 +193,11 @@
CONST_GPU_INSTANCE_PROFILE_MIG7_G,
]

gpu_driver_install_modes = [
CONST_GPU_DRIVER_INSTALL,
CONST_GPU_DRIVER_NONE
]

nrg_lockdown_restriction_levels = [
CONST_NRG_LOCKDOWN_RESTRICTION_LEVEL_READONLY,
CONST_NRG_LOCKDOWN_RESTRICTION_LEVEL_UNRESTRICTED,
Expand Down Expand Up @@ -824,6 +830,7 @@ def load_arguments(self, _):
c.argument('enable_secure_boot', action='store_true')
c.argument("if_match")
c.argument("if_none_match")
c.argument('gpu_driver', arg_type=get_enum_type(gpu_driver_install_modes))

with self.argument_context('aks nodepool update', resource_type=ResourceType.MGMT_CONTAINERSERVICE, operation_group='agent_pools') as c:
c.argument('enable_cluster_autoscaler', options_list=[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1551,6 +1551,35 @@ def get_if_none_match(self) -> str:
"""
return self.raw_param.get("if_none_match")

def _get_gpu_driver(self) -> Union[str, None]:
"""Obtain the value of gpu_driver.

:return: string
"""
# read the original value passed by the command
gpu_driver = self.raw_param.get("gpu_driver")

# In create mode, try to read the property value corresponding to the parameter from the `agentpool` object
if self.decorator_mode == DecoratorMode.CREATE:
if (
self.agentpool and
hasattr(self.agentpool, "gpu_profile") and # backward compatibility
self.agentpool.gpu_profile and
self.agentpool.gpu_profile.driver is not None
):
gpu_driver = self.agentpool.gpu_profile.driver

# this parameter does not need dynamic completion
# this parameter does not need validation
return gpu_driver

def get_gpu_driver(self) -> Union[str, None]:
"""Obtain the value of gpu_driver.

:return: string or None
"""
return self._get_gpu_driver()


class AKSAgentPoolAddDecorator:
def __init__(
Expand Down Expand Up @@ -1915,6 +1944,22 @@ def set_up_agentpool_windows_profile(self, agentpool: AgentPool) -> AgentPool:

return agentpool

def set_up_gpu_profile(self, agentpool: AgentPool) -> AgentPool:
"""Set up gpu profile for the AgentPool object.

:return: the AgentPool object
"""
self._ensure_agentpool(agentpool)

gpu_driver = self.context.get_gpu_driver()

# Construct AgentPoolGPUProfile if one of the fields has been set
if gpu_driver:
agentpool.gpu_profile = self.models.GPUProfile()
agentpool.gpu_profile.driver = gpu_driver

return agentpool

def construct_agentpool_profile_default(self, bypass_restore_defaults: bool = False) -> AgentPool:
"""The overall controller used to construct the AgentPool profile by default.

Expand Down Expand Up @@ -1959,6 +2004,8 @@ def construct_agentpool_profile_default(self, bypass_restore_defaults: bool = Fa
agentpool = self.set_up_agentpool_security_profile(agentpool)
# set up message of the day
agentpool = self.set_up_motd(agentpool)
# set up gpu profile
agentpool = self.set_up_gpu_profile(agentpool)
# restore defaults
if not bypass_restore_defaults:
agentpool = self._restore_defaults_in_agentpool(agentpool)
Expand Down
2 changes: 2 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -2424,6 +2424,8 @@ def aks_agentpool_add(
# etag headers
if_match=None,
if_none_match=None,
# gpu driver
gpu_driver=None,
):
# DO NOT MOVE: get all the original parameters and save them as a dictionary
raw_parameters = locals()
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,23 @@ def common_get_node_public_ip_prefix_id(self):
ctx_1.attach_agentpool(agentpool)
self.assertEqual(ctx_1.get_node_public_ip_prefix_id(), "test_node_public_ip_prefix_id")

def common_get_gpu_driver(self):
ctx_1 = AKSAgentPoolContext(
self.cmd,
AKSAgentPoolParamDict({"gpu_driver": None}),
self.models,
DecoratorMode.CREATE,
self.agentpool_decorator_mode,
)
self.assertEqual(ctx_1.get_gpu_driver(), None)
agentpool = self.create_initialized_agentpool_instance(
gpu_profile=self.models.GPUProfile(
driver="Install"
)
)
ctx_1.attach_agentpool(agentpool)
self.assertEqual(ctx_1.get_gpu_driver(), "Install")

def common_get_node_count_and_enable_cluster_autoscaler_min_max_count(
self,
):
Expand Down Expand Up @@ -1788,6 +1805,9 @@ def test_get_if_match(self):
def test_get_if_none_match(self):
self.get_if_none_match()

def test_get_gpu_driver(self):
self.common_get_gpu_driver()

class AKSAgentPoolContextManagedClusterModeTestCase(AKSAgentPoolContextCommonTestCase):
Comment thread
FumingZhang marked this conversation as resolved.
def setUp(self):
self.cli_ctx = MockCLI()
Expand Down Expand Up @@ -2431,6 +2451,28 @@ def common_set_up_agentpool_security_profile(self):
)
self.assertEqual(dec_agentpool_1, ground_truth_agentpool_1)

def common_set_up_gpu_profile(self):
dec_1 = AKSAgentPoolAddDecorator(
self.cmd,
self.client,
{"gpu_driver": "Install"},
self.resource_type,
self.agentpool_decorator_mode,
)
# fail on passing the wrong agentpool object
with self.assertRaises(CLIInternalError):
dec_1.set_up_gpu_profile(None)
agentpool_1 = self.create_initialized_agentpool_instance(restore_defaults=False)
dec_1.context.attach_agentpool(agentpool_1)
dec_agentpool_1 = dec_1.set_up_gpu_profile(agentpool_1)
dec_agentpool_1 = self._restore_defaults_in_agentpool(dec_agentpool_1)
ground_truth_agentpool_1 = self.create_initialized_agentpool_instance(
gpu_profile=self.models.GPUProfile(
driver="Install",
)
)
self.assertEqual(dec_agentpool_1, ground_truth_agentpool_1)

class AKSAgentPoolAddDecoratorStandaloneModeTestCase(AKSAgentPoolAddDecoratorCommonTestCase):
def setUp(self):
self.cli_ctx = MockCLI()
Expand Down Expand Up @@ -2481,6 +2523,9 @@ def test_set_up_agentpool_windows_profile(self):

def test_set_up_agentpool_security_profile(self):
self.common_set_up_agentpool_security_profile()

def test_set_up_gpu_profile(self):
self.common_set_up_gpu_profile()

def test_construct_agentpool_profile_default(self):
import inspect
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12264,3 +12264,82 @@ def test_aks_network_isolated_cluster(self, resource_group, resource_group_locat
self.cmd("aks delete -g {resource_group} -n {aks_name_1} --yes --no-wait", checks=[self.is_empty()])
self.cmd("aks delete -g {resource_group} -n {aks_name_2} --yes --no-wait", checks=[self.is_empty()])
self.cmd("aks delete -g {resource_group} -n {aks_name_3} --yes --no-wait", checks=[self.is_empty()])


@AllowLargeResponse()
@AKSCustomResourceGroupPreparer(
random_name_length=17, name_prefix="clitest", location="westus3"
)
def test_aks_create_gpu_driver_flow(self, resource_group, resource_group_location):
# reset the count so in replay mode the random names will start with 0
self.test_resources_count = 0
aks_name = self.create_random_name("cliakstest", 16)
node_pool_name = self.create_random_name("c", 6)
node_pool_name_second = self.create_random_name("c", 6)
self.kwargs.update(
{
"resource_group": resource_group,
"name": aks_name,
"dns_name_prefix": self.create_random_name("cliaksdns", 16),
"location": resource_group_location,
"resource_type": "Microsoft.ContainerService/ManagedClusters",
"node_pool_name": node_pool_name,
"node_pool_name_second": node_pool_name_second,
"ssh_key_value": self.generate_ssh_keys(),
"node_vm_size": "standard_nc6s_v3"
}
)

# 1. create
create_cmd = (
"aks create --resource-group={resource_group} --name={name} --location={location} "
"--enable-managed-identity "
"--ssh-key-value={ssh_key_value} "
)
self.cmd(create_cmd, checks=[
self.check('provisioningState', 'Succeeded')
])

# 2. add nodepool with --gpu-driver none
self.cmd(
"aks nodepool add "
"--resource-group={resource_group} "
"--cluster-name={name} "
"--name={node_pool_name} "
"--node-vm-size={node_vm_size} "
"-c 1 "
"--os-type Linux "
"--gpu-driver None",
checks=[
self.check("provisioningState", "Succeeded"),
self.check("gpuProfile.driver", "None"),
],
)

# nodepool delete the second
self.cmd(
"aks nodepool delete --resource-group={resource_group} --cluster-name={name} --name={node_pool_name}",
checks=[self.is_empty()],
)

# 3. add nodepool with --gpu-driver install
self.cmd(
"aks nodepool add "
"--resource-group={resource_group} "
"--cluster-name={name} "
"--name={node_pool_name_second} "
"--node-vm-size={node_vm_size} "
"-c 1 "
"--os-type Linux "
"--gpu-driver Install",
checks=[
self.check("provisioningState", "Succeeded"),
self.check("gpuProfile.driver", "Install"),
],
)

# delete
self.cmd(
"aks delete -g {resource_group} -n {name} --yes --no-wait",
checks=[self.is_empty()],
)
Loading