Skip to content

Commit 95ed957

Browse files
author
Ace Eldeib
authored
[AKS] az aks: Add --gpu-instance-profile for Nvidia multi-instan… (#23501)
1 parent 6d83c07 commit 95ed957

8 files changed

Lines changed: 3436 additions & 1 deletion

File tree

src/azure-cli/azure/cli/command_modules/acs/_consts.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,13 @@
4141
CONST_DEFAULT_NODE_VM_SIZE = "Standard_DS2_v2"
4242
CONST_DEFAULT_WINDOWS_NODE_VM_SIZE = "Standard_D2s_v3"
4343

44+
# gpu instance
45+
CONST_GPU_INSTANCE_PROFILE_MIG1_G = "MIG1g"
46+
CONST_GPU_INSTANCE_PROFILE_MIG2_G = "MIG2g"
47+
CONST_GPU_INSTANCE_PROFILE_MIG3_G = "MIG3g"
48+
CONST_GPU_INSTANCE_PROFILE_MIG4_G = "MIG4g"
49+
CONST_GPU_INSTANCE_PROFILE_MIG7_G = "MIG7g"
50+
4451
# consts for ManagedCluster
4552
# load balancer sku
4653
CONST_LOAD_BALANCER_SKU_BASIC = "basic"

src/azure-cli/azure/cli/command_modules/acs/_help.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,9 @@
547547
- name: --http-proxy-config
548548
type: string
549549
short-summary: HTTP Proxy configuration for this cluster.
550+
- name: --gpu-instance-profile
551+
type: string
552+
short-summary: GPU instance profile to partition multi-gpu Nvidia GPUs.
550553
551554
examples:
552555
- name: Create a Kubernetes cluster with an existing SSH public key.
@@ -1099,6 +1102,9 @@
10991102
- name: --host-group-id
11001103
type: string
11011104
short-summary: The fully qualified dedicated host group id used to provision agent node pool.
1105+
- name: --gpu-instance-profile
1106+
type: string
1107+
short-summary: GPU instance profile to partition multi-gpu Nvidia GPUs.
11021108
examples:
11031109
- name: Create a nodepool in an existing AKS cluster with ephemeral os enabled.
11041110
text: az aks nodepool add -g MyResourceGroup -n nodepool1 --cluster-name MyManagedCluster --node-osdisk-type Ephemeral --node-osdisk-size 48

src/azure-cli/azure/cli/command_modules/acs/_params.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@
2424
CONST_SCALE_DOWN_MODE_DELETE, CONST_SCALE_SET_PRIORITY_REGULAR,
2525
CONST_SCALE_SET_PRIORITY_SPOT, CONST_SPOT_EVICTION_POLICY_DEALLOCATE,
2626
CONST_SPOT_EVICTION_POLICY_DELETE, CONST_STABLE_UPGRADE_CHANNEL,
27-
CONST_AZURE_KEYVAULT_NETWORK_ACCESS_PUBLIC, CONST_AZURE_KEYVAULT_NETWORK_ACCESS_PRIVATE)
27+
CONST_AZURE_KEYVAULT_NETWORK_ACCESS_PUBLIC, CONST_AZURE_KEYVAULT_NETWORK_ACCESS_PRIVATE,
28+
CONST_GPU_INSTANCE_PROFILE_MIG1_G, CONST_GPU_INSTANCE_PROFILE_MIG2_G,
29+
CONST_GPU_INSTANCE_PROFILE_MIG3_G, CONST_GPU_INSTANCE_PROFILE_MIG4_G,
30+
CONST_GPU_INSTANCE_PROFILE_MIG7_G)
2831
from azure.cli.command_modules.acs._validators import (
2932
validate_acr, validate_agent_pool_name, validate_assign_identity,
3033
validate_assign_kubelet_identity, validate_azure_keyvault_kms_key_id,
@@ -116,6 +119,14 @@
116119

117120
keyvault_network_access_types = [CONST_AZURE_KEYVAULT_NETWORK_ACCESS_PUBLIC, CONST_AZURE_KEYVAULT_NETWORK_ACCESS_PRIVATE]
118121

122+
gpu_instance_profiles = [
123+
CONST_GPU_INSTANCE_PROFILE_MIG1_G,
124+
CONST_GPU_INSTANCE_PROFILE_MIG2_G,
125+
CONST_GPU_INSTANCE_PROFILE_MIG3_G,
126+
CONST_GPU_INSTANCE_PROFILE_MIG4_G,
127+
CONST_GPU_INSTANCE_PROFILE_MIG7_G,
128+
]
129+
119130

120131
def load_arguments(self, _):
121132

@@ -327,6 +338,7 @@ def load_arguments(self, _):
327338
c.argument('yes', options_list=['--yes', '-y'], help='Do not prompt for confirmation.', action='store_true')
328339
c.argument('host_group_id', validator=validate_host_group_id)
329340
c.argument('http_proxy_config')
341+
c.argument('gpu_instance_profile', arg_type=get_enum_type(gpu_instance_profiles))
330342

331343
with self.argument_context('aks update') as c:
332344
# managed cluster paramerters
@@ -493,6 +505,7 @@ def load_arguments(self, _):
493505
c.argument('kubelet_config')
494506
c.argument('linux_os_config')
495507
c.argument('host_group_id', validator=validate_host_group_id)
508+
c.argument('gpu_instance_profile', arg_type=get_enum_type(gpu_instance_profiles))
496509

497510
with self.argument_context('aks nodepool update', resource_type=ResourceType.MGMT_CONTAINERSERVICE, operation_group='agent_pools') as c:
498511
c.argument('enable_cluster_autoscaler', options_list=[

src/azure-cli/azure/cli/command_modules/acs/agentpool_decorator.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1200,6 +1200,25 @@ def get_no_wait(self) -> bool:
12001200
# this parameter does not need validation
12011201
return no_wait
12021202

1203+
def get_gpu_instance_profile(self) -> Union[str, None]:
1204+
"""Obtain the value of gpu_instance_profile.
1205+
1206+
:return: string or None
1207+
"""
1208+
# read the original value passed by the command
1209+
gpu_instance_profile = self.raw_param.get("gpu_instance_profile")
1210+
# try to read the property value corresponding to the parameter from the `mc` object
1211+
if (
1212+
self.agentpool and
1213+
hasattr(self.agentpool, "gpu_instance_profile") and
1214+
self.agentpool.gpu_instance_profile is not None
1215+
):
1216+
gpu_instance_profile = self.agentpool.gpu_instance_profile
1217+
1218+
# this parameter does not need dynamic completion
1219+
# this parameter does not need validation
1220+
return gpu_instance_profile
1221+
12031222

12041223
class AKSAgentPoolAddDecorator:
12051224
def __init__(
@@ -1469,6 +1488,16 @@ def set_up_custom_node_config(self, agentpool: AgentPool) -> AgentPool:
14691488
agentpool.linux_os_config = self.context.get_linux_os_config()
14701489
return agentpool
14711490

1491+
def set_up_gpu_properties(self, agentpool: AgentPool) -> AgentPool:
1492+
"""Set up gpu related properties for the AgentPool object.
1493+
1494+
:return: the AgentPool object
1495+
"""
1496+
self._ensure_agentpool(agentpool)
1497+
1498+
agentpool.gpu_instance_profile = self.context.get_gpu_instance_profile()
1499+
return agentpool
1500+
14721501
def construct_agentpool_profile_default(self, bypass_restore_defaults: bool = False) -> AgentPool:
14731502
"""The overall controller used to construct the AgentPool profile by default.
14741503
@@ -1501,6 +1530,8 @@ def construct_agentpool_profile_default(self, bypass_restore_defaults: bool = Fa
15011530
agentpool = self.set_up_vm_properties(agentpool)
15021531
# set up custom node config
15031532
agentpool = self.set_up_custom_node_config(agentpool)
1533+
# set up gpu instance profile
1534+
agentpool = self.set_up_gpu_properties(agentpool)
15041535
# restore defaults
15051536
if not bypass_restore_defaults:
15061537
agentpool = self._restore_defaults_in_agentpool(agentpool)

src/azure-cli/azure/cli/command_modules/acs/custom.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1580,6 +1580,7 @@ def aks_create(
15801580
yes=False,
15811581
aks_custom_headers=None,
15821582
host_group_id=None,
1583+
gpu_instance_profile=None,
15831584
):
15841585
# DO NOT MOVE: get all the original parameters and save them as a dictionary
15851586
raw_parameters = locals()
@@ -2982,6 +2983,7 @@ def aks_agentpool_add(
29822983
no_wait=False,
29832984
aks_custom_headers=None,
29842985
host_group_id=None,
2986+
gpu_instance_profile=None,
29852987
):
29862988
# DO NOT MOVE: get all the original parameters and save them as a dictionary
29872989
raw_parameters = locals()

0 commit comments

Comments
 (0)