Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/aks-preview/HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ To release a new version, please select a new version number (usually plus 1 to

Pending
+++++++
* Add managed GPU enablement option to node pool property in `az aks nodepool add` and `az aks nodepool update`.

19.0.0b27
+++++++
Expand Down
4 changes: 4 additions & 0 deletions src/aks-preview/azext_aks_preview/_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@
CONST_GPU_DRIVER_INSTALL = "Install"
CONST_GPU_DRIVER_NONE = "None"

# gpu management mode
CONST_GPU_MANAGEMENT_MODE_MANAGED = "Managed"
CONST_GPU_MANAGEMENT_MODE_UNMANAGED = "Unmanaged"

# consts for ManagedCluster
# load balancer sku
CONST_LOAD_BALANCER_SKU_BASIC = "basic"
Expand Down
6 changes: 6 additions & 0 deletions src/aks-preview/azext_aks_preview/_help.py
Original file line number Diff line number Diff line change
Expand Up @@ -2184,6 +2184,9 @@
- name: --enable-artifact-streaming
type: bool
short-summary: Enable artifact streaming for VirtualMachineScaleSets managed by a node pool, to speed up the cold-start of containers on a node through on-demand image loading. To use this feature, container images must also enable artifact streaming on ACR. If not specified, the default is false.
- name: --enable-managed-gpu
type: bool
short-summary: Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver. For more details, visit aka.ms/aks/managed-gpu.
- name: --skip-gpu-driver-install
type: bool
short-summary: To skip GPU driver auto installation by AKS on a nodepool using GPU vm size if customers want to manage GPU driver installation by their own. If not specified, the default is false.
Expand Down Expand Up @@ -2400,6 +2403,9 @@
- name: --enable-artifact-streaming
type: bool
short-summary: Enable artifact streaming for VirtualMachineScaleSets managed by a node pool, to speed up the cold-start of containers on a node through on-demand image loading. To use this feature, container images must also enable artifact streaming on ACR. If not specified, the default is false.
- name: --enable-managed-gpu
type: bool
short-summary: Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver. For more details, visit aka.ms/aks/managed-gpu.
- name: --os-sku
type: string
short-summary: The os-sku of the agent node pool.
Expand Down
12 changes: 12 additions & 0 deletions src/aks-preview/azext_aks_preview/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -2026,6 +2026,12 @@ def load_arguments(self, _):
validator=validate_artifact_streaming,
is_preview=True,
)
c.argument(
"enable_managed_gpu",
action="store_true",
is_preview=True,
help="Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver.",
)
c.argument(
"node_public_ip_tags",
arg_type=tags_type,
Expand Down Expand Up @@ -2135,6 +2141,12 @@ def load_arguments(self, _):
validator=validate_artifact_streaming,
is_preview=True,
)
c.argument(
"enable_managed_gpu",
action="store_true",
is_preview=True,
help="Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver.",
)
c.argument(
"os_sku",
arg_type=get_enum_type(node_os_skus_update),
Expand Down
55 changes: 55 additions & 0 deletions src/aks-preview/azext_aks_preview/agentpool_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
CONST_MANAGED_CLUSTER_SKU_NAME_AUTOMATIC,
CONST_SSH_ACCESS_LOCALUSER,
CONST_GPU_DRIVER_NONE,
CONST_GPU_MANAGEMENT_MODE_MANAGED,
CONST_GPU_MANAGEMENT_MODE_UNMANAGED,
CONST_NODEPOOL_MODE_MANAGEDSYSTEM,
CONST_NODEPOOL_MODE_MACHINES,
)
Expand Down Expand Up @@ -587,6 +589,24 @@ def get_enable_artifact_streaming(self) -> bool:
enable_artifact_streaming = self.agentpool.artifact_streaming_profile.enabled
return enable_artifact_streaming

def get_enable_managed_gpu(self) -> bool:
"""Obtain the value of enable_managed_gpu.
:return: bool
"""

# read the original value passed by the command
enable_managed_gpu = self.raw_param.get("enable_managed_gpu")
# In create mode, try to read the property value corresponding to the parameter from the `agentpool` object
if self.decorator_mode == DecoratorMode.CREATE:
if (
self.agentpool and
self.agentpool.gpu_profile is not None and
self.agentpool.gpu_profile.nvidia is not None and
self.agentpool.gpu_profile.nvidia.management_mode is not None
):
enable_managed_gpu = self.agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_MANAGED
return enable_managed_gpu
Comment thread
runzhen marked this conversation as resolved.
Outdated

def get_pod_ip_allocation_mode(self: bool = False) -> Union[str, None]:
"""Get the value of pod_ip_allocation_mode.
:return: str or None
Expand Down Expand Up @@ -1276,6 +1296,20 @@ def set_up_artifact_streaming(self, agentpool: AgentPool) -> AgentPool:
agentpool.artifact_streaming_profile.enabled = True
return agentpool

def set_up_managed_gpu(self, agentpool: AgentPool) -> AgentPool:
"""Set up managed GPU property for the AgentPool object."""
self._ensure_agentpool(agentpool)

enable_managed_gpu = self.context.get_enable_managed_gpu()

if enable_managed_gpu:
if agentpool.gpu_profile is None:
agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member
if agentpool.gpu_profile.nvidia is None:
agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member
agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED
return agentpool
Comment thread
runzhen marked this conversation as resolved.

def set_up_ssh_access(self, agentpool: AgentPool) -> AgentPool:
self._ensure_agentpool(agentpool)

Expand Down Expand Up @@ -1510,6 +1544,8 @@ def construct_agentpool_profile_preview(self) -> AgentPool:
agentpool = self.set_up_init_taints(agentpool)
# set up artifact streaming
agentpool = self.set_up_artifact_streaming(agentpool)
# set up managed gpu
agentpool = self.set_up_managed_gpu(agentpool)
# set up skip_gpu_driver_install
agentpool = self.set_up_skip_gpu_driver_install(agentpool)
# set up gpu profile
Expand Down Expand Up @@ -1688,6 +1724,22 @@ def update_artifact_streaming(self, agentpool: AgentPool) -> AgentPool:
agentpool.artifact_streaming_profile.enabled = True
return agentpool

def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool:
"""Update managed GPU property for the AgentPool object.
:return: the AgentPool object
"""
self._ensure_agentpool(agentpool)

enable_managed_gpu = self.context.get_enable_managed_gpu()

if enable_managed_gpu:
if agentpool.gpu_profile is None:
agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member
if agentpool.gpu_profile.nvidia is None:
agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member
agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED
return agentpool

def update_os_sku(self, agentpool: AgentPool) -> AgentPool:
self._ensure_agentpool(agentpool)

Expand Down Expand Up @@ -1812,6 +1864,9 @@ def update_agentpool_profile_preview(self, agentpools: List[AgentPool] = None) -
# update artifact streaming
agentpool = self.update_artifact_streaming(agentpool)

# update managed gpu
agentpool = self.update_managed_gpu(agentpool)

# update secure boot
agentpool = self.update_secure_boot(agentpool)

Expand Down
2 changes: 2 additions & 0 deletions src/aks-preview/azext_aks_preview/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -1903,6 +1903,7 @@ def aks_agentpool_add(
asg_ids=None,
node_public_ip_tags=None,
enable_artifact_streaming=False,
enable_managed_gpu=False,
skip_gpu_driver_install=False,
gpu_driver=None,
driver_type=None,
Expand Down Expand Up @@ -1977,6 +1978,7 @@ def aks_agentpool_update(
allowed_host_ports=None,
asg_ids=None,
enable_artifact_streaming=False,
enable_managed_gpu=False,
os_sku=None,
ssh_access=None,
yes=False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
CONST_MANAGED_CLUSTER_SKU_NAME_BASE,
CONST_MANAGED_CLUSTER_SKU_NAME_AUTOMATIC,
CONST_GPU_DRIVER_NONE,
CONST_GPU_MANAGEMENT_MODE_MANAGED,
CONST_GPU_MANAGEMENT_MODE_UNMANAGED,
CONST_NODEPOOL_MODE_MANAGEDSYSTEM,
CONST_NODEPOOL_MODE_MACHINES,
)
Expand Down Expand Up @@ -257,6 +259,45 @@ def common_get_enable_artifact_streaming(self):
ctx_2.attach_agentpool(agentpool_2)
self.assertEqual(ctx_2.get_enable_artifact_streaming(), None)

def common_get_enable_managed_gpu(self):
# default
ctx_1 = AKSPreviewAgentPoolContext(
self.cmd,
AKSAgentPoolParamDict({"enable_managed_gpu": None}),
self.models,
DecoratorMode.CREATE,
self.agentpool_decorator_mode,
)
self.assertEqual(ctx_1.get_enable_managed_gpu(), None)
agentpool_1 = self.create_initialized_agentpool_instance(
gpu_profile=self.models.GPUProfile(
nvidia=self.models.NvidiaGPUProfile(
management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED
)
)
)
ctx_1.attach_agentpool(agentpool_1)
self.assertEqual(ctx_1.get_enable_managed_gpu(), True)

# default
ctx_2 = AKSPreviewAgentPoolContext(
self.cmd,
AKSAgentPoolParamDict({"enable_managed_gpu": None}),
self.models,
DecoratorMode.UPDATE,
self.agentpool_decorator_mode,
)
self.assertEqual(ctx_2.get_enable_managed_gpu(), None)
agentpool_2 = self.create_initialized_agentpool_instance(
gpu_profile=self.models.GPUProfile(
nvidia=self.models.NvidiaGPUProfile(
management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED
)
)
)
ctx_2.attach_agentpool(agentpool_2)
self.assertEqual(ctx_2.get_enable_managed_gpu(), None)

def common_get_pod_ip_allocation_mode(self):
# default
ctx_1 = AKSPreviewAgentPoolContext(
Expand Down Expand Up @@ -1037,6 +1078,9 @@ def test_get_workload_runtime(self):
def test_get_enable_artifact_streaming(self):
self.common_get_enable_artifact_streaming()

def test_get_enable_managed_gpu(self):
self.common_get_enable_managed_gpu()

def test_get_pod_ip_allocation_mode(self):
self.common_get_pod_ip_allocation_mode()

Expand Down Expand Up @@ -1130,6 +1174,9 @@ def test_get_workload_runtime(self):

def test_get_enable_artifact_streaming(self):
self.common_get_enable_artifact_streaming()

Comment thread
runzhen marked this conversation as resolved.
def test_get_enable_managed_gpu(self):
self.common_get_enable_managed_gpu()

def test_get_pod_ip_allocation_mode(self):
self.common_get_pod_ip_allocation_mode()
Expand Down Expand Up @@ -1450,6 +1497,30 @@ def common_set_up_artifact_streaming(self):
)
self.assertEqual(dec_agentpool_1, ground_truth_agentpool_1)

def common_set_up_managed_gpu(self):
dec_1 = AKSPreviewAgentPoolAddDecorator(
self.cmd,
self.client,
{"enable_managed_gpu": True},
self.resource_type,
self.agentpool_decorator_mode,
)
# fail on passing the wrong agentpool object
with self.assertRaises(CLIInternalError):
dec_1.set_up_managed_gpu(None)
agentpool_1 = self.create_initialized_agentpool_instance(restore_defaults=False)
dec_1.context.attach_agentpool(agentpool_1)
dec_agentpool_1 = dec_1.set_up_managed_gpu(agentpool_1)
dec_agentpool_1 = self._restore_defaults_in_agentpool(dec_agentpool_1)
ground_truth_agentpool_1 = self.create_initialized_agentpool_instance(
gpu_profile=self.models.GPUProfile(
nvidia=self.models.NvidiaGPUProfile(
management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED
)
)
)
self.assertEqual(dec_agentpool_1, ground_truth_agentpool_1)

def common_set_up_skip_gpu_driver_install(self):
dec_1 = AKSPreviewAgentPoolAddDecorator(
self.cmd,
Expand Down Expand Up @@ -1999,6 +2070,9 @@ def test_set_up_gpu_propertes(self):
def test_set_up_artifact_streaming(self):
self.common_set_up_artifact_streaming()

def test_set_up_managed_gpu(self):
self.common_set_up_managed_gpu()

def test_set_up_skip_gpu_driver_install(self):
self.common_set_up_skip_gpu_driver_install()

Expand Down Expand Up @@ -2144,6 +2218,9 @@ def test_set_up_gpu_propertes(self):

def test_set_up_artifact_streaming(self):
self.common_set_up_artifact_streaming()

Comment thread
runzhen marked this conversation as resolved.
def test_set_up_managed_gpu(self):
self.common_set_up_managed_gpu()

def test_set_up_skip_gpu_driver_install(self):
self.common_set_up_skip_gpu_driver_install()
Expand Down Expand Up @@ -2349,6 +2426,57 @@ def common_update_artifact_streaming(self):
)
self.assertEqual(dec_agentpool_2, grond_truth_agentpool_2)

def common_update_managed_gpu(self):
dec_1 = AKSPreviewAgentPoolUpdateDecorator(
self.cmd,
self.client,
{"enable_managed_gpu": None},
self.resource_type,
self.agentpool_decorator_mode,
)
# fail on passing the wrong agentpool object
with self.assertRaises(CLIInternalError):
dec_1.update_managed_gpu(None)
agentpool_1 = self.create_initialized_agentpool_instance(
gpu_profile=self.models.GPUProfile(
nvidia=self.models.NvidiaGPUProfile(
management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED
)
)
)
dec_1.context.attach_agentpool(agentpool_1)
dec_agentpool_1 = dec_1.update_managed_gpu(agentpool_1)
grond_truth_agentpool_1 = self.create_initialized_agentpool_instance(
Comment thread
runzhen marked this conversation as resolved.
Outdated
gpu_profile=self.models.GPUProfile(
nvidia=self.models.NvidiaGPUProfile(
management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED
)
)
)
self.assertEqual(dec_agentpool_1, grond_truth_agentpool_1)

dec_2 = AKSPreviewAgentPoolUpdateDecorator(
self.cmd,
self.client,
{"enable_managed_gpu": True},
self.resource_type,
self.agentpool_decorator_mode,
)
# fail on passing the wrong agentpool object
with self.assertRaises(CLIInternalError):
dec_2.update_managed_gpu(None)
agentpool_2 = self.create_initialized_agentpool_instance()
dec_2.context.attach_agentpool(agentpool_2)
dec_agentpool_2 = dec_2.update_managed_gpu(agentpool_2)
grond_truth_agentpool_2 = self.create_initialized_agentpool_instance(
Comment thread
runzhen marked this conversation as resolved.
gpu_profile=self.models.GPUProfile(
nvidia=self.models.NvidiaGPUProfile(
management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED
)
)
)
self.assertEqual(dec_agentpool_2, grond_truth_agentpool_2)

def common_update_secure_boot(self):
dec_1 = AKSPreviewAgentPoolUpdateDecorator(
self.cmd,
Expand Down Expand Up @@ -2849,6 +2977,9 @@ def setUp(self):
def test_update_artifact_streaming(self):
self.common_update_artifact_streaming()

def test_update_managed_gpu(self):
self.common_update_managed_gpu()

def test_update_secure_boot(self):
self.common_update_secure_boot()

Expand Down Expand Up @@ -2941,6 +3072,9 @@ def setUp(self):

def test_update_artifact_streaming(self):
self.common_update_artifact_streaming()

Comment thread
runzhen marked this conversation as resolved.
def test_update_managed_gpu(self):
self.common_update_managed_gpu()

def test_update_secure_boot(self):
self.common_update_secure_boot()
Expand Down
Loading
Loading