Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/aks-preview/HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ To release a new version, please select a new version number (usually plus 1 to
Pending
+++++++
* Fix `match_condition` kwarg leaking to HTTP transport by overriding `put_mc` and `add_agentpool` to pass `if_match` / `if_none_match` directly to the vendored SDK. This change fixes the compatibility issue as azure-cli/acs module adopts TypeSpec emitted SDKs while azure-cli-extensions/aks-preview still uses the autorest emitted SDK.
+ `az aks list-vm-skus`: New command to list available VM SKUs for AKS clusters in a given region.
* `az aks list-vm-skus`: New command to list available VM SKUs for AKS clusters in a given region.
* Add managed GPU enablement option to node pool property in `az aks nodepool add` and `az aks nodepool update`.



19.0.0b27
+++++++
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
],
"gpu, no quota": [
"test_aks_nodepool_add_with_gpu_instance_profile",
"test_aks_gpu_driver_type"
"test_aks_gpu_driver_type",
"test_aks_nodepool_add_with_enable_managed_gpu",
"test_aks_nodepool_update_with_enable_managed_gpu"
],
"pod ip allocation mode static block, missing feature registration": [
"test_aks_create_with_pod_ip_allocation_mode_static_block"
Expand Down
4 changes: 4 additions & 0 deletions src/aks-preview/azext_aks_preview/_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@
CONST_GPU_DRIVER_INSTALL = "Install"
CONST_GPU_DRIVER_NONE = "None"

# gpu management mode
CONST_GPU_MANAGEMENT_MODE_MANAGED = "Managed"
CONST_GPU_MANAGEMENT_MODE_UNMANAGED = "Unmanaged"

# consts for ManagedCluster
# load balancer sku
CONST_LOAD_BALANCER_SKU_BASIC = "basic"
Expand Down
6 changes: 6 additions & 0 deletions src/aks-preview/azext_aks_preview/_help.py
Original file line number Diff line number Diff line change
Expand Up @@ -2203,6 +2203,9 @@
- name: --enable-artifact-streaming
type: bool
short-summary: Enable artifact streaming for VirtualMachineScaleSets managed by a node pool, to speed up the cold-start of containers on a node through on-demand image loading. To use this feature, container images must also enable artifact streaming on ACR. If not specified, the default is false.
- name: --enable-managed-gpu
type: bool
short-summary: Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver. For more details, visit aka.ms/aks/managed-gpu.
- name: --skip-gpu-driver-install
type: bool
short-summary: To skip GPU driver auto installation by AKS on a nodepool using GPU vm size if customers want to manage GPU driver installation by their own. If not specified, the default is false.
Expand Down Expand Up @@ -2419,6 +2422,9 @@
- name: --enable-artifact-streaming
type: bool
short-summary: Enable artifact streaming for VirtualMachineScaleSets managed by a node pool, to speed up the cold-start of containers on a node through on-demand image loading. To use this feature, container images must also enable artifact streaming on ACR. If not specified, the default is false.
- name: --enable-managed-gpu
type: bool
short-summary: Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver. For more details, visit aka.ms/aks/managed-gpu.
- name: --os-sku
type: string
short-summary: The os-sku of the agent node pool.
Expand Down
12 changes: 12 additions & 0 deletions src/aks-preview/azext_aks_preview/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -2031,6 +2031,12 @@ def load_arguments(self, _):
validator=validate_artifact_streaming,
is_preview=True,
)
c.argument(
"enable_managed_gpu",
action="store_true",
is_preview=True,
help="Enable the Managed GPU experience.",
)
c.argument(
"node_public_ip_tags",
arg_type=tags_type,
Expand Down Expand Up @@ -2140,6 +2146,12 @@ def load_arguments(self, _):
validator=validate_artifact_streaming,
is_preview=True,
)
c.argument(
"enable_managed_gpu",
action="store_true",
is_preview=True,
help="Enable the Managed GPU experience.",
)
c.argument(
"os_sku",
arg_type=get_enum_type(node_os_skus_update),
Expand Down
67 changes: 67 additions & 0 deletions src/aks-preview/azext_aks_preview/agentpool_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,10 @@
CONST_DEFAULT_WINDOWS_VMS_VM_SIZE,
CONST_MANAGED_CLUSTER_SKU_NAME_AUTOMATIC,
CONST_SSH_ACCESS_LOCALUSER,
CONST_GPU_DRIVER_INSTALL,
CONST_GPU_DRIVER_NONE,
CONST_GPU_MANAGEMENT_MODE_MANAGED,
CONST_GPU_MANAGEMENT_MODE_UNMANAGED,
CONST_NODEPOOL_MODE_MANAGEDSYSTEM,
CONST_NODEPOOL_MODE_MACHINES,
)
Expand Down Expand Up @@ -587,6 +590,27 @@ def get_enable_artifact_streaming(self) -> bool:
enable_artifact_streaming = self.agentpool.artifact_streaming_profile.enabled
return enable_artifact_streaming

def get_enable_managed_gpu(self) -> Union[bool, None]:
"""Obtain the value of enable_managed_gpu.
:return: bool
"""

# read the original value passed by the command
enable_managed_gpu = self.raw_param.get("enable_managed_gpu")

# In create mode, try to read the property value corresponding to the parameter from the `agentpool` object
if self.decorator_mode == DecoratorMode.CREATE:
if (
self.agentpool and
self.agentpool.gpu_profile is not None and
self.agentpool.gpu_profile.nvidia is not None and
self.agentpool.gpu_profile.nvidia.management_mode is not None
):
enable_managed_gpu = (
self.agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_MANAGED
)
return enable_managed_gpu

def get_pod_ip_allocation_mode(self: bool = False) -> Union[str, None]:
"""Get the value of pod_ip_allocation_mode.
:return: str or None
Expand Down Expand Up @@ -1276,6 +1300,21 @@ def set_up_artifact_streaming(self, agentpool: AgentPool) -> AgentPool:
agentpool.artifact_streaming_profile.enabled = True
return agentpool

def set_up_managed_gpu(self, agentpool: AgentPool) -> AgentPool:
"""Set up managed GPU property for the AgentPool object."""
self._ensure_agentpool(agentpool)

enable_managed_gpu = self.context.get_enable_managed_gpu()

if enable_managed_gpu:
if agentpool.gpu_profile is None:
agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member
if agentpool.gpu_profile.nvidia is None:
agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member
agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED
agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL
return agentpool
Comment thread
runzhen marked this conversation as resolved.

def set_up_ssh_access(self, agentpool: AgentPool) -> AgentPool:
self._ensure_agentpool(agentpool)

Expand Down Expand Up @@ -1510,6 +1549,8 @@ def construct_agentpool_profile_preview(self) -> AgentPool:
agentpool = self.set_up_init_taints(agentpool)
# set up artifact streaming
agentpool = self.set_up_artifact_streaming(agentpool)
# set up managed gpu
agentpool = self.set_up_managed_gpu(agentpool)
# set up skip_gpu_driver_install
agentpool = self.set_up_skip_gpu_driver_install(agentpool)
# set up gpu profile
Expand Down Expand Up @@ -1704,6 +1745,29 @@ def update_artifact_streaming(self, agentpool: AgentPool) -> AgentPool:
agentpool.artifact_streaming_profile.enabled = True
return agentpool

def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool:
"""Update managed GPU property for the AgentPool object.
:return: the AgentPool object
"""
self._ensure_agentpool(agentpool)

enable_managed_gpu = self.context.get_enable_managed_gpu()
if enable_managed_gpu is None:
return agentpool

if enable_managed_gpu:
if agentpool.gpu_profile is None:
agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member
if agentpool.gpu_profile.nvidia is None:
agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member
agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED
agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL
else:
if agentpool.gpu_profile and agentpool.gpu_profile.nvidia:
agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED

return agentpool

def update_os_sku(self, agentpool: AgentPool) -> AgentPool:
self._ensure_agentpool(agentpool)

Expand Down Expand Up @@ -1828,6 +1892,9 @@ def update_agentpool_profile_preview(self, agentpools: List[AgentPool] = None) -
# update artifact streaming
agentpool = self.update_artifact_streaming(agentpool)

# update managed gpu
agentpool = self.update_managed_gpu(agentpool)

# update secure boot
agentpool = self.update_secure_boot(agentpool)

Expand Down
2 changes: 2 additions & 0 deletions src/aks-preview/azext_aks_preview/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -1919,6 +1919,7 @@ def aks_agentpool_add(
asg_ids=None,
node_public_ip_tags=None,
enable_artifact_streaming=False,
enable_managed_gpu=False,
skip_gpu_driver_install=False,
gpu_driver=None,
driver_type=None,
Expand Down Expand Up @@ -1993,6 +1994,7 @@ def aks_agentpool_update(
allowed_host_ports=None,
asg_ids=None,
enable_artifact_streaming=False,
enable_managed_gpu=False,
os_sku=None,
ssh_access=None,
yes=False,
Expand Down
Loading
Loading