From dd07270b29135813434b296ffb81095078631de2 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Thu, 19 Mar 2026 04:50:46 +0000 Subject: [PATCH 01/14] add managed gpu --- .devcontainer/devcontainer.json | 2 +- src/aks-preview/HISTORY.rst | 1 + src/aks-preview/azext_aks_preview/_consts.py | 4 + src/aks-preview/azext_aks_preview/_help.py | 6 + src/aks-preview/azext_aks_preview/_params.py | 12 ++ .../azext_aks_preview/agentpool_decorator.py | 55 +++++++ src/aks-preview/azext_aks_preview/custom.py | 2 + .../tests/latest/test_agentpool_decorator.py | 134 ++++++++++++++++++ .../tests/latest/test_aks_commands.py | 115 +++++++++++++++ 9 files changed, 330 insertions(+), 1 deletion(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 32534bc7b07..9c2f13958e3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -4,7 +4,7 @@ "features": { "ghcr.io/devcontainers/features/github-cli:1": {} }, - "workspaceFolder": "/workspaces", + "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}", "onCreateCommand": "python -m venv venv", "postCreateCommand": "REPO_NAME=$(basename $GITHUB_REPOSITORY) && cat $REPO_NAME/.devcontainer/login.sh >> ~/.bashrc && cp $REPO_NAME/.devcontainer/setup.sh easy_setup.sh && chmod +x easy_setup.sh", "hostRequirements": { diff --git a/src/aks-preview/HISTORY.rst b/src/aks-preview/HISTORY.rst index c0b711103a5..c09e8a32a32 100644 --- a/src/aks-preview/HISTORY.rst +++ b/src/aks-preview/HISTORY.rst @@ -11,6 +11,7 @@ To release a new version, please select a new version number (usually plus 1 to Pending +++++++ +* Add managed GPU enablement option to node pool property in `az aks nodepool add` and `az aks nodepool update`. 19.0.0b27 +++++++ diff --git a/src/aks-preview/azext_aks_preview/_consts.py b/src/aks-preview/azext_aks_preview/_consts.py index 9947552d15a..10c430a9ce6 100644 --- a/src/aks-preview/azext_aks_preview/_consts.py +++ b/src/aks-preview/azext_aks_preview/_consts.py @@ -76,6 +76,10 @@ CONST_GPU_DRIVER_INSTALL = "Install" CONST_GPU_DRIVER_NONE = "None" +# gpu management mode +CONST_GPU_MANAGEMENT_MODE_MANAGED = "Managed" +CONST_GPU_MANAGEMENT_MODE_UNMANAGED = "Unmanaged" + # consts for ManagedCluster # load balancer sku CONST_LOAD_BALANCER_SKU_BASIC = "basic" diff --git a/src/aks-preview/azext_aks_preview/_help.py b/src/aks-preview/azext_aks_preview/_help.py index cc7cce3865f..ea85e1449a5 100644 --- a/src/aks-preview/azext_aks_preview/_help.py +++ b/src/aks-preview/azext_aks_preview/_help.py @@ -2184,6 +2184,9 @@ - name: --enable-artifact-streaming type: bool short-summary: Enable artifact streaming for VirtualMachineScaleSets managed by a node pool, to speed up the cold-start of containers on a node through on-demand image loading. To use this feature, container images must also enable artifact streaming on ACR. If not specified, the default is false. + - name: --enable-managed-gpu + type: bool + short-summary: Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver. For more details, visit aka.ms/aks/managed-gpu. - name: --skip-gpu-driver-install type: bool short-summary: To skip GPU driver auto installation by AKS on a nodepool using GPU vm size if customers want to manage GPU driver installation by their own. If not specified, the default is false. @@ -2400,6 +2403,9 @@ - name: --enable-artifact-streaming type: bool short-summary: Enable artifact streaming for VirtualMachineScaleSets managed by a node pool, to speed up the cold-start of containers on a node through on-demand image loading. To use this feature, container images must also enable artifact streaming on ACR. If not specified, the default is false. + - name: --enable-managed-gpu + type: bool + short-summary: Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver. For more details, visit aka.ms/aks/managed-gpu. - name: --os-sku type: string short-summary: The os-sku of the agent node pool. diff --git a/src/aks-preview/azext_aks_preview/_params.py b/src/aks-preview/azext_aks_preview/_params.py index 9e5d78cde69..9d6381b02e9 100644 --- a/src/aks-preview/azext_aks_preview/_params.py +++ b/src/aks-preview/azext_aks_preview/_params.py @@ -2026,6 +2026,12 @@ def load_arguments(self, _): validator=validate_artifact_streaming, is_preview=True, ) + c.argument( + "enable_managed_gpu", + action="store_true", + is_preview=True, + help="Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver.", + ) c.argument( "node_public_ip_tags", arg_type=tags_type, @@ -2135,6 +2141,12 @@ def load_arguments(self, _): validator=validate_artifact_streaming, is_preview=True, ) + c.argument( + "enable_managed_gpu", + action="store_true", + is_preview=True, + help="Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver.", + ) c.argument( "os_sku", arg_type=get_enum_type(node_os_skus_update), diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index 1c7ad86415f..ee8164ae33f 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -45,6 +45,8 @@ CONST_MANAGED_CLUSTER_SKU_NAME_AUTOMATIC, CONST_SSH_ACCESS_LOCALUSER, CONST_GPU_DRIVER_NONE, + CONST_GPU_MANAGEMENT_MODE_MANAGED, + CONST_GPU_MANAGEMENT_MODE_UNMANAGED, CONST_NODEPOOL_MODE_MANAGEDSYSTEM, CONST_NODEPOOL_MODE_MACHINES, ) @@ -587,6 +589,24 @@ def get_enable_artifact_streaming(self) -> bool: enable_artifact_streaming = self.agentpool.artifact_streaming_profile.enabled return enable_artifact_streaming + def get_enable_managed_gpu(self) -> bool: + """Obtain the value of enable_managed_gpu. + :return: bool + """ + + # read the original value passed by the command + enable_managed_gpu = self.raw_param.get("enable_managed_gpu") + # In create mode, try to read the property value corresponding to the parameter from the `agentpool` object + if self.decorator_mode == DecoratorMode.CREATE: + if ( + self.agentpool and + self.agentpool.gpu_profile is not None and + self.agentpool.gpu_profile.nvidia is not None and + self.agentpool.gpu_profile.nvidia.management_mode is not None + ): + enable_managed_gpu = self.agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_MANAGED + return enable_managed_gpu + def get_pod_ip_allocation_mode(self: bool = False) -> Union[str, None]: """Get the value of pod_ip_allocation_mode. :return: str or None @@ -1276,6 +1296,20 @@ def set_up_artifact_streaming(self, agentpool: AgentPool) -> AgentPool: agentpool.artifact_streaming_profile.enabled = True return agentpool + def set_up_managed_gpu(self, agentpool: AgentPool) -> AgentPool: + """Set up managed GPU property for the AgentPool object.""" + self._ensure_agentpool(agentpool) + + enable_managed_gpu = self.context.get_enable_managed_gpu() + + if enable_managed_gpu: + if agentpool.gpu_profile is None: + agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member + if agentpool.gpu_profile.nvidia is None: + agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member + agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED + return agentpool + def set_up_ssh_access(self, agentpool: AgentPool) -> AgentPool: self._ensure_agentpool(agentpool) @@ -1510,6 +1544,8 @@ def construct_agentpool_profile_preview(self) -> AgentPool: agentpool = self.set_up_init_taints(agentpool) # set up artifact streaming agentpool = self.set_up_artifact_streaming(agentpool) + # set up managed gpu + agentpool = self.set_up_managed_gpu(agentpool) # set up skip_gpu_driver_install agentpool = self.set_up_skip_gpu_driver_install(agentpool) # set up gpu profile @@ -1688,6 +1724,22 @@ def update_artifact_streaming(self, agentpool: AgentPool) -> AgentPool: agentpool.artifact_streaming_profile.enabled = True return agentpool + def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool: + """Update managed GPU property for the AgentPool object. + :return: the AgentPool object + """ + self._ensure_agentpool(agentpool) + + enable_managed_gpu = self.context.get_enable_managed_gpu() + + if enable_managed_gpu: + if agentpool.gpu_profile is None: + agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member + if agentpool.gpu_profile.nvidia is None: + agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member + agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED + return agentpool + def update_os_sku(self, agentpool: AgentPool) -> AgentPool: self._ensure_agentpool(agentpool) @@ -1812,6 +1864,9 @@ def update_agentpool_profile_preview(self, agentpools: List[AgentPool] = None) - # update artifact streaming agentpool = self.update_artifact_streaming(agentpool) + # update managed gpu + agentpool = self.update_managed_gpu(agentpool) + # update secure boot agentpool = self.update_secure_boot(agentpool) diff --git a/src/aks-preview/azext_aks_preview/custom.py b/src/aks-preview/azext_aks_preview/custom.py index de53ffb9648..42195d5450e 100644 --- a/src/aks-preview/azext_aks_preview/custom.py +++ b/src/aks-preview/azext_aks_preview/custom.py @@ -1903,6 +1903,7 @@ def aks_agentpool_add( asg_ids=None, node_public_ip_tags=None, enable_artifact_streaming=False, + enable_managed_gpu=False, skip_gpu_driver_install=False, gpu_driver=None, driver_type=None, @@ -1977,6 +1978,7 @@ def aks_agentpool_update( allowed_host_ports=None, asg_ids=None, enable_artifact_streaming=False, + enable_managed_gpu=False, os_sku=None, ssh_access=None, yes=False, diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py index f211311be2d..18378a5acb6 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py @@ -36,6 +36,8 @@ CONST_MANAGED_CLUSTER_SKU_NAME_BASE, CONST_MANAGED_CLUSTER_SKU_NAME_AUTOMATIC, CONST_GPU_DRIVER_NONE, + CONST_GPU_MANAGEMENT_MODE_MANAGED, + CONST_GPU_MANAGEMENT_MODE_UNMANAGED, CONST_NODEPOOL_MODE_MANAGEDSYSTEM, CONST_NODEPOOL_MODE_MACHINES, ) @@ -257,6 +259,45 @@ def common_get_enable_artifact_streaming(self): ctx_2.attach_agentpool(agentpool_2) self.assertEqual(ctx_2.get_enable_artifact_streaming(), None) + def common_get_enable_managed_gpu(self): + # default + ctx_1 = AKSPreviewAgentPoolContext( + self.cmd, + AKSAgentPoolParamDict({"enable_managed_gpu": None}), + self.models, + DecoratorMode.CREATE, + self.agentpool_decorator_mode, + ) + self.assertEqual(ctx_1.get_enable_managed_gpu(), None) + agentpool_1 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED + ) + ) + ) + ctx_1.attach_agentpool(agentpool_1) + self.assertEqual(ctx_1.get_enable_managed_gpu(), True) + + # default + ctx_2 = AKSPreviewAgentPoolContext( + self.cmd, + AKSAgentPoolParamDict({"enable_managed_gpu": None}), + self.models, + DecoratorMode.UPDATE, + self.agentpool_decorator_mode, + ) + self.assertEqual(ctx_2.get_enable_managed_gpu(), None) + agentpool_2 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED + ) + ) + ) + ctx_2.attach_agentpool(agentpool_2) + self.assertEqual(ctx_2.get_enable_managed_gpu(), None) + def common_get_pod_ip_allocation_mode(self): # default ctx_1 = AKSPreviewAgentPoolContext( @@ -1037,6 +1078,9 @@ def test_get_workload_runtime(self): def test_get_enable_artifact_streaming(self): self.common_get_enable_artifact_streaming() + def test_get_enable_managed_gpu(self): + self.common_get_enable_managed_gpu() + def test_get_pod_ip_allocation_mode(self): self.common_get_pod_ip_allocation_mode() @@ -1130,6 +1174,9 @@ def test_get_workload_runtime(self): def test_get_enable_artifact_streaming(self): self.common_get_enable_artifact_streaming() + + def test_get_enable_managed_gpu(self): + self.common_get_enable_managed_gpu() def test_get_pod_ip_allocation_mode(self): self.common_get_pod_ip_allocation_mode() @@ -1450,6 +1497,30 @@ def common_set_up_artifact_streaming(self): ) self.assertEqual(dec_agentpool_1, ground_truth_agentpool_1) + def common_set_up_managed_gpu(self): + dec_1 = AKSPreviewAgentPoolAddDecorator( + self.cmd, + self.client, + {"enable_managed_gpu": True}, + self.resource_type, + self.agentpool_decorator_mode, + ) + # fail on passing the wrong agentpool object + with self.assertRaises(CLIInternalError): + dec_1.set_up_managed_gpu(None) + agentpool_1 = self.create_initialized_agentpool_instance(restore_defaults=False) + dec_1.context.attach_agentpool(agentpool_1) + dec_agentpool_1 = dec_1.set_up_managed_gpu(agentpool_1) + dec_agentpool_1 = self._restore_defaults_in_agentpool(dec_agentpool_1) + ground_truth_agentpool_1 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED + ) + ) + ) + self.assertEqual(dec_agentpool_1, ground_truth_agentpool_1) + def common_set_up_skip_gpu_driver_install(self): dec_1 = AKSPreviewAgentPoolAddDecorator( self.cmd, @@ -1999,6 +2070,9 @@ def test_set_up_gpu_propertes(self): def test_set_up_artifact_streaming(self): self.common_set_up_artifact_streaming() + def test_set_up_managed_gpu(self): + self.common_set_up_managed_gpu() + def test_set_up_skip_gpu_driver_install(self): self.common_set_up_skip_gpu_driver_install() @@ -2144,6 +2218,9 @@ def test_set_up_gpu_propertes(self): def test_set_up_artifact_streaming(self): self.common_set_up_artifact_streaming() + + def test_set_up_managed_gpu(self): + self.common_set_up_managed_gpu() def test_set_up_skip_gpu_driver_install(self): self.common_set_up_skip_gpu_driver_install() @@ -2349,6 +2426,57 @@ def common_update_artifact_streaming(self): ) self.assertEqual(dec_agentpool_2, grond_truth_agentpool_2) + def common_update_managed_gpu(self): + dec_1 = AKSPreviewAgentPoolUpdateDecorator( + self.cmd, + self.client, + {"enable_managed_gpu": None}, + self.resource_type, + self.agentpool_decorator_mode, + ) + # fail on passing the wrong agentpool object + with self.assertRaises(CLIInternalError): + dec_1.update_managed_gpu(None) + agentpool_1 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED + ) + ) + ) + dec_1.context.attach_agentpool(agentpool_1) + dec_agentpool_1 = dec_1.update_managed_gpu(agentpool_1) + grond_truth_agentpool_1 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED + ) + ) + ) + self.assertEqual(dec_agentpool_1, grond_truth_agentpool_1) + + dec_2 = AKSPreviewAgentPoolUpdateDecorator( + self.cmd, + self.client, + {"enable_managed_gpu": True}, + self.resource_type, + self.agentpool_decorator_mode, + ) + # fail on passing the wrong agentpool object + with self.assertRaises(CLIInternalError): + dec_2.update_managed_gpu(None) + agentpool_2 = self.create_initialized_agentpool_instance() + dec_2.context.attach_agentpool(agentpool_2) + dec_agentpool_2 = dec_2.update_managed_gpu(agentpool_2) + grond_truth_agentpool_2 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED + ) + ) + ) + self.assertEqual(dec_agentpool_2, grond_truth_agentpool_2) + def common_update_secure_boot(self): dec_1 = AKSPreviewAgentPoolUpdateDecorator( self.cmd, @@ -2849,6 +2977,9 @@ def setUp(self): def test_update_artifact_streaming(self): self.common_update_artifact_streaming() + def test_update_managed_gpu(self): + self.common_update_managed_gpu() + def test_update_secure_boot(self): self.common_update_secure_boot() @@ -2941,6 +3072,9 @@ def setUp(self): def test_update_artifact_streaming(self): self.common_update_artifact_streaming() + + def test_update_managed_gpu(self): + self.common_update_managed_gpu() def test_update_secure_boot(self): self.common_update_secure_boot() diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py index b80a0d09d54..94bf8541da0 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py @@ -6868,6 +6868,60 @@ def test_aks_nodepool_add_with_artifact_streaming( checks=[self.is_empty()], ) + @live_only() + @AllowLargeResponse() + @AKSCustomResourceGroupPreparer( + random_name_length=17, name_prefix="clitest", location="westus3" + ) + def test_aks_nodepool_add_with_enable_managed_gpu( + self, resource_group, resource_group_location + ): + aks_name = self.create_random_name("cliakstest", 16) + nodepool_name = self.create_random_name("n", 6) + + self.kwargs.update( + { + "resource_group": resource_group, + "name": aks_name, + "location": resource_group_location, + "ssh_key_value": self.generate_ssh_keys(), + "node_pool_name": nodepool_name, + "node_vm_size": "Standard_NC6s_v3", + } + ) + + # create + create_cmd = ( + "aks create --resource-group={resource_group} --name={name} " + "--ssh-key-value={ssh_key_value} " + ) + + self.cmd( + create_cmd, + checks=[ + self.check("provisioningState", "Succeeded"), + ], + ) + + # nodepool add + self.cmd( + "aks nodepool add --resource-group={resource_group} --cluster-name={name} --name={node_pool_name} " + "--node-vm-size={node_vm_size} --node-count 1" + "--enable-managed-gpu", + checks=[ + self.check("provisioningState", "Succeeded"), + self.check( + "gpuProfile.nvidia.managementMode", "Managed" + ), + ], + ) + + # delete + self.cmd( + "aks delete -g {resource_group} -n {name} --yes --no-wait", + checks=[self.is_empty()], + ) + @AllowLargeResponse() @AKSCustomResourceGroupPreparer( random_name_length=17, name_prefix="clitest", location="eastus" @@ -16443,6 +16497,67 @@ def test_aks_nodepool_update_with_artifact_streaming( ], ) + @live_only() + @AllowLargeResponse() + @AKSCustomResourceGroupPreparer( + random_name_length=17, name_prefix="clitest", location="westus3" + ) + def test_aks_nodepool_update_with_enable_managed_gpu( + self, resource_group, resource_group_location + ): + aks_name = self.create_random_name("cliakstest", 16) + nodepool_name = self.create_random_name("n", 6) + + self.kwargs.update( + { + "resource_group": resource_group, + "name": aks_name, + "location": resource_group_location, + "ssh_key_value": self.generate_ssh_keys(), + "node_pool_name": nodepool_name, + "node_vm_size": "Standard_NC6s_v3", + } + ) + + self.cmd( + "aks create " + "--resource-group={resource_group} " + "--name={name} " + "--location={location} " + "--ssh-key-value={ssh_key_value} " + "--nodepool-name={node_pool_name} " + "--node-count=1 " + "--node-vm-size={node_vm_size}", + checks=[ + self.check("provisioningState", "Succeeded"), + ], + ) + + self.cmd( + "aks nodepool update " + "--resource-group={resource_group} " + "--cluster-name={name} " + "--name={node_pool_name} " + "--enable-managed-gpu", + checks=[ + self.check("provisioningState", "Succeeded"), + self.check( + "gpuProfile.nvidia.managementMode", "Managed" + ), + ], + ) + + # delete + cmd = ( + "aks delete --resource-group={resource_group} --name={name} --yes --no-wait" + ) + self.cmd( + cmd, + checks=[ + self.is_empty(), + ], + ) + @AllowLargeResponse() @AKSCustomResourceGroupPreparer( random_name_length=17, name_prefix="clitest", location="westus2" From 374e8d3bdb1a90dadd5d782f43d092bd8b2b6327 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Thu, 19 Mar 2026 17:40:26 +0000 Subject: [PATCH 02/14] managed gpu --- .devcontainer/devcontainer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 9c2f13958e3..32534bc7b07 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -4,7 +4,7 @@ "features": { "ghcr.io/devcontainers/features/github-cli:1": {} }, - "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}", + "workspaceFolder": "/workspaces", "onCreateCommand": "python -m venv venv", "postCreateCommand": "REPO_NAME=$(basename $GITHUB_REPOSITORY) && cat $REPO_NAME/.devcontainer/login.sh >> ~/.bashrc && cp $REPO_NAME/.devcontainer/setup.sh easy_setup.sh && chmod +x easy_setup.sh", "hostRequirements": { From 0dfbc071c7a9ebfbdc0b476e52d09e105bdacbb5 Mon Sep 17 00:00:00 2001 From: Runzhen Date: Thu, 19 Mar 2026 10:54:40 -0700 Subject: [PATCH 03/14] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../azext_aks_preview/tests/latest/test_aks_commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py index 94bf8541da0..48d1af3b1fa 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py @@ -6907,7 +6907,7 @@ def test_aks_nodepool_add_with_enable_managed_gpu( self.cmd( "aks nodepool add --resource-group={resource_group} --cluster-name={name} --name={node_pool_name} " "--node-vm-size={node_vm_size} --node-count 1" - "--enable-managed-gpu", + " --enable-managed-gpu", checks=[ self.check("provisioningState", "Succeeded"), self.check( From 1938f190e08e317bc781098033087c61696accbd Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Thu, 19 Mar 2026 20:30:19 +0000 Subject: [PATCH 04/14] managed gpu --- .../tests/latest/test_agentpool_decorator.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py index 18378a5acb6..fc4dfa66329 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py @@ -2399,12 +2399,12 @@ def common_update_artifact_streaming(self): ) dec_1.context.attach_agentpool(agentpool_1) dec_agentpool_1 = dec_1.update_artifact_streaming(agentpool_1) - grond_truth_agentpool_1 = self.create_initialized_agentpool_instance( + ground_truth_agentpool_1 = self.create_initialized_agentpool_instance( artifact_streaming_profile=self.models.AgentPoolArtifactStreamingProfile( enabled=True ) ) - self.assertEqual(dec_agentpool_1, grond_truth_agentpool_1) + self.assertEqual(dec_agentpool_1, ground_truth_agentpool_1) dec_2 = AKSPreviewAgentPoolUpdateDecorator( self.cmd, @@ -2419,12 +2419,12 @@ def common_update_artifact_streaming(self): agentpool_2 = self.create_initialized_agentpool_instance() dec_2.context.attach_agentpool(agentpool_2) dec_agentpool_2 = dec_2.update_artifact_streaming(agentpool_2) - grond_truth_agentpool_2 = self.create_initialized_agentpool_instance( + ground_truth_agentpool_2 = self.create_initialized_agentpool_instance( artifact_streaming_profile=self.models.AgentPoolArtifactStreamingProfile( enabled=True ) ) - self.assertEqual(dec_agentpool_2, grond_truth_agentpool_2) + self.assertEqual(dec_agentpool_2, ground_truth_agentpool_2) def common_update_managed_gpu(self): dec_1 = AKSPreviewAgentPoolUpdateDecorator( @@ -2446,14 +2446,14 @@ def common_update_managed_gpu(self): ) dec_1.context.attach_agentpool(agentpool_1) dec_agentpool_1 = dec_1.update_managed_gpu(agentpool_1) - grond_truth_agentpool_1 = self.create_initialized_agentpool_instance( + ground_truth_agentpool_1 = self.create_initialized_agentpool_instance( gpu_profile=self.models.GPUProfile( nvidia=self.models.NvidiaGPUProfile( management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED ) ) ) - self.assertEqual(dec_agentpool_1, grond_truth_agentpool_1) + self.assertEqual(dec_agentpool_1, ground_truth_agentpool_1) dec_2 = AKSPreviewAgentPoolUpdateDecorator( self.cmd, From 6a490510a97dc838916cee625e45fdae9e5202ae Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Thu, 19 Mar 2026 20:45:02 +0000 Subject: [PATCH 05/14] managed gpu --- src/aks-preview/azext_aks_preview/_params.py | 4 ++-- src/aks-preview/azext_aks_preview/agentpool_decorator.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/aks-preview/azext_aks_preview/_params.py b/src/aks-preview/azext_aks_preview/_params.py index 9d6381b02e9..40d980e73c6 100644 --- a/src/aks-preview/azext_aks_preview/_params.py +++ b/src/aks-preview/azext_aks_preview/_params.py @@ -2030,7 +2030,7 @@ def load_arguments(self, _): "enable_managed_gpu", action="store_true", is_preview=True, - help="Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver.", + help="Enable the Managed GPU experience.", ) c.argument( "node_public_ip_tags", @@ -2145,7 +2145,7 @@ def load_arguments(self, _): "enable_managed_gpu", action="store_true", is_preview=True, - help="Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver.", + help="Enable the Managed GPU experience.", ) c.argument( "os_sku", diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index b60e573a5d7..64d91bb7f81 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -596,6 +596,9 @@ def get_enable_managed_gpu(self) -> bool: # read the original value passed by the command enable_managed_gpu = self.raw_param.get("enable_managed_gpu") + if enable_managed_gpu is None: + enable_managed_gpu = False + # In create mode, try to read the property value corresponding to the parameter from the `agentpool` object if self.decorator_mode == DecoratorMode.CREATE: if ( @@ -604,7 +607,9 @@ def get_enable_managed_gpu(self) -> bool: self.agentpool.gpu_profile.nvidia is not None and self.agentpool.gpu_profile.nvidia.management_mode is not None ): - enable_managed_gpu = self.agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_MANAGED + enable_managed_gpu = ( + self.agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_MANAGED + ) return enable_managed_gpu def get_pod_ip_allocation_mode(self: bool = False) -> Union[str, None]: From 1abc43ce920de48f6c27c7a922983a02b317b8e7 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Thu, 19 Mar 2026 20:55:47 +0000 Subject: [PATCH 06/14] managed gpu --- src/aks-preview/azext_aks_preview/agentpool_decorator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index 64d91bb7f81..52194ae35a6 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -598,7 +598,7 @@ def get_enable_managed_gpu(self) -> bool: enable_managed_gpu = self.raw_param.get("enable_managed_gpu") if enable_managed_gpu is None: enable_managed_gpu = False - + # In create mode, try to read the property value corresponding to the parameter from the `agentpool` object if self.decorator_mode == DecoratorMode.CREATE: if ( From 719d15f8ab81eddd3eed53c349264cc0607f641e Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Thu, 19 Mar 2026 21:17:27 +0000 Subject: [PATCH 07/14] managed gpu --- src/aks-preview/azext_aks_preview/agentpool_decorator.py | 1 - .../azext_aks_preview/tests/latest/test_agentpool_decorator.py | 1 - .../azext_aks_preview/tests/latest/test_aks_commands.py | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index 52194ae35a6..72dd3f9b0ff 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -46,7 +46,6 @@ CONST_SSH_ACCESS_LOCALUSER, CONST_GPU_DRIVER_NONE, CONST_GPU_MANAGEMENT_MODE_MANAGED, - CONST_GPU_MANAGEMENT_MODE_UNMANAGED, CONST_NODEPOOL_MODE_MANAGEDSYSTEM, CONST_NODEPOOL_MODE_MACHINES, ) diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py index fc4dfa66329..b6efa08b599 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py @@ -37,7 +37,6 @@ CONST_MANAGED_CLUSTER_SKU_NAME_AUTOMATIC, CONST_GPU_DRIVER_NONE, CONST_GPU_MANAGEMENT_MODE_MANAGED, - CONST_GPU_MANAGEMENT_MODE_UNMANAGED, CONST_NODEPOOL_MODE_MANAGEDSYSTEM, CONST_NODEPOOL_MODE_MACHINES, ) diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py index 48d1af3b1fa..56a59b814a5 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py @@ -6906,7 +6906,7 @@ def test_aks_nodepool_add_with_enable_managed_gpu( # nodepool add self.cmd( "aks nodepool add --resource-group={resource_group} --cluster-name={name} --name={node_pool_name} " - "--node-vm-size={node_vm_size} --node-count 1" + "--node-vm-size={node_vm_size} --node-count 1 " " --enable-managed-gpu", checks=[ self.check("provisioningState", "Succeeded"), From 0499eb0940a80220583272de25f22ba59fa77454 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Thu, 19 Mar 2026 21:42:11 +0000 Subject: [PATCH 08/14] managed gpu --- .../azext_aks_preview/agentpool_decorator.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index 72dd3f9b0ff..36cac026f2b 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -46,6 +46,7 @@ CONST_SSH_ACCESS_LOCALUSER, CONST_GPU_DRIVER_NONE, CONST_GPU_MANAGEMENT_MODE_MANAGED, + CONST_GPU_MANAGEMENT_MODE_UNMANAGED, CONST_NODEPOOL_MODE_MANAGEDSYSTEM, CONST_NODEPOOL_MODE_MACHINES, ) @@ -1752,12 +1753,16 @@ def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool: enable_managed_gpu = self.context.get_enable_managed_gpu() + if agentpool.gpu_profile is None: + agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member + if agentpool.gpu_profile.nvidia is None: + agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member + if enable_managed_gpu: - if agentpool.gpu_profile is None: - agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member - if agentpool.gpu_profile.nvidia is None: - agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED + else: + agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED + return agentpool def update_os_sku(self, agentpool: AgentPool) -> AgentPool: From 5881f7b609ca54e21800e628b1a7afcd68f5138e Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Fri, 20 Mar 2026 01:04:39 +0000 Subject: [PATCH 09/14] managed gpu --- src/aks-preview/azext_aks_preview/agentpool_decorator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index 36cac026f2b..e8c39b5e088 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -589,15 +589,13 @@ def get_enable_artifact_streaming(self) -> bool: enable_artifact_streaming = self.agentpool.artifact_streaming_profile.enabled return enable_artifact_streaming - def get_enable_managed_gpu(self) -> bool: + def get_enable_managed_gpu(self) -> Union[bool, None]: """Obtain the value of enable_managed_gpu. :return: bool """ # read the original value passed by the command enable_managed_gpu = self.raw_param.get("enable_managed_gpu") - if enable_managed_gpu is None: - enable_managed_gpu = False # In create mode, try to read the property value corresponding to the parameter from the `agentpool` object if self.decorator_mode == DecoratorMode.CREATE: @@ -1752,6 +1750,8 @@ def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool: self._ensure_agentpool(agentpool) enable_managed_gpu = self.context.get_enable_managed_gpu() + if enable_managed_gpu is None: + return agentpool if agentpool.gpu_profile is None: agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member From 572aa0e1c5e6df823d9068fbaf8e4484eae21bf5 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Fri, 20 Mar 2026 21:27:50 +0000 Subject: [PATCH 10/14] managed gpu --- .../azext_aks_preview/agentpool_decorator.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index e8c39b5e088..31f818b9eb8 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -1753,15 +1753,15 @@ def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool: if enable_managed_gpu is None: return agentpool - if agentpool.gpu_profile is None: - agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member - if agentpool.gpu_profile.nvidia is None: - agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member - if enable_managed_gpu: + if agentpool.gpu_profile is None: + agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member + if agentpool.gpu_profile.nvidia is None: + agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED else: - agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED + if agentpool.gpu_profile and agentpool.gpu_profile.nvidia: + agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED return agentpool From cc79e1021ae6082bec8297dd7456842b3a8c4dfc Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Mon, 23 Mar 2026 04:29:24 +0000 Subject: [PATCH 11/14] gpu --- .../latest/test_update_agentpool_profile_preview.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py b/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py index 7df5619d3ac..a9d5f9548ab 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py @@ -123,6 +123,7 @@ def test_update_agentpool_profile_preview_default_behavior(self): # Mock all the update methods to return the agentpool unchanged decorator.update_network_profile = Mock(return_value=agentpool) decorator.update_artifact_streaming = Mock(return_value=agentpool) + decorator.update_managed_gpu = Mock(return_value=agentpool) decorator.update_secure_boot = Mock(return_value=agentpool) decorator.update_vtpm = Mock(return_value=agentpool) decorator.update_os_sku = Mock(return_value=agentpool) @@ -146,6 +147,7 @@ def test_update_agentpool_profile_preview_default_behavior(self): # Verify that all update methods were called decorator.update_network_profile.assert_called_once_with(agentpool) decorator.update_artifact_streaming.assert_called_once_with(agentpool) + decorator.update_managed_gpu.assert_called_once_with(agentpool) decorator.update_secure_boot.assert_called_once_with(agentpool) decorator.update_vtpm.assert_called_once_with(agentpool) decorator.update_os_sku.assert_called_once_with(agentpool) @@ -187,6 +189,7 @@ def test_update_agentpool_profile_preview_with_agentpools_parameter(self): # Mock all the update methods to return the agentpool unchanged decorator.update_network_profile = Mock(return_value=agentpool) decorator.update_artifact_streaming = Mock(return_value=agentpool) + decorator.update_managed_gpu = Mock(return_value=agentpool) decorator.update_secure_boot = Mock(return_value=agentpool) decorator.update_vtpm = Mock(return_value=agentpool) decorator.update_os_sku = Mock(return_value=agentpool) @@ -238,6 +241,7 @@ def test_update_agentpool_profile_preview_managed_system_mode(self): # Mock all the update methods (they should not be called for ManagedSystem mode) decorator.update_network_profile = Mock() decorator.update_artifact_streaming = Mock() + decorator.update_managed_gpu = Mock() decorator.update_secure_boot = Mock() decorator.update_vtpm = Mock() decorator.update_os_sku = Mock() @@ -267,6 +271,7 @@ def test_update_agentpool_profile_preview_managed_system_mode(self): # Verify that none of the update methods were called for ManagedSystem mode decorator.update_network_profile.assert_not_called() decorator.update_artifact_streaming.assert_not_called() + decorator.update_managed_gpu.assert_not_called() decorator.update_secure_boot.assert_not_called() decorator.update_vtpm.assert_not_called() decorator.update_os_sku.assert_not_called() @@ -345,6 +350,7 @@ def test_update_agentpool_profile_preview_system_mode_regular_flow(self): # Mock all the update methods to return the agentpool unchanged decorator.update_network_profile = Mock(return_value=agentpool) decorator.update_artifact_streaming = Mock(return_value=agentpool) + decorator.update_managed_gpu = Mock(return_value=agentpool) decorator.update_secure_boot = Mock(return_value=agentpool) decorator.update_vtpm = Mock(return_value=agentpool) decorator.update_os_sku = Mock(return_value=agentpool) @@ -366,6 +372,7 @@ def test_update_agentpool_profile_preview_system_mode_regular_flow(self): # Verify that all update methods were called for System mode decorator.update_network_profile.assert_called_once_with(agentpool) decorator.update_artifact_streaming.assert_called_once_with(agentpool) + decorator.update_managed_gpu.assert_called_once_with(agentpool) decorator.update_secure_boot.assert_called_once_with(agentpool) decorator.update_vtpm.assert_called_once_with(agentpool) decorator.update_os_sku.assert_called_once_with(agentpool) @@ -412,6 +419,7 @@ def mock_method(pool): decorator.update_network_profile = create_mock_update_method("update_network_profile") decorator.update_artifact_streaming = create_mock_update_method("update_artifact_streaming") + decorator.update_managed_gpu = create_mock_update_method("update_managed_gpu") decorator.update_secure_boot = create_mock_update_method("update_secure_boot") decorator.update_vtpm = create_mock_update_method("update_vtpm") decorator.update_os_sku = create_mock_update_method("update_os_sku") @@ -430,6 +438,7 @@ def mock_method(pool): expected_order = [ "update_network_profile", "update_artifact_streaming", + "update_managed_gpu", "update_secure_boot", "update_vtpm", "update_os_sku", @@ -478,6 +487,7 @@ def track_and_return(pool): decorator.update_network_profile = create_tracking_mock("update_network_profile") decorator.update_artifact_streaming = create_tracking_mock("update_artifact_streaming") + decorator.update_managed_gpu = create_tracking_mock("update_managed_gpu") decorator.update_secure_boot = create_tracking_mock("update_secure_boot") decorator.update_vtpm = create_tracking_mock("update_vtpm") decorator.update_os_sku = create_tracking_mock("update_os_sku") @@ -547,7 +557,7 @@ def test_update_agentpool_profile_preview_mixed_modes_scenario(self): # Mock all update methods update_methods = [ - 'update_network_profile', 'update_artifact_streaming', + 'update_network_profile', 'update_artifact_streaming', 'update_managed_gpu', 'update_secure_boot', 'update_vtpm', 'update_os_sku', 'update_fips_image', 'update_ssh_access', 'update_localdns_profile', 'update_auto_scaler_properties_vms', 'update_upgrade_strategy', 'update_blue_green_upgrade_settings', 'update_gpu_profile' @@ -613,6 +623,7 @@ def test_update_agentpool_profile_preview_managed_cluster_mode(self): # Mock all the update methods decorator.update_network_profile = Mock(return_value=agentpool) decorator.update_artifact_streaming = Mock(return_value=agentpool) + decorator.update_managed_gpu = Mock(return_value=agentpool) decorator.update_secure_boot = Mock(return_value=agentpool) decorator.update_vtpm = Mock(return_value=agentpool) decorator.update_os_sku = Mock(return_value=agentpool) From 23b7f62e1a7bce4a44fa9ca66e6ba67944b1ce6d Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Tue, 24 Mar 2026 00:16:44 +0000 Subject: [PATCH 12/14] managed gpu --- .../azcli_aks_live_test/configs/ext_matrix_default.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json b/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json index cbec9591737..77b2ec6534a 100644 --- a/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json +++ b/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json @@ -22,7 +22,10 @@ ], "gpu, no quota": [ "test_aks_nodepool_add_with_gpu_instance_profile", - "test_aks_gpu_driver_type" + "test_aks_gpu_driver_type", + "test_aks_nodepool_add_with_enable_managed_gpu", + "test_aks_nodepool_update_with_enable_managed_gpu" + ], "pod ip allocation mode static block, missing feature registration": [ "test_aks_create_with_pod_ip_allocation_mode_static_block" From 5a2e3a24a8b3b5fe5d283c38c5d86313edc0ff37 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Tue, 24 Mar 2026 01:15:52 +0000 Subject: [PATCH 13/14] gpu --- .../azcli_aks_live_test/configs/ext_matrix_default.json | 1 - 1 file changed, 1 deletion(-) diff --git a/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json b/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json index 77b2ec6534a..e3e54bb1540 100644 --- a/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json +++ b/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json @@ -25,7 +25,6 @@ "test_aks_gpu_driver_type", "test_aks_nodepool_add_with_enable_managed_gpu", "test_aks_nodepool_update_with_enable_managed_gpu" - ], "pod ip allocation mode static block, missing feature registration": [ "test_aks_create_with_pod_ip_allocation_mode_static_block" From 0fb29824b852b910b1dd4bf666e1a42cf3fd72d4 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Tue, 24 Mar 2026 19:49:42 +0000 Subject: [PATCH 14/14] gpu --- src/aks-preview/azext_aks_preview/agentpool_decorator.py | 3 +++ .../azext_aks_preview/tests/latest/test_agentpool_decorator.py | 2 ++ .../azext_aks_preview/tests/latest/test_aks_commands.py | 2 ++ 3 files changed, 7 insertions(+) diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index 31f818b9eb8..6446ac61edc 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -44,6 +44,7 @@ CONST_DEFAULT_WINDOWS_VMS_VM_SIZE, CONST_MANAGED_CLUSTER_SKU_NAME_AUTOMATIC, CONST_SSH_ACCESS_LOCALUSER, + CONST_GPU_DRIVER_INSTALL, CONST_GPU_DRIVER_NONE, CONST_GPU_MANAGEMENT_MODE_MANAGED, CONST_GPU_MANAGEMENT_MODE_UNMANAGED, @@ -1311,6 +1312,7 @@ def set_up_managed_gpu(self, agentpool: AgentPool) -> AgentPool: if agentpool.gpu_profile.nvidia is None: agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED + agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL return agentpool def set_up_ssh_access(self, agentpool: AgentPool) -> AgentPool: @@ -1759,6 +1761,7 @@ def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool: if agentpool.gpu_profile.nvidia is None: agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED + agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL else: if agentpool.gpu_profile and agentpool.gpu_profile.nvidia: agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py index b6efa08b599..8bbe8fe9028 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py @@ -1513,6 +1513,7 @@ def common_set_up_managed_gpu(self): dec_agentpool_1 = self._restore_defaults_in_agentpool(dec_agentpool_1) ground_truth_agentpool_1 = self.create_initialized_agentpool_instance( gpu_profile=self.models.GPUProfile( + driver=CONST_GPU_DRIVER_INSTALL, nvidia=self.models.NvidiaGPUProfile( management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED ) @@ -2469,6 +2470,7 @@ def common_update_managed_gpu(self): dec_agentpool_2 = dec_2.update_managed_gpu(agentpool_2) grond_truth_agentpool_2 = self.create_initialized_agentpool_instance( gpu_profile=self.models.GPUProfile( + driver=CONST_GPU_DRIVER_INSTALL, nvidia=self.models.NvidiaGPUProfile( management_mode=CONST_GPU_MANAGEMENT_MODE_MANAGED ) diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py index d8b35e16386..9ffe956d6d6 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py @@ -6910,6 +6910,7 @@ def test_aks_nodepool_add_with_enable_managed_gpu( " --enable-managed-gpu", checks=[ self.check("provisioningState", "Succeeded"), + self.check("gpuProfile.driver", "Install"), self.check( "gpuProfile.nvidia.managementMode", "Managed" ), @@ -16541,6 +16542,7 @@ def test_aks_nodepool_update_with_enable_managed_gpu( "--enable-managed-gpu", checks=[ self.check("provisioningState", "Succeeded"), + self.check("gpuProfile.driver", "Install"), self.check( "gpuProfile.nvidia.managementMode", "Managed" ),