Skip to content

Commit 0ff4a5e

Browse files
authored
add osdisk full cache feature (#9875)
1 parent 8db0a28 commit 0ff4a5e

8 files changed

Lines changed: 272 additions & 3 deletions

File tree

src/aks-preview/HISTORY.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ To release a new version, please select a new version number (usually plus 1 to
1111

1212
Pending
1313
+++++++
14+
* `az aks create` and `az aks nodepool add`: Add `--enable-osdisk-full-caching` (preview) to enable the full-cache ephemeral OS disk feature for a node pool. Requires AFEC registration `Microsoft.ContainerService/FullCachePreview`. Property is immutable after node pool creation.
1415

1516
21.0.0b1
1617
++++++

src/aks-preview/azext_aks_preview/_help.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,14 @@
5555
- name: --node-osdisk-type
5656
type: string
5757
short-summary: OS disk type to be used for machines in a given agent pool. Defaults to 'Ephemeral' when possible in conjunction with VM size and OS disk size. May not be changed for this pool after creation. ('Ephemeral' or 'Managed')
58+
- name: --enable-osdisk-fc --enable-osdisk-full-caching
59+
type: bool
60+
short-summary: Enable the full-cache ephemeral OS disk feature for the default node pool.
61+
long-summary: |-
62+
When enabled, the entire operating system is cached on the local
63+
ephemeral OS disk to mitigate E17 events caused by network failures.
64+
Requires Ephemeral OS disk and a VM size with sufficient cache.
65+
This property is immutable after the node pool is created.
5866
- name: --node-osdisk-diskencryptionset-id -d
5967
type: string
6068
short-summary: ResourceId of the disk encryption set to use for enabling encryption at rest on agent node os disk.
@@ -2100,6 +2108,14 @@
21002108
- name: --node-osdisk-type
21012109
type: string
21022110
short-summary: OS disk type to be used for machines in a given agent pool. Defaults to 'Ephemeral' when possible in conjunction with VM size and OS disk size. May not be changed for this pool after creation. ('Ephemeral' or 'Managed')
2111+
- name: --enable-osdisk-fc --enable-osdisk-full-caching
2112+
type: bool
2113+
short-summary: Enable the full-cache ephemeral OS disk feature for the node pool.
2114+
long-summary: |-
2115+
When enabled, the entire operating system is cached on the local
2116+
ephemeral OS disk to mitigate E17 events caused by network failures.
2117+
Requires Ephemeral OS disk and a VM size with sufficient cache.
2118+
This property is immutable after the node pool is created.
21032119
- name: --max-pods -m
21042120
type: int
21052121
short-summary: The maximum number of pods deployable to a node.

src/aks-preview/azext_aks_preview/_params.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@
238238
validate_force_upgrade_disable_and_enable_parameters,
239239
validate_azure_service_mesh_revision,
240240
validate_artifact_streaming,
241+
validate_os_disk_full_caching,
241242
validate_custom_endpoints,
242243
validate_bootstrap_container_registry_resource_id,
243244
validate_gateway_prefix_size,
@@ -895,6 +896,13 @@ def load_arguments(self, _):
895896
)
896897
c.argument("node_osdisk_type", arg_type=get_enum_type(node_os_disk_types))
897898
c.argument("node_osdisk_size", type=int)
899+
c.argument(
900+
"enable_os_disk_full_caching",
901+
options_list=["--enable-osdisk-full-caching", "--enable-osdisk-fc"],
902+
action="store_true",
903+
validator=validate_os_disk_full_caching,
904+
is_preview=True,
905+
)
898906
c.argument("max_pods", type=int, options_list=["--max-pods", "-m"])
899907
c.argument("vm_set_type", validator=validate_vm_set_type)
900908
c.argument(
@@ -2062,6 +2070,13 @@ def load_arguments(self, _):
20622070
c.argument("node_taints", validator=validate_nodepool_taints)
20632071
c.argument("node_osdisk_type", arg_type=get_enum_type(node_os_disk_types))
20642072
c.argument("node_osdisk_size", type=int)
2073+
c.argument(
2074+
"enable_os_disk_full_caching",
2075+
options_list=["--enable-osdisk-full-caching", "--enable-osdisk-fc"],
2076+
action="store_true",
2077+
validator=validate_os_disk_full_caching,
2078+
is_preview=True,
2079+
)
20652080
# upgrade strategy
20662081
c.argument("upgrade_strategy", arg_type=get_enum_type(upgrade_strategies))
20672082
# rolling upgrade params

src/aks-preview/azext_aks_preview/_validators.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
CONST_MANAGED_CLUSTER_SKU_TIER_STANDARD,
2222
CONST_NETWORK_POD_IP_ALLOCATION_MODE_DYNAMIC_INDIVIDUAL,
2323
CONST_NETWORK_POD_IP_ALLOCATION_MODE_STATIC_BLOCK,
24-
CONST_NODEPOOL_MODE_GATEWAY, CONST_OS_SKU_AZURELINUX,
24+
CONST_NODEPOOL_MODE_GATEWAY, CONST_OS_DISK_TYPE_MANAGED,
25+
CONST_OS_SKU_AZURELINUX,
2526
CONST_OS_SKU_CBLMARINER, CONST_OS_SKU_MARINER)
2627
from azext_aks_preview._helpers import _fuzzy_match
2728
from azure.cli.core import keys
@@ -953,6 +954,22 @@ def validate_asm_egress_name(namespace):
953954
)
954955

955956

957+
def validate_os_disk_full_caching(namespace):
958+
"""Reject --enable-osdisk-full-caching when OS disk type is explicitly Managed.
959+
960+
Full-cache OS disk requires Ephemeral storage; failing fast at the CLI gives
961+
a clearer error than waiting for an ARM round-trip.
962+
"""
963+
if not getattr(namespace, "enable_os_disk_full_caching", False):
964+
return
965+
node_osdisk_type = getattr(namespace, "node_osdisk_type", None)
966+
if node_osdisk_type == CONST_OS_DISK_TYPE_MANAGED:
967+
raise ArgumentUsageError(
968+
"--enable-osdisk-full-caching requires Ephemeral OS disk; "
969+
"it cannot be used with --node-osdisk-type Managed."
970+
)
971+
972+
956973
def validate_artifact_streaming(namespace):
957974
"""Validates artifact streaming flags for mutual exclusivity and OS support."""
958975
enable_artifact_streaming = getattr(namespace, "enable_artifact_streaming", False)

src/aks-preview/azext_aks_preview/agentpool_decorator.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,19 @@ def get_enable_artifact_streaming(self) -> bool:
606606
)
607607
return enable_artifact_streaming
608608

609+
def get_enable_os_disk_full_caching(self) -> bool:
610+
"""Obtain the value of enable_os_disk_full_caching.
611+
:return: bool
612+
"""
613+
enable_os_disk_full_caching = self.raw_param.get("enable_os_disk_full_caching")
614+
if self.decorator_mode == DecoratorMode.CREATE:
615+
if (
616+
self.agentpool and
617+
self.agentpool.enable_os_disk_full_caching is not None
618+
):
619+
enable_os_disk_full_caching = self.agentpool.enable_os_disk_full_caching
620+
return enable_os_disk_full_caching
621+
609622
def get_enable_managed_gpu(self) -> Union[bool, None]:
610623
"""Obtain the value of enable_managed_gpu.
611624
:return: bool
@@ -1327,6 +1340,14 @@ def set_up_artifact_streaming(self, agentpool: AgentPool) -> AgentPool:
13271340
agentpool.artifact_streaming_profile.enabled = True
13281341
return agentpool
13291342

1343+
def set_up_os_disk_full_caching(self, agentpool: AgentPool) -> AgentPool:
1344+
"""Set up enable_os_disk_full_caching property for the AgentPool object."""
1345+
self._ensure_agentpool(agentpool)
1346+
1347+
if self.context.get_enable_os_disk_full_caching():
1348+
agentpool.enable_os_disk_full_caching = True
1349+
return agentpool
1350+
13301351
def set_up_managed_gpu(self, agentpool: AgentPool) -> AgentPool:
13311352
"""Set up managed GPU property for the AgentPool object."""
13321353
self._ensure_agentpool(agentpool)
@@ -1606,6 +1627,8 @@ def construct_agentpool_profile_preview(self) -> AgentPool:
16061627
agentpool = self.set_up_init_taints(agentpool)
16071628
# set up artifact streaming
16081629
agentpool = self.set_up_artifact_streaming(agentpool)
1630+
# set up os disk full caching
1631+
agentpool = self.set_up_os_disk_full_caching(agentpool)
16091632
# set up managed gpu
16101633
agentpool = self.set_up_managed_gpu(agentpool)
16111634
# set up skip_gpu_driver_install

src/aks-preview/azext_aks_preview/custom.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,7 @@ def aks_create(
10651065
nodepool_initialization_taints=None,
10661066
node_osdisk_type=None,
10671067
node_osdisk_size=0,
1068+
enable_os_disk_full_caching=False,
10681069
vm_set_type=None,
10691070
zones=None,
10701071
ppg=None,
@@ -1934,6 +1935,7 @@ def aks_agentpool_add(
19341935
node_taints=None,
19351936
node_osdisk_type=None,
19361937
node_osdisk_size=0,
1938+
enable_os_disk_full_caching=False,
19371939
max_surge=None,
19381940
drain_timeout=None,
19391941
node_soak_duration=None,

src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py

Lines changed: 104 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,61 @@ def common_get_enable_artifact_streaming(self):
258258
ctx_2.attach_agentpool(agentpool_2)
259259
self.assertEqual(ctx_2.get_enable_artifact_streaming(), None)
260260

261+
def common_get_enable_os_disk_full_caching(self):
262+
# default: store_true flag not provided -> raw is False
263+
ctx_1 = AKSPreviewAgentPoolContext(
264+
self.cmd,
265+
AKSAgentPoolParamDict({"enable_os_disk_full_caching": False}),
266+
self.models,
267+
DecoratorMode.CREATE,
268+
self.agentpool_decorator_mode,
269+
)
270+
self.assertEqual(ctx_1.get_enable_os_disk_full_caching(), False)
271+
# CREATE: value on attached agentpool overrides default False
272+
agentpool_1 = self.create_initialized_agentpool_instance(
273+
enable_os_disk_full_caching=True
274+
)
275+
ctx_1.attach_agentpool(agentpool_1)
276+
self.assertEqual(ctx_1.get_enable_os_disk_full_caching(), True)
277+
278+
# explicit True from raw param, no attached agentpool
279+
ctx_2 = AKSPreviewAgentPoolContext(
280+
self.cmd,
281+
AKSAgentPoolParamDict({"enable_os_disk_full_caching": True}),
282+
self.models,
283+
DecoratorMode.CREATE,
284+
self.agentpool_decorator_mode,
285+
)
286+
self.assertEqual(ctx_2.get_enable_os_disk_full_caching(), True)
287+
288+
# priority: raw True is overridden by attached agentpool False in CREATE mode
289+
ctx_3 = AKSPreviewAgentPoolContext(
290+
self.cmd,
291+
AKSAgentPoolParamDict({"enable_os_disk_full_caching": True}),
292+
self.models,
293+
DecoratorMode.CREATE,
294+
self.agentpool_decorator_mode,
295+
)
296+
agentpool_3 = self.create_initialized_agentpool_instance(
297+
enable_os_disk_full_caching=False
298+
)
299+
ctx_3.attach_agentpool(agentpool_3)
300+
self.assertEqual(ctx_3.get_enable_os_disk_full_caching(), False)
301+
302+
# UPDATE mode gate: attached agentpool MUST NOT override raw_param
303+
ctx_4 = AKSPreviewAgentPoolContext(
304+
self.cmd,
305+
AKSAgentPoolParamDict({"enable_os_disk_full_caching": False}),
306+
self.models,
307+
DecoratorMode.UPDATE,
308+
self.agentpool_decorator_mode,
309+
)
310+
agentpool_4 = self.create_initialized_agentpool_instance(
311+
enable_os_disk_full_caching=True
312+
)
313+
ctx_4.attach_agentpool(agentpool_4)
314+
self.assertEqual(ctx_4.get_enable_os_disk_full_caching(), False)
315+
261316
def common_get_disable_artifact_streaming(self):
262317
# default
263318
ctx_1 = AKSPreviewAgentPoolContext(
@@ -1155,6 +1210,9 @@ def test_get_workload_runtime(self):
11551210
def test_get_enable_artifact_streaming(self):
11561211
self.common_get_enable_artifact_streaming()
11571212

1213+
def test_get_enable_os_disk_full_caching(self):
1214+
self.common_get_enable_os_disk_full_caching()
1215+
11581216
def test_get_disable_artifact_streaming(self):
11591217
self.common_get_disable_artifact_streaming()
11601218

@@ -1257,7 +1315,10 @@ def test_get_workload_runtime(self):
12571315

12581316
def test_get_enable_artifact_streaming(self):
12591317
self.common_get_enable_artifact_streaming()
1260-
1318+
1319+
def test_get_enable_os_disk_full_caching(self):
1320+
self.common_get_enable_os_disk_full_caching()
1321+
12611322
def test_get_enable_managed_gpu(self):
12621323
self.common_get_enable_managed_gpu()
12631324

@@ -1583,6 +1644,41 @@ def common_set_up_artifact_streaming(self):
15831644
)
15841645
self.assertEqual(dec_agentpool_1, ground_truth_agentpool_1)
15851646

1647+
def common_set_up_os_disk_full_caching(self):
1648+
# default: store_true flag not provided -> raw is False -> field stays unset
1649+
dec_default = AKSPreviewAgentPoolAddDecorator(
1650+
self.cmd,
1651+
self.client,
1652+
{"enable_os_disk_full_caching": False},
1653+
self.resource_type,
1654+
self.agentpool_decorator_mode,
1655+
)
1656+
with self.assertRaises(CLIInternalError):
1657+
dec_default.set_up_os_disk_full_caching(None)
1658+
agentpool_default = self.create_initialized_agentpool_instance(restore_defaults=False)
1659+
dec_default.context.attach_agentpool(agentpool_default)
1660+
dec_agentpool_default = dec_default.set_up_os_disk_full_caching(agentpool_default)
1661+
dec_agentpool_default = self._restore_defaults_in_agentpool(dec_agentpool_default)
1662+
ground_truth_default = self.create_initialized_agentpool_instance()
1663+
self.assertEqual(dec_agentpool_default, ground_truth_default)
1664+
1665+
# explicit True -> field set to True
1666+
dec_true = AKSPreviewAgentPoolAddDecorator(
1667+
self.cmd,
1668+
self.client,
1669+
{"enable_os_disk_full_caching": True},
1670+
self.resource_type,
1671+
self.agentpool_decorator_mode,
1672+
)
1673+
agentpool_true = self.create_initialized_agentpool_instance(restore_defaults=False)
1674+
dec_true.context.attach_agentpool(agentpool_true)
1675+
dec_agentpool_true = dec_true.set_up_os_disk_full_caching(agentpool_true)
1676+
dec_agentpool_true = self._restore_defaults_in_agentpool(dec_agentpool_true)
1677+
ground_truth_true = self.create_initialized_agentpool_instance(
1678+
enable_os_disk_full_caching=True
1679+
)
1680+
self.assertEqual(dec_agentpool_true, ground_truth_true)
1681+
15861682
def common_set_up_managed_gpu(self):
15871683
dec_1 = AKSPreviewAgentPoolAddDecorator(
15881684
self.cmd,
@@ -2157,6 +2253,9 @@ def test_set_up_gpu_propertes(self):
21572253
def test_set_up_artifact_streaming(self):
21582254
self.common_set_up_artifact_streaming()
21592255

2256+
def test_set_up_os_disk_full_caching(self):
2257+
self.common_set_up_os_disk_full_caching()
2258+
21602259
def test_set_up_managed_gpu(self):
21612260
self.common_set_up_managed_gpu()
21622261

@@ -2305,7 +2404,10 @@ def test_set_up_gpu_propertes(self):
23052404

23062405
def test_set_up_artifact_streaming(self):
23072406
self.common_set_up_artifact_streaming()
2308-
2407+
2408+
def test_set_up_os_disk_full_caching(self):
2409+
self.common_set_up_os_disk_full_caching()
2410+
23092411
def test_set_up_managed_gpu(self):
23102412
self.common_set_up_managed_gpu()
23112413

0 commit comments

Comments
 (0)