Skip to content

Commit d169483

Browse files
Add blue green upgrade support for aks-preview (#8999)
1 parent b0d843e commit d169483

14 files changed

Lines changed: 7025 additions & 7 deletions

src/aks-preview/HISTORY.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ To release a new version, please select a new version number (usually plus 1 to
1212
Pending
1313
+++++++
1414
* Add machine command `az aks machine add` to add a machine to an existing machine pool.
15+
* Add blue-green upgrade strategy support for AKS node pools:
16+
- `az aks nodepool add/update/upgrade`: Add `--upgrade-strategy` parameter to switch between rolling and blue-green nodepool upgrades.
17+
- `az aks nodepool add/update/upgrade`: Add `--drain-batch-size`, `--drain-timeout-bg`, `--batch-soak-duration`, `--final-soak-duration` parameters to configure blue-green upgrade settings.
1518

1619
18.0.0b37
1720
+++++++

src/aks-preview/azext_aks_preview/_consts.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@
109109
CONST_NODE_IMAGE_UPGRADE_CHANNEL = "node-image"
110110
CONST_NONE_UPGRADE_CHANNEL = "none"
111111

112+
# consts for upgrade strategy
113+
CONST_UPGRADE_STRATEGY_ROLLING = "Rolling"
114+
CONST_UPGRADE_STRATEGY_BLUE_GREEN = "BlueGreen"
115+
112116
# consts for node os upgrade channel
113117
CONST_NODE_OS_CHANNEL_NODE_IMAGE = "NodeImage"
114118
CONST_NODE_OS_CHANNEL_NONE = "None"

src/aks-preview/azext_aks_preview/_help.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2073,6 +2073,24 @@
20732073
- name: --localdns-config
20742074
type: string
20752075
short-summary: Set the localDNS Profile for a nodepool with a JSON config file.
2076+
- name: --upgrade-strategy
2077+
type: string
2078+
short-summary: Upgrade strategy for the node pool. Allowed values are "Rolling" or "BlueGreen". Default is "Rolling".
2079+
- name: --drain-batch-size
2080+
type: string
2081+
short-summary: Number or percentage of nodes to drain per batch during blue-green upgrades. Accepts an integer (e.g. '5') or percentage (e.g. '50%'). Default is 10%.
2082+
long-summary: |-
2083+
Specifies how many nodes to drain in each batch during a blue-green upgrade. Must be a non-zero value, either as an integer (e.g. '5') or a percentage (e.g. '50%') of the total blue nodes at the start of the upgrade. Fractional nodes are rounded up. For more details and best practices, see https://learn.microsoft.com/en-us/azure/aks/upgrade-cluster
2084+
- name: --drain-timeout-bg
2085+
type: int
2086+
short-summary: Timeout (in minutes) to evict pods and gracefully terminate per node during blue-green upgrades. Default is 30 minutes.
2087+
long-summary: Maximum time (in minutes) to wait for pod eviction and graceful termination per node during blue-green upgrades. Honors pod disruption budgets. If exceeded, the upgrade fails. Default is 30 minutes.
2088+
- name: --batch-soak-duration
2089+
type: int
2090+
short-summary: Wait time (in minutes) after draining a batch of nodes before proceeding to the next batch. Default is 15 minutes. Only for blue-green upgrades.
2091+
- name: --final-soak-duration
2092+
type: int
2093+
short-summary: Wait time (in minutes) after all old nodes are drained before removing them. Default is 60 minutes. Only for blue-green upgrades.
20762094
examples:
20772095
- name: Create a nodepool in an existing AKS cluster with ephemeral os enabled.
20782096
text: az aks nodepool add -g MyResourceGroup -n nodepool1 --cluster-name MyManagedCluster --node-osdisk-type Ephemeral --node-osdisk-size 48
@@ -2094,6 +2112,8 @@
20942112
text: az aks nodepool add -g MyResourceGroup -n nodepool1 --cluster-name MyManagedCluster --vm-set-type VirtualMachines --vm-sizes "Standard_D4s_v3,Standard_D8s_v3" --node-count 3
20952113
- name: Create a nodepool with ManagedSystem mode
20962114
text: az aks nodepool add -g MyResourceGroup -n managedsystem1 --cluster-name MyManagedCluster --mode ManagedSystem
2115+
- name: Create a node pool with blue-green upgrade strategy and default parameters
2116+
text: az aks nodepool add -g MyResourceGroup -n nodepool1 --cluster-name MyManagedCluster --upgrade-strategy BlueGreen
20972117
"""
20982118

20992119
helps['aks nodepool scale'] = """
@@ -2148,6 +2168,24 @@
21482168
- name: --undrainable-node-behavior
21492169
type: string
21502170
short-summary: Define the behavior for undrainable nodes during upgrade. The value should be "Cordon" or "Schedule". The default value is "Schedule".
2171+
- name: --upgrade-strategy
2172+
type: string
2173+
short-summary: Upgrade strategy for the node pool. Allowed values are "Rolling" or "BlueGreen". Default is "Rolling".
2174+
- name: --drain-batch-size
2175+
type: string
2176+
short-summary: Number or percentage of nodes to drain per batch during blue-green upgrades. Accepts an integer (e.g. '5') or percentage (e.g. '50%'). Default is 10%.
2177+
long-summary: |-
2178+
Specifies how many nodes to drain in each batch during a blue-green upgrade. Must be a non-zero value, either as an integer (e.g. '5') or a percentage (e.g. '50%') of the total blue nodes at the start of the upgrade. Fractional nodes are rounded up. For more details and best practices, see: https://learn.microsoft.com/en-us/azure/aks/upgrade-cluster
2179+
- name: --drain-timeout-bg
2180+
type: int
2181+
short-summary: Timeout (in minutes) to evict pods and gracefully terminate per node during blue-green upgrades. Default is 30 minutes.
2182+
long-summary: Maximum time (in minutes) to wait for pod eviction and graceful termination per node during blue-green upgrades. Honors pod disruption budgets. If exceeded, the upgrade fails. Default is 30 minutes.
2183+
- name: --batch-soak-duration
2184+
type: int
2185+
short-summary: Wait time (in minutes) after draining a batch of nodes before proceeding to the next batch. Default is 15 minutes. Only for blue-green upgrades.
2186+
- name: --final-soak-duration
2187+
type: int
2188+
short-summary: Wait time (in minutes) after all old nodes are drained before removing them. Default is 60 minutes. Only for blue-green upgrades.
21512189
"""
21522190

21532191
helps['aks nodepool update'] = """
@@ -2254,6 +2292,24 @@
22542292
- name: --node-vm-size -s
22552293
type: string
22562294
short-summary: VM size for Kubernetes nodes. Only configurable when updating the autoscale settings of a VirtualMachines node pool.
2295+
- name: --upgrade-strategy
2296+
type: string
2297+
short-summary: Upgrade strategy for the node pool. Allowed values are "Rolling" or "BlueGreen". Default is "Rolling".
2298+
- name: --drain-batch-size
2299+
type: string
2300+
short-summary: Number or percentage of nodes to drain per batch during blue-green upgrades. Accepts an integer (e.g. '5') or percentage (e.g. '50%'). Default is 10%.
2301+
long-summary: |-
2302+
Specifies how many nodes to drain in each batch during a blue-green upgrade. Must be a non-zero value, either as an integer (e.g. '5') or a percentage (e.g. '50%') of the total blue nodes at the start of the upgrade. Fractional nodes are rounded up. For more details and best practices, see: https://learn.microsoft.com/en-us/azure/aks/upgrade-cluster
2303+
- name: --drain-timeout-bg
2304+
type: int
2305+
short-summary: Timeout (in minutes) to evict pods and gracefully terminate per node during blue-green upgrades. Default is 30 minutes.
2306+
long-summary: Maximum time (in minutes) to wait for pod eviction and graceful termination per node during blue-green upgrades. Honors pod disruption budgets. If exceeded, the upgrade fails. Default is 30 minutes.
2307+
- name: --batch-soak-duration
2308+
type: int
2309+
short-summary: Wait time (in minutes) after draining a batch of nodes before proceeding to the next batch. Default is 15 minutes. Only for blue-green upgrades.
2310+
- name: --final-soak-duration
2311+
type: int
2312+
short-summary: Wait time (in minutes) after all old nodes are drained before removing them. Default is 60 minutes. Only for blue-green upgrades.
22572313
examples:
22582314
- name: Reconcile the nodepool back to its current state.
22592315
text: az aks nodepool update -g MyResourceGroup -n nodepool1 --cluster-name MyManagedCluster
@@ -2267,6 +2323,8 @@
22672323
text: az aks nodepool update --mode System -g MyResourceGroup -n nodepool1 --cluster-name MyManagedCluster
22682324
- name: Update cluster autoscaler vm size, min-count and max-count for virtual machines node pool
22692325
text: az aks nodepool update -g MyResourceGroup -n nodepool1 --cluster-name MyManagedCluster --update-cluster-autoscaler --node-vm-size "Standard_D2s_v3" --min-count 2 --max-count 4
2326+
- name: Update a node pool with blue-green upgrade settings
2327+
text: az aks nodepool update -g MyResourceGroup -n nodepool1 --cluster-name MyManagedCluster --drain-batch-size 50% --drain-timeout-bg 5 --batch-soak-duration 10 --final-soak-duration 10
22702328
"""
22712329

22722330
helps['aks nodepool get-upgrades'] = """

src/aks-preview/azext_aks_preview/_params.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@
152152
CONST_ADVANCED_NETWORKPOLICIES_L7,
153153
CONST_TRANSIT_ENCRYPTION_TYPE_NONE,
154154
CONST_TRANSIT_ENCRYPTION_TYPE_WIREGUARD,
155+
CONST_UPGRADE_STRATEGY_ROLLING,
156+
CONST_UPGRADE_STRATEGY_BLUE_GREEN,
155157
)
156158

157159
from azext_aks_preview._validators import (
@@ -223,6 +225,7 @@
223225
validate_gateway_prefix_size,
224226
validate_max_unavailable,
225227
validate_max_blocked_nodes,
228+
validate_drain_batch_size,
226229
validate_resource_group_parameter,
227230
validate_location_resource_group_cluster_parameters,
228231
)
@@ -507,6 +510,11 @@
507510
CONST_GPU_DRIVER_TYPE_GRID,
508511
]
509512

513+
upgrade_strategies = [
514+
CONST_UPGRADE_STRATEGY_ROLLING,
515+
CONST_UPGRADE_STRATEGY_BLUE_GREEN,
516+
]
517+
510518

511519
def load_arguments(self, _):
512520
acr_arg_type = CLIArgumentType(metavar="ACR_NAME_OR_RESOURCE_ID")
@@ -1678,12 +1686,20 @@ def load_arguments(self, _):
16781686
c.argument("node_taints", validator=validate_nodepool_taints)
16791687
c.argument("node_osdisk_type", arg_type=get_enum_type(node_os_disk_types))
16801688
c.argument("node_osdisk_size", type=int)
1689+
# upgrade strategy
1690+
c.argument("upgrade_strategy", arg_type=get_enum_type(upgrade_strategies))
1691+
# rolling upgrade params
16811692
c.argument("max_surge", validator=validate_max_surge)
16821693
c.argument("drain_timeout", type=int)
16831694
c.argument("node_soak_duration", type=int)
16841695
c.argument("undrainable_node_behavior")
16851696
c.argument("max_unavailable", validator=validate_max_unavailable)
16861697
c.argument("max_blocked_nodes", validator=validate_max_blocked_nodes)
1698+
# blue-green upgrade parameters
1699+
c.argument("drain_batch_size", validator=validate_drain_batch_size)
1700+
c.argument("drain_timeout_bg", type=int)
1701+
c.argument("batch_soak_duration", type=int)
1702+
c.argument("final_soak_duration", type=int)
16871703
c.argument("mode", arg_type=get_enum_type(node_mode_types))
16881704
c.argument("scale_down_mode", arg_type=get_enum_type(scale_down_modes))
16891705
c.argument("max_pods", type=int, options_list=["--max-pods", "-m"])
@@ -1816,12 +1832,20 @@ def load_arguments(self, _):
18161832
c.argument("labels", nargs="*", validator=validate_nodepool_labels)
18171833
c.argument("tags", tags_type)
18181834
c.argument("node_taints", validator=validate_nodepool_taints)
1835+
# upgrade strategy
1836+
c.argument("upgrade_strategy", arg_type=get_enum_type(upgrade_strategies))
1837+
# rolling upgrade parameters
18191838
c.argument("max_surge", validator=validate_max_surge)
18201839
c.argument("drain_timeout", type=int)
18211840
c.argument("node_soak_duration", type=int)
18221841
c.argument("undrainable_node_behavior")
18231842
c.argument("max_unavailable", validator=validate_max_unavailable)
18241843
c.argument("max_blocked_nodes", validator=validate_max_blocked_nodes)
1844+
# blue-green upgrade parameters
1845+
c.argument("drain_batch_size", validator=validate_drain_batch_size)
1846+
c.argument("drain_timeout_bg", type=int)
1847+
c.argument("batch_soak_duration", type=int)
1848+
c.argument("final_soak_duration", type=int)
18251849
c.argument("mode", arg_type=get_enum_type(node_mode_types))
18261850
c.argument("scale_down_mode", arg_type=get_enum_type(scale_down_modes))
18271851
# extensions
@@ -1898,12 +1922,20 @@ def load_arguments(self, _):
18981922
)
18991923

19001924
with self.argument_context("aks nodepool upgrade") as c:
1925+
# upgrade strategy
1926+
c.argument("upgrade_strategy", arg_type=get_enum_type(upgrade_strategies))
1927+
# rolling upgrade parameters
19011928
c.argument("max_surge", validator=validate_max_surge)
19021929
c.argument("drain_timeout", type=int)
19031930
c.argument("node_soak_duration", type=int)
19041931
c.argument("undrainable_node_behavior")
19051932
c.argument("max_unavailable", validator=validate_max_unavailable)
19061933
c.argument("max_blocked_nodes", validator=validate_max_blocked_nodes)
1934+
# blue-green upgrade parameters
1935+
c.argument("drain_batch_size", validator=validate_drain_batch_size)
1936+
c.argument("drain_timeout_bg", type=int)
1937+
c.argument("batch_soak_duration", type=int)
1938+
c.argument("final_soak_duration", type=int)
19071939
c.argument("snapshot_id", validator=validate_snapshot_id)
19081940
c.argument(
19091941
"yes",

src/aks-preview/azext_aks_preview/_validators.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,23 @@ def validate_max_blocked_nodes(namespace):
530530
raise InvalidArgumentValueError('--max-blocked-nodes should be an int or percentage')
531531

532532

533+
def validate_drain_batch_size(namespace):
534+
"""validates drain batch size parameter as non-zero integers or percentages."""
535+
if namespace.drain_batch_size is None:
536+
return
537+
int_or_percent = namespace.drain_batch_size
538+
if int_or_percent.endswith('%'):
539+
int_or_percent = int_or_percent.rstrip('%')
540+
541+
try:
542+
value = int(int_or_percent)
543+
if value <= 0:
544+
raise InvalidArgumentValueError('--drain-batch-size must be a non-zero value')
545+
except ValueError:
546+
# pylint: disable=raise-missing-from
547+
raise InvalidArgumentValueError('--drain-batch-size should be an integer or percentage (e.g., "5" or "50%")')
548+
549+
533550
def validate_assign_identity(namespace):
534551
if namespace.assign_identity is not None:
535552
if namespace.assign_identity == '':

0 commit comments

Comments
 (0)