From 4ce6c11f2265742bb990f01b14f15f39382f04e4 Mon Sep 17 00:00:00 2001 From: "aks-node-assistant[bot]" <190555641+aks-node-assistant[bot]@users.noreply.github.com> Date: Thu, 12 Mar 2026 17:07:29 -0700 Subject: [PATCH 01/26] feat: bump windows image version for 2026-03B (#8074) Co-authored-by: Jane Jung Co-authored-by: janenotjung-hue <107402425+janenotjung-hue@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: aks-node-assistant[bot] <190555641+aks-node-assistant[bot]@users.noreply.github.com> From 52e7bab024ef0c849b60484375064fccab53bbbb Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sun, 15 Mar 2026 22:08:49 -0700 Subject: [PATCH 02/26] feat(rcv1p): unify cert bootstrap flow and add Windows CA refresh task https://eng.ms/docs/products/onecert-certificates-key-vault-and-dsms/onecert-customer-guide/autorotationandecr/overviewrcv https://eng.ms/docs/products/onecert-certificates-key-vault-and-dsms/onecert-customer-guide/autorotationandecr/rcv1ptsg cse_cmd.sh.gtpl: derive cert endpoint mode from target cloud and always run custom-cloud init script. cse_cmd.sh: same mode logic as template; remove LOCATION export. init-aks-custom-cloud.sh: merged legacy + operation-requests logic into one script with distro-aware cert install paths. parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh: removed (merged into unified script). parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh: removed (merged into unified script). parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests-mariner.sh: removed (merged into unified script). const.go: keep only unified custom-cloud init script constant. variables.go: simplify script selection to always use unified init script. kubernetesfunc.ps1: add location-aware CA retrieval (legacy/rcv1p) and scheduled refresh task registration helper. kuberneteswindowssetup.ps1: pass location to CA retrieval and register refresh task for custom cloud. --- aks-node-controller/parser/helper.go | 7 +- .../parser/templates/cse_cmd.sh.gtpl | 1 + .../init-aks-custom-cloud-mariner.sh | 186 --------- ...custom-cloud-operation-requests-mariner.sh | 236 ------------ ...nit-aks-custom-cloud-operation-requests.sh | 346 ----------------- .../artifacts/init-aks-custom-cloud.sh | 358 ++++++++++++++++-- parts/windows/kuberneteswindowssetup.ps1 | 4 +- pkg/agent/const.go | 9 +- pkg/agent/variables.go | 19 +- staging/cse/windows/kubernetesfunc.ps1 | 132 +++++-- 10 files changed, 454 insertions(+), 844 deletions(-) delete mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh delete mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests-mariner.sh delete mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index f5644dcda02..ab9e6210ccf 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -64,6 +64,7 @@ func getFuncMap() template.FuncMap { return template.FuncMap{ "getInitAKSCustomCloudFilepath": getInitAKSCustomCloudFilepath, "getIsAksCustomCloud": getIsAksCustomCloud, + "getCloudLocation": getCloudLocation, } } @@ -538,11 +539,15 @@ func getIsAksCustomCloud(customCloudConfig *aksnodeconfigv1.CustomCloudConfig) b return strings.EqualFold(customCloudConfig.GetCustomCloudEnvName(), helpers.AksCustomCloudName) } +func getCloudLocation(v *aksnodeconfigv1.Configuration) string { + return strings.ToLower(strings.Join(strings.Fields(v.GetClusterConfig().GetLocation()), "")) +} + /* GetCloudTargetEnv determines and returns whether the region is a sovereign cloud which have their own data compliance regulations (China/Germany/USGov) or standard. */ // Azure public cloud. func getCloudTargetEnv(v *aksnodeconfigv1.Configuration) string { - loc := strings.ToLower(strings.Join(strings.Fields(v.GetClusterConfig().GetLocation()), "")) + loc := getCloudLocation(v) switch { case strings.HasPrefix(loc, "china"): return "AzureChinaCloud" diff --git a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl index b1359b071d9..d685a3444da 100644 --- a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl +++ b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl @@ -3,4 +3,5 @@ echo $(date),$(hostname) > ${PROVISION_OUTPUT}; REPO_DEPOT_ENDPOINT="{{.CustomCloudConfig.RepoDepotEndpoint}}" {{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} +LOCATION="{{getCloudLocation .}}" /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh deleted file mode 100644 index 587da9ba270..00000000000 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh +++ /dev/null @@ -1,186 +0,0 @@ -#!/bin/bash -set -x -mkdir -p /root/AzureCACertificates - -IS_MARINER=0 -IS_AZURELINUX=0 -# shellcheck disable=SC3010 -if [[ -f /etc/os-release ]]; then - . /etc/os-release - # shellcheck disable=SC3010 - if [[ $NAME == *"Mariner"* ]]; then - IS_MARINER=1 - elif [[ $NAME == *"Microsoft Azure Linux"* ]]; then - IS_AZURELINUX=1 - else - echo "Unknown Linux distribution" - exit 1 - fi -else - echo "Unsupported operating system" - exit 1 -fi - -echo "distribution is $distribution" -echo "Running on $NAME" - -# http://168.63.129.16 is a constant for the host's wireserver endpoint -certs=$(curl "http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json") -IFS_backup=$IFS -IFS=$'\r\n' -certNames=($(echo $certs | grep -oP '(?<=Name\": \")[^\"]*')) -certBodies=($(echo $certs | grep -oP '(?<=CertBody\": \")[^\"]*')) -for i in ${!certBodies[@]}; do - echo ${certBodies[$i]} | sed 's/\\r\\n/\n/g' | sed 's/\\//g' > "/root/AzureCACertificates/$(echo ${certNames[$i]} | sed 's/.cer/.crt/g')" -done -IFS=$IFS_backup - -cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ -/usr/bin/update-ca-trust - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -scriptPath=$0 -# Determine an absolute, canonical path to this script for use in cron. -if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" -fi - -if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi -fi - -cloud-init status --wait - -function init_mariner_repo_depot { - local repodepot_endpoint=$1 - echo "Adding [extended] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo - - echo "Adding [nvidia] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - - echo "Adding [cloud-native] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo - - echo "Pointing Mariner repos at RepoDepot..." - for f in /etc/yum.repos.d/*.repo - do - sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f - echo "$f modified." - done - echo "Mariner repo setup complete." -} - -function init_azurelinux_repo_depot { - local repodepot_endpoint=$1 - repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") - - # tbd maybe we do this a bit nicer - rm -f /etc/yum.repos.d/azurelinux* - - for repo in "${repos[@]}"; do - output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" - repo_content=( - "[azurelinux-official-$repo]" - "name=Azure Linux Official $repo \$releasever \$basearch" - "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" - "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" - "gpgcheck=1" - "repo_gpgcheck=1" - "enabled=1" - "skip_if_unavailable=True" - "sslverify=1" - ) - - rm -f "$output_file" - - for line in "${repo_content[@]}"; do - echo "$line" >> "$output_file" - done - - echo "File '$output_file' has been created." - done - echo "Azure Linux repo setup complete." -} - -dnf_makecache() { - local retries=10 - local dnf_makecache_output=/tmp/dnf-makecache.out - local i - for i in $(seq 1 $retries); do - ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ - cat $dnf_makecache_output && break || \ - cat $dnf_makecache_output - if [ $i -eq $retries ]; then - return 1 - else sleep 5 - fi - done - echo "Executed dnf makecache -y $i times" -} - -marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" -if [ -z "$marinerRepoDepotEndpoint" ]; then - >&2 echo "repo depot endpoint empty while running custom-cloud init script" -else - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - if [ "$IS_MARINER" -eq 1 ]; then - echo "Initializing Mariner repo depot settings..." - init_mariner_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - elif [ "$IS_AZURELINUX" -eq 1 ]; then - echo "Initializing Azure Linux repo depot settings..." - init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - else - echo "No customizations for distribution: $NAME" - fi -fi - -# Set the chrony config to use the PHC /dev/ptp0 clock -cat > /etc/chrony.conf < "/root/AzureCACertificates/$cert_filename" - echo "Successfully saved certificate: $cert_filename" - else - echo "Warning: Failed to retrieve certificate content for $cert_filename" - fi - done -} - -# Process root certificates -process_cert_operations "operationrequestsroot" - -# Process intermediate certificates -process_cert_operations "operationrequestsintermediate" - -# Copy all certificate files to the Mariner/AzureLinux system certificate directory -cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - -# Update the system certificate store using Mariner/AzureLinux command -/usr/bin/update-ca-trust - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -scriptPath=$0 -# Determine an absolute, canonical path to this script for use in cron. -if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" -fi - -if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi -fi - -function init_mariner_repo_depot { - local repodepot_endpoint=$1 - echo "Adding [extended] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo - - echo "Adding [nvidia] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - - echo "Adding [cloud-native] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo - - echo "Pointing Mariner repos at RepoDepot..." - for f in /etc/yum.repos.d/*.repo - do - sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f - echo "$f modified." - done - echo "Mariner repo setup complete." -} - -function init_azurelinux_repo_depot { - local repodepot_endpoint=$1 - repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") - - # tbd maybe we do this a bit nicer - rm -f /etc/yum.repos.d/azurelinux* - - for repo in "${repos[@]}"; do - output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" - repo_content=( - "[azurelinux-official-$repo]" - "name=Azure Linux Official $repo \$releasever \$basearch" - "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" - "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" - "gpgcheck=1" - "repo_gpgcheck=1" - "enabled=1" - "skip_if_unavailable=True" - "sslverify=1" - ) - - rm -f "$output_file" - - for line in "${repo_content[@]}"; do - echo "$line" >> "$output_file" - done - - echo "File '$output_file' has been created." - done -} - -cloud-init status --wait - -dnf_makecache() { - local retries=10 - local dnf_makecache_output=/tmp/dnf-makecache.out - local i - for i in $(seq 1 $retries); do - ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ - cat $dnf_makecache_output && break || \ - cat $dnf_makecache_output - if [ $i -eq $retries ]; then - return 1 - else sleep 5 - fi - done - echo "Executed dnf makecache -y $i times" -} - -marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" -if [ -z "$marinerRepoDepotEndpoint" ]; then - >&2 echo "repo depot endpoint empty while running custom-cloud init script" -else - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - if [ "$IS_MARINER" -eq 1 ]; then - echo "Initializing Mariner repo depot settings..." - init_mariner_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - elif [ "$IS_AZURELINUX" -eq 1 ]; then - echo "Initializing Azure Linux repo depot settings..." - init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - else - echo "No customizations for distribution: $NAME" - fi -fi - -#EOF diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh deleted file mode 100644 index 99ae86d0242..00000000000 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh +++ /dev/null @@ -1,346 +0,0 @@ -#!/bin/bash -set -x -mkdir -p /root/AzureCACertificates - -IS_FLATCAR=0 -IS_UBUNTU=0 -IS_ACL=0 -# shellcheck disable=SC3010 -if [[ -f /etc/os-release ]]; then - . /etc/os-release - # shellcheck disable=SC3010 - if [[ $NAME == *"Ubuntu"* ]]; then - IS_UBUNTU=1 - elif [[ $ID == *"flatcar"* ]]; then - IS_FLATCAR=1 - elif [[ $ID == "azurecontainerlinux" ]] || { [[ $ID == "azurelinux" ]] && [[ ${VARIANT_ID:-} == "azurecontainerlinux" ]]; }; then - IS_ACL=1 - else - echo "Unknown Linux distribution" - exit 1 - fi -else - echo "Unsupported operating system" - exit 1 -fi - -echo "distribution is $distribution" -echo "Running on $NAME" - -# http://168.63.129.16 is a constant for the host's wireserver endpoint -WIRESERVER_ENDPOINT="http://168.63.129.16" - -# Function to make HTTP request with retry logic for rate limiting -make_request_with_retry() { - local url="$1" - local max_retries=10 - local retry_delay=3 - local attempt=1 - - local response - while [ $attempt -le $max_retries ]; do - response=$(curl -f --no-progress-meter "$url") - local request_status=$? - - if echo "$response" | grep -q "RequestRateLimitExceeded"; then - sleep $retry_delay - retry_delay=$((retry_delay * 2)) - attempt=$((attempt + 1)) - elif [ $request_status -ne 0 ]; then - sleep $retry_delay - attempt=$((attempt + 1)) - else - echo "$response" - return 0 - fi - done - - echo "exhausted all retries, last response: $response" - return 1 -} - -# Function to process certificate operations from a given endpoint -process_cert_operations() { - local endpoint_type="$1" - local operation_response - - echo "Retrieving certificate operations for type: $endpoint_type" - operation_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json") - local request_status=$? - if [ -z "$operation_response" ] || [ $request_status -ne 0 ]; then - echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json" - return - fi - - # Extract ResourceFileName values from the JSON response - local cert_filenames - mapfile -t cert_filenames < <(echo "$operation_response" | grep -oP '(?<="ResouceFileName": ")[^"]*') - - if [ ${#cert_filenames[@]} -eq 0 ]; then - echo "No certificate filenames found in response for $endpoint_type" - return - fi - - # Process each certificate file - for cert_filename in "${cert_filenames[@]}"; do - echo "Processing certificate file: $cert_filename" - - # Extract filename and extension - local filename="${cert_filename%.*}" - local extension="${cert_filename##*.}" - - echo "Downloading certificate: filename=$filename, extension=$extension" - - # Retrieve the actual certificate content with retry logic - local cert_content - cert_content=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension") - local request_status=$? - if [ -z "$cert_content" ] || [ $request_status -ne 0 ]; then - echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension" - continue - fi - - if [ -n "$cert_content" ]; then - # Save the certificate to the appropriate location - echo "$cert_content" > "/root/AzureCACertificates/$cert_filename" - echo "Successfully saved certificate: $cert_filename" - else - echo "Warning: Failed to retrieve certificate content for $cert_filename" - fi - done -} - -# Process root certificates -process_cert_operations "operationrequestsroot" - -# Process intermediate certificates -process_cert_operations "operationrequestsintermediate" - -if [ "$IS_ACL" -eq 1 ]; then - cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - update-ca-trust -elif [ "${IS_FLATCAR}" -eq 0 ]; then - # Copy all certificate files to the system certificate directory - cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ - - # Update the system certificate store - update-ca-certificates - - # This copies the updated bundle to the location used by OpenSSL which is commonly used - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem -else - for cert in /root/AzureCACertificates/*.crt; do - destcert="${cert##*/}" - destcert="${destcert%.*}.pem" - cp "$cert" /etc/ssl/certs/"$destcert" - done - update-ca-certificates -fi - - - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -function init_ubuntu_main_repo_depot { - local repodepot_endpoint="$1" - # Initialize directory for keys - mkdir -p /etc/apt/keyrings - - # This copies the updated bundle to the location used by OpenSSL which is commonly used - echo "Copying updated bundle to OpenSSL .pem file..." - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem - echo "Updated bundle copied." - - # Back up sources.list and sources.list.d contents - mkdir -p /etc/apt/backup/ - if [ -f "/etc/apt/sources.list" ]; then - mv /etc/apt/sources.list /etc/apt/backup/ - fi - for sources_file in /etc/apt/sources.list.d/*; do - if [ -f "$sources_file" ]; then - mv "$sources_file" /etc/apt/backup/ - fi - done - - # Set location of sources file - . /etc/os-release - aptSourceFile="/etc/apt/sources.list.d/ubuntu.sources" - - # Create main sources file - cat < /etc/apt/sources.list.d/ubuntu.sources - -Types: deb -URIs: ${repodepot_endpoint}/ubuntu -Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security -Components: main universe restricted multiverse -Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg -EOF - - # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing - # all urls with the RepoDepot Ubuntu url - ubuntuUrl=${repodepot_endpoint}/ubuntu - echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." - sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile - echo "apt source URLs converted, see new file below:" - echo "" - echo "-----" - cat $aptSourceFile - echo "-----" - echo "" -} - -function check_url { - local url=$1 - echo "Checking url: $url" - - # Use curl to check the URL and capture both stdout and stderr - curl_exit_code=$(curl -s --head --request GET $url) - # Check the exit status of curl - # shellcheck disable=SC3010 - if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then - echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" - exit 1 - fi -} - -function write_to_sources_file { - local sources_list_d_file=$1 - local source_uri=$2 - shift 2 - local key_paths=("$@") - - sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" - ubuntuDist=$(lsb_release -c | awk '{print $2}') - - tee -a $sources_file_path < /dev/null - echo "$key_name key added to keyring." -} - -function derive_key_paths { - local key_names=("$@") - local key_paths=() - - for key_name in "${key_names[@]}"; do - key_paths+=("/etc/apt/keyrings/${key_name}.gpg") - done - - echo "${key_paths[*]}" -} - -function add_ms_keys { - # Add the Microsoft package server keys to keyring. - echo "Adding Microsoft keys to keyring..." - - add_key_ubuntu microsoft.asc - add_key_ubuntu msopentech.asc -} - -function aptget_update { - echo "apt-get updating..." - echo "note: depending on how many sources have been added this may take a couple minutes..." - if apt-get update | grep -q "404 Not Found"; then - echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." - exit 1 - else - echo "apt-get update complete!" - fi -} - -function init_ubuntu_pmc_repo_depot { - local repodepot_endpoint="$1" - # Add Microsoft packages source to the azure specific sources.list. - echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." - - microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" - check_url $microsoftPackageSource - write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - echo "Ubuntu ($ubuntuRel) repo added." - echo "Adding packages.microsoft.com keys" - add_ms_keys $repodepot_endpoint -} - -if [ "$IS_UBUNTU" -eq 1 ]; then - scriptPath=$0 - # Determine an absolute, canonical path to this script for use in cron. - if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" - fi - - if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi - fi - - cloud-init status --wait - rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - ubuntuRel=$(lsb_release --release | awk '{print $2}') - ubuntuDist=$(lsb_release -c | awk '{print $2}') - # initialize archive.ubuntu.com repo - init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} - init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} - # update apt list - echo "Running apt-get update" - aptget_update -elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then - script_path="$(readlink -f "$0")" - svc="/etc/systemd/system/azure-ca-refresh.service" - tmr="/etc/systemd/system/azure-ca-refresh.timer" - - cat >"$svc" <"$tmr" < "/root/AzureCACertificates/$(echo ${certNames[$i]} | sed "s/.cer/.${ext}/g")" -done -IFS=$IFS_backup +WIRESERVER_ENDPOINT="http://168.63.129.16" + +function make_request_with_retry { + local url="$1" + local max_retries=10 + local retry_delay=3 + local attempt=1 + + local response + while [ $attempt -le $max_retries ]; do + response=$(curl -f --no-progress-meter "$url") + local request_status=$? + + if echo "$response" | grep -q "RequestRateLimitExceeded"; then + sleep $retry_delay + retry_delay=$((retry_delay * 2)) + attempt=$((attempt + 1)) + elif [ $request_status -ne 0 ]; then + sleep $retry_delay + attempt=$((attempt + 1)) + else + echo "$response" + return 0 + fi + done -if [ "$IS_ACL" -eq 1 ]; then - cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - update-ca-trust -elif [ "$IS_FLATCAR" -eq 1 ]; then - cp /root/AzureCACertificates/*.pem /etc/ssl/certs/ - update-ca-certificates -else - cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ - update-ca-certificates + echo "exhausted all retries, last response: $response" + return 1 +} - # This copies the updated bundle to the location used by OpenSSL which is commonly used - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem +function is_opted_in_for_root_certs { + local opt_in_response + + opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") + local request_status=$? + if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then + echo "Warning: failed to determine IsOptedInForRootCerts state" + return 1 + fi + + if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then + echo "IsOptedInForRootCerts=true" + return 0 + fi + + echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" + return 1 +} + +function get_trust_store_dir { + if [ "$IS_ACL" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + echo "/etc/pki/ca-trust/source/anchors" + elif [ "$IS_FLATCAR" -eq 1 ]; then + echo "/etc/ssl/certs" + else + echo "/usr/local/share/ca-certificates" + fi +} + +function debug_print_trust_store { + local stage="$1" + local trust_store_dir + + trust_store_dir=$(get_trust_store_dir) + echo "Trust store contents ${stage} cert copy: ${trust_store_dir}" + ls -al "$trust_store_dir" || true +} + +function retrieve_legacy_certs { + local certs + local cert_names + local cert_bodies + local i + + certs=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=cacertificates&ext=json") + if [ -z "$certs" ]; then + echo "Warning: failed to retrieve legacy custom cloud certificates" + return 1 + fi + + IFS_backup=$IFS + IFS=$'\r\n' + cert_names=($(echo $certs | grep -oP '(?<=Name\": \")[^\"]*')) + cert_bodies=($(echo $certs | grep -oP '(?<=CertBody\": \")[^\"]*')) + for i in ${!cert_bodies[@]}; do + echo ${cert_bodies[$i]} | sed 's/\\r\\n/\n/g' | sed 's/\\//g' > "/root/AzureCACertificates/$(echo ${cert_names[$i]} | sed 's/.cer/.crt/g')" + done + IFS=$IFS_backup +} + +function process_cert_operations { + local endpoint_type="$1" + local operation_response + + echo "Retrieving certificate operations for type: $endpoint_type" + operation_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json") + local request_status=$? + if [ -z "$operation_response" ] || [ $request_status -ne 0 ]; then + echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json" + return 1 + fi + + local cert_filenames + mapfile -t cert_filenames < <(echo "$operation_response" | grep -oP '(?<="ResouceFileName": ")[^"]*') + + if [ ${#cert_filenames[@]} -eq 0 ]; then + echo "No certificate filenames found in response for $endpoint_type" + return 1 + fi + + for cert_filename in "${cert_filenames[@]}"; do + echo "Processing certificate file: $cert_filename" + + local filename="${cert_filename%.*}" + local extension="${cert_filename##*.}" + local cert_content + + cert_content=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension") + local request_status=$? + if [ -z "$cert_content" ] || [ $request_status -ne 0 ]; then + echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension" + continue + fi + + echo "$cert_content" > "/root/AzureCACertificates/$cert_filename" + echo "Successfully saved certificate: $cert_filename" + done +} + +function retrieve_rcv1p_certs { + process_cert_operations "operationrequestsroot" || return 1 + process_cert_operations "operationrequestsintermediate" || return 1 +} + +function install_certs_to_trust_store { + mkdir -p /root/AzureCACertificates + + debug_print_trust_store "before" + + if [ "$IS_ACL" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ + update-ca-trust + elif [ "$IS_FLATCAR" -eq 1 ]; then + for cert in /root/AzureCACertificates/*.crt; do + destcert="${cert##*/}" + destcert="${destcert%.*}.pem" + cp "$cert" /etc/ssl/certs/"$destcert" + done + update-ca-certificates + else + cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ + update-ca-certificates + + # This copies the updated bundle to the location used by OpenSSL which is commonly used + cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem + fi + + debug_print_trust_store "after" +} + +# Certificate refresh behavior summary: +# - legacy mode directly attempts certificate download from wireserver and only in ussec and usnat regions. +# - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. +# - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. + +location_normalized="${LOCATION,,}" +location_normalized="${location_normalized//[[:space:]]/}" +if [ -z "$location_normalized" ]; then + echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" +fi + +cert_endpoint_mode="rcv1p" +case "$location_normalized" in + ussec*|usnat*) cert_endpoint_mode="legacy" ;; +esac +echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" +rm -f /root/AzureCACertificates/* +if [ "$cert_endpoint_mode" = "legacy" ]; then + if retrieve_legacy_certs; then + install_certs_to_trust_store + else + echo "Warning: failed to retrieve legacy certificates from wireserver; continuing without trust store updates" + fi +elif [ "$cert_endpoint_mode" = "rcv1p" ]; then + if is_opted_in_for_root_certs; then + if retrieve_rcv1p_certs; then + install_certs_to_trust_store + else + echo "Warning: failed to retrieve rcv1p certificates from wireserver; continuing without trust store updates" + fi + fi fi # This section creates a cron job to poll for refreshed CA certs daily @@ -201,7 +371,80 @@ function init_ubuntu_pmc_repo_depot { add_ms_keys $repodepot_endpoint } -if [ "$IS_UBUNTU" -eq 1 ]; then +function init_mariner_repo_depot { + local repodepot_endpoint=$1 + echo "Adding [extended] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo + + echo "Adding [nvidia] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + + echo "Adding [cloud-native] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo + + echo "Pointing Mariner repos at RepoDepot..." + for f in /etc/yum.repos.d/*.repo; do + sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f + echo "$f modified." + done + echo "Mariner repo setup complete." +} + +function init_azurelinux_repo_depot { + local repodepot_endpoint=$1 + local repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") + + rm -f /etc/yum.repos.d/azurelinux* + + for repo in "${repos[@]}"; do + output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" + repo_content=( + "[azurelinux-official-$repo]" + "name=Azure Linux Official $repo \$releasever \$basearch" + "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" + "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" + "gpgcheck=1" + "repo_gpgcheck=1" + "enabled=1" + "skip_if_unavailable=True" + "sslverify=1" + ) + + rm -f "$output_file" + + for line in "${repo_content[@]}"; do + echo "$line" >> "$output_file" + done + + echo "File '$output_file' has been created." + done + echo "Azure Linux repo setup complete." +} + +function dnf_makecache { + local retries=10 + local dnf_makecache_output=/tmp/dnf-makecache.out + local i + for i in $(seq 1 $retries); do + ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ + cat $dnf_makecache_output && break || \ + cat $dnf_makecache_output + if [ $i -eq $retries ]; then + return 1 + else + sleep 5 + fi + done + echo "Executed dnf makecache -y $i times" +} + +if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then scriptPath=$0 # Determine an absolute, canonical path to this script for use in cron. if command -v readlink >/dev/null 2>&1; then @@ -260,11 +503,72 @@ EOF systemctl enable --now azure-ca-refresh.timer fi +if [ "$IS_UBUNTU" -eq 1 ]; then + rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -n "$rootRepoDepotEndpoint" ]; then + cloud-init status --wait + ubuntuRel=$(lsb_release --release | awk '{print $2}') + ubuntuDist=$(lsb_release -c | awk '{print $2}') + init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} + init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} + echo "Running apt-get update" + aptget_update + else + echo "REPO_DEPOT_ENDPOINT empty, skipping Ubuntu RepoDepot initialization" + fi +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + cloud-init status --wait + + marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -z "$marinerRepoDepotEndpoint" ]; then + >&2 echo "repo depot endpoint empty while running custom-cloud init script" + else + if [ "$IS_MARINER" -eq 1 ]; then + echo "Initializing Mariner repo depot settings..." + init_mariner_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + else + echo "Initializing Azure Linux repo depot settings..." + init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + fi + fi +fi + # Disable systemd-timesyncd and install chrony and uses local time source # ACL has PTP clock config compiled into chronyd with no config file or sourcedir directives, # so it uses only the local PTP clock and has no DHCP-injectable NTP sources. if [ "$IS_ACL" -eq 1 ]; then echo "Skipping chrony configuration for ACL (PTP clock baked into chronyd, no external NTP sources)" +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then +cat > /etc/chrony.conf < $certFilePath + } + + return $true } - Write-Log "Convert CA certificates rawdata" - $caCerts=($rawData.Content) | ConvertFrom-Json - if ([string]::IsNullOrEmpty($caCerts)) { - Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_EMPTY_CA_CERTIFICATES -ErrorMessage "CA certificates rawdata is empty" + $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' + $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + if (($optInResponse.Content -notmatch 'IsOptedInForRootCerts=true')) { + Write-Log "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" + return $false } - $certificates = $caCerts.Certificates - for ($index = 0; $index -lt $certificates.Length ; $index++) { - $name=$certificates[$index].Name - $certFilePath = Join-Path $caFolder $name - Write-Log "Write certificate $name to $certFilePath" - $certificates[$index].CertBody > $certFilePath + $operationRequestTypes = @("operationrequestsroot", "operationrequestsintermediate") + $downloadedAny = $false + + foreach ($requestType in $operationRequestTypes) { + $operationRequestUri = "http://168.63.129.16/machine?comp=acmspackage&type=$requestType&ext=json" + $operationResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$operationRequestUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + $operationJson = ($operationResponse.Content) | ConvertFrom-Json + + if ($null -eq $operationJson -or $null -eq $operationJson.OperationRequests) { + Write-Log "Warning: no operation requests found for $requestType" + continue + } + + foreach ($operation in $operationJson.OperationRequests) { + $resourceFileName = $operation.ResouceFileName + if ([string]::IsNullOrEmpty($resourceFileName)) { + continue + } + + $resourceType = [IO.Path]::GetFileNameWithoutExtension($resourceFileName) + $resourceExt = [IO.Path]::GetExtension($resourceFileName).TrimStart('.') + $resourceUri = "http://168.63.129.16/machine?comp=acmspackage&type=$resourceType&ext=$resourceExt" + + $certContentResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$resourceUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + if ([string]::IsNullOrEmpty($certContentResponse.Content)) { + Write-Log "Warning: empty certificate content for $resourceFileName" + continue + } + + $certFilePath = Join-Path $caFolder $resourceFileName + Write-Log "Write certificate $resourceFileName to $certFilePath" + $certContentResponse.Content > $certFilePath + $downloadedAny = $true + } + } + + if (-not $downloadedAny) { + Write-Log "Warning: no CA certificates were downloaded in rcv1p mode" } + + return $downloadedAny } catch { - # Catch all exceptions in this function. NOTE: exit cannot be caught. - Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_GET_CA_CERTIFICATES -ErrorMessage $_ + Write-Log "Warning: failed to retrieve CA certificates. Error: $_" + return $false } } From a183741bcc6a34faecd22002c2ae63c8daa6b70f Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 18 Mar 2026 14:08:50 -0700 Subject: [PATCH 03/26] feat: enhance CA certificates refresh task with endpoint mode based on location --- .../artifacts/init-aks-custom-cloud.sh | 33 ++++++++++++------- parts/windows/kuberneteswindowssetup.ps1 | 4 ++- staging/cse/windows/kubernetesfunc.ps1 | 15 +++++---- 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index fab9e105975..9f3b4fe479e 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -198,16 +198,28 @@ function install_certs_to_trust_store { # - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. # - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. -location_normalized="${LOCATION,,}" -location_normalized="${location_normalized//[[:space:]]/}" -if [ -z "$location_normalized" ]; then - echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" +# Action values: +# - init: normal provisioning path +# - ca-refresh: scheduled refresh path +action=${1:-init} +requested_cert_endpoint_mode="${2:-}" + +cert_endpoint_mode="" +if [ "$action" = "ca-refresh" ] && [ -n "$requested_cert_endpoint_mode" ]; then + cert_endpoint_mode="${requested_cert_endpoint_mode,,}" +else + location_normalized="${LOCATION,,}" + location_normalized="${location_normalized//[[:space:]]/}" + if [ -z "$location_normalized" ]; then + echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" + fi + + cert_endpoint_mode="rcv1p" + case "$location_normalized" in + ussec*|usnat*) cert_endpoint_mode="legacy" ;; + esac fi -cert_endpoint_mode="rcv1p" -case "$location_normalized" in - ussec*|usnat*) cert_endpoint_mode="legacy" ;; -esac echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then @@ -228,7 +240,6 @@ fi # This section creates a cron job to poll for refreshed CA certs daily # It can be removed if not needed or desired -action=${1:-init} if [ "$action" = "ca-refresh" ]; then exit fi @@ -454,7 +465,7 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then + if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$cert_endpoint_mode\"") | crontab -; then echo "Failed to install ca-refresh cron job via crontab" >&2 fi fi @@ -483,7 +494,7 @@ Wants=network-online.target [Service] Type=oneshot -ExecStart=$script_path ca-refresh +ExecStart=$script_path ca-refresh $cert_endpoint_mode EOF cat >"$tmr" < Date: Wed, 18 Mar 2026 17:14:10 -0700 Subject: [PATCH 04/26] feat: add tests for certificate endpoint mode handling in AKS custom cloud spec --- .../artifacts/init_aks_custom_cloud_spec.sh | 39 +++++ staging/cse/windows/kubernetesfunc.tests.ps1 | 147 ++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh create mode 100644 staging/cse/windows/kubernetesfunc.tests.ps1 diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh new file mode 100644 index 00000000000..f00709306c2 --- /dev/null +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +Describe 'init-aks-custom-cloud.sh refresh mode wiring' + script_path='./parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh' + + It 'parses action and optional requested cert endpoint mode arguments' + When run grep -Eq '^action=\$\{1:-init\}$' "$script_path" + The status should eq 0 + + When run grep -Eq '^requested_cert_endpoint_mode="\$\{2:-\}"$' "$script_path" + The status should eq 0 + End + + It 'uses requested mode during ca-refresh when provided' + When run grep -Eq '^if \[ "\$action" = "ca-refresh" \] && \[ -n "\$requested_cert_endpoint_mode" \]; then$' "$script_path" + The status should eq 0 + + When run grep -Eq '^\s*cert_endpoint_mode="\$\{requested_cert_endpoint_mode,,\}"$' "$script_path" + The status should eq 0 + End + + It 'exits early in ca-refresh mode after certificate refresh logic' + When run grep -Eq '^if \[ "\$action" = "ca-refresh" \]; then$' "$script_path" + The status should eq 0 + + When run grep -Eq '^\s*exit$' "$script_path" + The status should eq 0 + End + + It 'passes cert endpoint mode into cron refresh command' + When run grep -Eq 'ca-refresh "\$cert_endpoint_mode"' "$script_path" + The status should eq 0 + End + + It 'passes cert endpoint mode into systemd refresh command' + When run grep -Eq '^ExecStart=\$script_path ca-refresh \$cert_endpoint_mode$' "$script_path" + The status should eq 0 + End +End diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 new file mode 100644 index 00000000000..ba14ebb48ef --- /dev/null +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -0,0 +1,147 @@ +if (-not (Get-PSDrive -Name C -ErrorAction SilentlyContinue)) { + New-PSDrive -Name C -PSProvider FileSystem -Root ([System.IO.Path]::GetTempPath()) | Out-Null +} + +function Write-Log { + param($Message) + Write-Host "$Message" +} + +function Logs-To-Event { + param($TaskName, $TaskMessage) + Write-Host "$TaskName $TaskMessage" +} + +function Set-ExitCode { + param($ExitCode, $ErrorMessage) + throw "Unexpected Set-ExitCode: $ExitCode $ErrorMessage" +} + +function Create-Directory { + param($FullPath, $DirectoryUsage) + if (-not (Test-Path $FullPath)) { + New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + } +} + +function Get-ScheduledTask { + param($TaskName, $ErrorAction) +} + +function New-ScheduledTaskAction { + param($Execute, $Argument) +} + +function New-ScheduledTaskPrincipal { + param($UserId, $LogonType, $RunLevel) +} + +function New-JobTrigger { + param([switch]$Daily, $At, $DaysInterval) +} + +function New-ScheduledTask { + param($Action, $Principal, $Trigger, $Description) +} + +function Register-ScheduledTask { + param($TaskName, $InputObject) +} + +. $PSScriptRoot\..\..\..\parts\windows\windowscsehelper.ps1 +. $PSCommandPath.Replace('.tests.ps1', '.ps1') + +Describe 'Get-CustomCloudCertEndpointModeFromLocation' { + It 'returns legacy for ussec regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'ussecwest' | Should Be 'legacy' + } + + It 'returns legacy for usnat regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'usnatcentral' | Should Be 'legacy' + } + + It 'returns rcv1p for public regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'southcentralus' | Should Be 'rcv1p' + } + + It 'handles mixed-case input' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'UsSeCeast' | Should Be 'legacy' + } +} + +Describe 'Register-CACertificatesRefreshTask' { + BeforeEach { + $script:lastScheduledTaskArgument = $null + + Mock Logs-To-Event + Mock Write-Log + Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } + Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } + Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } + Mock Register-ScheduledTask + Mock New-ScheduledTaskAction -MockWith { + param($Execute, $Argument) + $script:lastScheduledTaskArgument = $Argument + return @{ Execute = $Execute; Argument = $Argument } + } + } + + It 'skips registration when the task already exists' { + Mock Get-ScheduledTask -MockWith { return @{ TaskName = 'aks-ca-certs-refresh-task' } } + + Register-CACertificatesRefreshTask -Location 'southcentralus' -CertEndpointMode 'rcv1p' + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 0 + Assert-MockCalled -CommandName New-ScheduledTaskAction -Exactly -Times 0 + } + + It 'creates a scheduled task that passes the explicit cert endpoint mode' { + Mock Get-ScheduledTask -MockWith { return $null } + + Register-CACertificatesRefreshTask -Location 'southcentralus' -CertEndpointMode 'rcv1p' + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 1 + $script:lastScheduledTaskArgument | Should Match ([regex]::Escape("Get-CACertificates -Location 'southcentralus' -CertEndpointMode 'rcv1p'")) + } +} + +Describe 'Get-CACertificates' { + BeforeEach { + Mock Write-Log + Mock Create-Directory -MockWith { + param($FullPath, $DirectoryUsage) + if (-not (Test-Path $FullPath)) { + New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + } + } + + if (Test-Path 'C:\ca') { + Remove-Item -Path 'C:\ca' -Recurse -Force + } + } + + It 'uses the legacy endpoint when CertEndpointMode is legacy regardless of location' { + Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + return [PSCustomObject]@{ + Content = '{"Certificates":[{"Name":"legacy.crt","CertBody":"legacy-body"}]}' + } + } + + $result = Get-CACertificates -Location 'southcentralus' -CertEndpointMode 'legacy' + + $result | Should Be $true + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' } + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } + } + + It 'returns false when certificate retrieval throws' { + Mock Retry-Command -MockWith { + throw 'simulated retrieval failure' + } + + $result = Get-CACertificates -Location 'ussecwest' -CertEndpointMode 'rcv1p' + + $result | Should Be $false + } +} From f32dc9f957682f886f1606871a3280a5903d993d Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 19 Mar 2026 12:44:29 -0700 Subject: [PATCH 05/26] feat: simplify certificate endpoint mode handling and refresh task registration --- .../artifacts/init-aks-custom-cloud.sh | 41 ++++++++----------- parts/windows/kuberneteswindowssetup.ps1 | 4 +- .../artifacts/init_aks_custom_cloud_spec.sh | 21 ++++++---- staging/cse/windows/kubernetesfunc.ps1 | 15 +++---- staging/cse/windows/kubernetesfunc.tests.ps1 | 14 +++---- 5 files changed, 44 insertions(+), 51 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 9f3b4fe479e..c7176be2393 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -198,28 +198,19 @@ function install_certs_to_trust_store { # - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. # - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. -# Action values: -# - init: normal provisioning path -# - ca-refresh: scheduled refresh path -action=${1:-init} -requested_cert_endpoint_mode="${2:-}" - -cert_endpoint_mode="" -if [ "$action" = "ca-refresh" ] && [ -n "$requested_cert_endpoint_mode" ]; then - cert_endpoint_mode="${requested_cert_endpoint_mode,,}" -else - location_normalized="${LOCATION,,}" - location_normalized="${location_normalized//[[:space:]]/}" - if [ -z "$location_normalized" ]; then - echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" - fi +refresh_location="${2:-${LOCATION}}" - cert_endpoint_mode="rcv1p" - case "$location_normalized" in - ussec*|usnat*) cert_endpoint_mode="legacy" ;; - esac +location_normalized="${refresh_location,,}" +location_normalized="${location_normalized//[[:space:]]/}" +if [ -z "$location_normalized" ]; then + echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" fi +cert_endpoint_mode="rcv1p" +case "$location_normalized" in + ussec*|usnat*) cert_endpoint_mode="legacy" ;; +esac + echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then @@ -238,8 +229,12 @@ elif [ "$cert_endpoint_mode" = "rcv1p" ]; then fi fi -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired +# In ca-refresh mode (invoked by the scheduled cron/systemd task with the location as arg), +# only the cert refresh above is needed; exit before running the full init path. +# Action values: +# - init (default): full provisioning path +# - ca-refresh : periodic refresh path; location is passed as arg to avoid env dependency +action=${1:-init} if [ "$action" = "ca-refresh" ]; then exit fi @@ -465,7 +460,7 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$cert_endpoint_mode\"") | crontab -; then + if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then echo "Failed to install ca-refresh cron job via crontab" >&2 fi fi @@ -494,7 +489,7 @@ Wants=network-online.target [Service] Type=oneshot -ExecStart=$script_path ca-refresh $cert_endpoint_mode +ExecStart=$script_path ca-refresh $LOCATION EOF cat >"$tmr" < Date: Thu, 19 Mar 2026 13:04:03 -0700 Subject: [PATCH 06/26] feat: implement conditional CA certificates refresh task registration for legacy and opted-in rcv1p modes --- .../artifacts/init-aks-custom-cloud.sh | 29 +++++++++------ parts/windows/kuberneteswindowssetup.ps1 | 4 ++- .../artifacts/init_aks_custom_cloud_spec.sh | 11 ++++++ staging/cse/windows/kubernetesfunc.ps1 | 21 +++++++++++ staging/cse/windows/kubernetesfunc.tests.ps1 | 36 +++++++++++++++++++ 5 files changed, 89 insertions(+), 12 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index c7176be2393..eeb01c392fe 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -212,8 +212,10 @@ case "$location_normalized" in esac echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" +install_ca_refresh_schedule=0 rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then + install_ca_refresh_schedule=1 if retrieve_legacy_certs; then install_certs_to_trust_store else @@ -221,6 +223,7 @@ if [ "$cert_endpoint_mode" = "legacy" ]; then fi elif [ "$cert_endpoint_mode" = "rcv1p" ]; then if is_opted_in_for_root_certs; then + install_ca_refresh_schedule=1 if retrieve_rcv1p_certs; then install_certs_to_trust_store else @@ -458,10 +461,12 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" fi - if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 + if [ "$install_ca_refresh_schedule" -eq 1 ]; then + if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then + # Quote the script path in the cron entry to avoid issues with spaces or special characters. + if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then + echo "Failed to install ca-refresh cron job via crontab" >&2 + fi fi fi @@ -477,11 +482,12 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 echo "Running apt-get update" aptget_update elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then - script_path="$(readlink -f "$0")" - svc="/etc/systemd/system/azure-ca-refresh.service" - tmr="/etc/systemd/system/azure-ca-refresh.timer" + if [ "$install_ca_refresh_schedule" -eq 1 ]; then + script_path="$(readlink -f "$0")" + svc="/etc/systemd/system/azure-ca-refresh.service" + tmr="/etc/systemd/system/azure-ca-refresh.timer" - cat >"$svc" <"$svc" <"$tmr" <"$tmr" < Date: Thu, 19 Mar 2026 14:54:49 -0700 Subject: [PATCH 07/26] feat: enhance CA certificates refresh task registration for legacy CSE packages --- parts/windows/kuberneteswindowssetup.ps1 | 9 ++++++++- .../cloud-init/artifacts/init_aks_custom_cloud_spec.sh | 10 +++++----- staging/cse/windows/kubernetesfunc.tests.ps1 | 3 --- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/parts/windows/kuberneteswindowssetup.ps1 b/parts/windows/kuberneteswindowssetup.ps1 index 766d85e8bc1..80d8e0ecdea 100644 --- a/parts/windows/kuberneteswindowssetup.ps1 +++ b/parts/windows/kuberneteswindowssetup.ps1 @@ -486,7 +486,14 @@ function BasePrep { Adjust-DynamicPortRange Register-LogsCleanupScriptTask Register-NodeResetScriptTask - if (Should-InstallCACertificatesRefreshTask -Location $Location) { + # Guard against older CSE packages that do not yet export Should-InstallCACertificatesRefreshTask. + # If the function is absent (old package), fall back to the previous unconditional behaviour so + # that legacy/ussec/usnat clusters continue to register the refresh task. + if (Get-Command -Name Should-InstallCACertificatesRefreshTask -ErrorAction Ignore) { + if (Should-InstallCACertificatesRefreshTask -Location $Location) { + Register-CACertificatesRefreshTask -Location $Location + } + } elseif (Get-Command -Name Register-CACertificatesRefreshTask -ErrorAction Ignore) { Register-CACertificatesRefreshTask -Location $Location } diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh index f85f580a8cc..8b54975d51b 100644 --- a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -18,7 +18,7 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' When run grep -Eq '^location_normalized="\$\{refresh_location,,\}"$' "$script_path" The status should eq 0 - When run grep -Eq 'ussec\*\|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" + When run grep -Eq 'ussec\*|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" The status should eq 0 End @@ -26,10 +26,10 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' When run grep -Eq '^install_ca_refresh_schedule=0$' "$script_path" The status should eq 0 - When run grep -Eq '^\s*install_ca_refresh_schedule=1$' "$script_path" + When run grep -Eq '^[[:space:]]*install_ca_refresh_schedule=1$' "$script_path" The status should eq 0 - When run grep -Eq '^\s*if \[ "\$install_ca_refresh_schedule" -eq 1 \]; then$' "$script_path" + When run grep -Eq '^[[:space:]]*if \[ "\$install_ca_refresh_schedule" -eq 1 \]; then$' "$script_path" The status should eq 0 End @@ -37,12 +37,12 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' When run grep -Eq '^if \[ "\$action" = "ca-refresh" \]; then$' "$script_path" The status should eq 0 - When run grep -Eq '^\s*exit$' "$script_path" + When run grep -Eq '^[[:space:]]*exit$' "$script_path" The status should eq 0 End It 'passes LOCATION directly into cron refresh command' - When run grep -Eq 'ca-refresh \\\\"\$LOCATION\\\\"' "$script_path" + When run grep -Eq 'ca-refresh \\"\$LOCATION\\"' "$script_path" The status should eq 0 End diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 948cd229dc0..8b062a273d0 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -74,7 +74,6 @@ Describe 'Register-CACertificatesRefreshTask' { $script:lastScheduledTaskArgument = $null Mock Logs-To-Event - Mock Write-Log Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } @@ -107,7 +106,6 @@ Describe 'Register-CACertificatesRefreshTask' { Describe 'Should-InstallCACertificatesRefreshTask' { BeforeEach { - Mock Write-Log } It 'returns true for legacy regions without calling the opt-in endpoint' { @@ -143,7 +141,6 @@ Describe 'Should-InstallCACertificatesRefreshTask' { Describe 'Get-CACertificates' { BeforeEach { - Mock Write-Log Mock Create-Directory -MockWith { param($FullPath, $DirectoryUsage) if (-not (Test-Path $FullPath)) { From b83899d61c5d1e94edbcce137c5ac0ae170816ee Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 19 Mar 2026 23:27:10 -0700 Subject: [PATCH 08/26] feat: update tests for certificate endpoint mode handling and refresh schedule installation --- .../artifacts/init_aks_custom_cloud_spec.sh | 12 ++++++++++-- staging/cse/windows/kubernetesfunc.tests.ps1 | 7 +++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh index 8b54975d51b..58812659856 100644 --- a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -17,26 +17,34 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' It 'always derives cert endpoint mode from refresh_location' When run grep -Eq '^location_normalized="\$\{refresh_location,,\}"$' "$script_path" The status should eq 0 + End + It 'maps ussec/usnat locations to legacy cert endpoint mode' When run grep -Eq 'ussec\*|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" The status should eq 0 End - It 'installs refresh schedule only for legacy mode or opted-in rcv1p mode' + It 'initializes refresh schedule installation as disabled' When run grep -Eq '^install_ca_refresh_schedule=0$' "$script_path" The status should eq 0 + End + It 'enables refresh schedule installation for eligible certificate modes' When run grep -Eq '^[[:space:]]*install_ca_refresh_schedule=1$' "$script_path" The status should eq 0 + End + It 'gates refresh schedule installation on install_ca_refresh_schedule' When run grep -Eq '^[[:space:]]*if \[ "\$install_ca_refresh_schedule" -eq 1 \]; then$' "$script_path" The status should eq 0 End - It 'exits early in ca-refresh mode after certificate refresh logic' + It 'checks for ca-refresh mode after certificate refresh logic' When run grep -Eq '^if \[ "\$action" = "ca-refresh" \]; then$' "$script_path" The status should eq 0 + End + It 'exits early in ca-refresh mode after certificate refresh logic' When run grep -Eq '^[[:space:]]*exit$' "$script_path" The status should eq 0 End diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 8b062a273d0..42e15c4fc25 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -48,8 +48,11 @@ function Register-ScheduledTask { param($TaskName, $InputObject) } -. $PSScriptRoot\..\..\..\parts\windows\windowscsehelper.ps1 -. $PSCommandPath.Replace('.tests.ps1', '.ps1') +$helperScriptPath = Join-Path $PSScriptRoot '..\..\..\parts\windows\windowscsehelper.ps1' +$scriptUnderTestPath = Join-Path $PSScriptRoot 'kubernetesfunc.ps1' + +. $helperScriptPath +. $scriptUnderTestPath Describe 'Get-CustomCloudCertEndpointModeFromLocation' { It 'returns legacy for ussec regions' { From 650fedc248c689e612992fdf6edadccdefe94e75 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 20 Mar 2026 07:42:47 -0700 Subject: [PATCH 09/26] feat: refactor test setup functions for improved readability and consistency --- staging/cse/windows/kubernetesfunc.tests.ps1 | 102 +++++++++---------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 42e15c4fc25..3f9f403666b 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -1,58 +1,64 @@ -if (-not (Get-PSDrive -Name C -ErrorAction SilentlyContinue)) { - New-PSDrive -Name C -PSProvider FileSystem -Root ([System.IO.Path]::GetTempPath()) | Out-Null -} +BeforeAll { + if (-not (Get-PSDrive -Name C -ErrorAction SilentlyContinue)) { + New-PSDrive -Name C -PSProvider FileSystem -Root ([System.IO.Path]::GetTempPath()) | Out-Null + } -function Write-Log { - param($Message) - Write-Host "$Message" -} + function Write-Log { + param($Message) + Write-Host "$Message" + } -function Logs-To-Event { - param($TaskName, $TaskMessage) - Write-Host "$TaskName $TaskMessage" -} + function Logs-To-Event { + param($TaskName, $TaskMessage) + Write-Host "$TaskName $TaskMessage" + } -function Set-ExitCode { - param($ExitCode, $ErrorMessage) - throw "Unexpected Set-ExitCode: $ExitCode $ErrorMessage" -} + function Set-ExitCode { + param($ExitCode, $ErrorMessage) + throw "Unexpected Set-ExitCode: $ExitCode $ErrorMessage" + } -function Create-Directory { - param($FullPath, $DirectoryUsage) - if (-not (Test-Path $FullPath)) { - New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + function Create-Directory { + param($FullPath, $DirectoryUsage) + if (-not (Test-Path $FullPath)) { + New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + } } -} -function Get-ScheduledTask { - param($TaskName, $ErrorAction) -} + function Get-ScheduledTask { + param($TaskName, $ErrorAction) + } -function New-ScheduledTaskAction { - param($Execute, $Argument) -} + function New-ScheduledTaskAction { + param($Execute, $Argument) + } -function New-ScheduledTaskPrincipal { - param($UserId, $LogonType, $RunLevel) -} + function New-ScheduledTaskPrincipal { + param($UserId, $LogonType, $RunLevel) + } -function New-JobTrigger { - param([switch]$Daily, $At, $DaysInterval) -} + function New-JobTrigger { + param([switch]$Daily, $At, $DaysInterval) + } -function New-ScheduledTask { - param($Action, $Principal, $Trigger, $Description) -} + function New-ScheduledTask { + param($Action, $Principal, $Trigger, $Description) + } -function Register-ScheduledTask { - param($TaskName, $InputObject) -} + function Register-ScheduledTask { + param($TaskName, $InputObject) + } + + function Retry-Command { + param($Command, $Args, $Retries, $RetryDelaySeconds) + } -$helperScriptPath = Join-Path $PSScriptRoot '..\..\..\parts\windows\windowscsehelper.ps1' -$scriptUnderTestPath = Join-Path $PSScriptRoot 'kubernetesfunc.ps1' + $helperScriptPath = Join-Path $PSScriptRoot '..\..\..\parts\windows\windowscsehelper.ps1' + $scriptUnderTestPath = Join-Path $PSScriptRoot 'kubernetesfunc.ps1' -. $helperScriptPath -. $scriptUnderTestPath + . $helperScriptPath + . $scriptUnderTestPath +} Describe 'Get-CustomCloudCertEndpointModeFromLocation' { It 'returns legacy for ussec regions' { @@ -76,11 +82,11 @@ Describe 'Register-CACertificatesRefreshTask' { BeforeEach { $script:lastScheduledTaskArgument = $null - Mock Logs-To-Event + Mock Logs-To-Event -MockWith { } Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } - Mock Register-ScheduledTask + Mock Register-ScheduledTask -MockWith { } Mock New-ScheduledTaskAction -MockWith { param($Execute, $Argument) $script:lastScheduledTaskArgument = $Argument @@ -109,6 +115,7 @@ Describe 'Register-CACertificatesRefreshTask' { Describe 'Should-InstallCACertificatesRefreshTask' { BeforeEach { + Mock Retry-Command -MockWith { } } It 'returns true for legacy regions without calling the opt-in endpoint' { @@ -144,13 +151,6 @@ Describe 'Should-InstallCACertificatesRefreshTask' { Describe 'Get-CACertificates' { BeforeEach { - Mock Create-Directory -MockWith { - param($FullPath, $DirectoryUsage) - if (-not (Test-Path $FullPath)) { - New-Item -Path $FullPath -ItemType Directory -Force | Out-Null - } - } - if (Test-Path 'C:\ca') { Remove-Item -Path 'C:\ca' -Recurse -Force } From 1e3d32e2fa850d3ee36aad35ef3d951e601b065f Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 20 Mar 2026 08:52:23 -0700 Subject: [PATCH 10/26] feat: update Get-CustomCloudCertEndpointModeFromLocation to clarify endpoint mode handling for legacy and rcv1p regions --- staging/cse/windows/kubernetesfunc.ps1 | 2 ++ staging/cse/windows/kubernetesfunc.tests.ps1 | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 5ae9df1e217..023542b6f3c 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -275,11 +275,13 @@ function Get-CustomCloudCertEndpointModeFromLocation { $Location ) + # ussec/usnat regions still use the legacy certificate endpoint contract. $normalizedLocation = $Location.ToLowerInvariant() if ($normalizedLocation.StartsWith("ussec") -or $normalizedLocation.StartsWith("usnat")) { return "legacy" } + # All other regions use the rcv1p endpoint mode with opt-in gating. return "rcv1p" } diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 3f9f403666b..2e95cef1338 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -62,19 +62,19 @@ BeforeAll { Describe 'Get-CustomCloudCertEndpointModeFromLocation' { It 'returns legacy for ussec regions' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'ussecwest' | Should Be 'legacy' + Get-CustomCloudCertEndpointModeFromLocation -Location 'ussecwest' | Should -Be 'legacy' } It 'returns legacy for usnat regions' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'usnatcentral' | Should Be 'legacy' + Get-CustomCloudCertEndpointModeFromLocation -Location 'usnatcentral' | Should -Be 'legacy' } It 'returns rcv1p for public regions' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'southcentralus' | Should Be 'rcv1p' + Get-CustomCloudCertEndpointModeFromLocation -Location 'southcentralus' | Should -Be 'rcv1p' } It 'handles mixed-case input' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'UsSeCeast' | Should Be 'legacy' + Get-CustomCloudCertEndpointModeFromLocation -Location 'UsSeCeast' | Should -Be 'legacy' } } @@ -109,7 +109,7 @@ Describe 'Register-CACertificatesRefreshTask' { Register-CACertificatesRefreshTask -Location 'southcentralus' Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 1 - $script:lastScheduledTaskArgument | Should Match ([regex]::Escape("Get-CACertificates -Location 'southcentralus'")) + $script:lastScheduledTaskArgument | Should -Match ([regex]::Escape("Get-CACertificates -Location 'southcentralus'")) } } @@ -123,7 +123,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { $result = Should-InstallCACertificatesRefreshTask -Location 'ussecwest' - $result | Should Be $true + $result | Should -Be $true Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 } @@ -134,7 +134,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' - $result | Should Be $true + $result | Should -Be $true Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } } @@ -145,7 +145,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' - $result | Should Be $false + $result | Should -Be $false } } @@ -166,7 +166,7 @@ Describe 'Get-CACertificates' { $result = Get-CACertificates -Location 'ussecwest' - $result | Should Be $true + $result | Should -Be $true Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' } Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } } @@ -178,6 +178,6 @@ Describe 'Get-CACertificates' { $result = Get-CACertificates -Location 'southcentralus' - $result | Should Be $false + $result | Should -Be $false } } From acb915657cc0857eeb7354f259db446fff6d296b Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 20 Mar 2026 09:52:25 -0700 Subject: [PATCH 11/26] feat: enhance tests for Should-InstallCACertificatesRefreshTask and Get-CACertificates to verify URI handling --- staging/cse/windows/kubernetesfunc.tests.ps1 | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 2e95cef1338..8186bfabc4c 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -128,14 +128,18 @@ Describe 'Should-InstallCACertificatesRefreshTask' { } It 'returns true for rcv1p regions when opt-in is enabled' { + $script:lastRetryUri = $null Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:lastRetryUri = $PSBoundParameters['Args'].Uri return [PSCustomObject]@{ Content = 'IsOptedInForRootCerts=true' } } $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' $result | Should -Be $true - Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:lastRetryUri | Should -Be 'http://168.63.129.16/acms/isOptedInForRootCerts' } It 'returns false for rcv1p regions when opt-in is disabled' { @@ -157,8 +161,10 @@ Describe 'Get-CACertificates' { } It 'uses the legacy endpoint when location is a ussec/usnat region' { + $script:retryUris = @() Mock Retry-Command -MockWith { param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:retryUris += $PSBoundParameters['Args'].Uri return [PSCustomObject]@{ Content = '{"Certificates":[{"Name":"legacy.crt","CertBody":"legacy-body"}]}' } @@ -167,8 +173,9 @@ Describe 'Get-CACertificates' { $result = Get-CACertificates -Location 'ussecwest' $result | Should -Be $true - Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' } - Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:retryUris | Should -Contain 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' + $script:retryUris | Should -Not -Contain 'http://168.63.129.16/acms/isOptedInForRootCerts' } It 'returns false when certificate retrieval throws' { From c496a58848b3b95b7f0a73c4e6a13a69d3d6760d Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 25 Mar 2026 16:58:14 -0700 Subject: [PATCH 12/26] feat: update cse_cmd.sh and cse_cmd.sh.gtpl to ensure consistent logging of custom cloud file paths --- aks-node-controller/parser/templates/cse_cmd.sh.gtpl | 2 +- parts/linux/cloud-init/artifacts/cse_cmd.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl index d685a3444da..42376814388 100644 --- a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl +++ b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl @@ -1,7 +1,7 @@ echo $(date),$(hostname) > ${PROVISION_OUTPUT}; {{if getIsAksCustomCloud .CustomCloudConfig}} REPO_DEPOT_ENDPOINT="{{.CustomCloudConfig.RepoDepotEndpoint}}" -{{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} LOCATION="{{getCloudLocation .}}" +{{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index 550ac950c37..7a034efa98f 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -18,8 +18,9 @@ fi; {{end}} {{if IsAKSCustomCloud}} REPO_DEPOT_ENDPOINT="{{AKSCustomCloudRepoDepotEndpoint}}" -{{GetInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} +LOCATION={{GetVariable "location"}} +{{GetInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; ADMINUSER={{GetParameter "linuxAdminUsername"}} MOBY_VERSION={{GetParameter "mobyVersion"}} TENANT_ID={{GetVariable "tenantID"}} @@ -32,7 +33,6 @@ KUBEPROXY_URL={{GetParameter "kubeProxySpec"}} APISERVER_PUBLIC_KEY={{GetParameter "apiServerCertificate"}} SUBSCRIPTION_ID={{GetVariable "subscriptionId"}} RESOURCE_GROUP={{GetVariable "resourceGroup"}} -LOCATION={{GetVariable "location"}} VM_TYPE={{GetVariable "vmType"}} SUBNET={{GetVariable "subnetName"}} NETWORK_SECURITY_GROUP={{GetVariable "nsgName"}} From 1071513ba798acb792b218c3eac877a9924c0fa3 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 26 Mar 2026 12:55:30 -0700 Subject: [PATCH 13/26] feat: update CA certificates functions for backward compatibility with optional Location parameter --- staging/cse/windows/kubernetesfunc.ps1 | 35 +++++++++---- staging/cse/windows/kubernetesfunc.tests.ps1 | 52 ++++++++++++++++++++ 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 023542b6f3c..d9852e4288d 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -71,8 +71,8 @@ function Register-NodeResetScriptTask { function Register-CACertificatesRefreshTask { Param( - [Parameter(Mandatory = $true)][string] - $Location + [Parameter(Mandatory = $false)][string] + $Location = "" ) Logs-To-Event -TaskName "AKS.WindowsCSE.RegisterCACertificatesRefreshTask" -TaskMessage "Start to register CA certificates refresh task" @@ -84,7 +84,13 @@ function Register-CACertificatesRefreshTask { return } - $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$Location' | Out-Null }" + # Include -Location only when it was provided, so older VHDs whose Get-CACertificates + # does not accept -Location can still execute the scheduled task successfully. + if ([string]::IsNullOrEmpty($Location)) { + $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates | Out-Null }" + } else { + $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$Location' | Out-Null }" + } $action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -NonInteractive -ExecutionPolicy Bypass -Command `"$refreshCommand`"" $principal = New-ScheduledTaskPrincipal -UserId SYSTEM -LogonType ServiceAccount -RunLevel Highest $trigger = New-JobTrigger -Daily -At "19:00" -DaysInterval 1 @@ -287,10 +293,14 @@ function Get-CustomCloudCertEndpointModeFromLocation { function Should-InstallCACertificatesRefreshTask { Param( - [Parameter(Mandatory = $true)][string] - $Location + [Parameter(Mandatory = $false)][string] + $Location = "" ) + # When Location is not supplied (older callers), default to legacy mode. + if ([string]::IsNullOrEmpty($Location)) { + return $true + } $certEndpointMode = Get-CustomCloudCertEndpointModeFromLocation -Location $Location if ($certEndpointMode -eq "legacy") { return $true @@ -308,15 +318,22 @@ function Should-InstallCACertificatesRefreshTask { function Get-CACertificates { Param( - [Parameter(Mandatory = $true)][string] - $Location + [Parameter(Mandatory = $false)][string] + $Location = "" ) $caFolder = "C:\ca" Create-Directory -FullPath $caFolder -DirectoryUsage "storing CA certificates" - $certEndpointMode = Get-CustomCloudCertEndpointModeFromLocation -Location $Location - Write-Log "Get CA certificates. Location: $Location. EndpointMode: $certEndpointMode" + # When Location is not supplied (older callers), fall back to the legacy endpoint + # which was the original behavior before the rcv1p changes. + if ([string]::IsNullOrEmpty($Location)) { + $certEndpointMode = "legacy" + Write-Log "Get CA certificates. Location not provided, defaulting to legacy endpoint mode" + } else { + $certEndpointMode = Get-CustomCloudCertEndpointModeFromLocation -Location $Location + Write-Log "Get CA certificates. Location: $Location. EndpointMode: $certEndpointMode" + } try { if ($certEndpointMode -eq "legacy") { diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 8186bfabc4c..8ada13ee440 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -187,4 +187,56 @@ Describe 'Get-CACertificates' { $result | Should -Be $false } + + It 'falls back to legacy endpoint when called without -Location (backward compat)' { + $script:retryUris = @() + Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:retryUris += $PSBoundParameters['Args'].Uri + return [PSCustomObject]@{ + Content = '{"Certificates":[{"Name":"compat.crt","CertBody":"compat-body"}]}' + } + } + + $result = Get-CACertificates + + $result | Should -Be $true + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:retryUris | Should -Contain 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' + } +} + +Describe 'Should-InstallCACertificatesRefreshTask - backward compat' { + It 'returns true when called without -Location (backward compat)' { + $result = Should-InstallCACertificatesRefreshTask + + $result | Should -Be $true + } +} + +Describe 'Register-CACertificatesRefreshTask - backward compat' { + BeforeEach { + $script:lastScheduledTaskArgument = $null + + Mock Logs-To-Event -MockWith { } + Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } + Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } + Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } + Mock Register-ScheduledTask -MockWith { } + Mock New-ScheduledTaskAction -MockWith { + param($Execute, $Argument) + $script:lastScheduledTaskArgument = $Argument + return @{ Execute = $Execute; Argument = $Argument } + } + } + + It 'creates a scheduled task without -Location when called without it (backward compat)' { + Mock Get-ScheduledTask -MockWith { return $null } + + Register-CACertificatesRefreshTask + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 1 + $script:lastScheduledTaskArgument | Should -Match ([regex]::Escape("Get-CACertificates |")) + $script:lastScheduledTaskArgument | Should -Not -Match "Location" + } } From af55d5cd4a40524723c82373ceeaf119e8e6ad31 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 27 Mar 2026 09:10:22 -0700 Subject: [PATCH 14/26] feat: remove deprecated Ubuntu repository initialization logic from init-aks-custom-cloud.sh --- .../cloud-init/artifacts/init-aks-custom-cloud.sh | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index eeb01c392fe..0c5487da414 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -469,18 +469,6 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 fi fi fi - - cloud-init status --wait - rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - ubuntuRel=$(lsb_release --release | awk '{print $2}') - ubuntuDist=$(lsb_release -c | awk '{print $2}') - # initialize archive.ubuntu.com repo - init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} - init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} - # update apt list - echo "Running apt-get update" - aptget_update elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then if [ "$install_ca_refresh_schedule" -eq 1 ]; then script_path="$(readlink -f "$0")" From d3408c26f70d29522e7f24c2d87f683a043113f5 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 2 Apr 2026 14:18:47 -0700 Subject: [PATCH 15/26] Split init-aks-custom-cloud.sh to fix Flatcar/ACL customData size limit The unified init-aks-custom-cloud.sh script (~22KB) pushed Flatcar and ACL VM customData over Azure's 87,380 character limit, causing 16 E2E failures. Split the script into two files: - init-aks-custom-cloud.sh: cert refresh + scheduling (included for all clouds) - init-aks-custom-cloud-repos.sh: repo depot + chrony (custom cloud only) The main script sources the repos script at runtime if present. For non-custom-cloud VMs, only the smaller main script is embedded, reducing base64(gzip) size from 8,736 to 4,424 chars (-4,312 chars). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../artifacts/init-aks-custom-cloud-repos.sh | 358 ++++++++++++++++++ .../artifacts/init-aks-custom-cloud.sh | 350 +---------------- parts/linux/cloud-init/nodecustomdata.yml | 7 + pkg/agent/baker.go | 3 + pkg/agent/const.go | 5 +- pkg/agent/variables.go | 3 + 6 files changed, 381 insertions(+), 345 deletions(-) create mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh new file mode 100644 index 00000000000..0c68d513568 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh @@ -0,0 +1,358 @@ +#!/bin/bash +# This script handles repo depot initialization and chrony configuration for +# AKS custom cloud environments. It is sourced by init-aks-custom-cloud.sh and +# inherits all variables from it (IS_UBUNTU, IS_MARINER, IS_AZURELINUX, +# IS_FLATCAR, IS_ACL, REPO_DEPOT_ENDPOINT, etc.). +# +# This script is only included in custom cloud images to keep the base +# customData size small for non-custom-cloud scenarios. + +set -x + +function init_ubuntu_main_repo_depot { + local repodepot_endpoint="$1" + # Initialize directory for keys + mkdir -p /etc/apt/keyrings + + # This copies the updated bundle to the location used by OpenSSL which is commonly used + echo "Copying updated bundle to OpenSSL .pem file..." + cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem + echo "Updated bundle copied." + + # Back up sources.list and sources.list.d contents + mkdir -p /etc/apt/backup/ + if [ -f "/etc/apt/sources.list" ]; then + mv /etc/apt/sources.list /etc/apt/backup/ + fi + for sources_file in /etc/apt/sources.list.d/*; do + if [ -f "$sources_file" ]; then + mv "$sources_file" /etc/apt/backup/ + fi + done + + # Set location of sources file + . /etc/os-release + aptSourceFile="/etc/apt/sources.list.d/ubuntu.sources" + + # Create main sources file + cat < /etc/apt/sources.list.d/ubuntu.sources + +Types: deb +URIs: ${repodepot_endpoint}/ubuntu +Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security +Components: main universe restricted multiverse +Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg +EOF + + # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing + # all urls with the RepoDepot Ubuntu url + ubuntuUrl=${repodepot_endpoint}/ubuntu + echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." + sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile + echo "apt source URLs converted, see new file below:" + echo "" + echo "-----" + cat $aptSourceFile + echo "-----" + echo "" +} + +function check_url { + local url=$1 + echo "Checking url: $url" + + # Use curl to check the URL and capture both stdout and stderr + curl_exit_code=$(curl -s --head --request GET $url) + # Check the exit status of curl + # shellcheck disable=SC3010 + if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then + echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" + exit 1 + fi +} + +function write_to_sources_file { + local sources_list_d_file=$1 + local source_uri=$2 + shift 2 + local key_paths=("$@") + + sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" + ubuntuDist=$(lsb_release -c | awk '{print $2}') + + tee -a $sources_file_path < /dev/null + echo "$key_name key added to keyring." +} + +function derive_key_paths { + local key_names=("$@") + local key_paths=() + + for key_name in "${key_names[@]}"; do + key_paths+=("/etc/apt/keyrings/${key_name}.gpg") + done + + echo "${key_paths[*]}" +} + +function add_ms_keys { + # Add the Microsoft package server keys to keyring. + echo "Adding Microsoft keys to keyring..." + + add_key_ubuntu microsoft.asc + add_key_ubuntu msopentech.asc +} + +function aptget_update { + echo "apt-get updating..." + echo "note: depending on how many sources have been added this may take a couple minutes..." + if apt-get update | grep -q "404 Not Found"; then + echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." + exit 1 + else + echo "apt-get update complete!" + fi +} + +function init_ubuntu_pmc_repo_depot { + local repodepot_endpoint="$1" + # Add Microsoft packages source to the azure specific sources.list. + echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." + + microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" + check_url $microsoftPackageSource + write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) + write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) + echo "Ubuntu ($ubuntuRel) repo added." + echo "Adding packages.microsoft.com keys" + add_ms_keys $repodepot_endpoint +} + +function init_mariner_repo_depot { + local repodepot_endpoint=$1 + echo "Adding [extended] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo + + echo "Adding [nvidia] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + + echo "Adding [cloud-native] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo + + echo "Pointing Mariner repos at RepoDepot..." + for f in /etc/yum.repos.d/*.repo; do + sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f + echo "$f modified." + done + echo "Mariner repo setup complete." +} + +function init_azurelinux_repo_depot { + local repodepot_endpoint=$1 + local repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") + + rm -f /etc/yum.repos.d/azurelinux* + + for repo in "${repos[@]}"; do + output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" + repo_content=( + "[azurelinux-official-$repo]" + "name=Azure Linux Official $repo \$releasever \$basearch" + "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" + "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" + "gpgcheck=1" + "repo_gpgcheck=1" + "enabled=1" + "skip_if_unavailable=True" + "sslverify=1" + ) + + rm -f "$output_file" + + for line in "${repo_content[@]}"; do + echo "$line" >> "$output_file" + done + + echo "File '$output_file' has been created." + done + echo "Azure Linux repo setup complete." +} + +function dnf_makecache { + local retries=10 + local dnf_makecache_output=/tmp/dnf-makecache.out + local i + for i in $(seq 1 $retries); do + ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ + cat $dnf_makecache_output && break || \ + cat $dnf_makecache_output + if [ $i -eq $retries ]; then + return 1 + else + sleep 5 + fi + done + echo "Executed dnf makecache -y $i times" +} + +if [ "$IS_UBUNTU" -eq 1 ]; then + rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -n "$rootRepoDepotEndpoint" ]; then + cloud-init status --wait + ubuntuRel=$(lsb_release --release | awk '{print $2}') + ubuntuDist=$(lsb_release -c | awk '{print $2}') + init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} + init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} + echo "Running apt-get update" + aptget_update + else + echo "REPO_DEPOT_ENDPOINT empty, skipping Ubuntu RepoDepot initialization" + fi +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + cloud-init status --wait + + marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -z "$marinerRepoDepotEndpoint" ]; then + >&2 echo "repo depot endpoint empty while running custom-cloud init script" + else + if [ "$IS_MARINER" -eq 1 ]; then + echo "Initializing Mariner repo depot settings..." + init_mariner_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + else + echo "Initializing Azure Linux repo depot settings..." + init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + fi + fi +fi + +# Disable systemd-timesyncd and install chrony and uses local time source +# ACL has PTP clock config compiled into chronyd with no config file or sourcedir directives, +# so it uses only the local PTP clock and has no DHCP-injectable NTP sources. +if [ "$IS_ACL" -eq 1 ]; then + echo "Skipping chrony configuration for ACL (PTP clock baked into chronyd, no external NTP sources)" +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then +cat > /etc/chrony.conf < $chrony_conf < /etc/apt/sources.list.d/ubuntu.sources - -Types: deb -URIs: ${repodepot_endpoint}/ubuntu -Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security -Components: main universe restricted multiverse -Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg -EOF - - # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing - # all urls with the RepoDepot Ubuntu url - ubuntuUrl=${repodepot_endpoint}/ubuntu - echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." - sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile - echo "apt source URLs converted, see new file below:" - echo "" - echo "-----" - cat $aptSourceFile - echo "-----" - echo "" -} - -function check_url { - local url=$1 - echo "Checking url: $url" - - # Use curl to check the URL and capture both stdout and stderr - curl_exit_code=$(curl -s --head --request GET $url) - # Check the exit status of curl - # shellcheck disable=SC3010 - if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then - echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" - exit 1 - fi -} - -function write_to_sources_file { - local sources_list_d_file=$1 - local source_uri=$2 - shift 2 - local key_paths=("$@") - - sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" - ubuntuDist=$(lsb_release -c | awk '{print $2}') - - tee -a $sources_file_path < /dev/null - echo "$key_name key added to keyring." -} - -function derive_key_paths { - local key_names=("$@") - local key_paths=() - - for key_name in "${key_names[@]}"; do - key_paths+=("/etc/apt/keyrings/${key_name}.gpg") - done - - echo "${key_paths[*]}" -} - -function add_ms_keys { - # Add the Microsoft package server keys to keyring. - echo "Adding Microsoft keys to keyring..." - - add_key_ubuntu microsoft.asc - add_key_ubuntu msopentech.asc -} - -function aptget_update { - echo "apt-get updating..." - echo "note: depending on how many sources have been added this may take a couple minutes..." - if apt-get update | grep -q "404 Not Found"; then - echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." - exit 1 - else - echo "apt-get update complete!" - fi -} - -function init_ubuntu_pmc_repo_depot { - local repodepot_endpoint="$1" - # Add Microsoft packages source to the azure specific sources.list. - echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." - - microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" - check_url $microsoftPackageSource - write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - echo "Ubuntu ($ubuntuRel) repo added." - echo "Adding packages.microsoft.com keys" - add_ms_keys $repodepot_endpoint -} - -function init_mariner_repo_depot { - local repodepot_endpoint=$1 - echo "Adding [extended] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo - - echo "Adding [nvidia] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - - echo "Adding [cloud-native] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo - - echo "Pointing Mariner repos at RepoDepot..." - for f in /etc/yum.repos.d/*.repo; do - sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f - echo "$f modified." - done - echo "Mariner repo setup complete." -} - -function init_azurelinux_repo_depot { - local repodepot_endpoint=$1 - local repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") - - rm -f /etc/yum.repos.d/azurelinux* - - for repo in "${repos[@]}"; do - output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" - repo_content=( - "[azurelinux-official-$repo]" - "name=Azure Linux Official $repo \$releasever \$basearch" - "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" - "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" - "gpgcheck=1" - "repo_gpgcheck=1" - "enabled=1" - "skip_if_unavailable=True" - "sslverify=1" - ) - - rm -f "$output_file" - - for line in "${repo_content[@]}"; do - echo "$line" >> "$output_file" - done - - echo "File '$output_file' has been created." - done - echo "Azure Linux repo setup complete." -} - -function dnf_makecache { - local retries=10 - local dnf_makecache_output=/tmp/dnf-makecache.out - local i - for i in $(seq 1 $retries); do - ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ - cat $dnf_makecache_output && break || \ - cat $dnf_makecache_output - if [ $i -eq $retries ]; then - return 1 - else - sleep 5 - fi - done - echo "Executed dnf makecache -y $i times" -} - if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then scriptPath=$0 # Determine an absolute, canonical path to this script for use in cron. @@ -504,139 +293,12 @@ EOF fi fi -if [ "$IS_UBUNTU" -eq 1 ]; then - rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - if [ -n "$rootRepoDepotEndpoint" ]; then - cloud-init status --wait - ubuntuRel=$(lsb_release --release | awk '{print $2}') - ubuntuDist=$(lsb_release -c | awk '{print $2}') - init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} - init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} - echo "Running apt-get update" - aptget_update - else - echo "REPO_DEPOT_ENDPOINT empty, skipping Ubuntu RepoDepot initialization" - fi -elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then - cloud-init status --wait - - marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - if [ -z "$marinerRepoDepotEndpoint" ]; then - >&2 echo "repo depot endpoint empty while running custom-cloud init script" - else - if [ "$IS_MARINER" -eq 1 ]; then - echo "Initializing Mariner repo depot settings..." - init_mariner_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - else - echo "Initializing Azure Linux repo depot settings..." - init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - fi - fi -fi - -# Disable systemd-timesyncd and install chrony and uses local time source -# ACL has PTP clock config compiled into chronyd with no config file or sourcedir directives, -# so it uses only the local PTP clock and has no DHCP-injectable NTP sources. -if [ "$IS_ACL" -eq 1 ]; then - echo "Skipping chrony configuration for ACL (PTP clock baked into chronyd, no external NTP sources)" -elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then -cat > /etc/chrony.conf < $chrony_conf < Date: Mon, 13 Apr 2026 11:27:13 -0700 Subject: [PATCH 16/26] feat(e2e): add RCV1P cert mode end-to-end tests Add e2e test infrastructure and scenarios to validate RCV1P (Root Certificate V1P) certificate endpoint mode across all supported Linux distros and Windows versions. Infrastructure changes: - Introduce ClusterInfra struct to decouple cluster lifecycle functions from the default Azure subscription, enabling per-subscription clients - Refactor ~20 functions in cluster.go, kube.go, and aks_model.go to accept ClusterInfra instead of hardcoding config.Azure - Add NewAzureClientForSubscription() to construct ARM clients for any subscription, replacing the single-subscription NewAzureClient() - Add CreateVMManagedIdentityInRG() for identity-only creation without blob storage (RCV1P subscription doesn't need shared storage) - Add ClusterRCV1PKubenet cached cluster function and RCV1P-specific resource group/identity cache entries Config and pipeline: - Add RCV1P_SUBSCRIPTION_ID env var to config, with lazy-init of RCV1PAzure client and helper functions - Pass RCV1P_SUBSCRIPTION_ID through e2e-template.yaml and e2e_run.sh - Add dedicated e2e-rcv1p.yaml pipeline with daily schedule Test scenarios: - Linux: Ubuntu 22.04, Ubuntu 24.04, AzureLinux V3, Flatcar, ACL - Windows: Server 2022, 23H2, 2025 - All tests skip gracefully when RCV1P_SUBSCRIPTION_ID is unset Validators: - ValidateRCV1PCertMode (Linux): checks provisioning log for rcv1p mode, verifies certs in /root/AzureCACertificates, validates distro-specific trust store updates, confirms cron/systemd refresh schedule - ValidateRCV1PCertModeWindows: checks C:\AzureCACertificates directory and scheduled refresh task Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/e2e-rcv1p.yaml | 19 ++ .pipelines/scripts/e2e_run.sh | 2 + .pipelines/templates/e2e-template.yaml | 1 + e2e/aks_model.go | 29 +-- e2e/cache.go | 47 +++- e2e/cluster.go | 124 ++++++----- e2e/config/azure.go | 96 ++++---- e2e/config/config.go | 23 ++ e2e/kube.go | 14 +- e2e/scenario_rcv1p_test.go | 210 ++++++++++++++++++ e2e/scenario_rcv1p_win_test.go | 91 ++++++++ e2e/test_helpers.go | 37 ++- e2e/types.go | 68 ++++++ e2e/validators.go | 107 +++++++++ e2e/vmss.go | 37 +-- .../artifacts/init-aks-custom-cloud.sh | 3 +- .../artifacts/init_aks_custom_cloud_spec.sh | 2 +- staging/cse/windows/kubernetesfunc.ps1 | 3 +- 18 files changed, 752 insertions(+), 161 deletions(-) create mode 100644 .pipelines/e2e-rcv1p.yaml create mode 100644 e2e/scenario_rcv1p_test.go create mode 100644 e2e/scenario_rcv1p_win_test.go diff --git a/.pipelines/e2e-rcv1p.yaml b/.pipelines/e2e-rcv1p.yaml new file mode 100644 index 00000000000..5fdf9d3a5ee --- /dev/null +++ b/.pipelines/e2e-rcv1p.yaml @@ -0,0 +1,19 @@ +name: $(Date:yyyyMMdd)$(Rev:.r) +variables: + TAGS_TO_RUN: "rcv1pcertmode=true" + SKIP_E2E_TESTS: false + E2E_GO_TEST_TIMEOUT: "75m" +schedules: + - cron: "0 11 * * *" + displayName: Daily 3am PST + branches: + include: + - main + always: true +trigger: none +pr: none +jobs: + - template: ./templates/e2e-template.yaml + parameters: + name: RCV1P Cert Mode Tests + IgnoreScenariosWithMissingVhd: false diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 1dcea264298..097fe250756 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -35,6 +35,7 @@ VHD_BUILD_ID="${VHD_BUILD_ID:-}" IGNORE_SCENARIOS_WITH_MISSING_VHD="${IGNORE_SCENARIOS_WITH_MISSING_VHD:-}" LOGGING_DIR="${LOGGING_DIR:-}" E2E_SUBSCRIPTION_ID="${E2E_SUBSCRIPTION_ID:-}" +RCV1P_SUBSCRIPTION_ID="${RCV1P_SUBSCRIPTION_ID:-}" ENABLE_SECURE_TLS_BOOTSTRAPPING="${ENABLE_SECURE_TLS_BOOTSTRAPPING:-true}" TAGS_TO_SKIP="${TAGS_TO_SKIP:-}" TAGS_TO_RUN="${TAGS_TO_RUN:-}" @@ -47,6 +48,7 @@ echo "VHD_BUILD_ID: ${VHD_BUILD_ID}" echo "IGNORE_SCENARIOS_WITH_MISSING_VHD: ${IGNORE_SCENARIOS_WITH_MISSING_VHD}" echo "LOGGING_DIR: ${LOGGING_DIR}" echo "E2E_SUBSCRIPTION_ID: ${E2E_SUBSCRIPTION_ID}" +echo "RCV1P_SUBSCRIPTION_ID: ${RCV1P_SUBSCRIPTION_ID}" echo "ENABLE_SECURE_TLS_BOOTSTRAPPING: ${ENABLE_SECURE_TLS_BOOTSTRAPPING}" echo "TAGS_TO_SKIP: ${TAGS_TO_SKIP}" echo "TAGS_TO_RUN: ${TAGS_TO_RUN}" diff --git a/.pipelines/templates/e2e-template.yaml b/.pipelines/templates/e2e-template.yaml index 3b4fad643d7..26d659f77ae 100644 --- a/.pipelines/templates/e2e-template.yaml +++ b/.pipelines/templates/e2e-template.yaml @@ -38,6 +38,7 @@ jobs: displayName: Run AgentBaker E2E env: E2E_SUBSCRIPTION_ID: $(E2E_SUBSCRIPTION_ID) + RCV1P_SUBSCRIPTION_ID: $(RCV1P_SUBSCRIPTION_ID) SYS_SSH_PUBLIC_KEY: $(SYS_SSH_PUBLIC_KEY) SYS_SSH_PRIVATE_KEY_B64: $(SYS_SSH_PRIVATE_KEY_B64) BUILD_SRC_DIR: $(System.DefaultWorkingDirectory) diff --git a/e2e/aks_model.go b/e2e/aks_model.go index f7e1a90c333..b8a670c43a9 100644 --- a/e2e/aks_model.go +++ b/e2e/aks_model.go @@ -299,22 +299,23 @@ func getFirewall(ctx context.Context, location, firewallSubnetID, publicIPID str } func addFirewallRules( - ctx context.Context, clusterModel *armcontainerservice.ManagedCluster, + ctx context.Context, infra *ClusterInfra, clusterModel *armcontainerservice.ManagedCluster, ) error { location := *clusterModel.Location defer toolkit.LogStepCtx(ctx, "adding firewall rules")() rg := *clusterModel.Properties.NodeResourceGroup - vnet, err := getClusterVNet(ctx, rg) + vnet, err := getClusterVNet(ctx, infra, rg) if err != nil { return err } - // For kubenet, the AKS-managed route table must stay attached so that pod - // routes (managed by cloud-provider-azure) and firewall routes coexist. - // For Azure CNI variants, the subnet may not have any route table, so we - // create and associate a dedicated one before adding the firewall routes. - aksSubnetResp, err := config.Azure.Subnet.Get(ctx, rg, vnet.name, "aks-subnet", nil) + // Find the AKS-managed route table currently associated with the subnet. + // We add firewall routes directly to this table so that both pod routes + // (managed by cloud-provider-azure) and firewall routes coexist. Creating + // a separate route table and swapping the subnet association disconnects + // the pod routes and breaks kubenet networking. + aksSubnetResp, err := infra.Azure.Subnet.Get(ctx, rg, vnet.name, "aks-subnet", nil) if err != nil { return fmt.Errorf("failed to get AKS subnet: %w", err) } @@ -332,7 +333,7 @@ func addFirewallRules( } toolkit.Logf(ctx, "Creating subnet %s in VNet %s", firewallSubnetName, vnet.name) - subnetPoller, err := config.Azure.Subnet.BeginCreateOrUpdate( + subnetPoller, err := infra.Azure.Subnet.BeginCreateOrUpdate( ctx, rg, vnet.name, @@ -365,7 +366,7 @@ func addFirewallRules( } toolkit.Logf(ctx, "Creating public IP %s", publicIPName) - pipPoller, err := config.Azure.PublicIPAddresses.BeginCreateOrUpdate( + pipPoller, err := infra.Azure.PublicIPAddresses.BeginCreateOrUpdate( ctx, rg, publicIPName, @@ -386,7 +387,7 @@ func addFirewallRules( firewallName := "abe2e-fw" firewall := getFirewall(ctx, location, firewallSubnetID, publicIPID) - fwPoller, err := config.Azure.AzureFirewall.BeginCreateOrUpdate(ctx, rg, firewallName, *firewall, nil) + fwPoller, err := infra.Azure.AzureFirewall.BeginCreateOrUpdate(ctx, rg, firewallName, *firewall, nil) if err != nil { return fmt.Errorf("failed to start Firewall creation: %w", err) } @@ -432,7 +433,7 @@ func addFirewallRules( for _, route := range firewallRoutes { toolkit.Logf(ctx, "Adding route %q to AKS route table %q", *route.Name, aksRTName) - poller, err := config.Azure.Routes.BeginCreateOrUpdate(ctx, rg, aksRTName, *route.Name, route, nil) + poller, err := infra.Azure.Routes.BeginCreateOrUpdate(ctx, rg, aksRTName, *route.Name, route, nil) if err != nil { return fmt.Errorf("failed to start adding route %q: %w", *route.Name, err) } @@ -510,7 +511,7 @@ func addPrivateAzureContainerRegistry(ctx context.Context, cluster *armcontainer if err := createPrivateAzureContainerRegistryPullSecret(ctx, cluster, kube, resourceGroupName, isNonAnonymousPull); err != nil { return fmt.Errorf("create private acr pull secret: %w", err) } - vnet, err := getClusterVNet(ctx, *cluster.Properties.NodeResourceGroup) + vnet, err := getClusterVNet(ctx, DefaultClusterInfra, *cluster.Properties.NodeResourceGroup) if err != nil { return err } @@ -531,7 +532,7 @@ func addNetworkIsolatedSettings(ctx context.Context, clusterModel *armcontainers location := *clusterModel.Location defer toolkit.LogStepCtx(ctx, fmt.Sprintf("Adding network settings for network isolated cluster %s in rg %s", *clusterModel.Name, *clusterModel.Properties.NodeResourceGroup)) - vnet, err := getClusterVNet(ctx, *clusterModel.Properties.NodeResourceGroup) + vnet, err := getClusterVNet(ctx, DefaultClusterInfra, *clusterModel.Properties.NodeResourceGroup) if err != nil { return err } @@ -678,7 +679,7 @@ func createPrivateAzureContainerRegistry(ctx context.Context, cluster *armcontai } // if ACR gets recreated so should the cluster toolkit.Logf(ctx, "Private ACR deleted, deleting cluster %s", *cluster.Name) - if err := deleteCluster(ctx, *cluster.Name, resourceGroup); err != nil { + if err := deleteCluster(ctx, DefaultClusterInfra, *cluster.Name, resourceGroup); err != nil { return fmt.Errorf("failed to delete cluster: %w", err) } } else { diff --git a/e2e/cache.go b/e2e/cache.go index 1b07d383815..777acdaf559 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -10,6 +10,7 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources/v3" ) // cachedFunc creates a thread-safe memoized version of a function. @@ -150,56 +151,67 @@ func clusterLatestKubernetesVersion(ctx context.Context, request ClusterRequest) if err != nil { return nil, fmt.Errorf("getting latest kubernetes version cluster model: %w", err) } - return prepareCluster(ctx, model, false, false) + return prepareCluster(ctx, DefaultClusterInfra, model, false, false) } var ClusterKubenet = cachedFunc(clusterKubenet) // clusterKubenet creates a basic cluster using kubenet networking func clusterKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getKubenetClusterModel("abe2e-kubenet-v4", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getKubenetClusterModel("abe2e-kubenet-v4", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterAzureNetwork = cachedFunc(clusterAzureNetwork) // clusterAzureNetwork creates a cluster with Azure CNI networking func clusterAzureNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureNetworkClusterModel("abe2e-azure-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterAzureBootstrapProfileCache = cachedFunc(clusterAzureBootstrapProfileCache) // clusterAzureBootstrapProfileCache creates a cluster with bootstrap profile cache but without network isolation func clusterAzureBootstrapProfileCache(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-bootstrapprofile-cache-v1", request.Location, request.K8sSystemPoolSKU), false, true) + return prepareCluster(ctx, DefaultClusterInfra, getAzureNetworkClusterModel("abe2e-azure-bootstrapprofile-cache-v1", request.Location, request.K8sSystemPoolSKU), false, true) } var ClusterAzureNetworkIsolated = cachedFunc(clusterAzureNetworkIsolated) // clusterAzureNetworkIsolated creates a networkisolated Azure network cluster (no internet access) func clusterAzureNetworkIsolated(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-networkisolated-v1", request.Location, request.K8sSystemPoolSKU), true, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureNetworkClusterModel("abe2e-azure-networkisolated-v1", request.Location, request.K8sSystemPoolSKU), true, false) } var ClusterAzureOverlayNetwork = cachedFunc(clusterAzureOverlayNetwork) // clusterAzureOverlayNetwork creates a cluster with Azure CNI Overlay networking func clusterAzureOverlayNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureOverlayNetworkClusterModel("abe2e-azure-overlay-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureOverlayNetworkClusterModel("abe2e-azure-overlay-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterAzureOverlayNetworkDualStack = cachedFunc(clusterAzureOverlayNetworkDualStack) // clusterAzureOverlayNetworkDualStack creates a dual-stack (IPv4+IPv6) Azure CNI Overlay cluster func clusterAzureOverlayNetworkDualStack(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureOverlayNetworkDualStackClusterModel("abe2e-azure-overlay-dualstack-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureOverlayNetworkDualStackClusterModel("abe2e-azure-overlay-dualstack-v3", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterCiliumNetwork = cachedFunc(clusterCiliumNetwork) // clusterCiliumNetwork creates a cluster with Cilium CNI networking func clusterCiliumNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getCiliumNetworkClusterModel("abe2e-cilium-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getCiliumNetworkClusterModel("abe2e-cilium-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) +} + +var ClusterRCV1PKubenet = cachedFunc(clusterRCV1PKubenet) + +// clusterRCV1PKubenet creates a kubenet cluster in the RCV1P subscription for cert mode testing. +func clusterRCV1PKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) { + infra := RCV1PClusterInfra() + if infra == nil { + return nil, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set, cannot create RCV1P cluster") + } + return prepareCluster(ctx, infra, getKubenetClusterModel("abe2e-rcv1p-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) } // isNotFoundErr checks if an error represents a "not found" response from Azure API @@ -228,6 +240,25 @@ var CachedEnsureResourceGroup = cachedFunc(ensureResourceGroup) var CachedCreateVMManagedIdentity = cachedFunc(config.Azure.CreateVMManagedIdentity) var CachedCompileAndUploadAKSNodeController = cachedFunc(compileAndUploadAKSNodeController) +// CachedRCV1PEnsureResourceGroup creates the resource group in the RCV1P subscription. +var CachedRCV1PEnsureResourceGroup = cachedFunc(ensureRCV1PResourceGroup) + +// CachedRCV1PCreateVMManagedIdentity creates a VM managed identity in the RCV1P subscription. +var CachedRCV1PCreateVMManagedIdentity = cachedFunc(func(ctx context.Context, location string) (string, error) { + if config.RCV1PAzure == nil { + return "", fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set") + } + return config.RCV1PAzure.CreateVMManagedIdentityInRG(ctx, config.RCV1PResourceGroupName(location), location) +}) + +func ensureRCV1PResourceGroup(ctx context.Context, location string) (armresources.ResourceGroup, error) { + infra := RCV1PClusterInfra() + if infra == nil { + return armresources.ResourceGroup{}, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set") + } + return ensureResourceGroupWithInfra(ctx, infra, location) +} + // VMSizeSKURequest is the cache key for Resource SKU lookups by VM size and location. type VMSizeSKURequest struct { Location string diff --git a/e2e/cluster.go b/e2e/cluster.go index 238b8f7f544..7cb0e627c3f 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -64,14 +64,14 @@ func (c *Cluster) MaxPodsPerNode() (int, error) { // This function contains complex concurrent orchestration — keep it as // minimal as possible and push all non-trivial logic into the individual // task functions it calls. -func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.ManagedCluster, isNetworkIsolated, attachPrivateAcr bool) (*Cluster, error) { +func prepareCluster(ctx context.Context, infra *ClusterInfra, clusterModel *armcontainerservice.ManagedCluster, isNetworkIsolated, attachPrivateAcr bool) (*Cluster, error) { defer toolkit.LogStepCtx(ctx, "preparing cluster")() ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutCluster) defer cancel() clusterModel.Name = to.Ptr(fmt.Sprintf("%s-%s", *clusterModel.Name, hash(clusterModel))) - cluster, err := getOrCreateCluster(ctx, clusterModel) + cluster, err := getOrCreateCluster(ctx, infra, clusterModel) if err != nil { return nil, fmt.Errorf("get or create cluster: %w", err) } @@ -82,11 +82,11 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag // finish before other subnet writes (firewall / network-isolated setup) // to avoid Azure VNet serialisation races. bastion := dag.Go(g, func(ctx context.Context) (*Bastion, error) { - return getOrCreateBastion(ctx, cluster) + return getOrCreateBastion(ctx, infra, cluster) }) dag.Run(g, func(ctx context.Context) error { return ensureMaintenanceConfiguration(ctx, cluster) }) - subnet := dag.Go(g, func(ctx context.Context) (string, error) { return getClusterSubnetID(ctx, cluster) }) - kube := dag.Go(g, func(ctx context.Context) (*Kubeclient, error) { return getClusterKubeClient(ctx, cluster) }) + subnet := dag.Go(g, func(ctx context.Context) (string, error) { return getClusterSubnetID(ctx, infra, cluster) }) + kube := dag.Go(g, func(ctx context.Context) (*Kubeclient, error) { return getClusterKubeClient(ctx, infra, cluster) }) identity := dag.Go(g, func(ctx context.Context) (*armcontainerservice.UserAssignedIdentity, error) { return getClusterKubeletIdentity(ctx, cluster) }) @@ -98,12 +98,12 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag // objects whose backing VMSS no longer exist. var networkDeps []dag.Dep if !isNetworkIsolated { - networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addFirewallRules(ctx, cluster) }, bastion)) + networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addFirewallRules(ctx, infra, cluster) }, bastion)) } if isNetworkIsolated { networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addNetworkIsolatedSettings(ctx, cluster) }, bastion)) } - dag.Run1(g, kube, func(ctx context.Context, k *Kubeclient) error { return collectGarbageVMSS(ctx, cluster, k) }, networkDeps...) + dag.Run1(g, kube, func(ctx context.Context, k *Kubeclient) error { return collectGarbageVMSS(ctx, infra, cluster, k) }, networkDeps...) needACR := isNetworkIsolated || attachPrivateAcr acrNonAnon := dag.Run2(g, kube, identity, addACR(cluster, needACR, true)) acrAnon := dag.Run2(g, kube, identity, addACR(cluster, needACR, false)) @@ -113,6 +113,7 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag if err := g.Wait(); err != nil { return nil, fmt.Errorf("prepare cluster tasks: %w", err) } + return &Cluster{ Model: cluster, Kube: kube.MustGet(), @@ -235,9 +236,10 @@ func hash(cluster *armcontainerservice.ManagedCluster) string { return hexHash[:5] } -func getOrCreateCluster(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { +func getOrCreateCluster(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { defer toolkit.LogStepCtxf(ctx, "get or create cluster %s", *cluster.Name)() - existingCluster, err := getExistingCluster(ctx, *cluster.Location, *cluster.Name) + rgName := infra.ResourceGroupName(*cluster.Location) + existingCluster, err := getExistingCluster(ctx, infra, rgName, *cluster.Name) if err != nil { return nil, fmt.Errorf("failed to get existing cluster %q: %w, and wont retry", *cluster.Name, err) } @@ -247,13 +249,12 @@ func getOrCreateCluster(ctx context.Context, cluster *armcontainerservice.Manage return existingCluster, nil } - return createNewAKSClusterWithRetry(ctx, cluster) + return createNewAKSClusterWithRetry(ctx, infra, rgName, cluster) } // isExistingCluster checks if an AKS cluster exists. return the cluster only if its provisioning state is Succeeded and can be used. non-nil error if not retriable -func getExistingCluster(ctx context.Context, location, clusterName string) (*armcontainerservice.ManagedCluster, error) { - resourceGroupName := config.ResourceGroupName(location) - existingCluster, err := config.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) +func getExistingCluster(ctx context.Context, infra *ClusterInfra, resourceGroupName, clusterName string) (*armcontainerservice.ManagedCluster, error) { + existingCluster, err := infra.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) var azErr *azcore.ResponseError if errors.As(err, &azErr) { if azErr.StatusCode == 404 { @@ -266,7 +267,7 @@ func getExistingCluster(ctx context.Context, location, clusterName string) (*arm switch *existingCluster.Properties.ProvisioningState { case "Succeeded": - nodeRGExists, err := isExistingResourceGroup(ctx, *existingCluster.Properties.NodeResourceGroup) + nodeRGExists, err := isExistingResourceGroup(ctx, infra, *existingCluster.Properties.NodeResourceGroup) if err != nil { return nil, err @@ -278,28 +279,28 @@ func getExistingCluster(ctx context.Context, location, clusterName string) (*arm fallthrough case "Failed": toolkit.Logf(ctx, "##vso[task.logissue type=warning;]Cluster %s in Failed state, deleting", clusterName) - if err := deleteCluster(ctx, clusterName, resourceGroupName); err != nil { + if err := deleteCluster(ctx, infra, clusterName, resourceGroupName); err != nil { return nil, err } // Wait for Azure to confirm cluster is fully deleted before allowing recreation. // This prevents "Reconcile managed identity credential failed" errors where Azure's // backend still has stale references to the old cluster during the new cluster's // identity reconciliation process. - if err := waitForClusterDeletion(ctx, clusterName, resourceGroupName); err != nil { + if err := waitForClusterDeletion(ctx, infra, clusterName, resourceGroupName); err != nil { return nil, fmt.Errorf("failed waiting for cluster deletion: %w", err) } return nil, nil default: // other provisioning state, deleting, , stopping,,cancaled,cancelling,"Creating", "Updating", "Scaling", "Migrating", "Upgrading", "Starting", "Restoring": .. plus many others. toolkit.Logf(ctx, "##vso[task.logissue type=warning;]Unexpected cluster provisioning state %s: %s", clusterName, *existingCluster.Properties.ProvisioningState) - return waitUntilClusterReady(ctx, clusterName, location) + return waitUntilClusterReady(ctx, infra, clusterName, resourceGroupName) } } -func deleteCluster(ctx context.Context, clusterName, resourceGroupName string) error { +func deleteCluster(ctx context.Context, infra *ClusterInfra, clusterName, resourceGroupName string) error { defer toolkit.LogStepCtxf(ctx, "deleting cluster %s", clusterName)() // beileih: why do we do this? - _, err := config.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) + _, err := infra.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) if err != nil { var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == 404 { @@ -309,7 +310,7 @@ func deleteCluster(ctx context.Context, clusterName, resourceGroupName string) e return fmt.Errorf("failed to retrieve cluster while trying to delete it %q: %w", clusterName, err) } - pollerResp, err := config.Azure.AKS.BeginDelete(ctx, resourceGroupName, clusterName, nil) + pollerResp, err := infra.Azure.AKS.BeginDelete(ctx, resourceGroupName, clusterName, nil) if err != nil { return fmt.Errorf("failed to delete cluster %q: %w", clusterName, err) } @@ -320,9 +321,9 @@ func deleteCluster(ctx context.Context, clusterName, resourceGroupName string) e return nil } -func waitForClusterDeletion(ctx context.Context, clusterName, resourceGroupName string) error { +func waitForClusterDeletion(ctx context.Context, infra *ClusterInfra, clusterName, resourceGroupName string) error { return wait.PollUntilContextCancel(ctx, 5*time.Second, true, func(ctx context.Context) (bool, error) { - _, err := config.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) + _, err := infra.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) if err != nil { var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == 404 { @@ -334,11 +335,11 @@ func waitForClusterDeletion(ctx context.Context, clusterName, resourceGroupName }) } -func waitUntilClusterReady(ctx context.Context, name, location string) (*armcontainerservice.ManagedCluster, error) { +func waitUntilClusterReady(ctx context.Context, infra *ClusterInfra, name, resourceGroupName string) (*armcontainerservice.ManagedCluster, error) { var cluster armcontainerservice.ManagedClustersClientGetResponse err := wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) { var err error - cluster, err = config.Azure.AKS.Get(ctx, config.ResourceGroupName(location), name, nil) + cluster, err = infra.Azure.AKS.Get(ctx, resourceGroupName, name, nil) if err != nil { return false, err } @@ -357,8 +358,8 @@ func waitUntilClusterReady(ctx context.Context, name, location string) (*armcont return &cluster.ManagedCluster, nil } -func isExistingResourceGroup(ctx context.Context, resourceGroupName string) (bool, error) { - rgExistence, err := config.Azure.ResourceGroup.CheckExistence(ctx, resourceGroupName, nil) +func isExistingResourceGroup(ctx context.Context, infra *ClusterInfra, resourceGroupName string) (bool, error) { + rgExistence, err := infra.Azure.ResourceGroup.CheckExistence(ctx, resourceGroupName, nil) if err != nil { return false, fmt.Errorf("failed to get RG %q: %w", resourceGroupName, err) } @@ -366,11 +367,11 @@ func isExistingResourceGroup(ctx context.Context, resourceGroupName string) (boo return rgExistence.Success, nil } -func createNewAKSCluster(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { +func createNewAKSCluster(ctx context.Context, infra *ClusterInfra, rgName string, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { // Note, it seems like the operation still can start a trigger a new operation even if nothing has changes - pollerResp, err := config.Azure.AKS.BeginCreateOrUpdate( + pollerResp, err := infra.Azure.AKS.BeginCreateOrUpdate( ctx, - config.ResourceGroupName(*cluster.Location), + rgName, *cluster.Name, *cluster, nil, @@ -391,16 +392,16 @@ func createNewAKSCluster(ctx context.Context, cluster *armcontainerservice.Manag // that retries creating a cluster if it fails with a 409 Conflict error // clusters are reused, and sometimes a cluster can be in UPDATING or DELETING state // simple retry should be sufficient to avoid such conflicts -func createNewAKSClusterWithRetry(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { +func createNewAKSClusterWithRetry(ctx context.Context, infra *ClusterInfra, rgName string, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { maxRetries := 10 retryInterval := 30 * time.Second var lastErr error for attempt := 0; attempt < maxRetries; attempt++ { if attempt > 0 { - toolkit.Logf(ctx, "Attempt %d: creating or updating cluster %s in region %s and rg %s", attempt+1, *cluster.Name, *cluster.Location, config.ResourceGroupName(*cluster.Location)) + toolkit.Logf(ctx, "Attempt %d: creating or updating cluster %s in region %s and rg %s", attempt+1, *cluster.Name, *cluster.Location, rgName) } - createdCluster, err := createNewAKSCluster(ctx, cluster) + createdCluster, err := createNewAKSCluster(ctx, infra, rgName, cluster) if err == nil { return createdCluster, nil } @@ -443,7 +444,8 @@ func ensureMaintenanceConfiguration(ctx context.Context, cluster *armcontainerse } func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.MaintenanceConfiguration, error) { - toolkit.Logf(ctx, "creating maintenance configuration for cluster %s in rg %s", *cluster.Name, config.ResourceGroupName(*cluster.Location)) + rgName := config.ResourceGroupName(*cluster.Location) + toolkit.Logf(ctx, "creating maintenance configuration for cluster %s in rg %s", *cluster.Name, rgName) maintenance := armcontainerservice.MaintenanceConfiguration{ Properties: &armcontainerservice.MaintenanceConfigurationProperties{ MaintenanceWindow: &armcontainerservice.MaintenanceWindow{ @@ -465,7 +467,7 @@ func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontaine }, } - _, err := config.Azure.Maintenance.CreateOrUpdate(ctx, config.ResourceGroupName(*cluster.Location), *cluster.Name, "default", maintenance, nil) + _, err := config.Azure.Maintenance.CreateOrUpdate(ctx, rgName, *cluster.Name, "default", maintenance, nil) if err != nil { return nil, fmt.Errorf("failed to create maintenance configuration: %w", err) } @@ -473,23 +475,23 @@ func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontaine return &maintenance, nil } -func getOrCreateBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { +func getOrCreateBastion(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { nodeRG := *cluster.Properties.NodeResourceGroup bastionName := fmt.Sprintf("%s-bastion", *cluster.Name) - existing, err := config.Azure.BastionHosts.Get(ctx, nodeRG, bastionName, nil) + existing, err := infra.Azure.BastionHosts.Get(ctx, nodeRG, bastionName, nil) var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == http.StatusNotFound { - return createNewBastion(ctx, cluster) + return createNewBastion(ctx, infra, cluster) } if err != nil { return nil, fmt.Errorf("failed to get bastion %q in rg %q: %w", bastionName, nodeRG, err) } - return NewBastion(config.Azure.Credential, config.Config.SubscriptionID, nodeRG, *existing.BastionHost.Properties.DNSName), nil + return NewBastion(infra.Azure.Credential, infra.SubscriptionID, nodeRG, *existing.BastionHost.Properties.DNSName), nil } -func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { +func createNewBastion(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { nodeRG := *cluster.Properties.NodeResourceGroup location := *cluster.Location bastionName := fmt.Sprintf("%s-bastion", *cluster.Name) @@ -497,7 +499,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC publicIPName := fmt.Sprintf("%s-bastion-pip", *cluster.Name) publicIPName = sanitizeAzureResourceName(publicIPName) - vnet, err := getClusterVNet(ctx, nodeRG) + vnet, err := getClusterVNet(ctx, infra, nodeRG) if err != nil { return nil, fmt.Errorf("get cluster vnet in rg %q: %w", nodeRG, err) } @@ -511,7 +513,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC } var bastionSubnetID string - bastionSubnet, subnetGetErr := config.Azure.Subnet.Get(ctx, nodeRG, vnet.name, bastionSubnetName, nil) + bastionSubnet, subnetGetErr := infra.Azure.Subnet.Get(ctx, nodeRG, vnet.name, bastionSubnetName, nil) if subnetGetErr != nil { var subnetAzErr *azcore.ResponseError if !errors.As(subnetGetErr, &subnetAzErr) || subnetAzErr.StatusCode != http.StatusNotFound { @@ -524,7 +526,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC AddressPrefix: to.Ptr(bastionSubnetPrefix), }, } - subnetPoller, err := config.Azure.Subnet.BeginCreateOrUpdate(ctx, nodeRG, vnet.name, bastionSubnetName, subnetParams, nil) + subnetPoller, err := infra.Azure.Subnet.BeginCreateOrUpdate(ctx, nodeRG, vnet.name, bastionSubnetName, subnetParams, nil) if err != nil { return nil, fmt.Errorf("failed to start creating bastion subnet: %w", err) } @@ -549,7 +551,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC } toolkit.Logf(ctx, "creating bastion public IP %s (rg %s)", publicIPName, nodeRG) - pipPoller, err := config.Azure.PublicIPAddresses.BeginCreateOrUpdate(ctx, nodeRG, publicIPName, pipParams, nil) + pipPoller, err := infra.Azure.PublicIPAddresses.BeginCreateOrUpdate(ctx, nodeRG, publicIPName, pipParams, nil) if err != nil { return nil, fmt.Errorf("failed to start creating bastion public IP: %w", err) } @@ -586,7 +588,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC } toolkit.Logf(ctx, "creating bastion %s (native client/tunneling enabled) in rg %s", bastionName, nodeRG) - bastionPoller, err := config.Azure.BastionHosts.BeginCreateOrUpdate(ctx, nodeRG, bastionName, bastionHost, nil) + bastionPoller, err := infra.Azure.BastionHosts.BeginCreateOrUpdate(ctx, nodeRG, bastionName, bastionHost, nil) if err != nil { return nil, fmt.Errorf("failed to start creating bastion: %w", err) } @@ -595,23 +597,23 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC return nil, fmt.Errorf("failed to create bastion: %w", err) } - bastion := NewBastion(config.Azure.Credential, config.Config.SubscriptionID, nodeRG, *resp.BastionHost.Properties.DNSName) + bastion := NewBastion(infra.Azure.Credential, infra.SubscriptionID, nodeRG, *resp.BastionHost.Properties.DNSName) - if err := verifyBastion(ctx, cluster, bastion); err != nil { + if err := verifyBastion(ctx, infra, cluster, bastion); err != nil { return nil, fmt.Errorf("failed to verify bastion: %w", err) } return bastion, nil } -func verifyBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster, bastion *Bastion) error { +func verifyBastion(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster, bastion *Bastion) error { nodeRG := *cluster.Properties.NodeResourceGroup - vmssName, err := getSystemPoolVMSSName(ctx, cluster) + vmssName, err := getSystemPoolVMSSName(ctx, infra, cluster) if err != nil { return err } var vmssVM *armcompute.VirtualMachineScaleSetVM - pager := config.Azure.VMSSVM.NewListPager(nodeRG, vmssName, nil) + pager := infra.Azure.VMSSVM.NewListPager(nodeRG, vmssName, nil) if pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -622,7 +624,7 @@ func verifyBastion(ctx context.Context, cluster *armcontainerservice.ManagedClus } } - vmPrivateIP, err := getPrivateIPFromVMSSVM(ctx, nodeRG, vmssName, *vmssVM.InstanceID) + vmPrivateIP, err := getPrivateIPFromVMSSVMWithClient(ctx, infra.Azure, nodeRG, vmssName, *vmssVM.InstanceID) ctx, cancel := context.WithCancel(ctx) defer cancel() @@ -644,7 +646,7 @@ func verifyBastion(ctx context.Context, cluster *armcontainerservice.ManagedClus return fmt.Errorf("Executed ssh on wrong VM, Expected %s: %s", vmssName, result.stdout) } -func getSystemPoolVMSSName(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (string, error) { +func getSystemPoolVMSSName(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (string, error) { nodeRG := *cluster.Properties.NodeResourceGroup var systemPoolName string for _, pool := range cluster.Properties.AgentPoolProfiles { @@ -652,7 +654,7 @@ func getSystemPoolVMSSName(ctx context.Context, cluster *armcontainerservice.Man systemPoolName = *pool.Name } } - pager := config.Azure.VMSS.NewListPager(nodeRG, nil) + pager := infra.Azure.VMSS.NewListPager(nodeRG, nil) if pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -684,8 +686,8 @@ type VNet struct { subnetId string } -func getClusterVNet(ctx context.Context, mcResourceGroupName string) (VNet, error) { - pager := config.Azure.VNet.NewListPager(mcResourceGroupName, nil) +func getClusterVNet(ctx context.Context, infra *ClusterInfra, mcResourceGroupName string) (VNet, error) { + pager := infra.Azure.VNet.NewListPager(mcResourceGroupName, nil) for pager.More() { nextResult, err := pager.NextPage(ctx) if err != nil { @@ -701,13 +703,13 @@ func getClusterVNet(ctx context.Context, mcResourceGroupName string) (VNet, erro return VNet{}, fmt.Errorf("failed to find aks vnet") } -func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.ManagedCluster, kube *Kubeclient) error { +func collectGarbageVMSS(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster, kube *Kubeclient) error { defer toolkit.LogStepCtx(ctx, "collecting garbage VMSS")() rg := *cluster.Properties.NodeResourceGroup // Build a set of all existing VMSS names while deleting old ones. existingVMSS := map[string]struct{}{} - pager := config.Azure.VMSS.NewListPager(rg, nil) + pager := infra.Azure.VMSS.NewListPager(rg, nil) for pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -730,7 +732,7 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage continue } - _, err := config.Azure.VMSS.BeginDelete(ctx, rg, *vmss.Name, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ + _, err := infra.Azure.VMSS.BeginDelete(ctx, rg, *vmss.Name, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ ForceDeletion: to.Ptr(true), }) if err != nil { @@ -790,8 +792,12 @@ func collectGarbageNodes(ctx context.Context, kube *Kubeclient, existingVMSS map } func ensureResourceGroup(ctx context.Context, location string) (armresources.ResourceGroup, error) { - resourceGroupName := config.ResourceGroupName(location) - rg, err := config.Azure.ResourceGroup.CreateOrUpdate( + return ensureResourceGroupWithInfra(ctx, DefaultClusterInfra, location) +} + +func ensureResourceGroupWithInfra(ctx context.Context, infra *ClusterInfra, location string) (armresources.ResourceGroup, error) { + resourceGroupName := infra.ResourceGroupName(location) + rg, err := infra.Azure.ResourceGroup.CreateOrUpdate( ctx, resourceGroupName, armresources.ResourceGroup{ diff --git a/e2e/config/azure.go b/e2e/config/azure.go index d0de6f04619..847db25a269 100644 --- a/e2e/config/azure.go +++ b/e2e/config/azure.go @@ -117,6 +117,10 @@ func NewHttpClient() *http.Client { } func NewAzureClient() (*AzureClient, error) { + return NewAzureClientForSubscription(Config.SubscriptionID) +} + +func NewAzureClientForSubscription(subscriptionID string) (*AzureClient, error) { httpClient := NewHttpClient() logger := runtime.NewLogPolicy(&policy.LogOptions{ IncludeBody: true, @@ -155,193 +159,183 @@ func NewAzureClient() (*AzureClient, error) { return nil, fmt.Errorf("create core client: %w", err) } - cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(Config.SubscriptionID, credential, opts) + cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create public ip addresses client: %w", err) } - cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(Config.SubscriptionID, credential, opts) - if err != nil { - return nil, fmt.Errorf("create bastion hosts client: %w", err) - } - - cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(Config.SubscriptionID, credential, opts) + cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create bastion hosts client: %w", err) } - cloud.RegistriesClient, err = armcontainerregistry.NewRegistriesClient(Config.SubscriptionID, credential, opts) + cloud.RegistriesClient, err = armcontainerregistry.NewRegistriesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create registry client: %w", err) } - cloud.CacheRulesClient, err = armcontainerregistry.NewCacheRulesClient(Config.SubscriptionID, credential, opts) + cloud.CacheRulesClient, err = armcontainerregistry.NewCacheRulesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create cache rules client: %w", err) } - cloud.PrivateEndpointClient, err = armnetwork.NewPrivateEndpointsClient(Config.SubscriptionID, credential, opts) + cloud.PrivateEndpointClient, err = armnetwork.NewPrivateEndpointsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private endpoint client: %w", err) } - cloud.PrivateZonesClient, err = armprivatedns.NewPrivateZonesClient(Config.SubscriptionID, credential, opts) + cloud.PrivateZonesClient, err = armprivatedns.NewPrivateZonesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private dns zones client: %w", err) } - cloud.VirutalNetworkLinksClient, err = armprivatedns.NewVirtualNetworkLinksClient(Config.SubscriptionID, credential, opts) + cloud.VirutalNetworkLinksClient, err = armprivatedns.NewVirtualNetworkLinksClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create virtual network links client: %w", err) } - cloud.RecordSetClient, err = armprivatedns.NewRecordSetsClient(Config.SubscriptionID, credential, opts) + cloud.RecordSetClient, err = armprivatedns.NewRecordSetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create record set client: %w", err) } - cloud.PrivateDNSZoneGroup, err = armnetwork.NewPrivateDNSZoneGroupsClient(Config.SubscriptionID, credential, opts) + cloud.PrivateDNSZoneGroup, err = armnetwork.NewPrivateDNSZoneGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private dns zone group client: %w", err) } - cloud.SecurityGroup, err = armnetwork.NewSecurityGroupsClient(Config.SubscriptionID, credential, opts) + cloud.SecurityGroup, err = armnetwork.NewSecurityGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create security group client: %w", err) } - cloud.Subnet, err = armnetwork.NewSubnetsClient(Config.SubscriptionID, credential, opts) + cloud.Subnet, err = armnetwork.NewSubnetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create subnet client: %w", err) } - cloud.RouteTables, err = armnetwork.NewRouteTablesClient(Config.SubscriptionID, credential, opts) + cloud.RouteTables, err = armnetwork.NewRouteTablesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create route tables client: %w", err) } - cloud.Routes, err = armnetwork.NewRoutesClient(Config.SubscriptionID, credential, opts) + cloud.Routes, err = armnetwork.NewRoutesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create routes client: %w", err) } - cloud.AKS, err = armcontainerservice.NewManagedClustersClient(Config.SubscriptionID, credential, opts) + cloud.AKS, err = armcontainerservice.NewManagedClustersClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create aks client: %w", err) } - cloud.Maintenance, err = armcontainerservice.NewMaintenanceConfigurationsClient(Config.SubscriptionID, credential, opts) + cloud.Maintenance, err = armcontainerservice.NewMaintenanceConfigurationsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create maintenance client: %w", err) } - cloud.NetworkInterfaces, err = armnetwork.NewInterfacesClient(Config.SubscriptionID, credential, opts) + cloud.NetworkInterfaces, err = armnetwork.NewInterfacesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create network interfaces client: %w", err) } - cloud.VMSS, err = armcompute.NewVirtualMachineScaleSetsClient(Config.SubscriptionID, credential, opts) + cloud.VMSS, err = armcompute.NewVirtualMachineScaleSetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss client: %w", err) } - cloud.VMSSVM, err = armcompute.NewVirtualMachineScaleSetVMsClient(Config.SubscriptionID, credential, opts) + cloud.VMSSVM, err = armcompute.NewVirtualMachineScaleSetVMsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss vm client: %w", err) } - cloud.VMs, err = armcompute.NewVirtualMachinesClient(Config.SubscriptionID, credential, opts) + cloud.VMs, err = armcompute.NewVirtualMachinesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vms client: %w", err) } - cloud.Images, err = armcompute.NewImagesClient(Config.SubscriptionID, credential, opts) + cloud.Images, err = armcompute.NewImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create images client: %w", err) } - cloud.Snapshots, err = armcompute.NewSnapshotsClient(Config.SubscriptionID, credential, opts) + cloud.Snapshots, err = armcompute.NewSnapshotsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create snapshots client: %w", err) } - cloud.GalleryImages, err = armcompute.NewGalleryImagesClient(Config.SubscriptionID, credential, opts) + cloud.GalleryImages, err = armcompute.NewGalleryImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create gallery images client: %w", err) } - cloud.GalleryImageVersions, err = armcompute.NewGalleryImageVersionsClient(Config.SubscriptionID, credential, opts) + cloud.GalleryImageVersions, err = armcompute.NewGalleryImageVersionsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create gallery image versions client: %w", err) } - cloud.Resource, err = armresources.NewClient(Config.SubscriptionID, credential, opts) + cloud.Resource, err = armresources.NewClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource client: %w", err) } - cloud.ResourceGroup, err = armresources.NewResourceGroupsClient(Config.SubscriptionID, credential, opts) + cloud.ResourceGroup, err = armresources.NewResourceGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource group client: %w", err) } - cloud.VNet, err = armnetwork.NewVirtualNetworksClient(Config.SubscriptionID, credential, opts) + cloud.VNet, err = armnetwork.NewVirtualNetworksClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vnet client: %w", err) } - cloud.AzureFirewall, err = armnetwork.NewAzureFirewallsClient(Config.SubscriptionID, credential, opts) + cloud.AzureFirewall, err = armnetwork.NewAzureFirewallsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create firewall client: %w", err) } - cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(Config.SubscriptionID, credential, opts) - if err != nil { - return nil, fmt.Errorf("create public ip addresses client: %w", err) - } - cloud.Blob, err = azblob.NewClient(Config.BlobStorageAccountURL(), credential, nil) if err != nil { return nil, fmt.Errorf("create blob container client: %w", err) } - cloud.StorageContainers, err = armstorage.NewBlobContainersClient(Config.SubscriptionID, credential, opts) + cloud.StorageContainers, err = armstorage.NewBlobContainersClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create blob container client: %w", err) } - cloud.RoleAssignments, err = armauthorization.NewRoleAssignmentsClient(Config.SubscriptionID, credential, opts) + cloud.RoleAssignments, err = armauthorization.NewRoleAssignmentsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create role assignment client: %w", err) } - cloud.UserAssignedIdentities, err = armmsi.NewUserAssignedIdentitiesClient(Config.SubscriptionID, credential, nil) + cloud.UserAssignedIdentities, err = armmsi.NewUserAssignedIdentitiesClient(subscriptionID, credential, nil) if err != nil { return nil, fmt.Errorf("create user assigned identities client: %w", err) } - cloud.StorageAccounts, err = armstorage.NewAccountsClient(Config.SubscriptionID, credential, nil) + cloud.StorageAccounts, err = armstorage.NewAccountsClient(subscriptionID, credential, nil) if err != nil { return nil, fmt.Errorf("create storage accounts client: %w", err) } - cloud.VMSSVMRunCommands, err = armcompute.NewVirtualMachineScaleSetVMRunCommandsClient(Config.SubscriptionID, credential, opts) + cloud.VMSSVMRunCommands, err = armcompute.NewVirtualMachineScaleSetVMRunCommandsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss vm run command client: %w", err) } - cloud.VMExtensionImages, err = armcompute.NewVirtualMachineExtensionImagesClient(Config.SubscriptionID, credential, opts) + cloud.VMExtensionImages, err = armcompute.NewVirtualMachineExtensionImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vm extension images client: %w", err) } - cloud.ResourceSKUs, err = armcompute.NewResourceSKUsClient(Config.SubscriptionID, credential, opts) + cloud.ResourceSKUs, err = armcompute.NewResourceSKUsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource skus client: %w", err) } // Ensure the gallery exists - cloud.Galleries, err = armcompute.NewGalleriesClient(Config.SubscriptionID, credential, opts) + cloud.Galleries, err = armcompute.NewGalleriesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create galleries client: %w", err) } @@ -419,6 +413,18 @@ func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context, identityLocat return *identity.Properties.ClientID, nil } +// CreateVMManagedIdentityInRG creates a VM managed identity in the specified resource group +// without creating blob storage infrastructure (which belongs to the default subscription). +func (a *AzureClient) CreateVMManagedIdentityInRG(ctx context.Context, resourceGroupName, location string) (string, error) { + identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, resourceGroupName, VMIdentityName, armmsi.Identity{ + Location: to.Ptr(location), + }, nil) + if err != nil { + return "", fmt.Errorf("create managed identity in RG %s: %w", resourceGroupName, err) + } + return *identity.Properties.ClientID, nil +} + func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error { poller, err := a.StorageAccounts.BeginCreate(ctx, ResourceGroupName(Config.DefaultLocation), Config.BlobStorageAccount(), armstorage.AccountCreateParameters{ Kind: to.Ptr(armstorage.KindStorageV2), diff --git a/e2e/config/config.go b/e2e/config/config.go index d61db484c6e..bd3f9c677c2 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -29,6 +29,10 @@ var ( Azure = mustNewAzureClient() VMIdentityName = "abe2e-vm-identity" + // RCV1PAzure is lazily initialized when RCV1PSubscriptionID is set. + // It provides Azure clients bound to the PlatformSettingsOverride-registered subscription. + RCV1PAzure *AzureClient + DefaultPollUntilDoneOptions = &runtime.PollUntilDoneOptions{ Frequency: time.Second, } @@ -40,6 +44,14 @@ func ResourceGroupName(location string) string { return "abe2e-" + location } +func RCV1PResourceGroupName(location string) string { + return "abe2e-rcv1p-" + location +} + +func (c *Configuration) RCV1PVMIdentityResourceID(location string) string { + return fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ManagedIdentity/userAssignedIdentities/%s", c.RCV1PSubscriptionID, RCV1PResourceGroupName(location), VMIdentityName) +} + func PrivateACRNameNotAnon(location string) string { return "abe2eprivatenonanon" + location // will have anonymous pull enabled } @@ -88,6 +100,7 @@ type Configuration struct { TestTimeoutCluster time.Duration `env:"TEST_TIMEOUT_CLUSTER" envDefault:"20m"` TestTimeoutVMSS time.Duration `env:"TEST_TIMEOUT_VMSS" envDefault:"17m"` WindowsAdminPassword string `env:"WINDOWS_ADMIN_PASSWORD"` + RCV1PSubscriptionID string `env:"RCV1P_SUBSCRIPTION_ID"` } func (c *Configuration) BlobStorageAccount() string { @@ -169,6 +182,16 @@ func mustLoadConfig() *Configuration { return cfg } +func init() { + if Config.RCV1PSubscriptionID != "" && !strings.HasPrefix(Config.RCV1PSubscriptionID, "$(") { + client, err := NewAzureClientForSubscription(Config.RCV1PSubscriptionID) + if err != nil { + panic(fmt.Sprintf("failed to create RCV1P Azure client: %v", err)) + } + RCV1PAzure = client + } +} + // Returns a newly generated RSA public/private key pair with the private key in PEM format. func mustGetNewRSAKeyPair() ([]byte, []byte, string) { // Generate new key pair diff --git a/e2e/kube.go b/e2e/kube.go index 87a260d4b4a..de18eb8837f 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -39,10 +39,10 @@ const ( podNetworkDebugAppLabel = "debugnonhost-mariner-tolerated" ) -func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*Kubeclient, error) { - resourceGroupName := config.ResourceGroupName(*cluster.Location) +func getClusterKubeClient(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*Kubeclient, error) { + resourceGroupName := infra.ResourceGroupName(*cluster.Location) clusterName := *cluster.Name - data, err := getClusterKubeconfigBytes(ctx, resourceGroupName, clusterName) + data, err := getClusterKubeconfigBytes(ctx, infra, resourceGroupName, clusterName) if err != nil { return nil, fmt.Errorf("get cluster kubeconfig bytes: %w", err) } @@ -276,8 +276,8 @@ func logPodDebugInfo(ctx context.Context, kube *Kubeclient, pod *corev1.Pod) { toolkit.Log(ctx, string(info)) } -func getClusterKubeconfigBytes(ctx context.Context, resourceGroupName, clusterName string) ([]byte, error) { - credentialList, err := config.Azure.AKS.ListClusterAdminCredentials(ctx, resourceGroupName, clusterName, nil) +func getClusterKubeconfigBytes(ctx context.Context, infra *ClusterInfra, resourceGroupName, clusterName string) ([]byte, error) { + credentialList, err := infra.Azure.AKS.ListClusterAdminCredentials(ctx, resourceGroupName, clusterName, nil) if err != nil { return nil, fmt.Errorf("list cluster admin credentials: %w", err) } @@ -445,9 +445,9 @@ func daemonsetDebug(ctx context.Context, deploymentName, targetNodeLabel, privat } } -func getClusterSubnetID(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (string, error) { +func getClusterSubnetID(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (string, error) { mcResourceGroupName := *cluster.Properties.NodeResourceGroup - pager := config.Azure.VNet.NewListPager(mcResourceGroupName, nil) + pager := infra.Azure.VNet.NewListPager(mcResourceGroupName, nil) for pager.More() { nextResult, err := pager.NextPage(ctx) if err != nil { diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go new file mode 100644 index 00000000000..817b63cba8e --- /dev/null +++ b/e2e/scenario_rcv1p_test.go @@ -0,0 +1,210 @@ +// scenario_rcv1p_test.go contains end-to-end tests for the RCV1P (Root Certificate V1P) cert mode +// on Linux distros. RCV1P is the next-generation mechanism for distributing Azure root CA certificates +// to AKS nodes. Instead of relying on hardcoded certificate bundles, RCV1P queries the Azure wireserver +// at provisioning time to download the latest root certificates and installs them into the OS trust store. +// +// These tests require: +// - A dedicated subscription (RCV1P_SUBSCRIPTION_ID) with the Microsoft.Compute/PlatformSettingsOverride +// feature flag registered, which enables the wireserver certificate endpoint. +// - The VM opt-in tag "platformsettings.host_environment.service.platform_optedin_for_rootcerts=true" +// on each VMSS, which tells wireserver to serve certificates to this specific VM. +// +// Both conditions must be met: the subscription feature enables the endpoint, and the VM tag grants +// per-VM access. Without the tag, wireserver returns IsOptedInForRootCerts=false. +// +// The positive tests (Test_RCV1P_) verify that certificates are downloaded, installed into +// the distro-specific trust store, and a refresh schedule is created. The negative test +// (Test_RCV1P_NotOptedIn) verifies that omitting the VM tag correctly prevents cert installation. +package e2e + +import ( + "context" + "strings" + "testing" + + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" +) + +// rcv1pOptInTag is the ARM tag that must be set on the VM resource for wireserver to serve +// root certificates. Without this tag, wireserver returns IsOptedInForRootCerts=false even +// if the subscription has the PlatformSettingsOverride feature registered. +const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedin_for_rootcerts" + +// skipIfRCV1PNotConfigured skips the test when the RCV1P subscription is not configured. +// This happens in regular CI runs where the RCV1P variable group is not linked, causing +// Azure DevOps to pass the literal unexpanded string "$(RCV1P_SUBSCRIPTION_ID)". +func skipIfRCV1PNotConfigured(t *testing.T) { + t.Helper() + subID := config.Config.RCV1PSubscriptionID + if subID == "" || strings.HasPrefix(subID, "$(") { + t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") + } +} + +// rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS so that wireserver +// will serve root certificates to this VM during provisioning. +func rcv1pOptInVMConfigMutator(vmss *armcompute.VirtualMachineScaleSet) { + if vmss.Tags == nil { + vmss.Tags = map[string]*string{} + } + vmss.Tags[rcv1pOptInTag] = to.Ptr("true") +} + +// Test_RCV1P_Ubuntu2204 validates RCV1P cert download and trust store installation on Ubuntu 22.04. +// Ubuntu uses /usr/local/share/ca-certificates/ as the cert drop folder and update-ca-certificates +// to rebuild the trust bundle. +func Test_RCV1P_Ubuntu2204(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Ubuntu 22.04 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Ubuntu2404 validates RCV1P cert download and trust store installation on Ubuntu 24.04. +// Covers the newer Ubuntu LTS release to ensure the cert endpoint and trust store integration +// work correctly across Ubuntu versions. +func Test_RCV1P_Ubuntu2404(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Ubuntu 24.04 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2404Gen2Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_AzureLinuxV3 validates RCV1P on Azure Linux V3, which uses a different trust store +// layout (/etc/pki/ca-trust/source/anchors/) and update command (update-ca-trust) than Ubuntu. +// This ensures the provisioning script correctly detects the distro and uses the right paths. +func Test_RCV1P_AzureLinuxV3(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Azure Linux V3 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDAzureLinuxV3Gen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Flatcar validates RCV1P on Flatcar Container Linux, which has a read-only root +// filesystem and requires certificates to be placed in /etc/ssl/certs/ as .pem files. +// This is the most constrained environment for cert installation. +func Test_RCV1P_Flatcar(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Flatcar with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDFlatcarGen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_ACL validates RCV1P on Azure Container Linux (ACL), which shares the same +// trust store layout as Azure Linux (/etc/pki/ca-trust/). ACL requires Trusted Launch, +// so the VMConfigMutator combines both the TrustedLaunch and opt-in tag settings. +func Test_RCV1P_ACL(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on ACL with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDACLGen2TL, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) + rcv1pOptInVMConfigMutator(vmss) + }, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_NotOptedIn is a negative test that validates the VM opt-in tag is required +// for cert installation. The VM is created in the RCV1P subscription (which has +// PlatformSettingsOverride registered) but WITHOUT the opt-in tag on the VMSS. +// This verifies that wireserver returns IsOptedInForRootCerts=false and the provisioning +// script correctly skips certificate download and trust store installation. +// This test is critical because it proves the two-layer access control works: +// subscription feature alone is not sufficient — the VM must also be explicitly tagged. +func Test_RCV1P_NotOptedIn(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode without VM opt-in tag; expects no cert installation", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PNotOptedIn(ctx, s) + }, + }, + }) +} diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go new file mode 100644 index 00000000000..55a35b584bb --- /dev/null +++ b/e2e/scenario_rcv1p_win_test.go @@ -0,0 +1,91 @@ +// scenario_rcv1p_win_test.go contains end-to-end tests for the RCV1P cert mode on Windows. +// Windows uses a different cert installation path than Linux: certificates are downloaded to +// C:\ca and imported into the Windows certificate store (Cert:\LocalMachine\Root) via +// Import-Certificate. A scheduled task (aks-ca-certs-refresh-task) is registered to +// periodically refresh the certificates. +// +// These tests run against the same RCV1P subscription and require the same VM opt-in tag +// as the Linux tests (see scenario_rcv1p_test.go for details on the two-layer access control). +package e2e + +import ( + "context" + "testing" + + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" +) + +// Test_RCV1P_Windows2022 validates RCV1P cert download and Windows certificate store +// installation on Windows Server 2022. +func Test_RCV1P_Windows2022(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2022 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2022Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows23H2 validates RCV1P on Windows Server 23H2, the annual channel release. +func Test_RCV1P_Windows23H2(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 23H2 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows23H2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows2025 validates RCV1P on Windows Server 2025. This SKU requires +// Trusted Launch, so the VMConfigMutator combines both TrustedLaunch and opt-in tag settings. +func Test_RCV1P_Windows2025(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2025 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2025, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) + rcv1pOptInVMConfigMutator(vmss) + }, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + Windows2025BootstrapConfigMutator(t, nbc) + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index a84442d32f1..9eb9818e32d 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -211,10 +211,24 @@ func runScenario(t testing.TB, s *Scenario) error { ctx := newTestCtx(t) maybeSkipScenario(ctx, t, s) - _, err := CachedEnsureResourceGroup(ctx, s.Location) - require.NoError(t, err) - _, err = CachedCreateVMManagedIdentity(ctx, s.Location) - require.NoError(t, err) + if s.AzureClient != nil { + // RCV1P scenario: ensure RG and identity in the RCV1P subscription + _, err := CachedRCV1PEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedRCV1PCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + // Also ensure default subscription infra (RG + identity + blob storage) is provisioned, + // since Windows log extraction on failure uploads to the default subscription's blob storage. + _, err = CachedEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + } else { + _, err := CachedEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + } s.T = t ctrruntimelog.SetLogger(zap.New()) @@ -261,6 +275,11 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) { nbc, err := getBaseNBC(s.T, s.Runtime.Cluster, s.VHD) require.NoError(s.T, err) + // Override subscription ID for RCV1P scenarios + if s.SubscriptionID != "" { + nbc.SubscriptionID = s.SubscriptionID + } + nbc.EnableScriptlessCSECmd = true if s.Runtime != nil && s.Runtime.EnableScriptlessNBCCSECmd { nbc.EnableScriptlessNBCCSECmd = true @@ -610,7 +629,7 @@ func RunCommand(ctx context.Context, s *Scenario, command string) (armcompute.Ru toolkit.Logf(ctx, "Command %q took %s", command, elapsed) }() - runPoller, err := config.Azure.VMSSVM.BeginRunCommand(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, armcompute.RunCommandInput{ + runPoller, err := s.GetAzure().VMSSVM.BeginRunCommand(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, armcompute.RunCommandInput{ CommandID: func() *string { if s.IsWindows() { return to.Ptr("RunPowerShellScript") @@ -639,11 +658,11 @@ func CreateImage(ctx context.Context, s *Scenario) *config.Image { require.NoErrorf(s.T, err, "failed to run sysprep on Windows VM for image creation") } - vm, err := config.Azure.VMSSVM.Get(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) + vm, err := s.GetAzure().VMSSVM.Get(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) require.NoError(s.T, err, "Failed to get VMSS VM for image creation") s.T.Log("Deallocating VMSS VM...") - poll, err := config.Azure.VMSSVM.BeginDeallocate(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, nil) + poll, err := s.GetAzure().VMSSVM.BeginDeallocate(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, nil) require.NoError(s.T, err, "Failed to begin deallocate") _, err = poll.PollUntilDone(ctx, nil) require.NoError(s.T, err, "Failed to deallocate") @@ -690,7 +709,7 @@ func CreateSIGImageVersionFromDisk(ctx context.Context, s *Scenario, version str // Create the image version directly from the disk s.T.Logf("Creating gallery image version: %s in %s", version, *image.ID) - createVersionOp, err := config.Azure.GalleryImageVersions.BeginCreateOrUpdate(ctx, rg, *gallery.Name, *image.Name, version, armcompute.GalleryImageVersion{ + createVersionOp, err := s.GetAzure().GalleryImageVersions.BeginCreateOrUpdate(ctx, rg, *gallery.Name, *image.Name, version, armcompute.GalleryImageVersion{ Location: to.Ptr(s.Location), Properties: &armcompute.GalleryImageVersionProperties{ StorageProfile: &armcompute.GalleryImageVersionStorageProfile{ @@ -726,7 +745,7 @@ func CreateSIGImageVersionFromDisk(ctx context.Context, s *Scenario, version str customVHD := *s.Config.VHD customVHD.Name = *image.Name // Use the architecture-specific image name customVHD.Gallery = &config.Gallery{ - SubscriptionID: config.Config.SubscriptionID, + SubscriptionID: s.GetSubscriptionID(), ResourceGroupName: rg, Name: *gallery.Name, } diff --git a/e2e/types.go b/e2e/types.go index 9643b167470..c2356f903e7 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -21,6 +21,33 @@ import ( "golang.org/x/crypto/ssh" ) +// ClusterInfra captures the Azure infrastructure scope for cluster operations. +// It allows cluster creation and management to target different subscriptions. +type ClusterInfra struct { + Azure *config.AzureClient + SubscriptionID string + ResourceGroupName func(location string) string +} + +// DefaultClusterInfra uses the default subscription and resource group naming. +var DefaultClusterInfra = &ClusterInfra{ + Azure: config.Azure, + SubscriptionID: config.Config.SubscriptionID, + ResourceGroupName: config.ResourceGroupName, +} + +// RCV1PClusterInfra returns the ClusterInfra for the RCV1P subscription, or nil if not configured. +func RCV1PClusterInfra() *ClusterInfra { + if config.RCV1PAzure == nil { + return nil + } + return &ClusterInfra{ + Azure: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + ResourceGroupName: config.RCV1PResourceGroupName, + } +} + type Tags struct { Name string ImageName string @@ -35,6 +62,7 @@ type Tags struct { Scriptless bool VHDCaching bool MockAzureChinaCloud bool + RCV1PCertMode bool VMSeriesCoverageTest bool } @@ -128,6 +156,14 @@ type Scenario struct { // a default size will be used. K8sSystemPoolSKU string + // AzureClient overrides the default config.Azure client for this scenario. + // When nil, config.Azure is used. + AzureClient *config.AzureClient + + // SubscriptionID overrides the default config.Config.SubscriptionID for this scenario. + // When empty, config.Config.SubscriptionID is used. + SubscriptionID string + // Runtime contains the runtime state of the scenario. It's populated in the beginning of the test run Runtime *ScenarioRuntime T testing.TB @@ -410,3 +446,35 @@ func (s *Scenario) IsWindows() bool { func (s *Scenario) IsLinux() bool { return !s.IsWindows() } + +// GetAzure returns the AzureClient for this scenario, falling back to the default config.Azure. +func (s *Scenario) GetAzure() *config.AzureClient { + if s.AzureClient != nil { + return s.AzureClient + } + return config.Azure +} + +// GetSubscriptionID returns the subscription ID for this scenario, falling back to config.Config.SubscriptionID. +func (s *Scenario) GetSubscriptionID() string { + if s.SubscriptionID != "" { + return s.SubscriptionID + } + return config.Config.SubscriptionID +} + +// GetResourceGroupName returns the resource group name for this scenario's location. +func (s *Scenario) GetResourceGroupName() string { + if s.SubscriptionID != "" && s.SubscriptionID != config.Config.SubscriptionID { + return config.RCV1PResourceGroupName(s.Location) + } + return config.ResourceGroupName(s.Location) +} + +// GetVMIdentityResourceID returns the VM identity resource ID for this scenario. +func (s *Scenario) GetVMIdentityResourceID() string { + if s.SubscriptionID != "" && s.SubscriptionID != config.Config.SubscriptionID { + return config.Config.RCV1PVMIdentityResourceID(s.Location) + } + return config.Config.VMIdentityResourceID(s.Location) +} diff --git a/e2e/validators.go b/e2e/validators.go index de8db1ca15a..8bbfbd9813e 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -412,6 +412,13 @@ func ValidateNonEmptyDirectory(ctx context.Context, s *Scenario, dirName string) execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "either could not find expected file, or something went wrong") } +func ValidateEmptyDirectory(ctx context.Context, s *Scenario, dirName string) { + s.T.Helper() + command := fmt.Sprintf("[ -d %s ] && [ -z \"$(ls -A %s)\" ]", dirName, dirName) + execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, + fmt.Sprintf("expected directory %s to be empty or not exist", dirName)) +} + func ValidateInspektorGadget(ctx context.Context, s *Scenario) { s.T.Helper() @@ -2293,3 +2300,103 @@ func ValidateAlgifAeadMitigation(ctx context.Context, s *Scenario) { execScriptOnVMForScenarioValidateExitCode(ctx, s, script, 0, "CVE-2026-31431 (algif_aead) mitigation validation failed") } + +// ValidateRCV1PCertMode validates that the rcv1p certificate endpoint mode was used during +// Linux node provisioning, certificates were downloaded and installed, and a refresh task was scheduled. +func ValidateRCV1PCertMode(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate the provisioning log shows rcv1p mode was selected + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Using custom cloud certificate endpoint mode: rcv1p") + + // Validate the subscription is opted in for root certs + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "IsOptedInForRootCerts=true") + + // Validate certificates were downloaded + ValidateNonEmptyDirectory(ctx, s, "/root/AzureCACertificates") + + // Validate trust store was updated (distro-specific path) + trustStoreDir := rcv1pTrustStoreDir(s) + execScriptOnVMForScenarioValidateExitCode(ctx, s, + fmt.Sprintf("sudo ls -1 %s/*.crt 2>/dev/null || sudo ls -1 %s/*.pem 2>/dev/null", trustStoreDir, trustStoreDir), + 0, fmt.Sprintf("expected certificates in trust store directory %s", trustStoreDir)) + + // Validate refresh schedule was created (cron or systemd timer depending on distro) + if s.VHD.Flatcar || s.VHD.OS == config.OSACL { + // Flatcar and ACL use systemd timer + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "systemctl is-enabled azure-ca-refresh.timer", + 0, "expected azure-ca-refresh.timer to be enabled") + } else { + // Ubuntu, Mariner, AzureLinux use cron + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "sudo crontab -l 2>/dev/null | grep -q ca-refresh", + 0, "expected ca-refresh cron entry") + } +} + +// rcv1pTrustStoreDir returns the OS trust store directory for the given scenario's distro. +func rcv1pTrustStoreDir(s *Scenario) string { + switch s.VHD.OS { + case config.OSMariner, config.OSAzureLinux, config.OSACL: + return "/etc/pki/ca-trust/source/anchors" + case config.OSFlatcar: + return "/etc/ssl/certs" + default: + // Ubuntu and anything else + return "/usr/local/share/ca-certificates" + } +} + +// ValidateRCV1PCertModeWindows validates that the rcv1p certificate endpoint mode was used during +// Windows node provisioning, certificates were downloaded and installed, and a refresh task was scheduled. +func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate CA certificates were installed to the Windows certificate store + command := []string{ + "$ErrorActionPreference = 'Stop'", + "$caFolder = 'C:\\ca'", + "if (-not (Test-Path $caFolder)) { throw 'CA certificates folder C:\\ca does not exist' }", + "$certs = Get-ChildItem -Path $caFolder -File", + "if ($certs.Count -eq 0) { throw 'No certificates found in C:\\ca folder' }", + "Write-Host \"Found $($certs.Count) certificate(s) in $caFolder\"", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected certificates in C:\\ca") + + // Validate the refresh scheduled task exists + command = []string{ + "$ErrorActionPreference = 'Stop'", + "$task = Get-ScheduledTask -TaskName 'aks-ca-certs-refresh-task' -ErrorAction SilentlyContinue", + "if (-not $task) { throw 'aks-ca-certs-refresh-task scheduled task not found' }", + "Write-Host \"Scheduled task found: $($task.TaskName) (State: $($task.State))\"", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected aks-ca-certs-refresh-task scheduled task") +} + +// ValidateRCV1PNotOptedIn validates that when the VM does NOT have the opt-in tag, +// wireserver returns IsOptedInForRootCerts=false and no certificates are installed, +// even in the RCV1P subscription with PlatformSettingsOverride registered. +func ValidateRCV1PNotOptedIn(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate the provisioning log shows rcv1p mode was selected + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Using custom cloud certificate endpoint mode: rcv1p") + + // Validate wireserver reported not opted in + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true") + + // Validate no certificates were downloaded + ValidateEmptyDirectory(ctx, s, "/root/AzureCACertificates") + + // Validate no refresh schedule was created + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "sudo crontab -l 2>/dev/null | grep -q ca-refresh", + 1, "expected no ca-refresh cron entry when not opted in") +} diff --git a/e2e/vmss.go b/e2e/vmss.go index d9260bf6407..d7a35696735 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -383,13 +383,13 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine if config.Config.IsLocalBuild() { s.T.Logf( "VMSS portal link: https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/overview", - config.Config.SubscriptionID, + s.GetSubscriptionID(), *cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, ) s.T.Logf( "Managed cluster portal link: https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ContainerService/managedClusters/%s/overview", - config.Config.SubscriptionID, + s.GetSubscriptionID(), *cluster.Model.Properties.NodeResourceGroup, *cluster.Model.Name, ) @@ -401,8 +401,8 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine model.Identity = &armcompute.VirtualMachineScaleSetIdentity{ Type: to.Ptr(armcompute.ResourceIdentityTypeSystemAssignedUserAssigned), UserAssignedIdentities: map[string]*armcompute.UserAssignedIdentitiesValue{ - *s.Runtime.Cluster.KubeletIdentity.ResourceID: {}, - config.Config.VMIdentityResourceID(s.Location): {}, + *s.Runtime.Cluster.KubeletIdentity.ResourceID: {}, + s.GetVMIdentityResourceID(): {}, }, } @@ -475,7 +475,7 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*ScenarioVM, error) { defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} - operation, err := config.Azure.VMSS.BeginCreateOrUpdate( + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, @@ -492,7 +492,7 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) } @@ -549,7 +549,7 @@ func waitForVMRunningState(ctx context.Context, s *Scenario, vmssVM *armcompute. var lastErr error for { // Get the updated VM with instance view to check power state - vm, err := config.Azure.VMSSVM.Get(ctxTimeout, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmssVM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{ + vm, err := s.GetAzure().VMSSVM.Get(ctxTimeout, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmssVM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{ Expand: to.Ptr(armcompute.InstanceViewTypesInstanceView), }) @@ -592,7 +592,7 @@ func waitForVMSSVM(ctx context.Context, s *Scenario) (*armcompute.VirtualMachine var lastErr error for { - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetVMsClientListOptions{ + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetVMsClientListOptions{ Expand: to.Ptr("instanceView"), }) @@ -622,9 +622,14 @@ func waitForVMSSVM(ctx context.Context, s *Scenario) (*armcompute.VirtualMachine } // getPrivateIPFromVMSSVM extracts the private IP address from a VMSS VM by querying its network interfaces. -func getPrivateIPFromVMSSVM(ctx context.Context, resourceGroup, vmssName, instanceID string) (string, error) { +func getPrivateIPFromVMSSVM(ctx context.Context, s *Scenario, resourceGroup, vmssName, instanceID string) (string, error) { + return getPrivateIPFromVMSSVMWithClient(ctx, s.GetAzure(), resourceGroup, vmssName, instanceID) +} + +// getPrivateIPFromVMSSVMWithClient extracts the private IP using the given Azure client. +func getPrivateIPFromVMSSVMWithClient(ctx context.Context, azure *config.AzureClient, resourceGroup, vmssName, instanceID string) (string, error) { // Query the network interface to get the IP configuration - pager := config.Azure.NetworkInterfaces.NewListVirtualMachineScaleSetVMNetworkInterfacesPager( + pager := azure.NetworkInterfaces.NewListVirtualMachineScaleSetVMNetworkInterfacesPager( resourceGroup, vmssName, instanceID, @@ -708,7 +713,7 @@ func extractBootDiagnostics(ctx context.Context, s *Scenario) error { return nil } - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) for pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -717,7 +722,7 @@ func extractBootDiagnostics(ctx context.Context, s *Scenario) error { for _, vmInstance := range page.Value { // Get boot diagnostics data - bootDiagResp, err := config.Azure.VMSSVM.RetrieveBootDiagnosticsData(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmInstance.InstanceID, nil) + bootDiagResp, err := s.GetAzure().VMSSVM.RetrieveBootDiagnosticsData(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmInstance.InstanceID, nil) if err != nil { return fmt.Errorf("failed to get boot diagnostics for VM %s: %v", *vmInstance.InstanceID, err) } @@ -857,7 +862,7 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { ctx, cancel := context.WithTimeout(ctx, 4*time.Minute) defer cancel() - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) page, err := pager.NextPage(ctx) if err != nil { s.T.Logf("failed to list VMSS instances: %s", err) @@ -871,7 +876,7 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { blobPrefix := s.Runtime.VMSSName blobUrl := config.Config.BlobStorageAccountURL() + "/" + config.Config.BlobContainer + "/" + blobPrefix - client := config.Azure.VMSSVMRunCommands + client := s.GetAzure().VMSSVMRunCommands // Invoke the RunCommand on the VMSS instance s.T.Logf("uploading windows logs to blob storage at %s, may take a few minutes", blobUrl) @@ -970,7 +975,7 @@ func deleteVMSS(ctx context.Context, s *Scenario) { } return } - _, err := config.Azure.VMSS.BeginDelete(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ + _, err := s.GetAzure().VMSS.BeginDelete(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ ForceDeletion: to.Ptr(true), }) if err != nil { @@ -1173,7 +1178,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual ID: to.Ptr( fmt.Sprintf( loadBalancerBackendAddressPoolIDTemplate, - config.Config.SubscriptionID, + s.GetSubscriptionID(), *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, ), ), diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 2fd36c81434..862c2f09b6c 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -43,7 +43,7 @@ function make_request_with_retry { local response while [ $attempt -le $max_retries ]; do - response=$(curl -f --no-progress-meter "$url") + response=$(curl -f --no-progress-meter --connect-timeout 10 --max-time 30 "$url") local request_status=$? if echo "$response" | grep -q "RequestRateLimitExceeded"; then @@ -213,6 +213,7 @@ esac echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" install_ca_refresh_schedule=0 +mkdir -p /root/AzureCACertificates rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then install_ca_refresh_schedule=1 diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh index 58812659856..13e0f33e188 100644 --- a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -20,7 +20,7 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' End It 'maps ussec/usnat locations to legacy cert endpoint mode' - When run grep -Eq 'ussec\*|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" + When run grep -Eq 'ussec\*\|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" The status should eq 0 End diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index d9852e4288d..56df5977e87 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -89,7 +89,8 @@ function Register-CACertificatesRefreshTask { if ([string]::IsNullOrEmpty($Location)) { $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates | Out-Null }" } else { - $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$Location' | Out-Null }" + $escapedLocation = $Location -replace "'", "''" + $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$escapedLocation' | Out-Null }" } $action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -NonInteractive -ExecutionPolicy Bypass -Command `"$refreshCommand`"" $principal = New-ScheduledTaskPrincipal -UserId SYSTEM -LogonType ServiceAccount -RunLevel Highest From 85ec0f6107aa749edce2dacc080074e782e2642e Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 14 Apr 2026 14:55:57 -0700 Subject: [PATCH 17/26] Address PR review feedback: fix multi-subscription, validation, and error handling - e2e/cluster.go: Pass ClusterInfra to ensureMaintenanceConfiguration and createNewMaintenanceConfiguration so RCV1P clusters use the correct subscription and resource group instead of the global default. - e2e/validators.go: Fix ValidateEmptyDirectory shell predicate to succeed when the directory is missing (not just when empty), matching the error message. Also quote dirName in the shell command. - staging/cse/windows/kubernetesfunc.ps1: Add -FailOnError switch to Get-CACertificates so initial provisioning fails fast on cert retrieval errors while the scheduled refresh task remains non-fatal. - parts/windows/kuberneteswindowssetup.ps1: Call Get-CACertificates with -FailOnError during initial provisioning. - staging/cse/windows/kubernetesfunc.tests.ps1: Add tests for -FailOnError behavior (exception and empty data paths). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/cluster.go | 15 ++++++++------- e2e/validators.go | 2 +- parts/windows/kuberneteswindowssetup.ps1 | 2 +- staging/cse/windows/kubernetesfunc.ps1 | 10 +++++++++- staging/cse/windows/kubernetesfunc.tests.ps1 | 18 ++++++++++++++++++ 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/e2e/cluster.go b/e2e/cluster.go index 7cb0e627c3f..6c91096475d 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -84,7 +84,7 @@ func prepareCluster(ctx context.Context, infra *ClusterInfra, clusterModel *armc bastion := dag.Go(g, func(ctx context.Context) (*Bastion, error) { return getOrCreateBastion(ctx, infra, cluster) }) - dag.Run(g, func(ctx context.Context) error { return ensureMaintenanceConfiguration(ctx, cluster) }) + dag.Run(g, func(ctx context.Context) error { return ensureMaintenanceConfiguration(ctx, infra, cluster) }) subnet := dag.Go(g, func(ctx context.Context) (string, error) { return getClusterSubnetID(ctx, infra, cluster) }) kube := dag.Go(g, func(ctx context.Context) (*Kubeclient, error) { return getClusterKubeClient(ctx, infra, cluster) }) identity := dag.Go(g, func(ctx context.Context) (*armcontainerservice.UserAssignedIdentity, error) { @@ -427,11 +427,12 @@ func createNewAKSClusterWithRetry(ctx context.Context, infra *ClusterInfra, rgNa return nil, fmt.Errorf("failed to create cluster after %d attempts due to persistent 409 Conflict: %w", maxRetries, lastErr) } -func ensureMaintenanceConfiguration(ctx context.Context, cluster *armcontainerservice.ManagedCluster) error { - _, err := config.Azure.Maintenance.Get(ctx, config.ResourceGroupName(*cluster.Location), *cluster.Name, "default", nil) +func ensureMaintenanceConfiguration(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) error { + rgName := infra.ResourceGroupName(*cluster.Location) + _, err := infra.Azure.Maintenance.Get(ctx, rgName, *cluster.Name, "default", nil) var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == 404 { - _, err = createNewMaintenanceConfiguration(ctx, cluster) + _, err = createNewMaintenanceConfiguration(ctx, infra, cluster) if err != nil { return fmt.Errorf("creating maintenance configuration for cluster %q: %w", *cluster.Name, err) } @@ -443,8 +444,8 @@ func ensureMaintenanceConfiguration(ctx context.Context, cluster *armcontainerse return nil } -func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.MaintenanceConfiguration, error) { - rgName := config.ResourceGroupName(*cluster.Location) +func createNewMaintenanceConfiguration(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.MaintenanceConfiguration, error) { + rgName := infra.ResourceGroupName(*cluster.Location) toolkit.Logf(ctx, "creating maintenance configuration for cluster %s in rg %s", *cluster.Name, rgName) maintenance := armcontainerservice.MaintenanceConfiguration{ Properties: &armcontainerservice.MaintenanceConfigurationProperties{ @@ -467,7 +468,7 @@ func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontaine }, } - _, err := config.Azure.Maintenance.CreateOrUpdate(ctx, rgName, *cluster.Name, "default", maintenance, nil) + _, err := infra.Azure.Maintenance.CreateOrUpdate(ctx, rgName, *cluster.Name, "default", maintenance, nil) if err != nil { return nil, fmt.Errorf("failed to create maintenance configuration: %w", err) } diff --git a/e2e/validators.go b/e2e/validators.go index 8bbfbd9813e..8555274a502 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -414,7 +414,7 @@ func ValidateNonEmptyDirectory(ctx context.Context, s *Scenario, dirName string) func ValidateEmptyDirectory(ctx context.Context, s *Scenario, dirName string) { s.T.Helper() - command := fmt.Sprintf("[ -d %s ] && [ -z \"$(ls -A %s)\" ]", dirName, dirName) + command := fmt.Sprintf("! [ -d '%s' ] || [ -z \"$(ls -A '%s')\" ]", dirName, dirName) execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, fmt.Sprintf("expected directory %s to be empty or not exist", dirName)) } diff --git a/parts/windows/kuberneteswindowssetup.ps1 b/parts/windows/kuberneteswindowssetup.ps1 index 80d8e0ecdea..8169a281973 100644 --- a/parts/windows/kuberneteswindowssetup.ps1 +++ b/parts/windows/kuberneteswindowssetup.ps1 @@ -444,7 +444,7 @@ function BasePrep { {{end}} - Get-CACertificates -Location $Location + Get-CACertificates -Location $Location -FailOnError Write-CACert -CACertificate $global:CACertificate ` -KubeDir $global:KubeDir diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 56df5977e87..159161153f0 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -320,7 +320,9 @@ function Should-InstallCACertificatesRefreshTask { function Get-CACertificates { Param( [Parameter(Mandatory = $false)][string] - $Location = "" + $Location = "", + [Parameter(Mandatory = $false)][switch] + $FailOnError ) $caFolder = "C:\ca" @@ -342,6 +344,9 @@ function Get-CACertificates { $rawData = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$uri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 $caCerts = ($rawData.Content) | ConvertFrom-Json if ($null -eq $caCerts -or $null -eq $caCerts.Certificates -or $caCerts.Certificates.Length -eq 0) { + if ($FailOnError) { + throw "CA certificates rawdata is empty for legacy endpoint" + } Write-Log "Warning: CA certificates rawdata is empty for legacy endpoint" return $false } @@ -406,6 +411,9 @@ function Get-CACertificates { return $downloadedAny } catch { + if ($FailOnError) { + throw "Failed to retrieve CA certificates. Error: $_" + } Write-Log "Warning: failed to retrieve CA certificates. Error: $_" return $false } diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 8ada13ee440..42accc39c51 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -188,6 +188,24 @@ Describe 'Get-CACertificates' { $result | Should -Be $false } + It 'throws when certificate retrieval fails with -FailOnError' { + Mock Retry-Command -MockWith { + throw 'simulated retrieval failure' + } + + { Get-CACertificates -Location 'southcentralus' -FailOnError } | Should -Throw '*Failed to retrieve CA certificates*' + } + + It 'throws when legacy endpoint returns empty data with -FailOnError' { + Mock Retry-Command -MockWith { + return [PSCustomObject]@{ + Content = '{"Certificates":[]}' + } + } + + { Get-CACertificates -Location 'ussecwest' -FailOnError } | Should -Throw '*CA certificates rawdata is empty*' + } + It 'falls back to legacy endpoint when called without -Location (backward compat)' { $script:retryUris = @() Mock Retry-Command -MockWith { From 4d8af283f8bb34b3a79dd904223102b85eb22944 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 14 Apr 2026 15:09:03 -0700 Subject: [PATCH 18/26] Add Windows not-opted-in negative test for RCV1P cert mode Add Test_RCV1P_Windows_NotOptedIn which creates a Windows VM in the RCV1P subscription without the opt-in tag and validates that: - C:\ca is empty or does not exist (no certificates downloaded) - aks-ca-certs-refresh-task scheduled task is not registered This mirrors the existing Linux Test_RCV1P_NotOptedIn test to ensure the two-layer access control (subscription feature + VM tag) works on Windows. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/scenario_rcv1p_win_test.go | 25 +++++++++++++++++++++++++ e2e/validators.go | 27 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 55a35b584bb..73c3851671d 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -89,3 +89,28 @@ func Test_RCV1P_Windows2025(t *testing.T) { }, }) } + +// Test_RCV1P_Windows_NotOptedIn is a negative test that validates the VM opt-in tag is required +// for cert installation on Windows. The VM is created in the RCV1P subscription (which has +// PlatformSettingsOverride registered) but WITHOUT the opt-in tag on the VMSS. +// This verifies that wireserver returns IsOptedInForRootCerts=false and the provisioning +// script correctly skips certificate download and refresh task registration. +func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2022Containerd, + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PNotOptedInWindows(ctx, s) + }, + }, + }) +} diff --git a/e2e/validators.go b/e2e/validators.go index 8555274a502..c975c46d057 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2400,3 +2400,30 @@ func ValidateRCV1PNotOptedIn(ctx context.Context, s *Scenario) { "sudo crontab -l 2>/dev/null | grep -q ca-refresh", 1, "expected no ca-refresh cron entry when not opted in") } + +// ValidateRCV1PNotOptedInWindows validates that when the Windows VM does NOT have the opt-in tag, +// no certificates are installed to C:\ca and no refresh scheduled task is registered, +// even in the RCV1P subscription with PlatformSettingsOverride registered. +func ValidateRCV1PNotOptedInWindows(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate C:\ca is empty or does not exist + command := []string{ + "$ErrorActionPreference = 'Stop'", + "$caFolder = 'C:\\ca'", + "if ((Test-Path $caFolder) -and @(Get-ChildItem -Path $caFolder -File).Count -gt 0) { throw 'Expected C:\\ca to be empty or not exist, but found certificates' }", + "Write-Host 'C:\\ca is empty or does not exist as expected'", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected C:\\ca to be empty or not exist when not opted in") + + // Validate no refresh scheduled task was registered + command = []string{ + "$ErrorActionPreference = 'Stop'", + "$task = Get-ScheduledTask -TaskName 'aks-ca-certs-refresh-task' -ErrorAction SilentlyContinue", + "if ($task) { throw 'Expected no aks-ca-certs-refresh-task but found one' }", + "Write-Host 'No aks-ca-certs-refresh-task found as expected'", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected no aks-ca-certs-refresh-task scheduled task when not opted in") +} From c1264485a8b5ec4558f4da891d81e6e9e1af1d15 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 15 Apr 2026 17:32:31 -0700 Subject: [PATCH 19/26] e2e: add VM instance-level tag update for RCV1P wireserver opt-in Wireserver checks tags on the individual VMSS VM instance, not the VMSS resource-level tags. Add VMInstanceTags field to Config and update the VM instance after it appears in the API but before CSE completes. This ensures wireserver sees the opt-in tag when init-aks-custom-cloud.sh queries IsOptedInForRootCerts during provisioning. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/scenario_rcv1p_test.go | 30 ++++++++++++++++++++++-------- e2e/scenario_rcv1p_win_test.go | 3 +++ e2e/types.go | 6 ++++++ e2e/vmss.go | 29 +++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 8 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 817b63cba8e..0bb927798ae 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -44,8 +44,9 @@ func skipIfRCV1PNotConfigured(t *testing.T) { } } -// rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS so that wireserver -// will serve root certificates to this VM during provisioning. +// rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS resource level. +// Note: For wireserver to recognize the tag, it must also be set on the individual VM instance. +// Use VMInstanceTags in the Config to set instance-level tags (applied after VM creation). func rcv1pOptInVMConfigMutator(vmss *armcompute.VirtualMachineScaleSet) { if vmss.Tags == nil { vmss.Tags = map[string]*string{} @@ -53,6 +54,14 @@ func rcv1pOptInVMConfigMutator(vmss *armcompute.VirtualMachineScaleSet) { vmss.Tags[rcv1pOptInTag] = to.Ptr("true") } +// rcv1pVMInstanceTags returns the tags that must be set on individual VM instances +// for wireserver to serve root certificates. +func rcv1pVMInstanceTags() map[string]*string { + return map[string]*string{ + rcv1pOptInTag: to.Ptr("true"), + } +} + // Test_RCV1P_Ubuntu2204 validates RCV1P cert download and trust store installation on Ubuntu 22.04. // Ubuntu uses /usr/local/share/ca-certificates/ as the cert drop folder and update-ca-certificates // to rebuild the trust bundle. @@ -66,9 +75,10 @@ func Test_RCV1P_Ubuntu2204(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, - VHD: config.VHDUbuntu2204Gen2Containerd, + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -91,9 +101,10 @@ func Test_RCV1P_Ubuntu2404(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, - VHD: config.VHDUbuntu2404Gen2Containerd, + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2404Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -116,9 +127,10 @@ func Test_RCV1P_AzureLinuxV3(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, - VHD: config.VHDAzureLinuxV3Gen2, + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDAzureLinuxV3Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -144,6 +156,7 @@ func Test_RCV1P_Flatcar(t *testing.T) { Cluster: ClusterRCV1PKubenet, VHD: config.VHDFlatcarGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -172,6 +185,7 @@ func Test_RCV1P_ACL(t *testing.T) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) rcv1pOptInVMConfigMutator(vmss) }, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 73c3851671d..0932ae5f97b 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -32,6 +32,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2022Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) @@ -54,6 +55,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows23H2, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) @@ -80,6 +82,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) rcv1pOptInVMConfigMutator(vmss) }, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { Windows2025BootstrapConfigMutator(t, nbc) }, diff --git a/e2e/types.go b/e2e/types.go index c2356f903e7..666ef4715eb 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -253,6 +253,12 @@ type Config struct { // This prevents the Guest Agent from sweeping events before they can be read. // Only set this on CSE performance test scenarios. EagerCSETimingExtraction bool + + // VMInstanceTags are tags applied directly to VMSS VM instances after creation via BeginUpdate. + // This is needed for features like RCV1P where wireserver checks tags on the individual VM instance, + // not the VMSS resource-level tags. These tags are applied after the VM appears in the API but + // before CSE completes, giving wireserver time to see them before the provisioning scripts query it. + VMInstanceTags map[string]*string } func (s *Scenario) PrepareAKSNodeConfig() { diff --git a/e2e/vmss.go b/e2e/vmss.go index d7a35696735..cbf094f9d93 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -492,6 +492,12 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } + if len(s.Config.VMInstanceTags) > 0 { + if err := updateVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to update VM instance tags: %w", err) + } + } + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -538,6 +544,29 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } +// updateVMInstanceTags updates tags on an individual VMSS VM instance. This is used for features +// like RCV1P where wireserver checks tags on the VM instance level, not the VMSS resource level. +// The update is done after the VM appears in the API but before CSE completes, ensuring the tags +// are visible to wireserver before provisioning scripts query it. +func updateVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "updating VM instance %s/%s tags", vmssName, instanceID)() + + poller, err := s.GetAzure().VMSSVM.BeginUpdate(ctx, resourceGroupName, vmssName, instanceID, + armcompute.VirtualMachineScaleSetVM{ + Tags: tags, + }, nil) + if err != nil { + return fmt.Errorf("failed to begin VM instance tag update: %w", err) + } + + _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return fmt.Errorf("failed to complete VM instance tag update: %w", err) + } + + return nil +} + // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. func waitForVMRunningState(ctx context.Context, s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { ctxTimeout, cancel := context.WithTimeout(ctx, 3*time.Minute) From f2fe7cc9ba5e32193192bec41decb43f62d9c271 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 15 Apr 2026 21:21:17 -0700 Subject: [PATCH 20/26] e2e: use JSON injection for VM profile tags at VMSS creation time The previous approach of updating VM instance tags after creation had a race condition: the BeginUpdate took ~108s, but CSE ran init-aks-custom-cloud.sh and queried wireserver before the tag update completed. Now we marshal the VMSS model to JSON, inject tags into virtualMachineProfile, and send a raw ARM PUT request via the SDK pipeline. This ensures the tags are present at VMSS creation time and propagate to VM instances before CSE boots. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/vmss.go | 147 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 125 insertions(+), 22 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index cbf094f9d93..3194c364fe5 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -24,6 +24,8 @@ import ( "github.com/Azure/agentbaker/pkg/agent" "github.com/Azure/agentbaker/pkg/agent/datamodel" "github.com/Azure/azure-sdk-for-go/sdk/azcore" + azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/streaming" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" "github.com/stretchr/testify/require" @@ -475,11 +477,21 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*ScenarioVM, error) { defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} + + vmssModel := createVMSSModel(ctx, s) + + // When VMInstanceTags are configured, we need to inject tags into + // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. + // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. + if len(s.Config.VMInstanceTags) > 0 { + return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) + } + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - createVMSSModel(ctx, s), + vmssModel, nil, ) if err != nil { @@ -492,12 +504,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - if len(s.Config.VMInstanceTags) > 0 { - if err := updateVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { - return vm, fmt.Errorf("failed to update VM instance tags: %w", err) - } - } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -544,27 +550,124 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// updateVMInstanceTags updates tags on an individual VMSS VM instance. This is used for features -// like RCV1P where wireserver checks tags on the VM instance level, not the VMSS resource level. -// The update is done after the VM appears in the API but before CSE completes, ensuring the tags -// are visible to wireserver before provisioning scripts query it. -func updateVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { - defer toolkit.LogStepCtxf(ctx, "updating VM instance %s/%s tags", vmssName, instanceID)() - - poller, err := s.GetAzure().VMSSVM.BeginUpdate(ctx, resourceGroupName, vmssName, instanceID, - armcompute.VirtualMachineScaleSetVM{ - Tags: tags, - }, nil) +// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into +// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed +// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be +// present at VMSS creation time so they propagate to VM instances before CSE runs. +func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { + defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() + + // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags + vmssJSON, err := json.Marshal(vmssModel) if err != nil { - return fmt.Errorf("failed to begin VM instance tag update: %w", err) + return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) } - _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + var vmssMap map[string]interface{} + if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { + return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) + } + + // Inject tags into properties.virtualMachineProfile + props, ok := vmssMap["properties"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties' field") + } + vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") + } + vmProfile["tags"] = s.Config.VMInstanceTags + s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) + + // Re-marshal the modified model + modifiedBody, err := json.Marshal(vmssMap) if err != nil { - return fmt.Errorf("failed to complete VM instance tag update: %w", err) + return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) } - return nil + // Build the ARM resource URL + subscriptionID := s.SubscriptionID + if subscriptionID == "" { + subscriptionID = config.Config.SubscriptionID + } + resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", + subscriptionID, resourceGroupName, s.Runtime.VMSSName) + + // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) + req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) + if err != nil { + return vm, fmt.Errorf("failed to create ARM request: %w", err) + } + req.Raw().Header.Set("Content-Type", "application/json") + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { + return vm, fmt.Errorf("failed to set request body: %w", err) + } + + resp, err := s.GetAzure().Core.Pipeline().Do(req) + if err != nil { + return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) + } + if resp.StatusCode != 200 && resp.StatusCode != 201 { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Create a poller for the async operation + poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) + if err != nil { + return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) + } + + // Wait for VMSS VM to appear before extracting the private IP + vm.VM, err = waitForVMSSVM(ctx, s) + if err != nil { + return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) + } + + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + if err != nil { + return vm, fmt.Errorf("failed to get VM private IP address: %w", err) + } + + s.T.Cleanup(func() { + defer cleanupBastionTunnel(vm.SSHClient) + cleanupVMSS(ctx, s, vm) + }) + + result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" + if config.Config.KeepVMSS { + s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") + } else { + s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") + } + result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" + s.T.Log(result) + + vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if !s.Config.SkipSSHConnectivityValidation { + var bastErr error + vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) + if bastErr != nil { + return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) + } + } + if err != nil { + return vm, err + } + + err = waitForVMRunningState(ctx, s, vm.VM) + if err != nil { + return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) + } + + return &ScenarioVM{ + VMSS: &vmssResp.VirtualMachineScaleSet, + PrivateIP: vm.PrivateIP, + VM: vm.VM, + SSHClient: vm.SSHClient, + }, nil } // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. From 069911cd537c10648df40920894f3ab489732a62 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 00:18:21 -0700 Subject: [PATCH 21/26] e2e: use lightweight PATCH for VM instance tags instead of JSON injection The ARM API does not support virtualMachineProfile.tags for Uniform mode VMSS (400 BadRequest). Instead, use a lightweight PATCH request to update tags on the VM instance after it appears. PATCH only modifies the tags property and should complete in seconds, unlike BeginUpdate which triggers a full model update (~108s). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/vmss.go | 149 +++++++++++++++------------------------------------- 1 file changed, 43 insertions(+), 106 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 3194c364fe5..c24fee9bee8 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -478,20 +478,11 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} - vmssModel := createVMSSModel(ctx, s) - - // When VMInstanceTags are configured, we need to inject tags into - // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. - // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. - if len(s.Config.VMInstanceTags) > 0 { - return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) - } - operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - vmssModel, + createVMSSModel(ctx, s), nil, ) if err != nil { @@ -504,6 +495,15 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } + // Apply VM instance tags via lightweight PATCH before CSE queries wireserver. + // This is needed for features like RCV1P where wireserver checks tags on the + // individual VM instance, not the VMSS resource-level tags. + if len(s.Config.VMInstanceTags) > 0 { + if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) + } + } + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -550,124 +550,61 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into -// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed -// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be -// present at VMSS creation time so they propagate to VM instances before CSE runs. -func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { - defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() - - // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags - vmssJSON, err := json.Marshal(vmssModel) - if err != nil { - return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) - } +// patchVMInstanceTags sends a lightweight PATCH request to update tags on a VMSS VM instance. +// This is much faster than BeginUpdate (which triggers a full model update) because it only +// modifies the tags property. The PATCH typically completes in seconds rather than minutes. +func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags", vmssName, instanceID)() - var vmssMap map[string]interface{} - if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { - return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) - } - - // Inject tags into properties.virtualMachineProfile - props, ok := vmssMap["properties"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties' field") - } - vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") - } - vmProfile["tags"] = s.Config.VMInstanceTags - s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) - - // Re-marshal the modified model - modifiedBody, err := json.Marshal(vmssMap) - if err != nil { - return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) - } - - // Build the ARM resource URL subscriptionID := s.SubscriptionID if subscriptionID == "" { subscriptionID = config.Config.SubscriptionID } - resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", - subscriptionID, resourceGroupName, s.Runtime.VMSSName) - // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) - req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) - if err != nil { - return vm, fmt.Errorf("failed to create ARM request: %w", err) - } - req.Raw().Header.Set("Content-Type", "application/json") - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { - return vm, fmt.Errorf("failed to set request body: %w", err) - } + resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s?api-version=2025-04-01", + subscriptionID, resourceGroupName, vmssName, instanceID) - resp, err := s.GetAzure().Core.Pipeline().Do(req) - if err != nil { - return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) - } - if resp.StatusCode != 200 && resp.StatusCode != 201 { - body, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) - } + body := struct { + Tags map[string]*string `json:"tags"` + }{Tags: tags} - // Create a poller for the async operation - poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) + bodyJSON, err := json.Marshal(body) if err != nil { - return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) + return fmt.Errorf("failed to marshal tag patch body: %w", err) } - // Wait for VMSS VM to appear before extracting the private IP - vm.VM, err = waitForVMSSVM(ctx, s) + req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) if err != nil { - return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) + return fmt.Errorf("failed to create PATCH request: %w", err) + } + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { + return fmt.Errorf("failed to set request body: %w", err) } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + resp, err := s.GetAzure().Core.Pipeline().Do(req) if err != nil { - return vm, fmt.Errorf("failed to get VM private IP address: %w", err) + return fmt.Errorf("failed to send PATCH request: %w", err) } - s.T.Cleanup(func() { - defer cleanupBastionTunnel(vm.SSHClient) - cleanupVMSS(ctx, s, vm) - }) - - result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" - if config.Config.KeepVMSS { - s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") - } else { - s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") + if resp.StatusCode != 200 && resp.StatusCode != 202 { + respBody, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return fmt.Errorf("PATCH VM instance tags failed with status %d: %s", resp.StatusCode, string(respBody)) } - result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" - s.T.Log(result) - vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) - if !s.Config.SkipSSHConnectivityValidation { - var bastErr error - vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) - if bastErr != nil { - return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) + // If 202 Accepted, poll until complete + if resp.StatusCode == 202 { + poller, err := azruntime.NewPoller[struct{}](resp, s.GetAzure().Core.Pipeline(), nil) + if err != nil { + return fmt.Errorf("failed to create poller for tag PATCH: %w", err) + } + _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return fmt.Errorf("failed to complete tag PATCH: %w", err) } - } - if err != nil { - return vm, err - } - - err = waitForVMRunningState(ctx, s, vm.VM) - if err != nil { - return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) } - return &ScenarioVM{ - VMSS: &vmssResp.VirtualMachineScaleSet, - PrivateIP: vm.PrivateIP, - VM: vm.VM, - SSHClient: vm.SSHClient, - }, nil + return nil } // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. From 4a5f81d036be7e08c98523a5618bc075ca47ebcd Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 10:40:31 -0700 Subject: [PATCH 22/26] Revert "e2e: use lightweight PATCH for VM instance tags instead of JSON injection" This reverts commit 03efe783c5dad08baa425e4fa43eaed022eb3dd2. --- e2e/vmss.go | 149 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 106 insertions(+), 43 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index c24fee9bee8..3194c364fe5 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -478,11 +478,20 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} + vmssModel := createVMSSModel(ctx, s) + + // When VMInstanceTags are configured, we need to inject tags into + // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. + // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. + if len(s.Config.VMInstanceTags) > 0 { + return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) + } + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - createVMSSModel(ctx, s), + vmssModel, nil, ) if err != nil { @@ -495,15 +504,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - // Apply VM instance tags via lightweight PATCH before CSE queries wireserver. - // This is needed for features like RCV1P where wireserver checks tags on the - // individual VM instance, not the VMSS resource-level tags. - if len(s.Config.VMInstanceTags) > 0 { - if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { - return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) - } - } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -550,61 +550,124 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// patchVMInstanceTags sends a lightweight PATCH request to update tags on a VMSS VM instance. -// This is much faster than BeginUpdate (which triggers a full model update) because it only -// modifies the tags property. The PATCH typically completes in seconds rather than minutes. -func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { - defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags", vmssName, instanceID)() +// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into +// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed +// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be +// present at VMSS creation time so they propagate to VM instances before CSE runs. +func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { + defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() + + // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags + vmssJSON, err := json.Marshal(vmssModel) + if err != nil { + return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) + } + var vmssMap map[string]interface{} + if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { + return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) + } + + // Inject tags into properties.virtualMachineProfile + props, ok := vmssMap["properties"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties' field") + } + vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") + } + vmProfile["tags"] = s.Config.VMInstanceTags + s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) + + // Re-marshal the modified model + modifiedBody, err := json.Marshal(vmssMap) + if err != nil { + return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) + } + + // Build the ARM resource URL subscriptionID := s.SubscriptionID if subscriptionID == "" { subscriptionID = config.Config.SubscriptionID } + resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", + subscriptionID, resourceGroupName, s.Runtime.VMSSName) - resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s?api-version=2025-04-01", - subscriptionID, resourceGroupName, vmssName, instanceID) - - body := struct { - Tags map[string]*string `json:"tags"` - }{Tags: tags} + // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) + req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) + if err != nil { + return vm, fmt.Errorf("failed to create ARM request: %w", err) + } + req.Raw().Header.Set("Content-Type", "application/json") + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { + return vm, fmt.Errorf("failed to set request body: %w", err) + } - bodyJSON, err := json.Marshal(body) + resp, err := s.GetAzure().Core.Pipeline().Do(req) if err != nil { - return fmt.Errorf("failed to marshal tag patch body: %w", err) + return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) + } + if resp.StatusCode != 200 && resp.StatusCode != 201 { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) } - req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) + // Create a poller for the async operation + poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) if err != nil { - return fmt.Errorf("failed to create PATCH request: %w", err) + return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) } - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { - return fmt.Errorf("failed to set request body: %w", err) + + // Wait for VMSS VM to appear before extracting the private IP + vm.VM, err = waitForVMSSVM(ctx, s) + if err != nil { + return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - resp, err := s.GetAzure().Core.Pipeline().Do(req) + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { - return fmt.Errorf("failed to send PATCH request: %w", err) + return vm, fmt.Errorf("failed to get VM private IP address: %w", err) } - if resp.StatusCode != 200 && resp.StatusCode != 202 { - respBody, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return fmt.Errorf("PATCH VM instance tags failed with status %d: %s", resp.StatusCode, string(respBody)) + s.T.Cleanup(func() { + defer cleanupBastionTunnel(vm.SSHClient) + cleanupVMSS(ctx, s, vm) + }) + + result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" + if config.Config.KeepVMSS { + s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") + } else { + s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") } + result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" + s.T.Log(result) - // If 202 Accepted, poll until complete - if resp.StatusCode == 202 { - poller, err := azruntime.NewPoller[struct{}](resp, s.GetAzure().Core.Pipeline(), nil) - if err != nil { - return fmt.Errorf("failed to create poller for tag PATCH: %w", err) - } - _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) - if err != nil { - return fmt.Errorf("failed to complete tag PATCH: %w", err) + vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if !s.Config.SkipSSHConnectivityValidation { + var bastErr error + vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) + if bastErr != nil { + return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) } } + if err != nil { + return vm, err + } - return nil + err = waitForVMRunningState(ctx, s, vm.VM) + if err != nil { + return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) + } + + return &ScenarioVM{ + VMSS: &vmssResp.VirtualMachineScaleSet, + PrivateIP: vm.PrivateIP, + VM: vm.VM, + SSHClient: vm.SSHClient, + }, nil } // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. From 2f04b0904397f5da7e0cb5845a5511fdad2a52ec Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 11:29:58 -0700 Subject: [PATCH 23/26] e2e: use Microsoft.Resources/tags API for VM instance tag patching For Uniform mode VMSS, VM instance tags cannot be set at creation time: - The Go SDK (armcompute v7.x) does not expose a Tags field on VirtualMachineScaleSetVMProfile. - The ARM API rejects virtualMachineProfile.tags for Uniform mode VMSS with: 'Could not find member tags on object of type VirtualMachineProfile'. - PATCH on the Compute VM instance endpoint returns 405 Method Not Allowed. - BeginUpdate (PUT) works but takes ~108s for a full VM model reconciliation, causing a race condition: CSE runs init-aks-custom-cloud.sh and queries wireserver before the tag update completes. Use the Microsoft.Resources/tags API instead, which provides a lightweight PATCH endpoint (/{resourceId}/providers/Microsoft.Resources/tags/default) that updates only tags without triggering a full VM update. The Merge operation adds tags without replacing existing ones. Also moves s.T.Cleanup() registration to immediately after waitForVMSSVM() so the VMSS is always cleaned up even if tag patching or subsequent steps fail, preventing orphaned VMSS resources. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/vmss.go | 162 ++++++++++++++++------------------------------------ 1 file changed, 50 insertions(+), 112 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 3194c364fe5..fb2df22eb15 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -478,20 +478,11 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} - vmssModel := createVMSSModel(ctx, s) - - // When VMInstanceTags are configured, we need to inject tags into - // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. - // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. - if len(s.Config.VMInstanceTags) > 0 { - return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) - } - operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - vmssModel, + createVMSSModel(ctx, s), nil, ) if err != nil { @@ -504,16 +495,27 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) - if err != nil { - return vm, fmt.Errorf("failed to get VM private IP address: %w", err) - } - + // Register cleanup early so the VMSS is always deleted even if subsequent steps + // (tag patching, IP lookup, etc.) fail — preventing orphaned VMSS resources. s.T.Cleanup(func() { defer cleanupBastionTunnel(vm.SSHClient) cleanupVMSS(ctx, s, vm) }) + // Apply VM instance tags via the Microsoft.Resources/tags API before CSE queries + // wireserver. This is needed for features like RCV1P where wireserver checks tags + // on the individual VM instance, not the VMSS resource-level tags. + if len(s.Config.VMInstanceTags) > 0 { + if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) + } + } + + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + if err != nil { + return vm, fmt.Errorf("failed to get VM private IP address: %w", err) + } + result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" if config.Config.KeepVMSS { s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") @@ -550,126 +552,62 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into -// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed -// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be -// present at VMSS creation time so they propagate to VM instances before CSE runs. -func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { - defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() - - // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags - vmssJSON, err := json.Marshal(vmssModel) - if err != nil { - return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) - } - - var vmssMap map[string]interface{} - if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { - return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) - } - - // Inject tags into properties.virtualMachineProfile - props, ok := vmssMap["properties"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties' field") - } - vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") - } - vmProfile["tags"] = s.Config.VMInstanceTags - s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) - - // Re-marshal the modified model - modifiedBody, err := json.Marshal(vmssMap) - if err != nil { - return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) - } +// patchVMInstanceTags uses the Microsoft.Resources/tags API to merge tags onto a VMSS VM +// instance. This is a lightweight PATCH that only modifies tags without triggering a full +// VM model update, completing in seconds rather than the ~108s that BeginUpdate takes. +func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags via Resources API", vmssName, instanceID)() - // Build the ARM resource URL subscriptionID := s.SubscriptionID if subscriptionID == "" { subscriptionID = config.Config.SubscriptionID } - resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", - subscriptionID, resourceGroupName, s.Runtime.VMSSName) - // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) - req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) - if err != nil { - return vm, fmt.Errorf("failed to create ARM request: %w", err) - } - req.Raw().Header.Set("Content-Type", "application/json") - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { - return vm, fmt.Errorf("failed to set request body: %w", err) - } + // The Microsoft.Resources/tags API allows lightweight tag updates on any Azure resource. + // Using "Merge" operation to add/update tags without replacing existing ones. + resourceURL := fmt.Sprintf( + "https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s/providers/Microsoft.Resources/tags/default?api-version=2021-04-01", + subscriptionID, resourceGroupName, vmssName, instanceID, + ) - resp, err := s.GetAzure().Core.Pipeline().Do(req) - if err != nil { - return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) - } - if resp.StatusCode != 200 && resp.StatusCode != 201 { - body, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) + body := struct { + Operation string `json:"operation"` + Properties struct { + Tags map[string]*string `json:"tags"` + } `json:"properties"` + }{ + Operation: "Merge", } + body.Properties.Tags = tags - // Create a poller for the async operation - poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) + bodyJSON, err := json.Marshal(body) if err != nil { - return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) + return fmt.Errorf("failed to marshal tag patch body: %w", err) } - // Wait for VMSS VM to appear before extracting the private IP - vm.VM, err = waitForVMSSVM(ctx, s) + req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) if err != nil { - return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) + return fmt.Errorf("failed to create PATCH request: %w", err) } - - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) - if err != nil { - return vm, fmt.Errorf("failed to get VM private IP address: %w", err) - } - - s.T.Cleanup(func() { - defer cleanupBastionTunnel(vm.SSHClient) - cleanupVMSS(ctx, s, vm) - }) - - result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" - if config.Config.KeepVMSS { - s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") - } else { - s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { + return fmt.Errorf("failed to set request body: %w", err) } - result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" - s.T.Log(result) - vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) - if !s.Config.SkipSSHConnectivityValidation { - var bastErr error - vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) - if bastErr != nil { - return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) - } - } + resp, err := s.GetAzure().Core.Pipeline().Do(req) if err != nil { - return vm, err + return fmt.Errorf("failed to send tag PATCH request: %w", err) } - err = waitForVMRunningState(ctx, s, vm.VM) - if err != nil { - return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) + if resp.StatusCode != 200 { + respBody, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return fmt.Errorf("tag PATCH failed with status %d: %s", resp.StatusCode, string(respBody)) } - return &ScenarioVM{ - VMSS: &vmssResp.VirtualMachineScaleSet, - PrivateIP: vm.PrivateIP, - VM: vm.VM, - SSHClient: vm.SSHClient, - }, nil + return nil } + // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. func waitForVMRunningState(ctx context.Context, s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { ctxTimeout, cancel := context.WithTimeout(ctx, 3*time.Minute) From 467850d1f4a97b1502a32da1014dfedd42664dc4 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 15:02:02 -0700 Subject: [PATCH 24/26] e2e: use BeginUpdate + deferred CSE for VM instance tagging Replace the Microsoft.Resources/tags API approach (which returns 405 on Uniform VMSS VM instances) with BeginUpdate (full PUT) + deferred CSE. For scenarios requiring VM instance tags (e.g., RCV1P): 1. Create VMSS without CSE extension profile 2. Wait for VMSS creation to complete 3. Apply tags via VMSSVM.BeginUpdate (~108s full PUT) 4. Re-add CSE extension via a second BeginCreateOrUpdate This ensures wireserver sees the per-VM-instance tags before CSE queries it. The delay is acceptable for E2E validation; production would use a different approach (e.g., AKS RP sets tags pre-boot). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/vmss.go | 120 +++++++++++++++++++++++++++++----------------------- 1 file changed, 67 insertions(+), 53 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index fb2df22eb15..799b7d34345 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -24,8 +24,6 @@ import ( "github.com/Azure/agentbaker/pkg/agent" "github.com/Azure/agentbaker/pkg/agent/datamodel" "github.com/Azure/azure-sdk-for-go/sdk/azcore" - azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" - "github.com/Azure/azure-sdk-for-go/sdk/azcore/streaming" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" "github.com/stretchr/testify/require" @@ -478,11 +476,25 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} + model := createVMSSModel(ctx, s) + + // For scenarios that need VM instance tags (e.g., RCV1P), we must apply tags + // before CSE runs because wireserver checks per-VM-instance tags. The only + // working method for Uniform VMSS is BeginUpdate (full PUT), which takes ~108s. + // To avoid the race, we strip the CSE extension before creation, apply tags + // via BeginUpdate, then re-add the extension in a second update. + var deferredExtensionProfile *armcompute.VirtualMachineScaleSetExtensionProfile + if len(s.Config.VMInstanceTags) > 0 && model.Properties.VirtualMachineProfile.ExtensionProfile != nil { + deferredExtensionProfile = model.Properties.VirtualMachineProfile.ExtensionProfile + model.Properties.VirtualMachineProfile.ExtensionProfile = nil + toolkit.Logf(ctx, "deferring CSE extension until VM instance tags are applied") + } + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - createVMSSModel(ctx, s), + model, nil, ) if err != nil { @@ -496,18 +508,45 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc } // Register cleanup early so the VMSS is always deleted even if subsequent steps - // (tag patching, IP lookup, etc.) fail — preventing orphaned VMSS resources. + // (tag update, IP lookup, etc.) fail — preventing orphaned VMSS resources. s.T.Cleanup(func() { defer cleanupBastionTunnel(vm.SSHClient) cleanupVMSS(ctx, s, vm) }) - // Apply VM instance tags via the Microsoft.Resources/tags API before CSE queries - // wireserver. This is needed for features like RCV1P where wireserver checks tags - // on the individual VM instance, not the VMSS resource-level tags. + // Wait for initial VMSS creation to fully complete before applying tags. + vmssResp, err := operation.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return vm, fmt.Errorf("failed to create VMSS: %w", err) + } + + // Apply VM instance tags via BeginUpdate (full PUT) and then re-add CSE. + // This is needed for features like RCV1P where wireserver checks tags on + // the individual VM instance, not the VMSS resource-level tags. if len(s.Config.VMInstanceTags) > 0 { - if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { - return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) + if err := updateVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to update VM instance tags: %w", err) + } + + // Re-add CSE extension now that tags are in place. + if deferredExtensionProfile != nil { + toolkit.Logf(ctx, "re-adding CSE extension after tags are applied") + vmssResp.VirtualMachineScaleSet.Properties.VirtualMachineProfile.ExtensionProfile = deferredExtensionProfile + cseOp, err := s.GetAzure().VMSS.BeginCreateOrUpdate( + ctx, + resourceGroupName, + s.Runtime.VMSSName, + vmssResp.VirtualMachineScaleSet, + nil, + ) + if err != nil { + return vm, fmt.Errorf("failed to begin adding CSE extension: %w", err) + } + vmssResp2, err := cseOp.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return vm, fmt.Errorf("failed to add CSE extension: %w", err) + } + vmssResp = vmssResp2 } } @@ -526,7 +565,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" s.T.Log(result) - vmssResp, err := operation.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) if !s.Config.SkipSSHConnectivityValidation { var bastErr error vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) @@ -534,9 +572,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) } } - if err != nil { - return vm, err - } // Wait for VM to be in "Running" power state before proceeding err = waitForVMRunningState(ctx, s, vm.VM) @@ -552,56 +587,35 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// patchVMInstanceTags uses the Microsoft.Resources/tags API to merge tags onto a VMSS VM -// instance. This is a lightweight PATCH that only modifies tags without triggering a full -// VM model update, completing in seconds rather than the ~108s that BeginUpdate takes. -func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { - defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags via Resources API", vmssName, instanceID)() - - subscriptionID := s.SubscriptionID - if subscriptionID == "" { - subscriptionID = config.Config.SubscriptionID - } - - // The Microsoft.Resources/tags API allows lightweight tag updates on any Azure resource. - // Using "Merge" operation to add/update tags without replacing existing ones. - resourceURL := fmt.Sprintf( - "https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s/providers/Microsoft.Resources/tags/default?api-version=2021-04-01", - subscriptionID, resourceGroupName, vmssName, instanceID, - ) - - body := struct { - Operation string `json:"operation"` - Properties struct { - Tags map[string]*string `json:"tags"` - } `json:"properties"` - }{ - Operation: "Merge", - } - body.Properties.Tags = tags +// updateVMInstanceTags uses BeginUpdate (full PUT) to set tags on a VMSS VM instance. +// This is the only method that works for Uniform mode VMSS — PATCH and Microsoft.Resources/tags +// API both return 405 at this scope. The operation takes ~108s as it triggers full VM model +// reconciliation. This is acceptable for E2E tests where we defer CSE until tags are in place. +func updateVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "updating VM instance %s/%s/%s tags via BeginUpdate", resourceGroupName, vmssName, instanceID)() - bodyJSON, err := json.Marshal(body) + // Get current VM instance to preserve existing state + currentVM, err := s.GetAzure().VMSSVM.Get(ctx, resourceGroupName, vmssName, instanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) if err != nil { - return fmt.Errorf("failed to marshal tag patch body: %w", err) + return fmt.Errorf("failed to get current VM instance: %w", err) } - req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) - if err != nil { - return fmt.Errorf("failed to create PATCH request: %w", err) + // Merge new tags with any existing tags + if currentVM.Tags == nil { + currentVM.Tags = make(map[string]*string) } - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { - return fmt.Errorf("failed to set request body: %w", err) + for k, v := range tags { + currentVM.Tags[k] = v } - resp, err := s.GetAzure().Core.Pipeline().Do(req) + poller, err := s.GetAzure().VMSSVM.BeginUpdate(ctx, resourceGroupName, vmssName, instanceID, currentVM.VirtualMachineScaleSetVM, nil) if err != nil { - return fmt.Errorf("failed to send tag PATCH request: %w", err) + return fmt.Errorf("failed to begin VM instance tag update: %w", err) } - if resp.StatusCode != 200 { - respBody, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return fmt.Errorf("tag PATCH failed with status %d: %s", resp.StatusCode, string(respBody)) + _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return fmt.Errorf("failed to complete VM instance tag update: %w", err) } return nil From 9636e822a868403adcbe9c467e00d94f76dbda6d Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 19:42:21 -0700 Subject: [PATCH 25/26] e2e: add feature flag check for RCV1P subscription Verify Microsoft.Compute/PlatformSettingsOverride is registered on the RCV1P subscription before running tests. This fails fast with a clear error if the feature flag is missing, rather than letting tests run and fail with opaque wireserver responses. The check runs once per test run via sync.Once. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/scenario_rcv1p_test.go | 56 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 0bb927798ae..eed7cf43ffd 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -19,11 +19,15 @@ package e2e import ( "context" + "fmt" + "io" "strings" + "sync" "testing" "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" + azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" ) @@ -36,12 +40,64 @@ const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedi // skipIfRCV1PNotConfigured skips the test when the RCV1P subscription is not configured. // This happens in regular CI runs where the RCV1P variable group is not linked, causing // Azure DevOps to pass the literal unexpanded string "$(RCV1P_SUBSCRIPTION_ID)". +// It also verifies the Microsoft.Compute/PlatformSettingsOverride feature flag is registered. func skipIfRCV1PNotConfigured(t *testing.T) { t.Helper() subID := config.Config.RCV1PSubscriptionID if subID == "" || strings.HasPrefix(subID, "$(") { t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") } + checkPlatformSettingsOverrideFeatureFlag(t, subID) +} + +var ( + featureFlagCheckOnce sync.Once + featureFlagCheckResult error +) + +// checkPlatformSettingsOverrideFeatureFlag verifies the Microsoft.Compute/PlatformSettingsOverride +// feature flag is registered on the given subscription. This is a prerequisite for wireserver to +// serve root certificates. The check runs only once per test run. +func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID string) { + t.Helper() + featureFlagCheckOnce.Do(func() { + featureFlagCheckResult = verifyFeatureFlag(t.Context(), subscriptionID) + }) + if featureFlagCheckResult != nil { + t.Fatalf("RCV1P feature flag check failed: %v", featureFlagCheckResult) + } +} + +func verifyFeatureFlag(ctx context.Context, subscriptionID string) error { + url := fmt.Sprintf( + "https://management.azure.com/subscriptions/%s/providers/Microsoft.Features/providers/Microsoft.Compute/features/PlatformSettingsOverride?api-version=2021-07-01", + subscriptionID, + ) + + req, err := azruntime.NewRequest(ctx, "GET", url) + if err != nil { + return fmt.Errorf("failed to create feature flag request: %w", err) + } + + resp, err := config.RCV1PAzure.Core.Pipeline().Do(req) + if err != nil { + return fmt.Errorf("failed to query feature flag: %w", err) + } + defer resp.Body.Close() + + body, _ := io.ReadAll(resp.Body) + bodyStr := string(body) + + if resp.StatusCode != 200 { + return fmt.Errorf("feature flag query returned status %d: %s", resp.StatusCode, bodyStr) + } + + if !strings.Contains(bodyStr, `"Registered"`) { + return fmt.Errorf("Microsoft.Compute/PlatformSettingsOverride is NOT registered on subscription %s (response: %s); "+ + "wireserver will not serve root certificates without this feature flag", subscriptionID, bodyStr) + } + + return nil } // rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS resource level. From 0dd86062e3bcb6100c8ea2cadb56574188d7973e Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 19:46:34 -0700 Subject: [PATCH 26/26] REVERT ME: poll wireserver IsOptedInForRootCerts with retry loop Experimental commit to validate whether wireserver detects VM instance tags applied via BeginUpdate after VM creation. Polls for up to ~5 minutes (30x10s). Wireserver reads IsOptedInForRootCerts from the Fabric Controller goal state (CCF/ContainerConfig), NOT directly from ARM tags. The flow is: BeginUpdate -> ARM model update -> FC generates new CCF with platformsettings.host_environment.service.platform_optedin_for_rootcerts -> FC pushes CCF to host agent -> wireserver reflects new state. FC goal state propagation can take several minutes, so the polling window is set to ~5 minutes to give adequate time for detection. Logs the full wireserver response on each attempt for diagnostics. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../artifacts/init-aks-custom-cloud.sh | 42 ++++++++++++++----- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 862c2f09b6c..c63e0bc5df9 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -65,20 +65,40 @@ function make_request_with_retry { function is_opted_in_for_root_certs { local opt_in_response + local request_status + local poll_attempt=1 + local max_poll_attempts=30 + local poll_interval=10 + + # Poll wireserver for up to ~5 minutes to allow platform metadata to sync. + # The VM instance tag triggers a Fabric Controller goal state (CCF) update, + # which must propagate to the host agent before wireserver can reflect it. + # FC goal state propagation can take several minutes in practice. + while [ $poll_attempt -le $max_poll_attempts ]; do + echo "is_opted_in_for_root_certs: poll attempt ${poll_attempt}/${max_poll_attempts}" + + opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") + request_status=$? + + echo "is_opted_in_for_root_certs: wireserver response (status=${request_status}): '${opt_in_response}'" + + if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then + echo "Warning: failed to determine IsOptedInForRootCerts state on attempt ${poll_attempt}" + elif echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then + echo "IsOptedInForRootCerts=true (found on attempt ${poll_attempt})" + return 0 + fi - opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") - local request_status=$? - if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then - echo "Warning: failed to determine IsOptedInForRootCerts state" - return 1 - fi + if [ $poll_attempt -lt $max_poll_attempts ]; then + echo "is_opted_in_for_root_certs: not opted in yet, waiting ${poll_interval}s before retry..." + sleep $poll_interval + fi - if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then - echo "IsOptedInForRootCerts=true" - return 0 - fi + poll_attempt=$((poll_attempt + 1)) + done - echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" + echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true after ${max_poll_attempts} attempts" + echo "Last wireserver response: '${opt_in_response}'" return 1 }