diff --git a/.pipelines/e2e-rcv1p.yaml b/.pipelines/e2e-rcv1p.yaml new file mode 100644 index 00000000000..5fdf9d3a5ee --- /dev/null +++ b/.pipelines/e2e-rcv1p.yaml @@ -0,0 +1,19 @@ +name: $(Date:yyyyMMdd)$(Rev:.r) +variables: + TAGS_TO_RUN: "rcv1pcertmode=true" + SKIP_E2E_TESTS: false + E2E_GO_TEST_TIMEOUT: "75m" +schedules: + - cron: "0 11 * * *" + displayName: Daily 3am PST + branches: + include: + - main + always: true +trigger: none +pr: none +jobs: + - template: ./templates/e2e-template.yaml + parameters: + name: RCV1P Cert Mode Tests + IgnoreScenariosWithMissingVhd: false diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 1dcea264298..097fe250756 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -35,6 +35,7 @@ VHD_BUILD_ID="${VHD_BUILD_ID:-}" IGNORE_SCENARIOS_WITH_MISSING_VHD="${IGNORE_SCENARIOS_WITH_MISSING_VHD:-}" LOGGING_DIR="${LOGGING_DIR:-}" E2E_SUBSCRIPTION_ID="${E2E_SUBSCRIPTION_ID:-}" +RCV1P_SUBSCRIPTION_ID="${RCV1P_SUBSCRIPTION_ID:-}" ENABLE_SECURE_TLS_BOOTSTRAPPING="${ENABLE_SECURE_TLS_BOOTSTRAPPING:-true}" TAGS_TO_SKIP="${TAGS_TO_SKIP:-}" TAGS_TO_RUN="${TAGS_TO_RUN:-}" @@ -47,6 +48,7 @@ echo "VHD_BUILD_ID: ${VHD_BUILD_ID}" echo "IGNORE_SCENARIOS_WITH_MISSING_VHD: ${IGNORE_SCENARIOS_WITH_MISSING_VHD}" echo "LOGGING_DIR: ${LOGGING_DIR}" echo "E2E_SUBSCRIPTION_ID: ${E2E_SUBSCRIPTION_ID}" +echo "RCV1P_SUBSCRIPTION_ID: ${RCV1P_SUBSCRIPTION_ID}" echo "ENABLE_SECURE_TLS_BOOTSTRAPPING: ${ENABLE_SECURE_TLS_BOOTSTRAPPING}" echo "TAGS_TO_SKIP: ${TAGS_TO_SKIP}" echo "TAGS_TO_RUN: ${TAGS_TO_RUN}" diff --git a/.pipelines/templates/e2e-template.yaml b/.pipelines/templates/e2e-template.yaml index 3b4fad643d7..26d659f77ae 100644 --- a/.pipelines/templates/e2e-template.yaml +++ b/.pipelines/templates/e2e-template.yaml @@ -38,6 +38,7 @@ jobs: displayName: Run AgentBaker E2E env: E2E_SUBSCRIPTION_ID: $(E2E_SUBSCRIPTION_ID) + RCV1P_SUBSCRIPTION_ID: $(RCV1P_SUBSCRIPTION_ID) SYS_SSH_PUBLIC_KEY: $(SYS_SSH_PUBLIC_KEY) SYS_SSH_PRIVATE_KEY_B64: $(SYS_SSH_PRIVATE_KEY_B64) BUILD_SRC_DIR: $(System.DefaultWorkingDirectory) diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index f5644dcda02..ab9e6210ccf 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -64,6 +64,7 @@ func getFuncMap() template.FuncMap { return template.FuncMap{ "getInitAKSCustomCloudFilepath": getInitAKSCustomCloudFilepath, "getIsAksCustomCloud": getIsAksCustomCloud, + "getCloudLocation": getCloudLocation, } } @@ -538,11 +539,15 @@ func getIsAksCustomCloud(customCloudConfig *aksnodeconfigv1.CustomCloudConfig) b return strings.EqualFold(customCloudConfig.GetCustomCloudEnvName(), helpers.AksCustomCloudName) } +func getCloudLocation(v *aksnodeconfigv1.Configuration) string { + return strings.ToLower(strings.Join(strings.Fields(v.GetClusterConfig().GetLocation()), "")) +} + /* GetCloudTargetEnv determines and returns whether the region is a sovereign cloud which have their own data compliance regulations (China/Germany/USGov) or standard. */ // Azure public cloud. func getCloudTargetEnv(v *aksnodeconfigv1.Configuration) string { - loc := strings.ToLower(strings.Join(strings.Fields(v.GetClusterConfig().GetLocation()), "")) + loc := getCloudLocation(v) switch { case strings.HasPrefix(loc, "china"): return "AzureChinaCloud" diff --git a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl index b1359b071d9..42376814388 100644 --- a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl +++ b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl @@ -1,6 +1,7 @@ echo $(date),$(hostname) > ${PROVISION_OUTPUT}; {{if getIsAksCustomCloud .CustomCloudConfig}} REPO_DEPOT_ENDPOINT="{{.CustomCloudConfig.RepoDepotEndpoint}}" -{{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} +LOCATION="{{getCloudLocation .}}" +{{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" diff --git a/e2e/aks_model.go b/e2e/aks_model.go index f7e1a90c333..b8a670c43a9 100644 --- a/e2e/aks_model.go +++ b/e2e/aks_model.go @@ -299,22 +299,23 @@ func getFirewall(ctx context.Context, location, firewallSubnetID, publicIPID str } func addFirewallRules( - ctx context.Context, clusterModel *armcontainerservice.ManagedCluster, + ctx context.Context, infra *ClusterInfra, clusterModel *armcontainerservice.ManagedCluster, ) error { location := *clusterModel.Location defer toolkit.LogStepCtx(ctx, "adding firewall rules")() rg := *clusterModel.Properties.NodeResourceGroup - vnet, err := getClusterVNet(ctx, rg) + vnet, err := getClusterVNet(ctx, infra, rg) if err != nil { return err } - // For kubenet, the AKS-managed route table must stay attached so that pod - // routes (managed by cloud-provider-azure) and firewall routes coexist. - // For Azure CNI variants, the subnet may not have any route table, so we - // create and associate a dedicated one before adding the firewall routes. - aksSubnetResp, err := config.Azure.Subnet.Get(ctx, rg, vnet.name, "aks-subnet", nil) + // Find the AKS-managed route table currently associated with the subnet. + // We add firewall routes directly to this table so that both pod routes + // (managed by cloud-provider-azure) and firewall routes coexist. Creating + // a separate route table and swapping the subnet association disconnects + // the pod routes and breaks kubenet networking. + aksSubnetResp, err := infra.Azure.Subnet.Get(ctx, rg, vnet.name, "aks-subnet", nil) if err != nil { return fmt.Errorf("failed to get AKS subnet: %w", err) } @@ -332,7 +333,7 @@ func addFirewallRules( } toolkit.Logf(ctx, "Creating subnet %s in VNet %s", firewallSubnetName, vnet.name) - subnetPoller, err := config.Azure.Subnet.BeginCreateOrUpdate( + subnetPoller, err := infra.Azure.Subnet.BeginCreateOrUpdate( ctx, rg, vnet.name, @@ -365,7 +366,7 @@ func addFirewallRules( } toolkit.Logf(ctx, "Creating public IP %s", publicIPName) - pipPoller, err := config.Azure.PublicIPAddresses.BeginCreateOrUpdate( + pipPoller, err := infra.Azure.PublicIPAddresses.BeginCreateOrUpdate( ctx, rg, publicIPName, @@ -386,7 +387,7 @@ func addFirewallRules( firewallName := "abe2e-fw" firewall := getFirewall(ctx, location, firewallSubnetID, publicIPID) - fwPoller, err := config.Azure.AzureFirewall.BeginCreateOrUpdate(ctx, rg, firewallName, *firewall, nil) + fwPoller, err := infra.Azure.AzureFirewall.BeginCreateOrUpdate(ctx, rg, firewallName, *firewall, nil) if err != nil { return fmt.Errorf("failed to start Firewall creation: %w", err) } @@ -432,7 +433,7 @@ func addFirewallRules( for _, route := range firewallRoutes { toolkit.Logf(ctx, "Adding route %q to AKS route table %q", *route.Name, aksRTName) - poller, err := config.Azure.Routes.BeginCreateOrUpdate(ctx, rg, aksRTName, *route.Name, route, nil) + poller, err := infra.Azure.Routes.BeginCreateOrUpdate(ctx, rg, aksRTName, *route.Name, route, nil) if err != nil { return fmt.Errorf("failed to start adding route %q: %w", *route.Name, err) } @@ -510,7 +511,7 @@ func addPrivateAzureContainerRegistry(ctx context.Context, cluster *armcontainer if err := createPrivateAzureContainerRegistryPullSecret(ctx, cluster, kube, resourceGroupName, isNonAnonymousPull); err != nil { return fmt.Errorf("create private acr pull secret: %w", err) } - vnet, err := getClusterVNet(ctx, *cluster.Properties.NodeResourceGroup) + vnet, err := getClusterVNet(ctx, DefaultClusterInfra, *cluster.Properties.NodeResourceGroup) if err != nil { return err } @@ -531,7 +532,7 @@ func addNetworkIsolatedSettings(ctx context.Context, clusterModel *armcontainers location := *clusterModel.Location defer toolkit.LogStepCtx(ctx, fmt.Sprintf("Adding network settings for network isolated cluster %s in rg %s", *clusterModel.Name, *clusterModel.Properties.NodeResourceGroup)) - vnet, err := getClusterVNet(ctx, *clusterModel.Properties.NodeResourceGroup) + vnet, err := getClusterVNet(ctx, DefaultClusterInfra, *clusterModel.Properties.NodeResourceGroup) if err != nil { return err } @@ -678,7 +679,7 @@ func createPrivateAzureContainerRegistry(ctx context.Context, cluster *armcontai } // if ACR gets recreated so should the cluster toolkit.Logf(ctx, "Private ACR deleted, deleting cluster %s", *cluster.Name) - if err := deleteCluster(ctx, *cluster.Name, resourceGroup); err != nil { + if err := deleteCluster(ctx, DefaultClusterInfra, *cluster.Name, resourceGroup); err != nil { return fmt.Errorf("failed to delete cluster: %w", err) } } else { diff --git a/e2e/cache.go b/e2e/cache.go index 1b07d383815..777acdaf559 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -10,6 +10,7 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources/v3" ) // cachedFunc creates a thread-safe memoized version of a function. @@ -150,56 +151,67 @@ func clusterLatestKubernetesVersion(ctx context.Context, request ClusterRequest) if err != nil { return nil, fmt.Errorf("getting latest kubernetes version cluster model: %w", err) } - return prepareCluster(ctx, model, false, false) + return prepareCluster(ctx, DefaultClusterInfra, model, false, false) } var ClusterKubenet = cachedFunc(clusterKubenet) // clusterKubenet creates a basic cluster using kubenet networking func clusterKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getKubenetClusterModel("abe2e-kubenet-v4", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getKubenetClusterModel("abe2e-kubenet-v4", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterAzureNetwork = cachedFunc(clusterAzureNetwork) // clusterAzureNetwork creates a cluster with Azure CNI networking func clusterAzureNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureNetworkClusterModel("abe2e-azure-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterAzureBootstrapProfileCache = cachedFunc(clusterAzureBootstrapProfileCache) // clusterAzureBootstrapProfileCache creates a cluster with bootstrap profile cache but without network isolation func clusterAzureBootstrapProfileCache(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-bootstrapprofile-cache-v1", request.Location, request.K8sSystemPoolSKU), false, true) + return prepareCluster(ctx, DefaultClusterInfra, getAzureNetworkClusterModel("abe2e-azure-bootstrapprofile-cache-v1", request.Location, request.K8sSystemPoolSKU), false, true) } var ClusterAzureNetworkIsolated = cachedFunc(clusterAzureNetworkIsolated) // clusterAzureNetworkIsolated creates a networkisolated Azure network cluster (no internet access) func clusterAzureNetworkIsolated(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-networkisolated-v1", request.Location, request.K8sSystemPoolSKU), true, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureNetworkClusterModel("abe2e-azure-networkisolated-v1", request.Location, request.K8sSystemPoolSKU), true, false) } var ClusterAzureOverlayNetwork = cachedFunc(clusterAzureOverlayNetwork) // clusterAzureOverlayNetwork creates a cluster with Azure CNI Overlay networking func clusterAzureOverlayNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureOverlayNetworkClusterModel("abe2e-azure-overlay-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureOverlayNetworkClusterModel("abe2e-azure-overlay-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterAzureOverlayNetworkDualStack = cachedFunc(clusterAzureOverlayNetworkDualStack) // clusterAzureOverlayNetworkDualStack creates a dual-stack (IPv4+IPv6) Azure CNI Overlay cluster func clusterAzureOverlayNetworkDualStack(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureOverlayNetworkDualStackClusterModel("abe2e-azure-overlay-dualstack-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureOverlayNetworkDualStackClusterModel("abe2e-azure-overlay-dualstack-v3", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterCiliumNetwork = cachedFunc(clusterCiliumNetwork) // clusterCiliumNetwork creates a cluster with Cilium CNI networking func clusterCiliumNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getCiliumNetworkClusterModel("abe2e-cilium-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getCiliumNetworkClusterModel("abe2e-cilium-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) +} + +var ClusterRCV1PKubenet = cachedFunc(clusterRCV1PKubenet) + +// clusterRCV1PKubenet creates a kubenet cluster in the RCV1P subscription for cert mode testing. +func clusterRCV1PKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) { + infra := RCV1PClusterInfra() + if infra == nil { + return nil, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set, cannot create RCV1P cluster") + } + return prepareCluster(ctx, infra, getKubenetClusterModel("abe2e-rcv1p-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) } // isNotFoundErr checks if an error represents a "not found" response from Azure API @@ -228,6 +240,25 @@ var CachedEnsureResourceGroup = cachedFunc(ensureResourceGroup) var CachedCreateVMManagedIdentity = cachedFunc(config.Azure.CreateVMManagedIdentity) var CachedCompileAndUploadAKSNodeController = cachedFunc(compileAndUploadAKSNodeController) +// CachedRCV1PEnsureResourceGroup creates the resource group in the RCV1P subscription. +var CachedRCV1PEnsureResourceGroup = cachedFunc(ensureRCV1PResourceGroup) + +// CachedRCV1PCreateVMManagedIdentity creates a VM managed identity in the RCV1P subscription. +var CachedRCV1PCreateVMManagedIdentity = cachedFunc(func(ctx context.Context, location string) (string, error) { + if config.RCV1PAzure == nil { + return "", fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set") + } + return config.RCV1PAzure.CreateVMManagedIdentityInRG(ctx, config.RCV1PResourceGroupName(location), location) +}) + +func ensureRCV1PResourceGroup(ctx context.Context, location string) (armresources.ResourceGroup, error) { + infra := RCV1PClusterInfra() + if infra == nil { + return armresources.ResourceGroup{}, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set") + } + return ensureResourceGroupWithInfra(ctx, infra, location) +} + // VMSizeSKURequest is the cache key for Resource SKU lookups by VM size and location. type VMSizeSKURequest struct { Location string diff --git a/e2e/cluster.go b/e2e/cluster.go index 238b8f7f544..6c91096475d 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -64,14 +64,14 @@ func (c *Cluster) MaxPodsPerNode() (int, error) { // This function contains complex concurrent orchestration — keep it as // minimal as possible and push all non-trivial logic into the individual // task functions it calls. -func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.ManagedCluster, isNetworkIsolated, attachPrivateAcr bool) (*Cluster, error) { +func prepareCluster(ctx context.Context, infra *ClusterInfra, clusterModel *armcontainerservice.ManagedCluster, isNetworkIsolated, attachPrivateAcr bool) (*Cluster, error) { defer toolkit.LogStepCtx(ctx, "preparing cluster")() ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutCluster) defer cancel() clusterModel.Name = to.Ptr(fmt.Sprintf("%s-%s", *clusterModel.Name, hash(clusterModel))) - cluster, err := getOrCreateCluster(ctx, clusterModel) + cluster, err := getOrCreateCluster(ctx, infra, clusterModel) if err != nil { return nil, fmt.Errorf("get or create cluster: %w", err) } @@ -82,11 +82,11 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag // finish before other subnet writes (firewall / network-isolated setup) // to avoid Azure VNet serialisation races. bastion := dag.Go(g, func(ctx context.Context) (*Bastion, error) { - return getOrCreateBastion(ctx, cluster) + return getOrCreateBastion(ctx, infra, cluster) }) - dag.Run(g, func(ctx context.Context) error { return ensureMaintenanceConfiguration(ctx, cluster) }) - subnet := dag.Go(g, func(ctx context.Context) (string, error) { return getClusterSubnetID(ctx, cluster) }) - kube := dag.Go(g, func(ctx context.Context) (*Kubeclient, error) { return getClusterKubeClient(ctx, cluster) }) + dag.Run(g, func(ctx context.Context) error { return ensureMaintenanceConfiguration(ctx, infra, cluster) }) + subnet := dag.Go(g, func(ctx context.Context) (string, error) { return getClusterSubnetID(ctx, infra, cluster) }) + kube := dag.Go(g, func(ctx context.Context) (*Kubeclient, error) { return getClusterKubeClient(ctx, infra, cluster) }) identity := dag.Go(g, func(ctx context.Context) (*armcontainerservice.UserAssignedIdentity, error) { return getClusterKubeletIdentity(ctx, cluster) }) @@ -98,12 +98,12 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag // objects whose backing VMSS no longer exist. var networkDeps []dag.Dep if !isNetworkIsolated { - networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addFirewallRules(ctx, cluster) }, bastion)) + networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addFirewallRules(ctx, infra, cluster) }, bastion)) } if isNetworkIsolated { networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addNetworkIsolatedSettings(ctx, cluster) }, bastion)) } - dag.Run1(g, kube, func(ctx context.Context, k *Kubeclient) error { return collectGarbageVMSS(ctx, cluster, k) }, networkDeps...) + dag.Run1(g, kube, func(ctx context.Context, k *Kubeclient) error { return collectGarbageVMSS(ctx, infra, cluster, k) }, networkDeps...) needACR := isNetworkIsolated || attachPrivateAcr acrNonAnon := dag.Run2(g, kube, identity, addACR(cluster, needACR, true)) acrAnon := dag.Run2(g, kube, identity, addACR(cluster, needACR, false)) @@ -113,6 +113,7 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag if err := g.Wait(); err != nil { return nil, fmt.Errorf("prepare cluster tasks: %w", err) } + return &Cluster{ Model: cluster, Kube: kube.MustGet(), @@ -235,9 +236,10 @@ func hash(cluster *armcontainerservice.ManagedCluster) string { return hexHash[:5] } -func getOrCreateCluster(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { +func getOrCreateCluster(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { defer toolkit.LogStepCtxf(ctx, "get or create cluster %s", *cluster.Name)() - existingCluster, err := getExistingCluster(ctx, *cluster.Location, *cluster.Name) + rgName := infra.ResourceGroupName(*cluster.Location) + existingCluster, err := getExistingCluster(ctx, infra, rgName, *cluster.Name) if err != nil { return nil, fmt.Errorf("failed to get existing cluster %q: %w, and wont retry", *cluster.Name, err) } @@ -247,13 +249,12 @@ func getOrCreateCluster(ctx context.Context, cluster *armcontainerservice.Manage return existingCluster, nil } - return createNewAKSClusterWithRetry(ctx, cluster) + return createNewAKSClusterWithRetry(ctx, infra, rgName, cluster) } // isExistingCluster checks if an AKS cluster exists. return the cluster only if its provisioning state is Succeeded and can be used. non-nil error if not retriable -func getExistingCluster(ctx context.Context, location, clusterName string) (*armcontainerservice.ManagedCluster, error) { - resourceGroupName := config.ResourceGroupName(location) - existingCluster, err := config.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) +func getExistingCluster(ctx context.Context, infra *ClusterInfra, resourceGroupName, clusterName string) (*armcontainerservice.ManagedCluster, error) { + existingCluster, err := infra.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) var azErr *azcore.ResponseError if errors.As(err, &azErr) { if azErr.StatusCode == 404 { @@ -266,7 +267,7 @@ func getExistingCluster(ctx context.Context, location, clusterName string) (*arm switch *existingCluster.Properties.ProvisioningState { case "Succeeded": - nodeRGExists, err := isExistingResourceGroup(ctx, *existingCluster.Properties.NodeResourceGroup) + nodeRGExists, err := isExistingResourceGroup(ctx, infra, *existingCluster.Properties.NodeResourceGroup) if err != nil { return nil, err @@ -278,28 +279,28 @@ func getExistingCluster(ctx context.Context, location, clusterName string) (*arm fallthrough case "Failed": toolkit.Logf(ctx, "##vso[task.logissue type=warning;]Cluster %s in Failed state, deleting", clusterName) - if err := deleteCluster(ctx, clusterName, resourceGroupName); err != nil { + if err := deleteCluster(ctx, infra, clusterName, resourceGroupName); err != nil { return nil, err } // Wait for Azure to confirm cluster is fully deleted before allowing recreation. // This prevents "Reconcile managed identity credential failed" errors where Azure's // backend still has stale references to the old cluster during the new cluster's // identity reconciliation process. - if err := waitForClusterDeletion(ctx, clusterName, resourceGroupName); err != nil { + if err := waitForClusterDeletion(ctx, infra, clusterName, resourceGroupName); err != nil { return nil, fmt.Errorf("failed waiting for cluster deletion: %w", err) } return nil, nil default: // other provisioning state, deleting, , stopping,,cancaled,cancelling,"Creating", "Updating", "Scaling", "Migrating", "Upgrading", "Starting", "Restoring": .. plus many others. toolkit.Logf(ctx, "##vso[task.logissue type=warning;]Unexpected cluster provisioning state %s: %s", clusterName, *existingCluster.Properties.ProvisioningState) - return waitUntilClusterReady(ctx, clusterName, location) + return waitUntilClusterReady(ctx, infra, clusterName, resourceGroupName) } } -func deleteCluster(ctx context.Context, clusterName, resourceGroupName string) error { +func deleteCluster(ctx context.Context, infra *ClusterInfra, clusterName, resourceGroupName string) error { defer toolkit.LogStepCtxf(ctx, "deleting cluster %s", clusterName)() // beileih: why do we do this? - _, err := config.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) + _, err := infra.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) if err != nil { var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == 404 { @@ -309,7 +310,7 @@ func deleteCluster(ctx context.Context, clusterName, resourceGroupName string) e return fmt.Errorf("failed to retrieve cluster while trying to delete it %q: %w", clusterName, err) } - pollerResp, err := config.Azure.AKS.BeginDelete(ctx, resourceGroupName, clusterName, nil) + pollerResp, err := infra.Azure.AKS.BeginDelete(ctx, resourceGroupName, clusterName, nil) if err != nil { return fmt.Errorf("failed to delete cluster %q: %w", clusterName, err) } @@ -320,9 +321,9 @@ func deleteCluster(ctx context.Context, clusterName, resourceGroupName string) e return nil } -func waitForClusterDeletion(ctx context.Context, clusterName, resourceGroupName string) error { +func waitForClusterDeletion(ctx context.Context, infra *ClusterInfra, clusterName, resourceGroupName string) error { return wait.PollUntilContextCancel(ctx, 5*time.Second, true, func(ctx context.Context) (bool, error) { - _, err := config.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) + _, err := infra.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) if err != nil { var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == 404 { @@ -334,11 +335,11 @@ func waitForClusterDeletion(ctx context.Context, clusterName, resourceGroupName }) } -func waitUntilClusterReady(ctx context.Context, name, location string) (*armcontainerservice.ManagedCluster, error) { +func waitUntilClusterReady(ctx context.Context, infra *ClusterInfra, name, resourceGroupName string) (*armcontainerservice.ManagedCluster, error) { var cluster armcontainerservice.ManagedClustersClientGetResponse err := wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) { var err error - cluster, err = config.Azure.AKS.Get(ctx, config.ResourceGroupName(location), name, nil) + cluster, err = infra.Azure.AKS.Get(ctx, resourceGroupName, name, nil) if err != nil { return false, err } @@ -357,8 +358,8 @@ func waitUntilClusterReady(ctx context.Context, name, location string) (*armcont return &cluster.ManagedCluster, nil } -func isExistingResourceGroup(ctx context.Context, resourceGroupName string) (bool, error) { - rgExistence, err := config.Azure.ResourceGroup.CheckExistence(ctx, resourceGroupName, nil) +func isExistingResourceGroup(ctx context.Context, infra *ClusterInfra, resourceGroupName string) (bool, error) { + rgExistence, err := infra.Azure.ResourceGroup.CheckExistence(ctx, resourceGroupName, nil) if err != nil { return false, fmt.Errorf("failed to get RG %q: %w", resourceGroupName, err) } @@ -366,11 +367,11 @@ func isExistingResourceGroup(ctx context.Context, resourceGroupName string) (boo return rgExistence.Success, nil } -func createNewAKSCluster(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { +func createNewAKSCluster(ctx context.Context, infra *ClusterInfra, rgName string, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { // Note, it seems like the operation still can start a trigger a new operation even if nothing has changes - pollerResp, err := config.Azure.AKS.BeginCreateOrUpdate( + pollerResp, err := infra.Azure.AKS.BeginCreateOrUpdate( ctx, - config.ResourceGroupName(*cluster.Location), + rgName, *cluster.Name, *cluster, nil, @@ -391,16 +392,16 @@ func createNewAKSCluster(ctx context.Context, cluster *armcontainerservice.Manag // that retries creating a cluster if it fails with a 409 Conflict error // clusters are reused, and sometimes a cluster can be in UPDATING or DELETING state // simple retry should be sufficient to avoid such conflicts -func createNewAKSClusterWithRetry(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { +func createNewAKSClusterWithRetry(ctx context.Context, infra *ClusterInfra, rgName string, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { maxRetries := 10 retryInterval := 30 * time.Second var lastErr error for attempt := 0; attempt < maxRetries; attempt++ { if attempt > 0 { - toolkit.Logf(ctx, "Attempt %d: creating or updating cluster %s in region %s and rg %s", attempt+1, *cluster.Name, *cluster.Location, config.ResourceGroupName(*cluster.Location)) + toolkit.Logf(ctx, "Attempt %d: creating or updating cluster %s in region %s and rg %s", attempt+1, *cluster.Name, *cluster.Location, rgName) } - createdCluster, err := createNewAKSCluster(ctx, cluster) + createdCluster, err := createNewAKSCluster(ctx, infra, rgName, cluster) if err == nil { return createdCluster, nil } @@ -426,11 +427,12 @@ func createNewAKSClusterWithRetry(ctx context.Context, cluster *armcontainerserv return nil, fmt.Errorf("failed to create cluster after %d attempts due to persistent 409 Conflict: %w", maxRetries, lastErr) } -func ensureMaintenanceConfiguration(ctx context.Context, cluster *armcontainerservice.ManagedCluster) error { - _, err := config.Azure.Maintenance.Get(ctx, config.ResourceGroupName(*cluster.Location), *cluster.Name, "default", nil) +func ensureMaintenanceConfiguration(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) error { + rgName := infra.ResourceGroupName(*cluster.Location) + _, err := infra.Azure.Maintenance.Get(ctx, rgName, *cluster.Name, "default", nil) var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == 404 { - _, err = createNewMaintenanceConfiguration(ctx, cluster) + _, err = createNewMaintenanceConfiguration(ctx, infra, cluster) if err != nil { return fmt.Errorf("creating maintenance configuration for cluster %q: %w", *cluster.Name, err) } @@ -442,8 +444,9 @@ func ensureMaintenanceConfiguration(ctx context.Context, cluster *armcontainerse return nil } -func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.MaintenanceConfiguration, error) { - toolkit.Logf(ctx, "creating maintenance configuration for cluster %s in rg %s", *cluster.Name, config.ResourceGroupName(*cluster.Location)) +func createNewMaintenanceConfiguration(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.MaintenanceConfiguration, error) { + rgName := infra.ResourceGroupName(*cluster.Location) + toolkit.Logf(ctx, "creating maintenance configuration for cluster %s in rg %s", *cluster.Name, rgName) maintenance := armcontainerservice.MaintenanceConfiguration{ Properties: &armcontainerservice.MaintenanceConfigurationProperties{ MaintenanceWindow: &armcontainerservice.MaintenanceWindow{ @@ -465,7 +468,7 @@ func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontaine }, } - _, err := config.Azure.Maintenance.CreateOrUpdate(ctx, config.ResourceGroupName(*cluster.Location), *cluster.Name, "default", maintenance, nil) + _, err := infra.Azure.Maintenance.CreateOrUpdate(ctx, rgName, *cluster.Name, "default", maintenance, nil) if err != nil { return nil, fmt.Errorf("failed to create maintenance configuration: %w", err) } @@ -473,23 +476,23 @@ func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontaine return &maintenance, nil } -func getOrCreateBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { +func getOrCreateBastion(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { nodeRG := *cluster.Properties.NodeResourceGroup bastionName := fmt.Sprintf("%s-bastion", *cluster.Name) - existing, err := config.Azure.BastionHosts.Get(ctx, nodeRG, bastionName, nil) + existing, err := infra.Azure.BastionHosts.Get(ctx, nodeRG, bastionName, nil) var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == http.StatusNotFound { - return createNewBastion(ctx, cluster) + return createNewBastion(ctx, infra, cluster) } if err != nil { return nil, fmt.Errorf("failed to get bastion %q in rg %q: %w", bastionName, nodeRG, err) } - return NewBastion(config.Azure.Credential, config.Config.SubscriptionID, nodeRG, *existing.BastionHost.Properties.DNSName), nil + return NewBastion(infra.Azure.Credential, infra.SubscriptionID, nodeRG, *existing.BastionHost.Properties.DNSName), nil } -func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { +func createNewBastion(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { nodeRG := *cluster.Properties.NodeResourceGroup location := *cluster.Location bastionName := fmt.Sprintf("%s-bastion", *cluster.Name) @@ -497,7 +500,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC publicIPName := fmt.Sprintf("%s-bastion-pip", *cluster.Name) publicIPName = sanitizeAzureResourceName(publicIPName) - vnet, err := getClusterVNet(ctx, nodeRG) + vnet, err := getClusterVNet(ctx, infra, nodeRG) if err != nil { return nil, fmt.Errorf("get cluster vnet in rg %q: %w", nodeRG, err) } @@ -511,7 +514,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC } var bastionSubnetID string - bastionSubnet, subnetGetErr := config.Azure.Subnet.Get(ctx, nodeRG, vnet.name, bastionSubnetName, nil) + bastionSubnet, subnetGetErr := infra.Azure.Subnet.Get(ctx, nodeRG, vnet.name, bastionSubnetName, nil) if subnetGetErr != nil { var subnetAzErr *azcore.ResponseError if !errors.As(subnetGetErr, &subnetAzErr) || subnetAzErr.StatusCode != http.StatusNotFound { @@ -524,7 +527,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC AddressPrefix: to.Ptr(bastionSubnetPrefix), }, } - subnetPoller, err := config.Azure.Subnet.BeginCreateOrUpdate(ctx, nodeRG, vnet.name, bastionSubnetName, subnetParams, nil) + subnetPoller, err := infra.Azure.Subnet.BeginCreateOrUpdate(ctx, nodeRG, vnet.name, bastionSubnetName, subnetParams, nil) if err != nil { return nil, fmt.Errorf("failed to start creating bastion subnet: %w", err) } @@ -549,7 +552,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC } toolkit.Logf(ctx, "creating bastion public IP %s (rg %s)", publicIPName, nodeRG) - pipPoller, err := config.Azure.PublicIPAddresses.BeginCreateOrUpdate(ctx, nodeRG, publicIPName, pipParams, nil) + pipPoller, err := infra.Azure.PublicIPAddresses.BeginCreateOrUpdate(ctx, nodeRG, publicIPName, pipParams, nil) if err != nil { return nil, fmt.Errorf("failed to start creating bastion public IP: %w", err) } @@ -586,7 +589,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC } toolkit.Logf(ctx, "creating bastion %s (native client/tunneling enabled) in rg %s", bastionName, nodeRG) - bastionPoller, err := config.Azure.BastionHosts.BeginCreateOrUpdate(ctx, nodeRG, bastionName, bastionHost, nil) + bastionPoller, err := infra.Azure.BastionHosts.BeginCreateOrUpdate(ctx, nodeRG, bastionName, bastionHost, nil) if err != nil { return nil, fmt.Errorf("failed to start creating bastion: %w", err) } @@ -595,23 +598,23 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC return nil, fmt.Errorf("failed to create bastion: %w", err) } - bastion := NewBastion(config.Azure.Credential, config.Config.SubscriptionID, nodeRG, *resp.BastionHost.Properties.DNSName) + bastion := NewBastion(infra.Azure.Credential, infra.SubscriptionID, nodeRG, *resp.BastionHost.Properties.DNSName) - if err := verifyBastion(ctx, cluster, bastion); err != nil { + if err := verifyBastion(ctx, infra, cluster, bastion); err != nil { return nil, fmt.Errorf("failed to verify bastion: %w", err) } return bastion, nil } -func verifyBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster, bastion *Bastion) error { +func verifyBastion(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster, bastion *Bastion) error { nodeRG := *cluster.Properties.NodeResourceGroup - vmssName, err := getSystemPoolVMSSName(ctx, cluster) + vmssName, err := getSystemPoolVMSSName(ctx, infra, cluster) if err != nil { return err } var vmssVM *armcompute.VirtualMachineScaleSetVM - pager := config.Azure.VMSSVM.NewListPager(nodeRG, vmssName, nil) + pager := infra.Azure.VMSSVM.NewListPager(nodeRG, vmssName, nil) if pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -622,7 +625,7 @@ func verifyBastion(ctx context.Context, cluster *armcontainerservice.ManagedClus } } - vmPrivateIP, err := getPrivateIPFromVMSSVM(ctx, nodeRG, vmssName, *vmssVM.InstanceID) + vmPrivateIP, err := getPrivateIPFromVMSSVMWithClient(ctx, infra.Azure, nodeRG, vmssName, *vmssVM.InstanceID) ctx, cancel := context.WithCancel(ctx) defer cancel() @@ -644,7 +647,7 @@ func verifyBastion(ctx context.Context, cluster *armcontainerservice.ManagedClus return fmt.Errorf("Executed ssh on wrong VM, Expected %s: %s", vmssName, result.stdout) } -func getSystemPoolVMSSName(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (string, error) { +func getSystemPoolVMSSName(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (string, error) { nodeRG := *cluster.Properties.NodeResourceGroup var systemPoolName string for _, pool := range cluster.Properties.AgentPoolProfiles { @@ -652,7 +655,7 @@ func getSystemPoolVMSSName(ctx context.Context, cluster *armcontainerservice.Man systemPoolName = *pool.Name } } - pager := config.Azure.VMSS.NewListPager(nodeRG, nil) + pager := infra.Azure.VMSS.NewListPager(nodeRG, nil) if pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -684,8 +687,8 @@ type VNet struct { subnetId string } -func getClusterVNet(ctx context.Context, mcResourceGroupName string) (VNet, error) { - pager := config.Azure.VNet.NewListPager(mcResourceGroupName, nil) +func getClusterVNet(ctx context.Context, infra *ClusterInfra, mcResourceGroupName string) (VNet, error) { + pager := infra.Azure.VNet.NewListPager(mcResourceGroupName, nil) for pager.More() { nextResult, err := pager.NextPage(ctx) if err != nil { @@ -701,13 +704,13 @@ func getClusterVNet(ctx context.Context, mcResourceGroupName string) (VNet, erro return VNet{}, fmt.Errorf("failed to find aks vnet") } -func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.ManagedCluster, kube *Kubeclient) error { +func collectGarbageVMSS(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster, kube *Kubeclient) error { defer toolkit.LogStepCtx(ctx, "collecting garbage VMSS")() rg := *cluster.Properties.NodeResourceGroup // Build a set of all existing VMSS names while deleting old ones. existingVMSS := map[string]struct{}{} - pager := config.Azure.VMSS.NewListPager(rg, nil) + pager := infra.Azure.VMSS.NewListPager(rg, nil) for pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -730,7 +733,7 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage continue } - _, err := config.Azure.VMSS.BeginDelete(ctx, rg, *vmss.Name, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ + _, err := infra.Azure.VMSS.BeginDelete(ctx, rg, *vmss.Name, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ ForceDeletion: to.Ptr(true), }) if err != nil { @@ -790,8 +793,12 @@ func collectGarbageNodes(ctx context.Context, kube *Kubeclient, existingVMSS map } func ensureResourceGroup(ctx context.Context, location string) (armresources.ResourceGroup, error) { - resourceGroupName := config.ResourceGroupName(location) - rg, err := config.Azure.ResourceGroup.CreateOrUpdate( + return ensureResourceGroupWithInfra(ctx, DefaultClusterInfra, location) +} + +func ensureResourceGroupWithInfra(ctx context.Context, infra *ClusterInfra, location string) (armresources.ResourceGroup, error) { + resourceGroupName := infra.ResourceGroupName(location) + rg, err := infra.Azure.ResourceGroup.CreateOrUpdate( ctx, resourceGroupName, armresources.ResourceGroup{ diff --git a/e2e/config/azure.go b/e2e/config/azure.go index d0de6f04619..847db25a269 100644 --- a/e2e/config/azure.go +++ b/e2e/config/azure.go @@ -117,6 +117,10 @@ func NewHttpClient() *http.Client { } func NewAzureClient() (*AzureClient, error) { + return NewAzureClientForSubscription(Config.SubscriptionID) +} + +func NewAzureClientForSubscription(subscriptionID string) (*AzureClient, error) { httpClient := NewHttpClient() logger := runtime.NewLogPolicy(&policy.LogOptions{ IncludeBody: true, @@ -155,193 +159,183 @@ func NewAzureClient() (*AzureClient, error) { return nil, fmt.Errorf("create core client: %w", err) } - cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(Config.SubscriptionID, credential, opts) + cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create public ip addresses client: %w", err) } - cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(Config.SubscriptionID, credential, opts) - if err != nil { - return nil, fmt.Errorf("create bastion hosts client: %w", err) - } - - cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(Config.SubscriptionID, credential, opts) + cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create bastion hosts client: %w", err) } - cloud.RegistriesClient, err = armcontainerregistry.NewRegistriesClient(Config.SubscriptionID, credential, opts) + cloud.RegistriesClient, err = armcontainerregistry.NewRegistriesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create registry client: %w", err) } - cloud.CacheRulesClient, err = armcontainerregistry.NewCacheRulesClient(Config.SubscriptionID, credential, opts) + cloud.CacheRulesClient, err = armcontainerregistry.NewCacheRulesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create cache rules client: %w", err) } - cloud.PrivateEndpointClient, err = armnetwork.NewPrivateEndpointsClient(Config.SubscriptionID, credential, opts) + cloud.PrivateEndpointClient, err = armnetwork.NewPrivateEndpointsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private endpoint client: %w", err) } - cloud.PrivateZonesClient, err = armprivatedns.NewPrivateZonesClient(Config.SubscriptionID, credential, opts) + cloud.PrivateZonesClient, err = armprivatedns.NewPrivateZonesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private dns zones client: %w", err) } - cloud.VirutalNetworkLinksClient, err = armprivatedns.NewVirtualNetworkLinksClient(Config.SubscriptionID, credential, opts) + cloud.VirutalNetworkLinksClient, err = armprivatedns.NewVirtualNetworkLinksClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create virtual network links client: %w", err) } - cloud.RecordSetClient, err = armprivatedns.NewRecordSetsClient(Config.SubscriptionID, credential, opts) + cloud.RecordSetClient, err = armprivatedns.NewRecordSetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create record set client: %w", err) } - cloud.PrivateDNSZoneGroup, err = armnetwork.NewPrivateDNSZoneGroupsClient(Config.SubscriptionID, credential, opts) + cloud.PrivateDNSZoneGroup, err = armnetwork.NewPrivateDNSZoneGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private dns zone group client: %w", err) } - cloud.SecurityGroup, err = armnetwork.NewSecurityGroupsClient(Config.SubscriptionID, credential, opts) + cloud.SecurityGroup, err = armnetwork.NewSecurityGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create security group client: %w", err) } - cloud.Subnet, err = armnetwork.NewSubnetsClient(Config.SubscriptionID, credential, opts) + cloud.Subnet, err = armnetwork.NewSubnetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create subnet client: %w", err) } - cloud.RouteTables, err = armnetwork.NewRouteTablesClient(Config.SubscriptionID, credential, opts) + cloud.RouteTables, err = armnetwork.NewRouteTablesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create route tables client: %w", err) } - cloud.Routes, err = armnetwork.NewRoutesClient(Config.SubscriptionID, credential, opts) + cloud.Routes, err = armnetwork.NewRoutesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create routes client: %w", err) } - cloud.AKS, err = armcontainerservice.NewManagedClustersClient(Config.SubscriptionID, credential, opts) + cloud.AKS, err = armcontainerservice.NewManagedClustersClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create aks client: %w", err) } - cloud.Maintenance, err = armcontainerservice.NewMaintenanceConfigurationsClient(Config.SubscriptionID, credential, opts) + cloud.Maintenance, err = armcontainerservice.NewMaintenanceConfigurationsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create maintenance client: %w", err) } - cloud.NetworkInterfaces, err = armnetwork.NewInterfacesClient(Config.SubscriptionID, credential, opts) + cloud.NetworkInterfaces, err = armnetwork.NewInterfacesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create network interfaces client: %w", err) } - cloud.VMSS, err = armcompute.NewVirtualMachineScaleSetsClient(Config.SubscriptionID, credential, opts) + cloud.VMSS, err = armcompute.NewVirtualMachineScaleSetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss client: %w", err) } - cloud.VMSSVM, err = armcompute.NewVirtualMachineScaleSetVMsClient(Config.SubscriptionID, credential, opts) + cloud.VMSSVM, err = armcompute.NewVirtualMachineScaleSetVMsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss vm client: %w", err) } - cloud.VMs, err = armcompute.NewVirtualMachinesClient(Config.SubscriptionID, credential, opts) + cloud.VMs, err = armcompute.NewVirtualMachinesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vms client: %w", err) } - cloud.Images, err = armcompute.NewImagesClient(Config.SubscriptionID, credential, opts) + cloud.Images, err = armcompute.NewImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create images client: %w", err) } - cloud.Snapshots, err = armcompute.NewSnapshotsClient(Config.SubscriptionID, credential, opts) + cloud.Snapshots, err = armcompute.NewSnapshotsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create snapshots client: %w", err) } - cloud.GalleryImages, err = armcompute.NewGalleryImagesClient(Config.SubscriptionID, credential, opts) + cloud.GalleryImages, err = armcompute.NewGalleryImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create gallery images client: %w", err) } - cloud.GalleryImageVersions, err = armcompute.NewGalleryImageVersionsClient(Config.SubscriptionID, credential, opts) + cloud.GalleryImageVersions, err = armcompute.NewGalleryImageVersionsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create gallery image versions client: %w", err) } - cloud.Resource, err = armresources.NewClient(Config.SubscriptionID, credential, opts) + cloud.Resource, err = armresources.NewClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource client: %w", err) } - cloud.ResourceGroup, err = armresources.NewResourceGroupsClient(Config.SubscriptionID, credential, opts) + cloud.ResourceGroup, err = armresources.NewResourceGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource group client: %w", err) } - cloud.VNet, err = armnetwork.NewVirtualNetworksClient(Config.SubscriptionID, credential, opts) + cloud.VNet, err = armnetwork.NewVirtualNetworksClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vnet client: %w", err) } - cloud.AzureFirewall, err = armnetwork.NewAzureFirewallsClient(Config.SubscriptionID, credential, opts) + cloud.AzureFirewall, err = armnetwork.NewAzureFirewallsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create firewall client: %w", err) } - cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(Config.SubscriptionID, credential, opts) - if err != nil { - return nil, fmt.Errorf("create public ip addresses client: %w", err) - } - cloud.Blob, err = azblob.NewClient(Config.BlobStorageAccountURL(), credential, nil) if err != nil { return nil, fmt.Errorf("create blob container client: %w", err) } - cloud.StorageContainers, err = armstorage.NewBlobContainersClient(Config.SubscriptionID, credential, opts) + cloud.StorageContainers, err = armstorage.NewBlobContainersClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create blob container client: %w", err) } - cloud.RoleAssignments, err = armauthorization.NewRoleAssignmentsClient(Config.SubscriptionID, credential, opts) + cloud.RoleAssignments, err = armauthorization.NewRoleAssignmentsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create role assignment client: %w", err) } - cloud.UserAssignedIdentities, err = armmsi.NewUserAssignedIdentitiesClient(Config.SubscriptionID, credential, nil) + cloud.UserAssignedIdentities, err = armmsi.NewUserAssignedIdentitiesClient(subscriptionID, credential, nil) if err != nil { return nil, fmt.Errorf("create user assigned identities client: %w", err) } - cloud.StorageAccounts, err = armstorage.NewAccountsClient(Config.SubscriptionID, credential, nil) + cloud.StorageAccounts, err = armstorage.NewAccountsClient(subscriptionID, credential, nil) if err != nil { return nil, fmt.Errorf("create storage accounts client: %w", err) } - cloud.VMSSVMRunCommands, err = armcompute.NewVirtualMachineScaleSetVMRunCommandsClient(Config.SubscriptionID, credential, opts) + cloud.VMSSVMRunCommands, err = armcompute.NewVirtualMachineScaleSetVMRunCommandsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss vm run command client: %w", err) } - cloud.VMExtensionImages, err = armcompute.NewVirtualMachineExtensionImagesClient(Config.SubscriptionID, credential, opts) + cloud.VMExtensionImages, err = armcompute.NewVirtualMachineExtensionImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vm extension images client: %w", err) } - cloud.ResourceSKUs, err = armcompute.NewResourceSKUsClient(Config.SubscriptionID, credential, opts) + cloud.ResourceSKUs, err = armcompute.NewResourceSKUsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource skus client: %w", err) } // Ensure the gallery exists - cloud.Galleries, err = armcompute.NewGalleriesClient(Config.SubscriptionID, credential, opts) + cloud.Galleries, err = armcompute.NewGalleriesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create galleries client: %w", err) } @@ -419,6 +413,18 @@ func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context, identityLocat return *identity.Properties.ClientID, nil } +// CreateVMManagedIdentityInRG creates a VM managed identity in the specified resource group +// without creating blob storage infrastructure (which belongs to the default subscription). +func (a *AzureClient) CreateVMManagedIdentityInRG(ctx context.Context, resourceGroupName, location string) (string, error) { + identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, resourceGroupName, VMIdentityName, armmsi.Identity{ + Location: to.Ptr(location), + }, nil) + if err != nil { + return "", fmt.Errorf("create managed identity in RG %s: %w", resourceGroupName, err) + } + return *identity.Properties.ClientID, nil +} + func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error { poller, err := a.StorageAccounts.BeginCreate(ctx, ResourceGroupName(Config.DefaultLocation), Config.BlobStorageAccount(), armstorage.AccountCreateParameters{ Kind: to.Ptr(armstorage.KindStorageV2), diff --git a/e2e/config/config.go b/e2e/config/config.go index d61db484c6e..bd3f9c677c2 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -29,6 +29,10 @@ var ( Azure = mustNewAzureClient() VMIdentityName = "abe2e-vm-identity" + // RCV1PAzure is lazily initialized when RCV1PSubscriptionID is set. + // It provides Azure clients bound to the PlatformSettingsOverride-registered subscription. + RCV1PAzure *AzureClient + DefaultPollUntilDoneOptions = &runtime.PollUntilDoneOptions{ Frequency: time.Second, } @@ -40,6 +44,14 @@ func ResourceGroupName(location string) string { return "abe2e-" + location } +func RCV1PResourceGroupName(location string) string { + return "abe2e-rcv1p-" + location +} + +func (c *Configuration) RCV1PVMIdentityResourceID(location string) string { + return fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ManagedIdentity/userAssignedIdentities/%s", c.RCV1PSubscriptionID, RCV1PResourceGroupName(location), VMIdentityName) +} + func PrivateACRNameNotAnon(location string) string { return "abe2eprivatenonanon" + location // will have anonymous pull enabled } @@ -88,6 +100,7 @@ type Configuration struct { TestTimeoutCluster time.Duration `env:"TEST_TIMEOUT_CLUSTER" envDefault:"20m"` TestTimeoutVMSS time.Duration `env:"TEST_TIMEOUT_VMSS" envDefault:"17m"` WindowsAdminPassword string `env:"WINDOWS_ADMIN_PASSWORD"` + RCV1PSubscriptionID string `env:"RCV1P_SUBSCRIPTION_ID"` } func (c *Configuration) BlobStorageAccount() string { @@ -169,6 +182,16 @@ func mustLoadConfig() *Configuration { return cfg } +func init() { + if Config.RCV1PSubscriptionID != "" && !strings.HasPrefix(Config.RCV1PSubscriptionID, "$(") { + client, err := NewAzureClientForSubscription(Config.RCV1PSubscriptionID) + if err != nil { + panic(fmt.Sprintf("failed to create RCV1P Azure client: %v", err)) + } + RCV1PAzure = client + } +} + // Returns a newly generated RSA public/private key pair with the private key in PEM format. func mustGetNewRSAKeyPair() ([]byte, []byte, string) { // Generate new key pair diff --git a/e2e/kube.go b/e2e/kube.go index 87a260d4b4a..de18eb8837f 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -39,10 +39,10 @@ const ( podNetworkDebugAppLabel = "debugnonhost-mariner-tolerated" ) -func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*Kubeclient, error) { - resourceGroupName := config.ResourceGroupName(*cluster.Location) +func getClusterKubeClient(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*Kubeclient, error) { + resourceGroupName := infra.ResourceGroupName(*cluster.Location) clusterName := *cluster.Name - data, err := getClusterKubeconfigBytes(ctx, resourceGroupName, clusterName) + data, err := getClusterKubeconfigBytes(ctx, infra, resourceGroupName, clusterName) if err != nil { return nil, fmt.Errorf("get cluster kubeconfig bytes: %w", err) } @@ -276,8 +276,8 @@ func logPodDebugInfo(ctx context.Context, kube *Kubeclient, pod *corev1.Pod) { toolkit.Log(ctx, string(info)) } -func getClusterKubeconfigBytes(ctx context.Context, resourceGroupName, clusterName string) ([]byte, error) { - credentialList, err := config.Azure.AKS.ListClusterAdminCredentials(ctx, resourceGroupName, clusterName, nil) +func getClusterKubeconfigBytes(ctx context.Context, infra *ClusterInfra, resourceGroupName, clusterName string) ([]byte, error) { + credentialList, err := infra.Azure.AKS.ListClusterAdminCredentials(ctx, resourceGroupName, clusterName, nil) if err != nil { return nil, fmt.Errorf("list cluster admin credentials: %w", err) } @@ -445,9 +445,9 @@ func daemonsetDebug(ctx context.Context, deploymentName, targetNodeLabel, privat } } -func getClusterSubnetID(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (string, error) { +func getClusterSubnetID(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (string, error) { mcResourceGroupName := *cluster.Properties.NodeResourceGroup - pager := config.Azure.VNet.NewListPager(mcResourceGroupName, nil) + pager := infra.Azure.VNet.NewListPager(mcResourceGroupName, nil) for pager.More() { nextResult, err := pager.NextPage(ctx) if err != nil { diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go new file mode 100644 index 00000000000..eed7cf43ffd --- /dev/null +++ b/e2e/scenario_rcv1p_test.go @@ -0,0 +1,280 @@ +// scenario_rcv1p_test.go contains end-to-end tests for the RCV1P (Root Certificate V1P) cert mode +// on Linux distros. RCV1P is the next-generation mechanism for distributing Azure root CA certificates +// to AKS nodes. Instead of relying on hardcoded certificate bundles, RCV1P queries the Azure wireserver +// at provisioning time to download the latest root certificates and installs them into the OS trust store. +// +// These tests require: +// - A dedicated subscription (RCV1P_SUBSCRIPTION_ID) with the Microsoft.Compute/PlatformSettingsOverride +// feature flag registered, which enables the wireserver certificate endpoint. +// - The VM opt-in tag "platformsettings.host_environment.service.platform_optedin_for_rootcerts=true" +// on each VMSS, which tells wireserver to serve certificates to this specific VM. +// +// Both conditions must be met: the subscription feature enables the endpoint, and the VM tag grants +// per-VM access. Without the tag, wireserver returns IsOptedInForRootCerts=false. +// +// The positive tests (Test_RCV1P_) verify that certificates are downloaded, installed into +// the distro-specific trust store, and a refresh schedule is created. The negative test +// (Test_RCV1P_NotOptedIn) verifies that omitting the VM tag correctly prevents cert installation. +package e2e + +import ( + "context" + "fmt" + "io" + "strings" + "sync" + "testing" + + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" + azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" +) + +// rcv1pOptInTag is the ARM tag that must be set on the VM resource for wireserver to serve +// root certificates. Without this tag, wireserver returns IsOptedInForRootCerts=false even +// if the subscription has the PlatformSettingsOverride feature registered. +const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedin_for_rootcerts" + +// skipIfRCV1PNotConfigured skips the test when the RCV1P subscription is not configured. +// This happens in regular CI runs where the RCV1P variable group is not linked, causing +// Azure DevOps to pass the literal unexpanded string "$(RCV1P_SUBSCRIPTION_ID)". +// It also verifies the Microsoft.Compute/PlatformSettingsOverride feature flag is registered. +func skipIfRCV1PNotConfigured(t *testing.T) { + t.Helper() + subID := config.Config.RCV1PSubscriptionID + if subID == "" || strings.HasPrefix(subID, "$(") { + t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") + } + checkPlatformSettingsOverrideFeatureFlag(t, subID) +} + +var ( + featureFlagCheckOnce sync.Once + featureFlagCheckResult error +) + +// checkPlatformSettingsOverrideFeatureFlag verifies the Microsoft.Compute/PlatformSettingsOverride +// feature flag is registered on the given subscription. This is a prerequisite for wireserver to +// serve root certificates. The check runs only once per test run. +func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID string) { + t.Helper() + featureFlagCheckOnce.Do(func() { + featureFlagCheckResult = verifyFeatureFlag(t.Context(), subscriptionID) + }) + if featureFlagCheckResult != nil { + t.Fatalf("RCV1P feature flag check failed: %v", featureFlagCheckResult) + } +} + +func verifyFeatureFlag(ctx context.Context, subscriptionID string) error { + url := fmt.Sprintf( + "https://management.azure.com/subscriptions/%s/providers/Microsoft.Features/providers/Microsoft.Compute/features/PlatformSettingsOverride?api-version=2021-07-01", + subscriptionID, + ) + + req, err := azruntime.NewRequest(ctx, "GET", url) + if err != nil { + return fmt.Errorf("failed to create feature flag request: %w", err) + } + + resp, err := config.RCV1PAzure.Core.Pipeline().Do(req) + if err != nil { + return fmt.Errorf("failed to query feature flag: %w", err) + } + defer resp.Body.Close() + + body, _ := io.ReadAll(resp.Body) + bodyStr := string(body) + + if resp.StatusCode != 200 { + return fmt.Errorf("feature flag query returned status %d: %s", resp.StatusCode, bodyStr) + } + + if !strings.Contains(bodyStr, `"Registered"`) { + return fmt.Errorf("Microsoft.Compute/PlatformSettingsOverride is NOT registered on subscription %s (response: %s); "+ + "wireserver will not serve root certificates without this feature flag", subscriptionID, bodyStr) + } + + return nil +} + +// rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS resource level. +// Note: For wireserver to recognize the tag, it must also be set on the individual VM instance. +// Use VMInstanceTags in the Config to set instance-level tags (applied after VM creation). +func rcv1pOptInVMConfigMutator(vmss *armcompute.VirtualMachineScaleSet) { + if vmss.Tags == nil { + vmss.Tags = map[string]*string{} + } + vmss.Tags[rcv1pOptInTag] = to.Ptr("true") +} + +// rcv1pVMInstanceTags returns the tags that must be set on individual VM instances +// for wireserver to serve root certificates. +func rcv1pVMInstanceTags() map[string]*string { + return map[string]*string{ + rcv1pOptInTag: to.Ptr("true"), + } +} + +// Test_RCV1P_Ubuntu2204 validates RCV1P cert download and trust store installation on Ubuntu 22.04. +// Ubuntu uses /usr/local/share/ca-certificates/ as the cert drop folder and update-ca-certificates +// to rebuild the trust bundle. +func Test_RCV1P_Ubuntu2204(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Ubuntu 22.04 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Ubuntu2404 validates RCV1P cert download and trust store installation on Ubuntu 24.04. +// Covers the newer Ubuntu LTS release to ensure the cert endpoint and trust store integration +// work correctly across Ubuntu versions. +func Test_RCV1P_Ubuntu2404(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Ubuntu 24.04 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2404Gen2Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_AzureLinuxV3 validates RCV1P on Azure Linux V3, which uses a different trust store +// layout (/etc/pki/ca-trust/source/anchors/) and update command (update-ca-trust) than Ubuntu. +// This ensures the provisioning script correctly detects the distro and uses the right paths. +func Test_RCV1P_AzureLinuxV3(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Azure Linux V3 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDAzureLinuxV3Gen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Flatcar validates RCV1P on Flatcar Container Linux, which has a read-only root +// filesystem and requires certificates to be placed in /etc/ssl/certs/ as .pem files. +// This is the most constrained environment for cert installation. +func Test_RCV1P_Flatcar(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Flatcar with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDFlatcarGen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_ACL validates RCV1P on Azure Container Linux (ACL), which shares the same +// trust store layout as Azure Linux (/etc/pki/ca-trust/). ACL requires Trusted Launch, +// so the VMConfigMutator combines both the TrustedLaunch and opt-in tag settings. +func Test_RCV1P_ACL(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on ACL with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDACLGen2TL, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) + rcv1pOptInVMConfigMutator(vmss) + }, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_NotOptedIn is a negative test that validates the VM opt-in tag is required +// for cert installation. The VM is created in the RCV1P subscription (which has +// PlatformSettingsOverride registered) but WITHOUT the opt-in tag on the VMSS. +// This verifies that wireserver returns IsOptedInForRootCerts=false and the provisioning +// script correctly skips certificate download and trust store installation. +// This test is critical because it proves the two-layer access control works: +// subscription feature alone is not sufficient — the VM must also be explicitly tagged. +func Test_RCV1P_NotOptedIn(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode without VM opt-in tag; expects no cert installation", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PNotOptedIn(ctx, s) + }, + }, + }) +} diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go new file mode 100644 index 00000000000..0932ae5f97b --- /dev/null +++ b/e2e/scenario_rcv1p_win_test.go @@ -0,0 +1,119 @@ +// scenario_rcv1p_win_test.go contains end-to-end tests for the RCV1P cert mode on Windows. +// Windows uses a different cert installation path than Linux: certificates are downloaded to +// C:\ca and imported into the Windows certificate store (Cert:\LocalMachine\Root) via +// Import-Certificate. A scheduled task (aks-ca-certs-refresh-task) is registered to +// periodically refresh the certificates. +// +// These tests run against the same RCV1P subscription and require the same VM opt-in tag +// as the Linux tests (see scenario_rcv1p_test.go for details on the two-layer access control). +package e2e + +import ( + "context" + "testing" + + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" +) + +// Test_RCV1P_Windows2022 validates RCV1P cert download and Windows certificate store +// installation on Windows Server 2022. +func Test_RCV1P_Windows2022(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2022 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2022Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows23H2 validates RCV1P on Windows Server 23H2, the annual channel release. +func Test_RCV1P_Windows23H2(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 23H2 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows23H2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows2025 validates RCV1P on Windows Server 2025. This SKU requires +// Trusted Launch, so the VMConfigMutator combines both TrustedLaunch and opt-in tag settings. +func Test_RCV1P_Windows2025(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2025 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2025, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) + rcv1pOptInVMConfigMutator(vmss) + }, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + Windows2025BootstrapConfigMutator(t, nbc) + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows_NotOptedIn is a negative test that validates the VM opt-in tag is required +// for cert installation on Windows. The VM is created in the RCV1P subscription (which has +// PlatformSettingsOverride registered) but WITHOUT the opt-in tag on the VMSS. +// This verifies that wireserver returns IsOptedInForRootCerts=false and the provisioning +// script correctly skips certificate download and refresh task registration. +func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2022Containerd, + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PNotOptedInWindows(ctx, s) + }, + }, + }) +} diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index a84442d32f1..9eb9818e32d 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -211,10 +211,24 @@ func runScenario(t testing.TB, s *Scenario) error { ctx := newTestCtx(t) maybeSkipScenario(ctx, t, s) - _, err := CachedEnsureResourceGroup(ctx, s.Location) - require.NoError(t, err) - _, err = CachedCreateVMManagedIdentity(ctx, s.Location) - require.NoError(t, err) + if s.AzureClient != nil { + // RCV1P scenario: ensure RG and identity in the RCV1P subscription + _, err := CachedRCV1PEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedRCV1PCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + // Also ensure default subscription infra (RG + identity + blob storage) is provisioned, + // since Windows log extraction on failure uploads to the default subscription's blob storage. + _, err = CachedEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + } else { + _, err := CachedEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + } s.T = t ctrruntimelog.SetLogger(zap.New()) @@ -261,6 +275,11 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) { nbc, err := getBaseNBC(s.T, s.Runtime.Cluster, s.VHD) require.NoError(s.T, err) + // Override subscription ID for RCV1P scenarios + if s.SubscriptionID != "" { + nbc.SubscriptionID = s.SubscriptionID + } + nbc.EnableScriptlessCSECmd = true if s.Runtime != nil && s.Runtime.EnableScriptlessNBCCSECmd { nbc.EnableScriptlessNBCCSECmd = true @@ -610,7 +629,7 @@ func RunCommand(ctx context.Context, s *Scenario, command string) (armcompute.Ru toolkit.Logf(ctx, "Command %q took %s", command, elapsed) }() - runPoller, err := config.Azure.VMSSVM.BeginRunCommand(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, armcompute.RunCommandInput{ + runPoller, err := s.GetAzure().VMSSVM.BeginRunCommand(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, armcompute.RunCommandInput{ CommandID: func() *string { if s.IsWindows() { return to.Ptr("RunPowerShellScript") @@ -639,11 +658,11 @@ func CreateImage(ctx context.Context, s *Scenario) *config.Image { require.NoErrorf(s.T, err, "failed to run sysprep on Windows VM for image creation") } - vm, err := config.Azure.VMSSVM.Get(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) + vm, err := s.GetAzure().VMSSVM.Get(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) require.NoError(s.T, err, "Failed to get VMSS VM for image creation") s.T.Log("Deallocating VMSS VM...") - poll, err := config.Azure.VMSSVM.BeginDeallocate(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, nil) + poll, err := s.GetAzure().VMSSVM.BeginDeallocate(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, nil) require.NoError(s.T, err, "Failed to begin deallocate") _, err = poll.PollUntilDone(ctx, nil) require.NoError(s.T, err, "Failed to deallocate") @@ -690,7 +709,7 @@ func CreateSIGImageVersionFromDisk(ctx context.Context, s *Scenario, version str // Create the image version directly from the disk s.T.Logf("Creating gallery image version: %s in %s", version, *image.ID) - createVersionOp, err := config.Azure.GalleryImageVersions.BeginCreateOrUpdate(ctx, rg, *gallery.Name, *image.Name, version, armcompute.GalleryImageVersion{ + createVersionOp, err := s.GetAzure().GalleryImageVersions.BeginCreateOrUpdate(ctx, rg, *gallery.Name, *image.Name, version, armcompute.GalleryImageVersion{ Location: to.Ptr(s.Location), Properties: &armcompute.GalleryImageVersionProperties{ StorageProfile: &armcompute.GalleryImageVersionStorageProfile{ @@ -726,7 +745,7 @@ func CreateSIGImageVersionFromDisk(ctx context.Context, s *Scenario, version str customVHD := *s.Config.VHD customVHD.Name = *image.Name // Use the architecture-specific image name customVHD.Gallery = &config.Gallery{ - SubscriptionID: config.Config.SubscriptionID, + SubscriptionID: s.GetSubscriptionID(), ResourceGroupName: rg, Name: *gallery.Name, } diff --git a/e2e/types.go b/e2e/types.go index 9643b167470..666ef4715eb 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -21,6 +21,33 @@ import ( "golang.org/x/crypto/ssh" ) +// ClusterInfra captures the Azure infrastructure scope for cluster operations. +// It allows cluster creation and management to target different subscriptions. +type ClusterInfra struct { + Azure *config.AzureClient + SubscriptionID string + ResourceGroupName func(location string) string +} + +// DefaultClusterInfra uses the default subscription and resource group naming. +var DefaultClusterInfra = &ClusterInfra{ + Azure: config.Azure, + SubscriptionID: config.Config.SubscriptionID, + ResourceGroupName: config.ResourceGroupName, +} + +// RCV1PClusterInfra returns the ClusterInfra for the RCV1P subscription, or nil if not configured. +func RCV1PClusterInfra() *ClusterInfra { + if config.RCV1PAzure == nil { + return nil + } + return &ClusterInfra{ + Azure: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + ResourceGroupName: config.RCV1PResourceGroupName, + } +} + type Tags struct { Name string ImageName string @@ -35,6 +62,7 @@ type Tags struct { Scriptless bool VHDCaching bool MockAzureChinaCloud bool + RCV1PCertMode bool VMSeriesCoverageTest bool } @@ -128,6 +156,14 @@ type Scenario struct { // a default size will be used. K8sSystemPoolSKU string + // AzureClient overrides the default config.Azure client for this scenario. + // When nil, config.Azure is used. + AzureClient *config.AzureClient + + // SubscriptionID overrides the default config.Config.SubscriptionID for this scenario. + // When empty, config.Config.SubscriptionID is used. + SubscriptionID string + // Runtime contains the runtime state of the scenario. It's populated in the beginning of the test run Runtime *ScenarioRuntime T testing.TB @@ -217,6 +253,12 @@ type Config struct { // This prevents the Guest Agent from sweeping events before they can be read. // Only set this on CSE performance test scenarios. EagerCSETimingExtraction bool + + // VMInstanceTags are tags applied directly to VMSS VM instances after creation via BeginUpdate. + // This is needed for features like RCV1P where wireserver checks tags on the individual VM instance, + // not the VMSS resource-level tags. These tags are applied after the VM appears in the API but + // before CSE completes, giving wireserver time to see them before the provisioning scripts query it. + VMInstanceTags map[string]*string } func (s *Scenario) PrepareAKSNodeConfig() { @@ -410,3 +452,35 @@ func (s *Scenario) IsWindows() bool { func (s *Scenario) IsLinux() bool { return !s.IsWindows() } + +// GetAzure returns the AzureClient for this scenario, falling back to the default config.Azure. +func (s *Scenario) GetAzure() *config.AzureClient { + if s.AzureClient != nil { + return s.AzureClient + } + return config.Azure +} + +// GetSubscriptionID returns the subscription ID for this scenario, falling back to config.Config.SubscriptionID. +func (s *Scenario) GetSubscriptionID() string { + if s.SubscriptionID != "" { + return s.SubscriptionID + } + return config.Config.SubscriptionID +} + +// GetResourceGroupName returns the resource group name for this scenario's location. +func (s *Scenario) GetResourceGroupName() string { + if s.SubscriptionID != "" && s.SubscriptionID != config.Config.SubscriptionID { + return config.RCV1PResourceGroupName(s.Location) + } + return config.ResourceGroupName(s.Location) +} + +// GetVMIdentityResourceID returns the VM identity resource ID for this scenario. +func (s *Scenario) GetVMIdentityResourceID() string { + if s.SubscriptionID != "" && s.SubscriptionID != config.Config.SubscriptionID { + return config.Config.RCV1PVMIdentityResourceID(s.Location) + } + return config.Config.VMIdentityResourceID(s.Location) +} diff --git a/e2e/validators.go b/e2e/validators.go index de8db1ca15a..c975c46d057 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -412,6 +412,13 @@ func ValidateNonEmptyDirectory(ctx context.Context, s *Scenario, dirName string) execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "either could not find expected file, or something went wrong") } +func ValidateEmptyDirectory(ctx context.Context, s *Scenario, dirName string) { + s.T.Helper() + command := fmt.Sprintf("! [ -d '%s' ] || [ -z \"$(ls -A '%s')\" ]", dirName, dirName) + execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, + fmt.Sprintf("expected directory %s to be empty or not exist", dirName)) +} + func ValidateInspektorGadget(ctx context.Context, s *Scenario) { s.T.Helper() @@ -2293,3 +2300,130 @@ func ValidateAlgifAeadMitigation(ctx context.Context, s *Scenario) { execScriptOnVMForScenarioValidateExitCode(ctx, s, script, 0, "CVE-2026-31431 (algif_aead) mitigation validation failed") } + +// ValidateRCV1PCertMode validates that the rcv1p certificate endpoint mode was used during +// Linux node provisioning, certificates were downloaded and installed, and a refresh task was scheduled. +func ValidateRCV1PCertMode(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate the provisioning log shows rcv1p mode was selected + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Using custom cloud certificate endpoint mode: rcv1p") + + // Validate the subscription is opted in for root certs + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "IsOptedInForRootCerts=true") + + // Validate certificates were downloaded + ValidateNonEmptyDirectory(ctx, s, "/root/AzureCACertificates") + + // Validate trust store was updated (distro-specific path) + trustStoreDir := rcv1pTrustStoreDir(s) + execScriptOnVMForScenarioValidateExitCode(ctx, s, + fmt.Sprintf("sudo ls -1 %s/*.crt 2>/dev/null || sudo ls -1 %s/*.pem 2>/dev/null", trustStoreDir, trustStoreDir), + 0, fmt.Sprintf("expected certificates in trust store directory %s", trustStoreDir)) + + // Validate refresh schedule was created (cron or systemd timer depending on distro) + if s.VHD.Flatcar || s.VHD.OS == config.OSACL { + // Flatcar and ACL use systemd timer + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "systemctl is-enabled azure-ca-refresh.timer", + 0, "expected azure-ca-refresh.timer to be enabled") + } else { + // Ubuntu, Mariner, AzureLinux use cron + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "sudo crontab -l 2>/dev/null | grep -q ca-refresh", + 0, "expected ca-refresh cron entry") + } +} + +// rcv1pTrustStoreDir returns the OS trust store directory for the given scenario's distro. +func rcv1pTrustStoreDir(s *Scenario) string { + switch s.VHD.OS { + case config.OSMariner, config.OSAzureLinux, config.OSACL: + return "/etc/pki/ca-trust/source/anchors" + case config.OSFlatcar: + return "/etc/ssl/certs" + default: + // Ubuntu and anything else + return "/usr/local/share/ca-certificates" + } +} + +// ValidateRCV1PCertModeWindows validates that the rcv1p certificate endpoint mode was used during +// Windows node provisioning, certificates were downloaded and installed, and a refresh task was scheduled. +func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate CA certificates were installed to the Windows certificate store + command := []string{ + "$ErrorActionPreference = 'Stop'", + "$caFolder = 'C:\\ca'", + "if (-not (Test-Path $caFolder)) { throw 'CA certificates folder C:\\ca does not exist' }", + "$certs = Get-ChildItem -Path $caFolder -File", + "if ($certs.Count -eq 0) { throw 'No certificates found in C:\\ca folder' }", + "Write-Host \"Found $($certs.Count) certificate(s) in $caFolder\"", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected certificates in C:\\ca") + + // Validate the refresh scheduled task exists + command = []string{ + "$ErrorActionPreference = 'Stop'", + "$task = Get-ScheduledTask -TaskName 'aks-ca-certs-refresh-task' -ErrorAction SilentlyContinue", + "if (-not $task) { throw 'aks-ca-certs-refresh-task scheduled task not found' }", + "Write-Host \"Scheduled task found: $($task.TaskName) (State: $($task.State))\"", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected aks-ca-certs-refresh-task scheduled task") +} + +// ValidateRCV1PNotOptedIn validates that when the VM does NOT have the opt-in tag, +// wireserver returns IsOptedInForRootCerts=false and no certificates are installed, +// even in the RCV1P subscription with PlatformSettingsOverride registered. +func ValidateRCV1PNotOptedIn(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate the provisioning log shows rcv1p mode was selected + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Using custom cloud certificate endpoint mode: rcv1p") + + // Validate wireserver reported not opted in + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true") + + // Validate no certificates were downloaded + ValidateEmptyDirectory(ctx, s, "/root/AzureCACertificates") + + // Validate no refresh schedule was created + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "sudo crontab -l 2>/dev/null | grep -q ca-refresh", + 1, "expected no ca-refresh cron entry when not opted in") +} + +// ValidateRCV1PNotOptedInWindows validates that when the Windows VM does NOT have the opt-in tag, +// no certificates are installed to C:\ca and no refresh scheduled task is registered, +// even in the RCV1P subscription with PlatformSettingsOverride registered. +func ValidateRCV1PNotOptedInWindows(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate C:\ca is empty or does not exist + command := []string{ + "$ErrorActionPreference = 'Stop'", + "$caFolder = 'C:\\ca'", + "if ((Test-Path $caFolder) -and @(Get-ChildItem -Path $caFolder -File).Count -gt 0) { throw 'Expected C:\\ca to be empty or not exist, but found certificates' }", + "Write-Host 'C:\\ca is empty or does not exist as expected'", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected C:\\ca to be empty or not exist when not opted in") + + // Validate no refresh scheduled task was registered + command = []string{ + "$ErrorActionPreference = 'Stop'", + "$task = Get-ScheduledTask -TaskName 'aks-ca-certs-refresh-task' -ErrorAction SilentlyContinue", + "if ($task) { throw 'Expected no aks-ca-certs-refresh-task but found one' }", + "Write-Host 'No aks-ca-certs-refresh-task found as expected'", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected no aks-ca-certs-refresh-task scheduled task when not opted in") +} diff --git a/e2e/vmss.go b/e2e/vmss.go index d9260bf6407..799b7d34345 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -383,13 +383,13 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine if config.Config.IsLocalBuild() { s.T.Logf( "VMSS portal link: https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/overview", - config.Config.SubscriptionID, + s.GetSubscriptionID(), *cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, ) s.T.Logf( "Managed cluster portal link: https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ContainerService/managedClusters/%s/overview", - config.Config.SubscriptionID, + s.GetSubscriptionID(), *cluster.Model.Properties.NodeResourceGroup, *cluster.Model.Name, ) @@ -401,8 +401,8 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine model.Identity = &armcompute.VirtualMachineScaleSetIdentity{ Type: to.Ptr(armcompute.ResourceIdentityTypeSystemAssignedUserAssigned), UserAssignedIdentities: map[string]*armcompute.UserAssignedIdentitiesValue{ - *s.Runtime.Cluster.KubeletIdentity.ResourceID: {}, - config.Config.VMIdentityResourceID(s.Location): {}, + *s.Runtime.Cluster.KubeletIdentity.ResourceID: {}, + s.GetVMIdentityResourceID(): {}, }, } @@ -475,11 +475,26 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*ScenarioVM, error) { defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} - operation, err := config.Azure.VMSS.BeginCreateOrUpdate( + + model := createVMSSModel(ctx, s) + + // For scenarios that need VM instance tags (e.g., RCV1P), we must apply tags + // before CSE runs because wireserver checks per-VM-instance tags. The only + // working method for Uniform VMSS is BeginUpdate (full PUT), which takes ~108s. + // To avoid the race, we strip the CSE extension before creation, apply tags + // via BeginUpdate, then re-add the extension in a second update. + var deferredExtensionProfile *armcompute.VirtualMachineScaleSetExtensionProfile + if len(s.Config.VMInstanceTags) > 0 && model.Properties.VirtualMachineProfile.ExtensionProfile != nil { + deferredExtensionProfile = model.Properties.VirtualMachineProfile.ExtensionProfile + model.Properties.VirtualMachineProfile.ExtensionProfile = nil + toolkit.Logf(ctx, "deferring CSE extension until VM instance tags are applied") + } + + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - createVMSSModel(ctx, s), + model, nil, ) if err != nil { @@ -492,16 +507,54 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) - if err != nil { - return vm, fmt.Errorf("failed to get VM private IP address: %w", err) - } - + // Register cleanup early so the VMSS is always deleted even if subsequent steps + // (tag update, IP lookup, etc.) fail — preventing orphaned VMSS resources. s.T.Cleanup(func() { defer cleanupBastionTunnel(vm.SSHClient) cleanupVMSS(ctx, s, vm) }) + // Wait for initial VMSS creation to fully complete before applying tags. + vmssResp, err := operation.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return vm, fmt.Errorf("failed to create VMSS: %w", err) + } + + // Apply VM instance tags via BeginUpdate (full PUT) and then re-add CSE. + // This is needed for features like RCV1P where wireserver checks tags on + // the individual VM instance, not the VMSS resource-level tags. + if len(s.Config.VMInstanceTags) > 0 { + if err := updateVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to update VM instance tags: %w", err) + } + + // Re-add CSE extension now that tags are in place. + if deferredExtensionProfile != nil { + toolkit.Logf(ctx, "re-adding CSE extension after tags are applied") + vmssResp.VirtualMachineScaleSet.Properties.VirtualMachineProfile.ExtensionProfile = deferredExtensionProfile + cseOp, err := s.GetAzure().VMSS.BeginCreateOrUpdate( + ctx, + resourceGroupName, + s.Runtime.VMSSName, + vmssResp.VirtualMachineScaleSet, + nil, + ) + if err != nil { + return vm, fmt.Errorf("failed to begin adding CSE extension: %w", err) + } + vmssResp2, err := cseOp.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return vm, fmt.Errorf("failed to add CSE extension: %w", err) + } + vmssResp = vmssResp2 + } + } + + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + if err != nil { + return vm, fmt.Errorf("failed to get VM private IP address: %w", err) + } + result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" if config.Config.KeepVMSS { s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") @@ -512,7 +565,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" s.T.Log(result) - vmssResp, err := operation.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) if !s.Config.SkipSSHConnectivityValidation { var bastErr error vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) @@ -520,9 +572,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) } } - if err != nil { - return vm, err - } // Wait for VM to be in "Running" power state before proceeding err = waitForVMRunningState(ctx, s, vm.VM) @@ -538,6 +587,41 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } +// updateVMInstanceTags uses BeginUpdate (full PUT) to set tags on a VMSS VM instance. +// This is the only method that works for Uniform mode VMSS — PATCH and Microsoft.Resources/tags +// API both return 405 at this scope. The operation takes ~108s as it triggers full VM model +// reconciliation. This is acceptable for E2E tests where we defer CSE until tags are in place. +func updateVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "updating VM instance %s/%s/%s tags via BeginUpdate", resourceGroupName, vmssName, instanceID)() + + // Get current VM instance to preserve existing state + currentVM, err := s.GetAzure().VMSSVM.Get(ctx, resourceGroupName, vmssName, instanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) + if err != nil { + return fmt.Errorf("failed to get current VM instance: %w", err) + } + + // Merge new tags with any existing tags + if currentVM.Tags == nil { + currentVM.Tags = make(map[string]*string) + } + for k, v := range tags { + currentVM.Tags[k] = v + } + + poller, err := s.GetAzure().VMSSVM.BeginUpdate(ctx, resourceGroupName, vmssName, instanceID, currentVM.VirtualMachineScaleSetVM, nil) + if err != nil { + return fmt.Errorf("failed to begin VM instance tag update: %w", err) + } + + _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return fmt.Errorf("failed to complete VM instance tag update: %w", err) + } + + return nil +} + + // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. func waitForVMRunningState(ctx context.Context, s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { ctxTimeout, cancel := context.WithTimeout(ctx, 3*time.Minute) @@ -549,7 +633,7 @@ func waitForVMRunningState(ctx context.Context, s *Scenario, vmssVM *armcompute. var lastErr error for { // Get the updated VM with instance view to check power state - vm, err := config.Azure.VMSSVM.Get(ctxTimeout, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmssVM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{ + vm, err := s.GetAzure().VMSSVM.Get(ctxTimeout, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmssVM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{ Expand: to.Ptr(armcompute.InstanceViewTypesInstanceView), }) @@ -592,7 +676,7 @@ func waitForVMSSVM(ctx context.Context, s *Scenario) (*armcompute.VirtualMachine var lastErr error for { - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetVMsClientListOptions{ + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetVMsClientListOptions{ Expand: to.Ptr("instanceView"), }) @@ -622,9 +706,14 @@ func waitForVMSSVM(ctx context.Context, s *Scenario) (*armcompute.VirtualMachine } // getPrivateIPFromVMSSVM extracts the private IP address from a VMSS VM by querying its network interfaces. -func getPrivateIPFromVMSSVM(ctx context.Context, resourceGroup, vmssName, instanceID string) (string, error) { +func getPrivateIPFromVMSSVM(ctx context.Context, s *Scenario, resourceGroup, vmssName, instanceID string) (string, error) { + return getPrivateIPFromVMSSVMWithClient(ctx, s.GetAzure(), resourceGroup, vmssName, instanceID) +} + +// getPrivateIPFromVMSSVMWithClient extracts the private IP using the given Azure client. +func getPrivateIPFromVMSSVMWithClient(ctx context.Context, azure *config.AzureClient, resourceGroup, vmssName, instanceID string) (string, error) { // Query the network interface to get the IP configuration - pager := config.Azure.NetworkInterfaces.NewListVirtualMachineScaleSetVMNetworkInterfacesPager( + pager := azure.NetworkInterfaces.NewListVirtualMachineScaleSetVMNetworkInterfacesPager( resourceGroup, vmssName, instanceID, @@ -708,7 +797,7 @@ func extractBootDiagnostics(ctx context.Context, s *Scenario) error { return nil } - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) for pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -717,7 +806,7 @@ func extractBootDiagnostics(ctx context.Context, s *Scenario) error { for _, vmInstance := range page.Value { // Get boot diagnostics data - bootDiagResp, err := config.Azure.VMSSVM.RetrieveBootDiagnosticsData(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmInstance.InstanceID, nil) + bootDiagResp, err := s.GetAzure().VMSSVM.RetrieveBootDiagnosticsData(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmInstance.InstanceID, nil) if err != nil { return fmt.Errorf("failed to get boot diagnostics for VM %s: %v", *vmInstance.InstanceID, err) } @@ -857,7 +946,7 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { ctx, cancel := context.WithTimeout(ctx, 4*time.Minute) defer cancel() - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) page, err := pager.NextPage(ctx) if err != nil { s.T.Logf("failed to list VMSS instances: %s", err) @@ -871,7 +960,7 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { blobPrefix := s.Runtime.VMSSName blobUrl := config.Config.BlobStorageAccountURL() + "/" + config.Config.BlobContainer + "/" + blobPrefix - client := config.Azure.VMSSVMRunCommands + client := s.GetAzure().VMSSVMRunCommands // Invoke the RunCommand on the VMSS instance s.T.Logf("uploading windows logs to blob storage at %s, may take a few minutes", blobUrl) @@ -970,7 +1059,7 @@ func deleteVMSS(ctx context.Context, s *Scenario) { } return } - _, err := config.Azure.VMSS.BeginDelete(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ + _, err := s.GetAzure().VMSS.BeginDelete(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ ForceDeletion: to.Ptr(true), }) if err != nil { @@ -1173,7 +1262,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual ID: to.Ptr( fmt.Sprintf( loadBalancerBackendAddressPoolIDTemplate, - config.Config.SubscriptionID, + s.GetSubscriptionID(), *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, ), ), diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index 550ac950c37..7a034efa98f 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -18,8 +18,9 @@ fi; {{end}} {{if IsAKSCustomCloud}} REPO_DEPOT_ENDPOINT="{{AKSCustomCloudRepoDepotEndpoint}}" -{{GetInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} +LOCATION={{GetVariable "location"}} +{{GetInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; ADMINUSER={{GetParameter "linuxAdminUsername"}} MOBY_VERSION={{GetParameter "mobyVersion"}} TENANT_ID={{GetVariable "tenantID"}} @@ -32,7 +33,6 @@ KUBEPROXY_URL={{GetParameter "kubeProxySpec"}} APISERVER_PUBLIC_KEY={{GetParameter "apiServerCertificate"}} SUBSCRIPTION_ID={{GetVariable "subscriptionId"}} RESOURCE_GROUP={{GetVariable "resourceGroup"}} -LOCATION={{GetVariable "location"}} VM_TYPE={{GetVariable "vmType"}} SUBNET={{GetVariable "subnetName"}} NETWORK_SECURITY_GROUP={{GetVariable "nsgName"}} diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh deleted file mode 100644 index 587da9ba270..00000000000 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh +++ /dev/null @@ -1,186 +0,0 @@ -#!/bin/bash -set -x -mkdir -p /root/AzureCACertificates - -IS_MARINER=0 -IS_AZURELINUX=0 -# shellcheck disable=SC3010 -if [[ -f /etc/os-release ]]; then - . /etc/os-release - # shellcheck disable=SC3010 - if [[ $NAME == *"Mariner"* ]]; then - IS_MARINER=1 - elif [[ $NAME == *"Microsoft Azure Linux"* ]]; then - IS_AZURELINUX=1 - else - echo "Unknown Linux distribution" - exit 1 - fi -else - echo "Unsupported operating system" - exit 1 -fi - -echo "distribution is $distribution" -echo "Running on $NAME" - -# http://168.63.129.16 is a constant for the host's wireserver endpoint -certs=$(curl "http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json") -IFS_backup=$IFS -IFS=$'\r\n' -certNames=($(echo $certs | grep -oP '(?<=Name\": \")[^\"]*')) -certBodies=($(echo $certs | grep -oP '(?<=CertBody\": \")[^\"]*')) -for i in ${!certBodies[@]}; do - echo ${certBodies[$i]} | sed 's/\\r\\n/\n/g' | sed 's/\\//g' > "/root/AzureCACertificates/$(echo ${certNames[$i]} | sed 's/.cer/.crt/g')" -done -IFS=$IFS_backup - -cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ -/usr/bin/update-ca-trust - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -scriptPath=$0 -# Determine an absolute, canonical path to this script for use in cron. -if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" -fi - -if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi -fi - -cloud-init status --wait - -function init_mariner_repo_depot { - local repodepot_endpoint=$1 - echo "Adding [extended] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo - - echo "Adding [nvidia] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - - echo "Adding [cloud-native] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo - - echo "Pointing Mariner repos at RepoDepot..." - for f in /etc/yum.repos.d/*.repo - do - sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f - echo "$f modified." - done - echo "Mariner repo setup complete." -} - -function init_azurelinux_repo_depot { - local repodepot_endpoint=$1 - repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") - - # tbd maybe we do this a bit nicer - rm -f /etc/yum.repos.d/azurelinux* - - for repo in "${repos[@]}"; do - output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" - repo_content=( - "[azurelinux-official-$repo]" - "name=Azure Linux Official $repo \$releasever \$basearch" - "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" - "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" - "gpgcheck=1" - "repo_gpgcheck=1" - "enabled=1" - "skip_if_unavailable=True" - "sslverify=1" - ) - - rm -f "$output_file" - - for line in "${repo_content[@]}"; do - echo "$line" >> "$output_file" - done - - echo "File '$output_file' has been created." - done - echo "Azure Linux repo setup complete." -} - -dnf_makecache() { - local retries=10 - local dnf_makecache_output=/tmp/dnf-makecache.out - local i - for i in $(seq 1 $retries); do - ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ - cat $dnf_makecache_output && break || \ - cat $dnf_makecache_output - if [ $i -eq $retries ]; then - return 1 - else sleep 5 - fi - done - echo "Executed dnf makecache -y $i times" -} - -marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" -if [ -z "$marinerRepoDepotEndpoint" ]; then - >&2 echo "repo depot endpoint empty while running custom-cloud init script" -else - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - if [ "$IS_MARINER" -eq 1 ]; then - echo "Initializing Mariner repo depot settings..." - init_mariner_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - elif [ "$IS_AZURELINUX" -eq 1 ]; then - echo "Initializing Azure Linux repo depot settings..." - init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - else - echo "No customizations for distribution: $NAME" - fi -fi - -# Set the chrony config to use the PHC /dev/ptp0 clock -cat > /etc/chrony.conf < "/root/AzureCACertificates/$cert_filename" - echo "Successfully saved certificate: $cert_filename" - else - echo "Warning: Failed to retrieve certificate content for $cert_filename" - fi - done -} - -# Process root certificates -process_cert_operations "operationrequestsroot" - -# Process intermediate certificates -process_cert_operations "operationrequestsintermediate" - -# Copy all certificate files to the Mariner/AzureLinux system certificate directory -cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - -# Update the system certificate store using Mariner/AzureLinux command -/usr/bin/update-ca-trust - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -scriptPath=$0 -# Determine an absolute, canonical path to this script for use in cron. -if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" -fi - -if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi -fi - -function init_mariner_repo_depot { - local repodepot_endpoint=$1 - echo "Adding [extended] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo - - echo "Adding [nvidia] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - - echo "Adding [cloud-native] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo - - echo "Pointing Mariner repos at RepoDepot..." - for f in /etc/yum.repos.d/*.repo - do - sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f - echo "$f modified." - done - echo "Mariner repo setup complete." -} - -function init_azurelinux_repo_depot { - local repodepot_endpoint=$1 - repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") - - # tbd maybe we do this a bit nicer - rm -f /etc/yum.repos.d/azurelinux* - - for repo in "${repos[@]}"; do - output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" - repo_content=( - "[azurelinux-official-$repo]" - "name=Azure Linux Official $repo \$releasever \$basearch" - "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" - "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" - "gpgcheck=1" - "repo_gpgcheck=1" - "enabled=1" - "skip_if_unavailable=True" - "sslverify=1" - ) - - rm -f "$output_file" - - for line in "${repo_content[@]}"; do - echo "$line" >> "$output_file" - done - - echo "File '$output_file' has been created." - done -} - -cloud-init status --wait - -dnf_makecache() { - local retries=10 - local dnf_makecache_output=/tmp/dnf-makecache.out - local i - for i in $(seq 1 $retries); do - ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ - cat $dnf_makecache_output && break || \ - cat $dnf_makecache_output - if [ $i -eq $retries ]; then - return 1 - else sleep 5 - fi - done - echo "Executed dnf makecache -y $i times" -} - -marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" -if [ -z "$marinerRepoDepotEndpoint" ]; then - >&2 echo "repo depot endpoint empty while running custom-cloud init script" -else - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - if [ "$IS_MARINER" -eq 1 ]; then - echo "Initializing Mariner repo depot settings..." - init_mariner_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - elif [ "$IS_AZURELINUX" -eq 1 ]; then - echo "Initializing Azure Linux repo depot settings..." - init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - else - echo "No customizations for distribution: $NAME" - fi -fi - -#EOF diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh deleted file mode 100644 index 99ae86d0242..00000000000 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh +++ /dev/null @@ -1,346 +0,0 @@ -#!/bin/bash -set -x -mkdir -p /root/AzureCACertificates - -IS_FLATCAR=0 -IS_UBUNTU=0 -IS_ACL=0 -# shellcheck disable=SC3010 -if [[ -f /etc/os-release ]]; then - . /etc/os-release - # shellcheck disable=SC3010 - if [[ $NAME == *"Ubuntu"* ]]; then - IS_UBUNTU=1 - elif [[ $ID == *"flatcar"* ]]; then - IS_FLATCAR=1 - elif [[ $ID == "azurecontainerlinux" ]] || { [[ $ID == "azurelinux" ]] && [[ ${VARIANT_ID:-} == "azurecontainerlinux" ]]; }; then - IS_ACL=1 - else - echo "Unknown Linux distribution" - exit 1 - fi -else - echo "Unsupported operating system" - exit 1 -fi - -echo "distribution is $distribution" -echo "Running on $NAME" - -# http://168.63.129.16 is a constant for the host's wireserver endpoint -WIRESERVER_ENDPOINT="http://168.63.129.16" - -# Function to make HTTP request with retry logic for rate limiting -make_request_with_retry() { - local url="$1" - local max_retries=10 - local retry_delay=3 - local attempt=1 - - local response - while [ $attempt -le $max_retries ]; do - response=$(curl -f --no-progress-meter "$url") - local request_status=$? - - if echo "$response" | grep -q "RequestRateLimitExceeded"; then - sleep $retry_delay - retry_delay=$((retry_delay * 2)) - attempt=$((attempt + 1)) - elif [ $request_status -ne 0 ]; then - sleep $retry_delay - attempt=$((attempt + 1)) - else - echo "$response" - return 0 - fi - done - - echo "exhausted all retries, last response: $response" - return 1 -} - -# Function to process certificate operations from a given endpoint -process_cert_operations() { - local endpoint_type="$1" - local operation_response - - echo "Retrieving certificate operations for type: $endpoint_type" - operation_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json") - local request_status=$? - if [ -z "$operation_response" ] || [ $request_status -ne 0 ]; then - echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json" - return - fi - - # Extract ResourceFileName values from the JSON response - local cert_filenames - mapfile -t cert_filenames < <(echo "$operation_response" | grep -oP '(?<="ResouceFileName": ")[^"]*') - - if [ ${#cert_filenames[@]} -eq 0 ]; then - echo "No certificate filenames found in response for $endpoint_type" - return - fi - - # Process each certificate file - for cert_filename in "${cert_filenames[@]}"; do - echo "Processing certificate file: $cert_filename" - - # Extract filename and extension - local filename="${cert_filename%.*}" - local extension="${cert_filename##*.}" - - echo "Downloading certificate: filename=$filename, extension=$extension" - - # Retrieve the actual certificate content with retry logic - local cert_content - cert_content=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension") - local request_status=$? - if [ -z "$cert_content" ] || [ $request_status -ne 0 ]; then - echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension" - continue - fi - - if [ -n "$cert_content" ]; then - # Save the certificate to the appropriate location - echo "$cert_content" > "/root/AzureCACertificates/$cert_filename" - echo "Successfully saved certificate: $cert_filename" - else - echo "Warning: Failed to retrieve certificate content for $cert_filename" - fi - done -} - -# Process root certificates -process_cert_operations "operationrequestsroot" - -# Process intermediate certificates -process_cert_operations "operationrequestsintermediate" - -if [ "$IS_ACL" -eq 1 ]; then - cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - update-ca-trust -elif [ "${IS_FLATCAR}" -eq 0 ]; then - # Copy all certificate files to the system certificate directory - cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ - - # Update the system certificate store - update-ca-certificates - - # This copies the updated bundle to the location used by OpenSSL which is commonly used - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem -else - for cert in /root/AzureCACertificates/*.crt; do - destcert="${cert##*/}" - destcert="${destcert%.*}.pem" - cp "$cert" /etc/ssl/certs/"$destcert" - done - update-ca-certificates -fi - - - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -function init_ubuntu_main_repo_depot { - local repodepot_endpoint="$1" - # Initialize directory for keys - mkdir -p /etc/apt/keyrings - - # This copies the updated bundle to the location used by OpenSSL which is commonly used - echo "Copying updated bundle to OpenSSL .pem file..." - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem - echo "Updated bundle copied." - - # Back up sources.list and sources.list.d contents - mkdir -p /etc/apt/backup/ - if [ -f "/etc/apt/sources.list" ]; then - mv /etc/apt/sources.list /etc/apt/backup/ - fi - for sources_file in /etc/apt/sources.list.d/*; do - if [ -f "$sources_file" ]; then - mv "$sources_file" /etc/apt/backup/ - fi - done - - # Set location of sources file - . /etc/os-release - aptSourceFile="/etc/apt/sources.list.d/ubuntu.sources" - - # Create main sources file - cat < /etc/apt/sources.list.d/ubuntu.sources - -Types: deb -URIs: ${repodepot_endpoint}/ubuntu -Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security -Components: main universe restricted multiverse -Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg -EOF - - # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing - # all urls with the RepoDepot Ubuntu url - ubuntuUrl=${repodepot_endpoint}/ubuntu - echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." - sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile - echo "apt source URLs converted, see new file below:" - echo "" - echo "-----" - cat $aptSourceFile - echo "-----" - echo "" -} - -function check_url { - local url=$1 - echo "Checking url: $url" - - # Use curl to check the URL and capture both stdout and stderr - curl_exit_code=$(curl -s --head --request GET $url) - # Check the exit status of curl - # shellcheck disable=SC3010 - if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then - echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" - exit 1 - fi -} - -function write_to_sources_file { - local sources_list_d_file=$1 - local source_uri=$2 - shift 2 - local key_paths=("$@") - - sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" - ubuntuDist=$(lsb_release -c | awk '{print $2}') - - tee -a $sources_file_path < /dev/null - echo "$key_name key added to keyring." -} - -function derive_key_paths { - local key_names=("$@") - local key_paths=() - - for key_name in "${key_names[@]}"; do - key_paths+=("/etc/apt/keyrings/${key_name}.gpg") - done - - echo "${key_paths[*]}" -} - -function add_ms_keys { - # Add the Microsoft package server keys to keyring. - echo "Adding Microsoft keys to keyring..." - - add_key_ubuntu microsoft.asc - add_key_ubuntu msopentech.asc -} - -function aptget_update { - echo "apt-get updating..." - echo "note: depending on how many sources have been added this may take a couple minutes..." - if apt-get update | grep -q "404 Not Found"; then - echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." - exit 1 - else - echo "apt-get update complete!" - fi -} - -function init_ubuntu_pmc_repo_depot { - local repodepot_endpoint="$1" - # Add Microsoft packages source to the azure specific sources.list. - echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." - - microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" - check_url $microsoftPackageSource - write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - echo "Ubuntu ($ubuntuRel) repo added." - echo "Adding packages.microsoft.com keys" - add_ms_keys $repodepot_endpoint -} - -if [ "$IS_UBUNTU" -eq 1 ]; then - scriptPath=$0 - # Determine an absolute, canonical path to this script for use in cron. - if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" - fi - - if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi - fi - - cloud-init status --wait - rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - ubuntuRel=$(lsb_release --release | awk '{print $2}') - ubuntuDist=$(lsb_release -c | awk '{print $2}') - # initialize archive.ubuntu.com repo - init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} - init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} - # update apt list - echo "Running apt-get update" - aptget_update -elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then - script_path="$(readlink -f "$0")" - svc="/etc/systemd/system/azure-ca-refresh.service" - tmr="/etc/systemd/system/azure-ca-refresh.timer" - - cat >"$svc" <"$tmr" < /etc/apt/sources.list.d/ubuntu.sources + +Types: deb +URIs: ${repodepot_endpoint}/ubuntu +Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security +Components: main universe restricted multiverse +Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg +EOF + + # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing + # all urls with the RepoDepot Ubuntu url + ubuntuUrl=${repodepot_endpoint}/ubuntu + echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." + sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile + echo "apt source URLs converted, see new file below:" + echo "" + echo "-----" + cat $aptSourceFile + echo "-----" + echo "" +} + +function check_url { + local url=$1 + echo "Checking url: $url" + + # Use curl to check the URL and capture both stdout and stderr + curl_exit_code=$(curl -s --head --request GET $url) + # Check the exit status of curl + # shellcheck disable=SC3010 + if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then + echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" + exit 1 + fi +} + +function write_to_sources_file { + local sources_list_d_file=$1 + local source_uri=$2 + shift 2 + local key_paths=("$@") + + sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" + ubuntuDist=$(lsb_release -c | awk '{print $2}') + + tee -a $sources_file_path < /dev/null + echo "$key_name key added to keyring." +} + +function derive_key_paths { + local key_names=("$@") + local key_paths=() + + for key_name in "${key_names[@]}"; do + key_paths+=("/etc/apt/keyrings/${key_name}.gpg") + done + + echo "${key_paths[*]}" +} + +function add_ms_keys { + # Add the Microsoft package server keys to keyring. + echo "Adding Microsoft keys to keyring..." + + add_key_ubuntu microsoft.asc + add_key_ubuntu msopentech.asc +} + +function aptget_update { + echo "apt-get updating..." + echo "note: depending on how many sources have been added this may take a couple minutes..." + if apt-get update | grep -q "404 Not Found"; then + echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." + exit 1 + else + echo "apt-get update complete!" + fi +} + +function init_ubuntu_pmc_repo_depot { + local repodepot_endpoint="$1" + # Add Microsoft packages source to the azure specific sources.list. + echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." + + microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" + check_url $microsoftPackageSource + write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) + write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) + echo "Ubuntu ($ubuntuRel) repo added." + echo "Adding packages.microsoft.com keys" + add_ms_keys $repodepot_endpoint +} + +function init_mariner_repo_depot { + local repodepot_endpoint=$1 + echo "Adding [extended] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo + + echo "Adding [nvidia] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + + echo "Adding [cloud-native] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo + + echo "Pointing Mariner repos at RepoDepot..." + for f in /etc/yum.repos.d/*.repo; do + sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f + echo "$f modified." + done + echo "Mariner repo setup complete." +} + +function init_azurelinux_repo_depot { + local repodepot_endpoint=$1 + local repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") + + rm -f /etc/yum.repos.d/azurelinux* + + for repo in "${repos[@]}"; do + output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" + repo_content=( + "[azurelinux-official-$repo]" + "name=Azure Linux Official $repo \$releasever \$basearch" + "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" + "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" + "gpgcheck=1" + "repo_gpgcheck=1" + "enabled=1" + "skip_if_unavailable=True" + "sslverify=1" + ) + + rm -f "$output_file" + + for line in "${repo_content[@]}"; do + echo "$line" >> "$output_file" + done + + echo "File '$output_file' has been created." + done + echo "Azure Linux repo setup complete." +} + +function dnf_makecache { + local retries=10 + local dnf_makecache_output=/tmp/dnf-makecache.out + local i + for i in $(seq 1 $retries); do + ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ + cat $dnf_makecache_output && break || \ + cat $dnf_makecache_output + if [ $i -eq $retries ]; then + return 1 + else + sleep 5 + fi + done + echo "Executed dnf makecache -y $i times" +} + +if [ "$IS_UBUNTU" -eq 1 ]; then + rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -n "$rootRepoDepotEndpoint" ]; then + cloud-init status --wait + ubuntuRel=$(lsb_release --release | awk '{print $2}') + ubuntuDist=$(lsb_release -c | awk '{print $2}') + init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} + init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} + echo "Running apt-get update" + aptget_update + else + echo "REPO_DEPOT_ENDPOINT empty, skipping Ubuntu RepoDepot initialization" + fi +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + cloud-init status --wait + + marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -z "$marinerRepoDepotEndpoint" ]; then + >&2 echo "repo depot endpoint empty while running custom-cloud init script" + else + if [ "$IS_MARINER" -eq 1 ]; then + echo "Initializing Mariner repo depot settings..." + init_mariner_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + else + echo "Initializing Azure Linux repo depot settings..." + init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + fi + fi +fi + +# Disable systemd-timesyncd and install chrony and uses local time source +# ACL has PTP clock config compiled into chronyd with no config file or sourcedir directives, +# so it uses only the local PTP clock and has no DHCP-injectable NTP sources. +if [ "$IS_ACL" -eq 1 ]; then + echo "Skipping chrony configuration for ACL (PTP clock baked into chronyd, no external NTP sources)" +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then +cat > /etc/chrony.conf < $chrony_conf < "/root/AzureCACertificates/$(echo ${certNames[$i]} | sed "s/.cer/.${ext}/g")" -done -IFS=$IFS_backup - -if [ "$IS_ACL" -eq 1 ]; then - cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - update-ca-trust -elif [ "$IS_FLATCAR" -eq 1 ]; then - cp /root/AzureCACertificates/*.pem /etc/ssl/certs/ - update-ca-certificates -else - cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ - update-ca-certificates - - # This copies the updated bundle to the location used by OpenSSL which is commonly used - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem -fi - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi +WIRESERVER_ENDPOINT="http://168.63.129.16" + +function make_request_with_retry { + local url="$1" + local max_retries=10 + local retry_delay=3 + local attempt=1 + + local response + while [ $attempt -le $max_retries ]; do + response=$(curl -f --no-progress-meter --connect-timeout 10 --max-time 30 "$url") + local request_status=$? + + if echo "$response" | grep -q "RequestRateLimitExceeded"; then + sleep $retry_delay + retry_delay=$((retry_delay * 2)) + attempt=$((attempt + 1)) + elif [ $request_status -ne 0 ]; then + sleep $retry_delay + attempt=$((attempt + 1)) + else + echo "$response" + return 0 + fi + done -function init_ubuntu_main_repo_depot { - local repodepot_endpoint="$1" - # Initialize directory for keys - mkdir -p /etc/apt/keyrings + echo "exhausted all retries, last response: $response" + return 1 +} - # This copies the updated bundle to the location used by OpenSSL which is commonly used - echo "Copying updated bundle to OpenSSL .pem file..." - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem - echo "Updated bundle copied." +function is_opted_in_for_root_certs { + local opt_in_response + local request_status + local poll_attempt=1 + local max_poll_attempts=30 + local poll_interval=10 + + # Poll wireserver for up to ~5 minutes to allow platform metadata to sync. + # The VM instance tag triggers a Fabric Controller goal state (CCF) update, + # which must propagate to the host agent before wireserver can reflect it. + # FC goal state propagation can take several minutes in practice. + while [ $poll_attempt -le $max_poll_attempts ]; do + echo "is_opted_in_for_root_certs: poll attempt ${poll_attempt}/${max_poll_attempts}" + + opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") + request_status=$? + + echo "is_opted_in_for_root_certs: wireserver response (status=${request_status}): '${opt_in_response}'" + + if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then + echo "Warning: failed to determine IsOptedInForRootCerts state on attempt ${poll_attempt}" + elif echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then + echo "IsOptedInForRootCerts=true (found on attempt ${poll_attempt})" + return 0 + fi - # Back up sources.list and sources.list.d contents - mkdir -p /etc/apt/backup/ - if [ -f "/etc/apt/sources.list" ]; then - mv /etc/apt/sources.list /etc/apt/backup/ - fi - for sources_file in /etc/apt/sources.list.d/*; do - if [ -f "$sources_file" ]; then - mv "$sources_file" /etc/apt/backup/ + if [ $poll_attempt -lt $max_poll_attempts ]; then + echo "is_opted_in_for_root_certs: not opted in yet, waiting ${poll_interval}s before retry..." + sleep $poll_interval fi + + poll_attempt=$((poll_attempt + 1)) done - # Set location of sources file - . /etc/os-release - aptSourceFile="/etc/apt/sources.list.d/ubuntu.sources" + echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true after ${max_poll_attempts} attempts" + echo "Last wireserver response: '${opt_in_response}'" + return 1 +} - # Create main sources file - cat < /etc/apt/sources.list.d/ubuntu.sources +function get_trust_store_dir { + if [ "$IS_ACL" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + echo "/etc/pki/ca-trust/source/anchors" + elif [ "$IS_FLATCAR" -eq 1 ]; then + echo "/etc/ssl/certs" + else + echo "/usr/local/share/ca-certificates" + fi +} -Types: deb -URIs: ${repodepot_endpoint}/ubuntu -Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security -Components: main universe restricted multiverse -Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg -EOF +function debug_print_trust_store { + local stage="$1" + local trust_store_dir - # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing - # all urls with the RepoDepot Ubuntu url - ubuntuUrl=${repodepot_endpoint}/ubuntu - echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." - sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile - echo "apt source URLs converted, see new file below:" - echo "" - echo "-----" - cat $aptSourceFile - echo "-----" - echo "" + trust_store_dir=$(get_trust_store_dir) + echo "Trust store contents ${stage} cert copy: ${trust_store_dir}" + ls -al "$trust_store_dir" || true } -function check_url { - local url=$1 - echo "Checking url: $url" +function retrieve_legacy_certs { + local certs + local cert_names + local cert_bodies + local i - # Use curl to check the URL and capture both stdout and stderr - curl_exit_code=$(curl -s --head --request GET $url) - # Check the exit status of curl - # shellcheck disable=SC3010 - if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then - echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" - exit 1 + certs=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=cacertificates&ext=json") + if [ -z "$certs" ]; then + echo "Warning: failed to retrieve legacy custom cloud certificates" + return 1 fi + + IFS_backup=$IFS + IFS=$'\r\n' + cert_names=($(echo $certs | grep -oP '(?<=Name\": \")[^\"]*')) + cert_bodies=($(echo $certs | grep -oP '(?<=CertBody\": \")[^\"]*')) + for i in ${!cert_bodies[@]}; do + echo ${cert_bodies[$i]} | sed 's/\\r\\n/\n/g' | sed 's/\\//g' > "/root/AzureCACertificates/$(echo ${cert_names[$i]} | sed 's/.cer/.crt/g')" + done + IFS=$IFS_backup } -function write_to_sources_file { - local sources_list_d_file=$1 - local source_uri=$2 - shift 2 - local key_paths=("$@") +function process_cert_operations { + local endpoint_type="$1" + local operation_response - sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" - ubuntuDist=$(lsb_release -c | awk '{print $2}') + echo "Retrieving certificate operations for type: $endpoint_type" + operation_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json") + local request_status=$? + if [ -z "$operation_response" ] || [ $request_status -ne 0 ]; then + echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json" + return 1 + fi - tee -a $sources_file_path < /dev/null - echo "$key_name key added to keyring." -} + local filename="${cert_filename%.*}" + local extension="${cert_filename##*.}" + local cert_content -function derive_key_paths { - local key_names=("$@") - local key_paths=() + cert_content=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension") + local request_status=$? + if [ -z "$cert_content" ] || [ $request_status -ne 0 ]; then + echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension" + continue + fi - for key_name in "${key_names[@]}"; do - key_paths+=("/etc/apt/keyrings/${key_name}.gpg") + echo "$cert_content" > "/root/AzureCACertificates/$cert_filename" + echo "Successfully saved certificate: $cert_filename" done +} - echo "${key_paths[*]}" +function retrieve_rcv1p_certs { + process_cert_operations "operationrequestsroot" || return 1 + process_cert_operations "operationrequestsintermediate" || return 1 } -function add_ms_keys { - # Add the Microsoft package server keys to keyring. - echo "Adding Microsoft keys to keyring..." +function install_certs_to_trust_store { + mkdir -p /root/AzureCACertificates + + debug_print_trust_store "before" + + if [ "$IS_ACL" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ + update-ca-trust + elif [ "$IS_FLATCAR" -eq 1 ]; then + for cert in /root/AzureCACertificates/*.crt; do + destcert="${cert##*/}" + destcert="${destcert%.*}.pem" + cp "$cert" /etc/ssl/certs/"$destcert" + done + update-ca-certificates + else + cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ + update-ca-certificates + + # This copies the updated bundle to the location used by OpenSSL which is commonly used + cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem + fi - add_key_ubuntu microsoft.asc - add_key_ubuntu msopentech.asc + debug_print_trust_store "after" } -function aptget_update { - echo "apt-get updating..." - echo "note: depending on how many sources have been added this may take a couple minutes..." - if apt-get update | grep -q "404 Not Found"; then - echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." - exit 1 +# Certificate refresh behavior summary: +# - legacy mode directly attempts certificate download from wireserver and only in ussec and usnat regions. +# - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. +# - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. + +refresh_location="${2:-${LOCATION}}" + +location_normalized="${refresh_location,,}" +location_normalized="${location_normalized//[[:space:]]/}" +if [ -z "$location_normalized" ]; then + echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" +fi + +cert_endpoint_mode="rcv1p" +case "$location_normalized" in + ussec*|usnat*) cert_endpoint_mode="legacy" ;; +esac + +echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" +install_ca_refresh_schedule=0 +mkdir -p /root/AzureCACertificates +rm -f /root/AzureCACertificates/* +if [ "$cert_endpoint_mode" = "legacy" ]; then + install_ca_refresh_schedule=1 + if retrieve_legacy_certs; then + install_certs_to_trust_store else - echo "apt-get update complete!" + echo "Warning: failed to retrieve legacy certificates from wireserver; continuing without trust store updates" fi -} +elif [ "$cert_endpoint_mode" = "rcv1p" ]; then + if is_opted_in_for_root_certs; then + install_ca_refresh_schedule=1 + if retrieve_rcv1p_certs; then + install_certs_to_trust_store + else + echo "Warning: failed to retrieve rcv1p certificates from wireserver; continuing without trust store updates" + fi + fi +fi -function init_ubuntu_pmc_repo_depot { - local repodepot_endpoint="$1" - # Add Microsoft packages source to the azure specific sources.list. - echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." - - microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" - check_url $microsoftPackageSource - write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - echo "Ubuntu ($ubuntuRel) repo added." - echo "Adding packages.microsoft.com keys" - add_ms_keys $repodepot_endpoint -} +# In ca-refresh mode (invoked by the scheduled cron/systemd task with the location as arg), +# only the cert refresh above is needed; exit before running the full init path. +# Action values: +# - init (default): full provisioning path +# - ca-refresh : periodic refresh path; location is passed as arg to avoid env dependency +action=${1:-init} +if [ "$action" = "ca-refresh" ]; then + exit +fi -if [ "$IS_UBUNTU" -eq 1 ]; then +if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then scriptPath=$0 # Determine an absolute, canonical path to this script for use in cron. if command -v readlink >/dev/null 2>&1; then @@ -209,30 +271,21 @@ if [ "$IS_UBUNTU" -eq 1 ]; then scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" fi - if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 + if [ "$install_ca_refresh_schedule" -eq 1 ]; then + if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then + # Quote the script path in the cron entry to avoid issues with spaces or special characters. + if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then + echo "Failed to install ca-refresh cron job via crontab" >&2 + fi fi fi - - cloud-init status --wait - rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - ubuntuRel=$(lsb_release --release | awk '{print $2}') - ubuntuDist=$(lsb_release -c | awk '{print $2}') - # initialize archive.ubuntu.com repo - init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} - init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} - # update apt list - echo "Running apt-get update" - aptget_update elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then - script_path="$(readlink -f "$0")" - svc="/etc/systemd/system/azure-ca-refresh.service" - tmr="/etc/systemd/system/azure-ca-refresh.timer" + if [ "$install_ca_refresh_schedule" -eq 1 ]; then + script_path="$(readlink -f "$0")" + svc="/etc/systemd/system/azure-ca-refresh.service" + tmr="/etc/systemd/system/azure-ca-refresh.timer" - cat >"$svc" <"$svc" <"$tmr" <"$tmr" < $chrony_conf < $certFilePath + } + + return $true } - Write-Log "Convert CA certificates rawdata" - $caCerts=($rawData.Content) | ConvertFrom-Json - if ([string]::IsNullOrEmpty($caCerts)) { - Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_EMPTY_CA_CERTIFICATES -ErrorMessage "CA certificates rawdata is empty" + $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' + $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + if (($optInResponse.Content -notmatch 'IsOptedInForRootCerts=true')) { + Write-Log "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" + return $false + } + + $operationRequestTypes = @("operationrequestsroot", "operationrequestsintermediate") + $downloadedAny = $false + + foreach ($requestType in $operationRequestTypes) { + $operationRequestUri = "http://168.63.129.16/machine?comp=acmspackage&type=$requestType&ext=json" + $operationResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$operationRequestUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + $operationJson = ($operationResponse.Content) | ConvertFrom-Json + + if ($null -eq $operationJson -or $null -eq $operationJson.OperationRequests) { + Write-Log "Warning: no operation requests found for $requestType" + continue + } + + foreach ($operation in $operationJson.OperationRequests) { + $resourceFileName = $operation.ResouceFileName + if ([string]::IsNullOrEmpty($resourceFileName)) { + continue + } + + $resourceType = [IO.Path]::GetFileNameWithoutExtension($resourceFileName) + $resourceExt = [IO.Path]::GetExtension($resourceFileName).TrimStart('.') + $resourceUri = "http://168.63.129.16/machine?comp=acmspackage&type=$resourceType&ext=$resourceExt" + + $certContentResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$resourceUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + if ([string]::IsNullOrEmpty($certContentResponse.Content)) { + Write-Log "Warning: empty certificate content for $resourceFileName" + continue + } + + $certFilePath = Join-Path $caFolder $resourceFileName + Write-Log "Write certificate $resourceFileName to $certFilePath" + $certContentResponse.Content > $certFilePath + $downloadedAny = $true + } } - $certificates = $caCerts.Certificates - for ($index = 0; $index -lt $certificates.Length ; $index++) { - $name=$certificates[$index].Name - $certFilePath = Join-Path $caFolder $name - Write-Log "Write certificate $name to $certFilePath" - $certificates[$index].CertBody > $certFilePath + if (-not $downloadedAny) { + Write-Log "Warning: no CA certificates were downloaded in rcv1p mode" } + + return $downloadedAny } catch { - # Catch all exceptions in this function. NOTE: exit cannot be caught. - Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_GET_CA_CERTIFICATES -ErrorMessage $_ + if ($FailOnError) { + throw "Failed to retrieve CA certificates. Error: $_" + } + Write-Log "Warning: failed to retrieve CA certificates. Error: $_" + return $false } } diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 new file mode 100644 index 00000000000..42accc39c51 --- /dev/null +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -0,0 +1,260 @@ +BeforeAll { + if (-not (Get-PSDrive -Name C -ErrorAction SilentlyContinue)) { + New-PSDrive -Name C -PSProvider FileSystem -Root ([System.IO.Path]::GetTempPath()) | Out-Null + } + + function Write-Log { + param($Message) + Write-Host "$Message" + } + + function Logs-To-Event { + param($TaskName, $TaskMessage) + Write-Host "$TaskName $TaskMessage" + } + + function Set-ExitCode { + param($ExitCode, $ErrorMessage) + throw "Unexpected Set-ExitCode: $ExitCode $ErrorMessage" + } + + function Create-Directory { + param($FullPath, $DirectoryUsage) + if (-not (Test-Path $FullPath)) { + New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + } + } + + function Get-ScheduledTask { + param($TaskName, $ErrorAction) + } + + function New-ScheduledTaskAction { + param($Execute, $Argument) + } + + function New-ScheduledTaskPrincipal { + param($UserId, $LogonType, $RunLevel) + } + + function New-JobTrigger { + param([switch]$Daily, $At, $DaysInterval) + } + + function New-ScheduledTask { + param($Action, $Principal, $Trigger, $Description) + } + + function Register-ScheduledTask { + param($TaskName, $InputObject) + } + + function Retry-Command { + param($Command, $Args, $Retries, $RetryDelaySeconds) + } + + $helperScriptPath = Join-Path $PSScriptRoot '..\..\..\parts\windows\windowscsehelper.ps1' + $scriptUnderTestPath = Join-Path $PSScriptRoot 'kubernetesfunc.ps1' + + . $helperScriptPath + . $scriptUnderTestPath +} + +Describe 'Get-CustomCloudCertEndpointModeFromLocation' { + It 'returns legacy for ussec regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'ussecwest' | Should -Be 'legacy' + } + + It 'returns legacy for usnat regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'usnatcentral' | Should -Be 'legacy' + } + + It 'returns rcv1p for public regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'southcentralus' | Should -Be 'rcv1p' + } + + It 'handles mixed-case input' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'UsSeCeast' | Should -Be 'legacy' + } +} + +Describe 'Register-CACertificatesRefreshTask' { + BeforeEach { + $script:lastScheduledTaskArgument = $null + + Mock Logs-To-Event -MockWith { } + Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } + Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } + Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } + Mock Register-ScheduledTask -MockWith { } + Mock New-ScheduledTaskAction -MockWith { + param($Execute, $Argument) + $script:lastScheduledTaskArgument = $Argument + return @{ Execute = $Execute; Argument = $Argument } + } + } + + It 'skips registration when the task already exists' { + Mock Get-ScheduledTask -MockWith { return @{ TaskName = 'aks-ca-certs-refresh-task' } } + + Register-CACertificatesRefreshTask -Location 'southcentralus' + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 0 + Assert-MockCalled -CommandName New-ScheduledTaskAction -Exactly -Times 0 + } + + It 'creates a scheduled task that passes location for cert endpoint mode derivation' { + Mock Get-ScheduledTask -MockWith { return $null } + + Register-CACertificatesRefreshTask -Location 'southcentralus' + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 1 + $script:lastScheduledTaskArgument | Should -Match ([regex]::Escape("Get-CACertificates -Location 'southcentralus'")) + } +} + +Describe 'Should-InstallCACertificatesRefreshTask' { + BeforeEach { + Mock Retry-Command -MockWith { } + } + + It 'returns true for legacy regions without calling the opt-in endpoint' { + Mock Retry-Command + + $result = Should-InstallCACertificatesRefreshTask -Location 'ussecwest' + + $result | Should -Be $true + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 + } + + It 'returns true for rcv1p regions when opt-in is enabled' { + $script:lastRetryUri = $null + Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:lastRetryUri = $PSBoundParameters['Args'].Uri + return [PSCustomObject]@{ Content = 'IsOptedInForRootCerts=true' } + } + + $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' + + $result | Should -Be $true + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:lastRetryUri | Should -Be 'http://168.63.129.16/acms/isOptedInForRootCerts' + } + + It 'returns false for rcv1p regions when opt-in is disabled' { + Mock Retry-Command -MockWith { + return [PSCustomObject]@{ Content = 'IsOptedInForRootCerts=false' } + } + + $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' + + $result | Should -Be $false + } +} + +Describe 'Get-CACertificates' { + BeforeEach { + if (Test-Path 'C:\ca') { + Remove-Item -Path 'C:\ca' -Recurse -Force + } + } + + It 'uses the legacy endpoint when location is a ussec/usnat region' { + $script:retryUris = @() + Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:retryUris += $PSBoundParameters['Args'].Uri + return [PSCustomObject]@{ + Content = '{"Certificates":[{"Name":"legacy.crt","CertBody":"legacy-body"}]}' + } + } + + $result = Get-CACertificates -Location 'ussecwest' + + $result | Should -Be $true + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:retryUris | Should -Contain 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' + $script:retryUris | Should -Not -Contain 'http://168.63.129.16/acms/isOptedInForRootCerts' + } + + It 'returns false when certificate retrieval throws' { + Mock Retry-Command -MockWith { + throw 'simulated retrieval failure' + } + + $result = Get-CACertificates -Location 'southcentralus' + + $result | Should -Be $false + } + + It 'throws when certificate retrieval fails with -FailOnError' { + Mock Retry-Command -MockWith { + throw 'simulated retrieval failure' + } + + { Get-CACertificates -Location 'southcentralus' -FailOnError } | Should -Throw '*Failed to retrieve CA certificates*' + } + + It 'throws when legacy endpoint returns empty data with -FailOnError' { + Mock Retry-Command -MockWith { + return [PSCustomObject]@{ + Content = '{"Certificates":[]}' + } + } + + { Get-CACertificates -Location 'ussecwest' -FailOnError } | Should -Throw '*CA certificates rawdata is empty*' + } + + It 'falls back to legacy endpoint when called without -Location (backward compat)' { + $script:retryUris = @() + Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:retryUris += $PSBoundParameters['Args'].Uri + return [PSCustomObject]@{ + Content = '{"Certificates":[{"Name":"compat.crt","CertBody":"compat-body"}]}' + } + } + + $result = Get-CACertificates + + $result | Should -Be $true + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:retryUris | Should -Contain 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' + } +} + +Describe 'Should-InstallCACertificatesRefreshTask - backward compat' { + It 'returns true when called without -Location (backward compat)' { + $result = Should-InstallCACertificatesRefreshTask + + $result | Should -Be $true + } +} + +Describe 'Register-CACertificatesRefreshTask - backward compat' { + BeforeEach { + $script:lastScheduledTaskArgument = $null + + Mock Logs-To-Event -MockWith { } + Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } + Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } + Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } + Mock Register-ScheduledTask -MockWith { } + Mock New-ScheduledTaskAction -MockWith { + param($Execute, $Argument) + $script:lastScheduledTaskArgument = $Argument + return @{ Execute = $Execute; Argument = $Argument } + } + } + + It 'creates a scheduled task without -Location when called without it (backward compat)' { + Mock Get-ScheduledTask -MockWith { return $null } + + Register-CACertificatesRefreshTask + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 1 + $script:lastScheduledTaskArgument | Should -Match ([regex]::Escape("Get-CACertificates |")) + $script:lastScheduledTaskArgument | Should -Not -Match "Location" + } +}