diff --git a/.pipelines/.vsts-vhd-builder-pr-windows.yaml b/.pipelines/.vsts-vhd-builder-pr-windows.yaml
index be72f7cc4af..bf6b05417db 100644
--- a/.pipelines/.vsts-vhd-builder-pr-windows.yaml
+++ b/.pipelines/.vsts-vhd-builder-pr-windows.yaml
@@ -34,7 +34,7 @@ pr:
- parts/windows
- go.mod
- go.sum
- - e2e/
+ - e2e/scenario_win_test.go
- staging/cse/windows/
exclude:
diff --git a/e2e/README.md b/e2e/README.md
index 812439f1843..21c2eb201f0 100644
--- a/e2e/README.md
+++ b/e2e/README.md
@@ -19,11 +19,16 @@ From a high-level, for each scenario,
To write an E2E scenario,
-- choose a testing cluster. There are a few defined
- in [cluster.go](https://github.com/Azure/AgentBaker/blob/dev/e2e/cluster.go), e.g,
- - ClusterKubenetAirgap
- - ClusterAzureNetwork
+- choose a testing cluster. There are several defined
+ in [cache.go](cache.go), e.g,
- ClusterKubenet
+ - ClusterAzureNetwork
+ - ClusterAzureOverlayNetwork
+ - ClusterAzureOverlayNetworkDualStack
+ - ClusterCiliumNetwork
+ - ClusterLatestKubernetesVersion
+ - ClusterAzureBootstrapProfileCache (private ACR)
+ - ClusterAzureNetworkIsolated (no internet access)
- use `NodeBootstrappingConfiugration` (`nbc`) to setup your scenario. it is used to invoke the primary
node-bootstrapping
API [GetLatestNodeBootstrapping](https://github.com/Azure/AgentBaker/blob/2e730b5a498c5be9b082d912fd08ac9346582db9/pkg/agent/bakerapi.go#L14).
@@ -31,27 +36,153 @@ To write an E2E scenario,
as well as `nbc.agentPoolProfile`. It is because when RP invokes AgentBaker, it will set the properties in this way
and in e2e we follow the pattern.
- use `VMConfigMutator` to set VMSS properties such as SKU when needed.
- Check [vmss](https://github.com/Azure/AgentBaker/blob/dev/e2e/vmss.go) for other configs.
+ Check [vmss](vmss.go) for other configs.
it is necessary to set `nbc.agentPoolProfile.VMSize` to match the VMSS SKU if you choose to change.
- use `Validator` to include your own verification of the VM's live state, such as file existsnce, sysctl settings, etc.
+## Infrastructure Architecture
+
+All E2E clusters share a single VNet and Azure Bastion in the `abe2e-{location}` resource group. This
+avoids creating a per-cluster Bastion (~10 min each) and ensures all clusters are reachable from a
+single SSH entry point.
+
+```mermaid
+graph TB
+ subgraph RG["abe2e-{location} Resource Group"]
+ subgraph VNET["abe2e-shared-vnet (10.0.0.0/8)"]
+ BASTION_SUBNET["AzureBastionSubnet
10.0.0.0/26"]
+ FW_SUBNET["AzureFirewallSubnet
10.0.1.0/24"]
+ PE_SUBNET["abe2e-pe-subnet
10.0.2.0/24
(shared private endpoints)"]
+ KUBENET_SUBNET["aks-subnet-abe2e-kubenet-v5
10.x.x.0/20"]
+ AZNET_SUBNET["aks-subnet-abe2e-azure-network-v4
10.x.x.0/20"]
+ MORE_SUBNETS["... more cluster subnets"]
+ end
+ BASTION["abe2e-shared-bastion
(Standard SKU, Tunneling)"]
+ FIREWALL["abe2e-fw
(Azure Firewall)"]
+ IDENTITY["abe2e-cluster-identity
(User-Assigned MSI)"]
+ PE_ACR["PE-for-abe2eprivate{location}
PE-for-abe2eprivatenonanon{location}
(shared ACR private endpoints)"]
+ DNS_ZONE["privatelink.azurecr.io
(Private DNS Zone)"]
+ ACR_ANON["abe2eprivate{location}
(Private ACR)"]
+ ACR_NONANON["abe2eprivatenonanon{location}
(Non-anonymous Private ACR)"]
+ end
+
+ subgraph MC_KUBENET["MC_abe2e-kubenet-v5 Resource Group"]
+ VMSS_K["VMSS (system pool)"]
+ VMSS_K_TEST["VMSS (test VMs)"]
+ RT_K["Route Table
(pod routes + firewall)"]
+ end
+
+ subgraph MC_NI["MC_abe2e-azure-networkisolated-v2 Resource Group"]
+ VMSS_NI["VMSS (system pool)"]
+ NSG_NI["NSG
(blocks internet)"]
+ end
+
+ BASTION --> BASTION_SUBNET
+ FIREWALL --> FW_SUBNET
+ PE_ACR --> PE_SUBNET
+ DNS_ZONE -.->|VNet link| VNET
+ VMSS_K --> KUBENET_SUBNET
+ RT_K -.->|associated| KUBENET_SUBNET
+ VMSS_NI --> AZNET_SUBNET
+ NSG_NI -.->|associated| AZNET_SUBNET
+
+ DEV["Developer / CI"]
+ DEV -->|SSH via tunnel| BASTION
+ BASTION -->|"connects to any VM
in shared VNet"| VMSS_K_TEST
+```
+
+### Shared Infrastructure Setup
+
+The shared infrastructure is created **automatically** on first test run via cached idempotent
+functions — no separate setup script is needed.
+
+| Resource | Name | Details |
+|----------|------|---------|
+| VNet | `abe2e-shared-vnet` | `10.0.0.0/8` — supports ~4096 `/20` cluster subnets |
+| Bastion | `abe2e-shared-bastion` | Standard SKU with tunneling enabled for native SSH |
+| Bastion Subnet | `AzureBastionSubnet` | `10.0.0.0/26` (required by Azure Bastion) |
+| Firewall Subnet | `AzureFirewallSubnet` | `10.0.1.0/24` |
+| PE Subnet | `abe2e-pe-subnet` | `10.0.2.0/24` — hosts shared private endpoints for ACRs |
+| Identity | `abe2e-cluster-identity` | User-assigned MSI with Network Contributor on the VNet |
+| Private DNS Zone | `privatelink.azurecr.io` | Shared zone in `abe2e-{location}` RG, linked to the VNet |
+
+Each AKS cluster gets its own `/20` subnet (4091 usable IPs) in the shared VNet. The subnet is
+named `aks-subnet-{clusterName}`. CIDRs are auto-allocated from a hash of the cluster name to
+avoid collisions.
+
+### Cluster Types
+
+All clusters use BYOV (Bring Your Own VNet) with the shared VNet. They differ in networking
+plugin, isolation level, and whether private ACR is needed.
+
+| Cluster | Network Plugin | Special Features | Private ACR |
+|---------|---------------|-----------------|:-----------:|
+| `abe2e-kubenet-v5` | Kubenet | Basic pod routing via route table | ❌ |
+| `abe2e-azure-network-v4` | Azure CNI | Pods get IPs from subnet (MaxPods=30) | ❌ |
+| `abe2e-azure-overlay-network-v4` | Azure CNI Overlay | Pods in virtual overlay, not subnet | ❌ |
+| `abe2e-azure-overlay-dualstack-v4` | Azure CNI Overlay | IPv4+IPv6 dual-stack | ❌ |
+| `abe2e-cilium-network-v4` | Azure CNI + Cilium | eBPF dataplane, replaces kube-proxy | ❌ |
+| `abe2e-latest-kubernetes-version-v2` | Kubenet | Auto-discovers latest GA K8s version | ❌ |
+| `abe2e-azure-bootstrapprofile-cache-v2` | Azure CNI | Bootstrap artifact caching from private ACR | ✅ |
+| `abe2e-azure-networkisolated-v2` | Azure CNI | NSG blocks all internet except allowlist | ✅ |
+
+**Network-isolated cluster** adds an NSG to its subnet that blocks all outbound traffic except
+`management.azure.com`, the cluster FQDN, and `packages.aks.azure.com`. Private endpoints for
+the ACRs are in the shared PE subnet, with DNS records in the shared `privatelink.azurecr.io` zone.
+
+### How It Works
+
+1. **`CachedEnsureSharedInfra`** — runs once per location per test run. Creates/verifies the shared
+ VNet, Bastion, Firewall, PE subnet, and user-assigned identity.
+2. **`configureSharedVNet`** — tags the cluster model for BYOV. After the cluster name is hashed,
+ **`CachedEnsureClusterSubnet`** creates the cluster's dedicated `/20` subnet.
+3. **`prepareCluster`** — creates/gets the AKS cluster, then runs a DAG of parallel tasks:
+ - Bastion lookup (shared)
+ - Firewall route table (non-isolated clusters)
+ - NSG association (network-isolated cluster)
+ - Private DNS zone + VNet link (if ACR needed, runs once before ACR tasks)
+ - Private ACR + PE creation (bootstrapprofile-cache and network-isolated)
+ - VMSS garbage collection
+ - Debug daemonsets
+4. SSH to test VMs goes through the shared Bastion, which can reach any VM in the VNet.
+
+### Test Flow
+
```mermaid
sequenceDiagram
- E2E->>+ARM: Get or Create AKS Cluster
- ARM-->>-E2E: Cluster details
- E2E->>+AgentBakerCode: Fetch VM Configuration (include CSE)
- AgentBakerCode-->>-E2E: VM Configuration
- E2E->>+ARM: Create VM using fetched VM Config in cluster network
- ARM-->>-E2E: VM instance
- E2E->>+Bastion: Create SSH Tunnel
- Bastion->>+VM: Forward SSH Connection
- E2E->>VM: Healthcheck via SSH Tunnel
- VM-->>E2E: Healthcheck OK
- E2E->>+KubeAPI: Verify Node Ready
- KubeAPI-->>-E2E: Node Ready
- E2E->>VM: Execute test validators via SSH Tunnel
- VM-->>-E2E: Test results
- Bastion-->>-E2E: Close SSH Tunnel
+ participant CI as Developer / CI
+ participant Infra as Shared Infra (cached)
+ participant ARM as Azure Resource Manager
+ participant AB as AgentBaker API
+ participant Bastion as Shared Bastion
+ participant VM as Test VM
+ participant K8s as Kube API Server
+
+ CI->>Infra: Ensure shared VNet + Bastion
+ Infra-->>CI: Ready (cached after first run)
+
+ CI->>Infra: Ensure cluster subnet
+ Infra-->>CI: Subnet ID
+
+ CI->>ARM: Create/Get AKS cluster (BYOV subnet)
+ ARM-->>CI: Cluster details
+
+ CI->>AB: Generate CSE + CustomData
+ AB-->>CI: VM configuration
+
+ CI->>ARM: Create VMSS in cluster subnet
+ ARM-->>CI: VM instance
+
+ CI->>Bastion: SSH tunnel to VM private IP
+ Bastion->>VM: Forward SSH connection
+
+ CI->>VM: Run health checks + validators
+ VM-->>CI: Results
+
+ CI->>K8s: Verify node ready
+ K8s-->>CI: Node ready ✓
+
+ Bastion-->>CI: Close tunnel
```
## Running Locally
diff --git a/e2e/aks_model.go b/e2e/aks_model.go
index b618ff87524..832123b0d12 100644
--- a/e2e/aks_model.go
+++ b/e2e/aks_model.go
@@ -112,7 +112,7 @@ func getAzureOverlayNetworkDualStackClusterModel(name, location, k8sSystemPoolSK
networkProfile.PodCidr,
to.Ptr("fd12:3456:789a::/64 "),
}
- networkProfile.ServiceCidr = to.Ptr("10.0.0.0/16")
+ networkProfile.ServiceCidr = to.Ptr("172.16.0.0/16")
networkProfile.ServiceCidrs = []*string{
networkProfile.ServiceCidr,
to.Ptr("fd12:3456:789a:1::/108"),
@@ -173,6 +173,8 @@ func getBaseClusterModel(clusterName, location, k8sSystemPoolSKU string) *armcon
},
NetworkProfile: &armcontainerservice.NetworkProfile{
NetworkPlugin: to.Ptr(armcontainerservice.NetworkPluginKubenet),
+ ServiceCidr: to.Ptr("172.16.0.0/16"),
+ DNSServiceIP: to.Ptr("172.16.0.10"),
},
AddonProfiles: map[string]*armcontainerservice.ManagedClusterAddonProfile{
"omsagent": {
@@ -301,115 +303,31 @@ func getFirewall(ctx context.Context, location, firewallSubnetID, publicIPID str
}
func addFirewallRules(
- ctx context.Context, clusterModel *armcontainerservice.ManagedCluster,
+ ctx context.Context, infra *SharedInfra, clusterModel *armcontainerservice.ManagedCluster,
) error {
- location := *clusterModel.Location
defer toolkit.LogStepCtx(ctx, "adding firewall rules")()
- rg := *clusterModel.Properties.NodeResourceGroup
- vnet, err := getClusterVNet(ctx, rg)
+ nodeRG := *clusterModel.Properties.NodeResourceGroup
+ vnet, err := getClusterVNet(ctx, clusterModel)
if err != nil {
return err
}
+ firewallPrivateIP := infra.FirewallIP
+
// For kubenet, the AKS-managed route table must stay attached so that pod
// routes (managed by cloud-provider-azure) and firewall routes coexist.
// For Azure CNI variants, the subnet may not have any route table, so we
// create and associate a dedicated one before adding the firewall routes.
- aksSubnetResp, err := config.Azure.Subnet.Get(ctx, rg, vnet.name, "aks-subnet", nil)
+ aksSubnetResp, err := config.Azure.Subnet.Get(ctx, vnet.resourceGroup, vnet.name, vnet.subnetName, nil)
if err != nil {
return fmt.Errorf("failed to get AKS subnet: %w", err)
}
- aksRTName, err := ensureFirewallRouteTable(ctx, clusterModel, vnet.name, aksSubnetResp.Subnet)
+ aksRTName, err := ensureFirewallRouteTable(ctx, clusterModel, vnet, aksSubnetResp.Subnet)
if err != nil {
return err
}
- // Create AzureFirewallSubnet - this subnet name is required by Azure Firewall
- firewallSubnetName := "AzureFirewallSubnet"
- firewallSubnetParams := armnetwork.Subnet{
- Properties: &armnetwork.SubnetPropertiesFormat{
- AddressPrefix: to.Ptr("10.225.0.0/24"), // Use a different CIDR that doesn't overlap with 10.224.0.0/16
- },
- }
-
- toolkit.Logf(ctx, "Creating subnet %s in VNet %s", firewallSubnetName, vnet.name)
- subnetPoller, err := config.Azure.Subnet.BeginCreateOrUpdate(
- ctx,
- rg,
- vnet.name,
- firewallSubnetName,
- firewallSubnetParams,
- nil,
- )
- if err != nil {
- return fmt.Errorf("failed to start creating firewall subnet: %w", err)
- }
-
- subnetResp, err := subnetPoller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
- if err != nil {
- return fmt.Errorf("failed to create firewall subnet: %w", err)
- }
-
- firewallSubnetID := *subnetResp.ID
- toolkit.Logf(ctx, "Created firewall subnet with ID: %s", firewallSubnetID)
-
- // Create public IP for the firewall
- publicIPName := "abe2e-fw-pip"
- publicIPParams := armnetwork.PublicIPAddress{
- Location: to.Ptr(location),
- SKU: &armnetwork.PublicIPAddressSKU{
- Name: to.Ptr(armnetwork.PublicIPAddressSKUNameStandard),
- },
- Properties: &armnetwork.PublicIPAddressPropertiesFormat{
- PublicIPAllocationMethod: to.Ptr(armnetwork.IPAllocationMethodStatic),
- },
- }
-
- toolkit.Logf(ctx, "Creating public IP %s", publicIPName)
- pipPoller, err := config.Azure.PublicIPAddresses.BeginCreateOrUpdate(
- ctx,
- rg,
- publicIPName,
- publicIPParams,
- nil,
- )
- if err != nil {
- return fmt.Errorf("failed to start creating public IP: %w", err)
- }
-
- pipResp, err := pipPoller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
- if err != nil {
- return fmt.Errorf("failed to create public IP: %w", err)
- }
-
- publicIPID := *pipResp.ID
- toolkit.Logf(ctx, "Created public IP with ID: %s", publicIPID)
-
- firewallName := "abe2e-fw"
- firewall := getFirewall(ctx, location, firewallSubnetID, publicIPID)
- fwPoller, err := config.Azure.AzureFirewall.BeginCreateOrUpdate(ctx, rg, firewallName, *firewall, nil)
- if err != nil {
- return fmt.Errorf("failed to start Firewall creation: %w", err)
- }
- fwResp, err := fwPoller.PollUntilDone(ctx, nil)
- if err != nil {
- return fmt.Errorf("failed to create Firewall: %w", err)
- }
-
- // Get the firewall's private IP address
- var firewallPrivateIP string
- if fwResp.Properties != nil && fwResp.Properties.IPConfigurations != nil && len(fwResp.Properties.IPConfigurations) > 0 {
- if fwResp.Properties.IPConfigurations[0].Properties != nil && fwResp.Properties.IPConfigurations[0].Properties.PrivateIPAddress != nil {
- firewallPrivateIP = *fwResp.Properties.IPConfigurations[0].Properties.PrivateIPAddress
- toolkit.Logf(ctx, "Firewall private IP: %s", firewallPrivateIP)
- }
- }
-
- if firewallPrivateIP == "" {
- return fmt.Errorf("failed to get firewall private IP address")
- }
-
// Add firewall routes to the existing AKS route table using individual
// route operations. This avoids replacing the entire table (which would
// race with cloud-provider-azure pod route updates) and preserves the
@@ -418,7 +336,7 @@ func addFirewallRules(
{
Name: to.Ptr("vnet-local"),
Properties: &armnetwork.RoutePropertiesFormat{
- AddressPrefix: to.Ptr("10.224.0.0/16"),
+ AddressPrefix: to.Ptr(vnet.addressPrefix),
NextHopType: to.Ptr(armnetwork.RouteNextHopTypeVnetLocal),
},
},
@@ -434,7 +352,7 @@ func addFirewallRules(
for _, route := range firewallRoutes {
toolkit.Logf(ctx, "Adding route %q to AKS route table %q", *route.Name, aksRTName)
- poller, err := config.Azure.Routes.BeginCreateOrUpdate(ctx, rg, aksRTName, *route.Name, route, nil)
+ poller, err := config.Azure.Routes.BeginCreateOrUpdate(ctx, nodeRG, aksRTName, *route.Name, route, nil)
if err != nil {
return fmt.Errorf("failed to start adding route %q: %w", *route.Name, err)
}
@@ -451,7 +369,7 @@ func addFirewallRules(
func ensureFirewallRouteTable(
ctx context.Context,
clusterModel *armcontainerservice.ManagedCluster,
- vnetName string,
+ vnet VNet,
aksSubnet armnetwork.Subnet,
) (string, error) {
if aksSubnet.Properties == nil {
@@ -479,21 +397,30 @@ func ensureFirewallRouteTable(
rg := *clusterModel.Properties.NodeResourceGroup
routeTableName := "abe2e-fw-rt"
toolkit.Logf(ctx, "AKS subnet has no route table; creating dedicated firewall route table %q", routeTableName)
- poller, err := config.Azure.RouteTables.BeginCreateOrUpdate(ctx, rg, routeTableName, armnetwork.RouteTable{
- Location: clusterModel.Location,
- }, nil)
- if err != nil {
- return "", fmt.Errorf("failed to start creating firewall route table %q: %w", routeTableName, err)
- }
- routeTableResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
+
+ var routeTableID *string
+ err := retryOn409(ctx, fmt.Sprintf("creating route table %s", routeTableName), func() error {
+ poller, err := config.Azure.RouteTables.BeginCreateOrUpdate(ctx, rg, routeTableName, armnetwork.RouteTable{
+ Location: clusterModel.Location,
+ }, nil)
+ if err != nil {
+ return fmt.Errorf("failed to start creating firewall route table %q: %w", routeTableName, err)
+ }
+ routeTableResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
+ if err != nil {
+ return fmt.Errorf("failed to create firewall route table %q: %w", routeTableName, err)
+ }
+ routeTableID = routeTableResp.ID
+ return nil
+ })
if err != nil {
- return "", fmt.Errorf("failed to create firewall route table %q: %w", routeTableName, err)
+ return "", err
}
aksSubnet.Properties.RouteTable = &armnetwork.RouteTable{
- ID: routeTableResp.ID,
+ ID: routeTableID,
}
- if err := updateSubnet(ctx, clusterModel, aksSubnet, vnetName); err != nil {
+ if err := updateSubnet(ctx, clusterModel, aksSubnet, vnet); err != nil {
return "", fmt.Errorf("failed to associate firewall route table %q with AKS subnet: %w", routeTableName, err)
}
@@ -512,12 +439,12 @@ func addPrivateAzureContainerRegistry(ctx context.Context, cluster *armcontainer
if err := createPrivateAzureContainerRegistryPullSecret(ctx, cluster, kube, resourceGroupName, isNonAnonymousPull); err != nil {
return fmt.Errorf("create private acr pull secret: %w", err)
}
- vnet, err := getClusterVNet(ctx, *cluster.Properties.NodeResourceGroup)
+ vnet, err := getClusterVNet(ctx, cluster)
if err != nil {
return err
}
- err = addPrivateEndpointForACR(ctx, *cluster.Properties.NodeResourceGroup, config.GetPrivateACRName(isNonAnonymousPull, *cluster.Location), vnet, *cluster.Location)
+ err = addPrivateEndpointForACR(ctx, config.GetPrivateACRName(isNonAnonymousPull, *cluster.Location), vnet, *cluster.Location)
if err != nil {
return err
}
@@ -533,36 +460,55 @@ func addNetworkIsolatedSettings(ctx context.Context, clusterModel *armcontainers
location := *clusterModel.Location
defer toolkit.LogStepCtx(ctx, fmt.Sprintf("Adding network settings for network isolated cluster %s in rg %s", *clusterModel.Name, *clusterModel.Properties.NodeResourceGroup))
- vnet, err := getClusterVNet(ctx, *clusterModel.Properties.NodeResourceGroup)
+ vnet, err := getClusterVNet(ctx, clusterModel)
if err != nil {
return err
}
- subnetId := vnet.subnetId
+
+ // The subnet is long-lived and shared across test runs. Once the NSG is
+ // associated we never need to touch it again. Private endpoints from
+ // previous runs can leave lingering IP prefix allocations on the subnet
+ // that make any PUT fail with InUsePrefixCannotBeDeleted, so we skip
+ // the update entirely when the NSG is already in place.
+ currentSubnet, err := config.Azure.Subnet.Get(ctx, vnet.resourceGroup, vnet.name, vnet.subnetName, nil)
+ if err != nil {
+ return fmt.Errorf("getting subnet %s: %w", vnet.subnetName, err)
+ }
+ if currentSubnet.Properties != nil && currentSubnet.Properties.NetworkSecurityGroup != nil {
+ toolkit.Logf(ctx, "subnet %s already has NSG, skipping update", vnet.subnetName)
+ return nil
+ }
nsgParams, err := networkIsolatedSecurityGroup(location, *clusterModel.Properties.Fqdn)
if err != nil {
return err
}
-
nsg, err := createNetworkIsolatedSecurityGroup(ctx, clusterModel, nsgParams, nil)
if err != nil {
return err
}
- subnetParameters := armnetwork.Subnet{
- ID: to.Ptr(subnetId),
+ if err = updateSubnet(ctx, clusterModel, armnetwork.Subnet{
+ ID: to.Ptr(vnet.subnetId),
Properties: &armnetwork.SubnetPropertiesFormat{
- AddressPrefix: to.Ptr("10.224.0.0/16"),
+ AddressPrefix: currentSubnet.Properties.AddressPrefix,
NetworkSecurityGroup: &armnetwork.SecurityGroup{
ID: nsg.ID,
},
},
- }
- if err = updateSubnet(ctx, clusterModel, subnetParameters, vnet.name); err != nil {
+ }, vnet); err != nil {
+ // After a cluster is GC'd, private endpoint IP prefix allocations can
+ // linger on the subnet. The merge in updateSubnet preserves them, but
+ // if Azure still rejects the PUT we log and continue — the NSG will be
+ // associated on the next run once allocations clear.
+ if strings.Contains(err.Error(), "InUsePrefixCannotBeDeleted") {
+ toolkit.Logf(ctx, "warning: cannot update subnet %s (lingering IP allocations), will retry next run", vnet.subnetName)
+ return nil
+ }
return err
}
- toolkit.Logf(ctx, "updated cluster %s subnet with network isolated cluster settings", *clusterModel.Name)
+ toolkit.Logf(ctx, "updated cluster %s subnet with network isolated settings", *clusterModel.Name)
return nil
}
@@ -609,30 +555,125 @@ func networkIsolatedSecurityGroup(location, clusterFQDN string) (armnetwork.Secu
}, nil
}
-func addPrivateEndpointForACR(ctx context.Context, nodeResourceGroup, privateACRName string, vnet VNet, location string) error {
- toolkit.Logf(ctx, "Checking if private endpoint for private container registry is in rg %s", nodeResourceGroup)
- var err error
- var privateEndpoint *armnetwork.PrivateEndpoint
- privateEndpointName := fmt.Sprintf("PE-for-%s", privateACRName)
- if privateEndpoint, err = createPrivateEndpoint(ctx, nodeResourceGroup, privateEndpointName, privateACRName, vnet, location); err != nil {
- return err
+// ensurePrivateDNSZone creates the private DNS zone and VNet link in the
+// shared RG. This is a VNet-level resource (one per cluster, not per ACR)
+// so it must run once before any private endpoint setup.
+//
+// Old clusters may have created their own privatelink.azurecr.io zones in
+// MC_ RGs and linked our shared VNet to them. A VNet can only link to one
+// zone per namespace, so we clean up stale links first.
+func ensurePrivateDNSZone(ctx context.Context, vnet VNet) (*armprivatedns.PrivateZone, error) {
+ sharedRG := vnet.resourceGroup
+ privateZoneName := "privatelink.azurecr.io"
+
+ if err := cleanupConflictingDNSZoneLinks(ctx, vnet, sharedRG, privateZoneName); err != nil {
+ return nil, fmt.Errorf("cleaning up conflicting DNS zone links: %w", err)
}
- privateZoneName := "privatelink.azurecr.io"
- var privateZone *armprivatedns.PrivateZone
- if privateZone, err = createPrivateZone(ctx, nodeResourceGroup, privateZoneName); err != nil {
- return err
+ zone, err := createPrivateZone(ctx, sharedRG, privateZoneName)
+ if err != nil {
+ return nil, fmt.Errorf("creating private DNS zone: %w", err)
}
- if err = createPrivateDNSLink(ctx, vnet, nodeResourceGroup, privateZoneName); err != nil {
- return err
+ if err = createPrivateDNSLink(ctx, vnet, sharedRG, privateZoneName); err != nil {
+ return nil, fmt.Errorf("creating private DNS VNet link: %w", err)
+ }
+
+ return zone, nil
+}
+
+// cleanupConflictingDNSZoneLinks finds privatelink.azurecr.io zones in other
+// resource groups that have a VNet link pointing to our shared VNet, and
+// deletes those links. This prevents "overlapping namespaces" errors.
+func cleanupConflictingDNSZoneLinks(ctx context.Context, vnet VNet, sharedRG, privateZoneName string) error {
+ sharedVNetID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s",
+ config.Config.SubscriptionID, vnet.resourceGroup, vnet.name)
+
+ subPager := config.Azure.PrivateZonesClient.NewListPager(nil)
+ for subPager.More() {
+ page, err := subPager.NextPage(ctx)
+ if err != nil {
+ return fmt.Errorf("listing private DNS zones: %w", err)
+ }
+ for _, zone := range page.Value {
+ if zone.Name == nil || *zone.Name != privateZoneName {
+ continue
+ }
+ zoneRG := resourceGroupFromID(*zone.ID)
+ if strings.EqualFold(zoneRG, sharedRG) {
+ continue
+ }
+ if err := deleteVNetLinkIfPointsToSharedVNet(ctx, zoneRG, privateZoneName, sharedVNetID); err != nil {
+ toolkit.Logf(ctx, "warning: failed to clean up DNS link in %s: %v", zoneRG, err)
+ }
+ }
}
+ return nil
+}
- if err = addRecordSetToPrivateDNSZone(ctx, privateEndpoint, nodeResourceGroup, privateZoneName); err != nil {
+// deleteVNetLinkIfPointsToSharedVNet checks all VNet links on a DNS zone and
+// deletes any that point to our shared VNet.
+func deleteVNetLinkIfPointsToSharedVNet(ctx context.Context, zoneRG, zoneName, sharedVNetID string) error {
+ linkPager := config.Azure.VirutalNetworkLinksClient.NewListPager(zoneRG, zoneName, nil)
+ for linkPager.More() {
+ page, err := linkPager.NextPage(ctx)
+ if err != nil {
+ return fmt.Errorf("listing VNet links in %s/%s: %w", zoneRG, zoneName, err)
+ }
+ for _, link := range page.Value {
+ if link.Properties == nil || link.Properties.VirtualNetwork == nil || link.Properties.VirtualNetwork.ID == nil {
+ continue
+ }
+ if !strings.EqualFold(*link.Properties.VirtualNetwork.ID, sharedVNetID) {
+ continue
+ }
+ toolkit.Logf(ctx, "deleting conflicting DNS zone link %s in %s/%s (points to shared VNet)", *link.Name, zoneRG, zoneName)
+ poller, err := config.Azure.VirutalNetworkLinksClient.BeginDelete(ctx, zoneRG, zoneName, *link.Name, nil)
+ if err != nil {
+ return fmt.Errorf("deleting VNet link %s: %w", *link.Name, err)
+ }
+ if _, err = poller.PollUntilDone(ctx, nil); err != nil {
+ return fmt.Errorf("waiting for VNet link %s deletion: %w", *link.Name, err)
+ }
+ }
+ }
+ return nil
+}
+
+// resourceGroupFromID extracts the resource group name from an Azure resource ID.
+func resourceGroupFromID(id string) string {
+ parts := strings.Split(id, "/")
+ for i, part := range parts {
+ if strings.EqualFold(part, "resourceGroups") && i+1 < len(parts) {
+ return parts[i+1]
+ }
+ }
+ return ""
+}
+
+func addPrivateEndpointForACR(ctx context.Context, privateACRName string, vnet VNet, location string) error {
+ sharedRG := vnet.resourceGroup
+ privateZoneName := "privatelink.azurecr.io"
+
+ // PEs live in the shared RG on the PE subnet so all clusters share one
+ // PE per ACR with a single IP address → single DNS A record.
+ peSubnetID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/%s",
+ config.Config.SubscriptionID, sharedRG, vnet.name, PESubnetName)
+ peVnet := VNet{
+ resourceGroup: sharedRG,
+ name: vnet.name,
+ subnetName: PESubnetName,
+ subnetId: peSubnetID,
+ }
+
+ privateEndpointName := fmt.Sprintf("PE-for-%s", privateACRName)
+ toolkit.Logf(ctx, "ensuring private endpoint %s in shared RG %s", privateEndpointName, sharedRG)
+ privateEndpoint, err := createPrivateEndpoint(ctx, sharedRG, privateEndpointName, privateACRName, peVnet, location)
+ if err != nil {
return err
}
- if err = addDNSZoneGroup(ctx, privateZone, nodeResourceGroup, privateZoneName, *privateEndpoint.Name); err != nil {
+ if err = addRecordSetToPrivateDNSZone(ctx, privateEndpoint, sharedRG, sharedRG, privateZoneName); err != nil {
return err
}
return nil
@@ -852,23 +893,32 @@ func createPrivateEndpoint(ctx context.Context, nodeResourceGroup, privateEndpoi
CustomDNSConfigs: []*armnetwork.CustomDNSConfigPropertiesFormat{},
},
}
- poller, err := config.Azure.PrivateEndpointClient.BeginCreateOrUpdate(
- ctx,
- nodeResourceGroup,
- privateEndpointName,
- peParams,
- nil,
- )
- if err != nil {
- return nil, fmt.Errorf("failed to create private endpoint in BeginCreateOrUpdate: %w", err)
- }
- resp, err := poller.PollUntilDone(ctx, nil)
+
+ var result armnetwork.PrivateEndpoint
+ err = retryOn409(ctx, fmt.Sprintf("creating private endpoint %s", privateEndpointName), func() error {
+ poller, err := config.Azure.PrivateEndpointClient.BeginCreateOrUpdate(
+ ctx,
+ nodeResourceGroup,
+ privateEndpointName,
+ peParams,
+ nil,
+ )
+ if err != nil {
+ return fmt.Errorf("failed to create private endpoint in BeginCreateOrUpdate: %w", err)
+ }
+ resp, err := poller.PollUntilDone(ctx, nil)
+ if err != nil {
+ return fmt.Errorf("failed to create private endpoint in polling: %w", err)
+ }
+ result = resp.PrivateEndpoint
+ return nil
+ })
if err != nil {
- return nil, fmt.Errorf("failed to create private endpoint in polling: %w", err)
+ return nil, err
}
- toolkit.Logf(ctx, "Private Endpoint created or updated with ID: %s", *resp.ID)
- return &resp.PrivateEndpoint, nil
+ toolkit.Logf(ctx, "Private Endpoint created or updated with ID: %s", *result.ID)
+ return &result, nil
}
func createPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName string) (*armprivatedns.PrivateZone, error) {
@@ -929,11 +979,11 @@ func waitForPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName
return zone, nil
}
-func createPrivateDNSLink(ctx context.Context, vnet VNet, nodeResourceGroup, privateZoneName string) error {
+func createPrivateDNSLink(ctx context.Context, vnet VNet, resourceGroup, privateZoneName string) error {
networkLinkName := "link-ABE2ETests"
_, err := config.Azure.VirutalNetworkLinksClient.Get(
ctx,
- nodeResourceGroup,
+ resourceGroup,
privateZoneName,
networkLinkName,
nil,
@@ -944,7 +994,11 @@ func createPrivateDNSLink(ctx context.Context, vnet VNet, nodeResourceGroup, pri
return nil
}
- vnetForId, err := config.Azure.VNet.Get(ctx, nodeResourceGroup, vnet.name, nil)
+ vnetRG := vnet.resourceGroup
+ if vnetRG == "" {
+ vnetRG = resourceGroup
+ }
+ vnetForId, err := config.Azure.VNet.Get(ctx, vnetRG, vnet.name, nil)
if err != nil {
return fmt.Errorf("failed to get vnet: %w", err)
}
@@ -959,7 +1013,7 @@ func createPrivateDNSLink(ctx context.Context, vnet VNet, nodeResourceGroup, pri
}
poller, err := config.Azure.VirutalNetworkLinksClient.BeginCreateOrUpdate(
ctx,
- nodeResourceGroup,
+ resourceGroup,
privateZoneName,
networkLinkName,
linkParams,
@@ -971,7 +1025,7 @@ func createPrivateDNSLink(ctx context.Context, vnet VNet, nodeResourceGroup, pri
if errors.As(err, &respErr) && respErr.StatusCode == 409 {
toolkit.Logf(ctx, "Virtual network link creation conflict (409), waiting for completion")
return wait.PollUntilContextTimeout(ctx, 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) {
- _, err := config.Azure.VirutalNetworkLinksClient.Get(ctx, nodeResourceGroup, privateZoneName, networkLinkName, nil)
+ _, err := config.Azure.VirutalNetworkLinksClient.Get(ctx, resourceGroup, privateZoneName, networkLinkName, nil)
if err != nil {
var respErr *azcore.ResponseError
if errors.As(err, &respErr) && respErr.StatusCode == 404 {
@@ -993,73 +1047,60 @@ func createPrivateDNSLink(ctx context.Context, vnet VNet, nodeResourceGroup, pri
return nil
}
-func addRecordSetToPrivateDNSZone(ctx context.Context, privateEndpoint *armnetwork.PrivateEndpoint, nodeResourceGroup, privateZoneName string) error {
- for i, dnsConfigPtr := range privateEndpoint.Properties.CustomDNSConfigs {
- var ipAddresses []string
- if dnsConfigPtr == nil {
- return fmt.Errorf("CustomDNSConfigs[%d] is nil", i)
- }
-
- // get the ip addresses
- dnsConfig := *dnsConfigPtr
- if len(dnsConfig.IPAddresses) == 0 {
- return fmt.Errorf("CustomDNSConfigs[%d].IPAddresses is nil or empty", i)
- }
- for _, ipPtr := range dnsConfig.IPAddresses {
- ipAddresses = append(ipAddresses, *ipPtr)
- }
- if len(ipAddresses) == 0 {
- return fmt.Errorf("IPAddresses is empty")
- }
+// addRecordSetToPrivateDNSZone creates A records in the private DNS zone for a
+// private endpoint. It reads the PE's NIC to get the actual private IPs (since
+// CustomDNSConfigs is unreliable when DNS zone groups have been used).
+func addRecordSetToPrivateDNSZone(ctx context.Context, privateEndpoint *armnetwork.PrivateEndpoint, peResourceGroup, dnsZoneResourceGroup, privateZoneName string) error {
+ if privateEndpoint.Properties == nil || len(privateEndpoint.Properties.NetworkInterfaces) == 0 {
+ return fmt.Errorf("private endpoint has no network interfaces")
+ }
- aRecords := make([]*armprivatedns.ARecord, len(ipAddresses))
- for i, ip := range ipAddresses {
- aRecords[i] = &armprivatedns.ARecord{IPv4Address: &ip}
- }
- ttl := int64(10)
- aRecordSet := armprivatedns.RecordSet{
- Properties: &armprivatedns.RecordSetProperties{
- TTL: &ttl,
- ARecords: aRecords,
- },
- }
- _, err := config.Azure.RecordSetClient.CreateOrUpdate(ctx, nodeResourceGroup, privateZoneName, armprivatedns.RecordTypeA, *dnsConfig.Fqdn, aRecordSet, nil)
- if err != nil {
- return fmt.Errorf("failed to create record set: %w", err)
- }
+ nicID := *privateEndpoint.Properties.NetworkInterfaces[0].ID
+ nicName := nicID[strings.LastIndex(nicID, "/")+1:]
+ nic, err := config.Azure.NetworkInterfaces.Get(ctx, peResourceGroup, nicName, nil)
+ if err != nil {
+ return fmt.Errorf("getting PE NIC %s: %w", nicName, err)
}
- toolkit.Logf(ctx, "Record Set created or updated")
- return nil
-}
+ // Each NIC IP config has a private IP and an associated FQDN from the
+ // PE's privateLinkServiceConnections. Create one A record per FQDN.
+ for _, ipConfig := range nic.Properties.IPConfigurations {
+ if ipConfig.Properties == nil || ipConfig.Properties.PrivateIPAddress == nil {
+ continue
+ }
+ ip := *ipConfig.Properties.PrivateIPAddress
-func addDNSZoneGroup(ctx context.Context, privateZone *armprivatedns.PrivateZone, nodeResourceGroup, privateZoneName, endpointName string) error {
- groupName := strings.Replace(privateZoneName, ".", "-", -1) // replace . with -
- _, err := config.Azure.PrivateDNSZoneGroup.Get(ctx, nodeResourceGroup, endpointName, groupName, nil)
- if err == nil {
- return nil
- }
- dnsZonegroup := armnetwork.PrivateDNSZoneGroup{
- Name: to.Ptr(fmt.Sprintf("%s/default", privateZoneName)),
- Properties: &armnetwork.PrivateDNSZoneGroupPropertiesFormat{
- PrivateDNSZoneConfigs: []*armnetwork.PrivateDNSZoneConfig{{
- Name: to.Ptr(groupName),
- Properties: &armnetwork.PrivateDNSZonePropertiesFormat{
- PrivateDNSZoneID: privateZone.ID,
+ // The PE's CustomDNSConfigs or PrivateLinkServiceConnections tell us the
+ // FQDN, but they may be empty. Use the IP config's
+ // PrivateLinkConnectionProperties for the FQDN list.
+ if ipConfig.Properties.PrivateLinkConnectionProperties == nil || len(ipConfig.Properties.PrivateLinkConnectionProperties.Fqdns) == 0 {
+ continue
+ }
+ for _, fqdn := range ipConfig.Properties.PrivateLinkConnectionProperties.Fqdns {
+ if fqdn == nil {
+ continue
+ }
+ // The NIC returns FQDNs like "myacr.azurecr.io" or
+ // "myacr.westus3.data.azurecr.io". The private DNS zone is
+ // "privatelink.azurecr.io", so the record name is everything
+ // before ".azurecr.io":
+ // "myacr.azurecr.io" → record "myacr"
+ // "myacr.westus3.data.azurecr.io" → record "myacr.westus3.data"
+ // Azure's CNAME chain maps X.azurecr.io → X.privatelink.azurecr.io
+ recordName := strings.TrimSuffix(*fqdn, ".azurecr.io")
+ aRecordSet := armprivatedns.RecordSet{
+ Properties: &armprivatedns.RecordSetProperties{
+ TTL: to.Ptr[int64](10),
+ ARecords: []*armprivatedns.ARecord{{IPv4Address: to.Ptr(ip)}},
},
- }},
- },
- }
- dnsZoneResp, err := config.Azure.PrivateDNSZoneGroup.BeginCreateOrUpdate(ctx, nodeResourceGroup, endpointName, groupName, dnsZonegroup, nil)
- if err != nil {
- return fmt.Errorf("failed to create private dns zone group in BeginCreateOrUpdate: %w", err)
- }
- _, err = dnsZoneResp.PollUntilDone(ctx, nil)
- if err != nil {
- return fmt.Errorf("failed to create private dns zone group in polling: %w", err)
+ }
+ _, err := config.Azure.RecordSetClient.CreateOrUpdate(ctx, dnsZoneResourceGroup, privateZoneName, armprivatedns.RecordTypeA, recordName, aRecordSet, nil)
+ if err != nil {
+ return fmt.Errorf("failed to create A record %s → %s: %w", recordName, ip, err)
+ }
+ toolkit.Logf(ctx, "DNS A record: %s.%s → %s", recordName, privateZoneName, ip)
+ }
}
-
- toolkit.Logf(ctx, "Private DNS Zone Group created or updated with ID")
return nil
}
@@ -1118,14 +1159,44 @@ func createNetworkIsolatedSecurityGroup(ctx context.Context, cluster *armcontain
return &nsg, nil
}
-func updateSubnet(ctx context.Context, cluster *armcontainerservice.ManagedCluster, subnetParameters armnetwork.Subnet, vnetName string) error {
- poller, err := config.Azure.Subnet.BeginCreateOrUpdate(ctx, *cluster.Properties.NodeResourceGroup, vnetName, config.Config.DefaultSubnetName, subnetParameters, nil)
- if err != nil {
+// updateSubnet reads the current subnet, merges in the desired fields, and PUTs
+// the result. This read-modify-write avoids stripping existing properties like
+// private endpoint allocations, service endpoints, or delegations — a bare PUT
+// with only the desired fields causes Azure to return InUsePrefixCannotBeDeleted.
+func updateSubnet(ctx context.Context, cluster *armcontainerservice.ManagedCluster, desired armnetwork.Subnet, vnet VNet) error {
+ return retryOn409(ctx, fmt.Sprintf("updating subnet %s", vnet.subnetName), func() error {
+ existing, err := config.Azure.Subnet.Get(ctx, vnet.resourceGroup, vnet.name, vnet.subnetName, nil)
+ if err != nil {
+ return fmt.Errorf("getting subnet %s: %w", vnet.subnetName, err)
+ }
+
+ merged := existing.Subnet
+ mergeSubnetProperties(merged.Properties, desired.Properties)
+
+ poller, err := config.Azure.Subnet.BeginCreateOrUpdate(ctx, vnet.resourceGroup, vnet.name, vnet.subnetName, merged, nil)
+ if err != nil {
+ return err
+ }
+ _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
return err
+ })
+}
+
+// mergeSubnetProperties overwrites only the fields that are set in desired.
+func mergeSubnetProperties(dst, desired *armnetwork.SubnetPropertiesFormat) {
+ if dst == nil || desired == nil {
+ return
}
- _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
- if err != nil {
- return err
+ if desired.AddressPrefix != nil {
+ dst.AddressPrefix = desired.AddressPrefix
+ }
+ if desired.NetworkSecurityGroup != nil {
+ dst.NetworkSecurityGroup = desired.NetworkSecurityGroup
+ }
+ if desired.RouteTable != nil {
+ dst.RouteTable = desired.RouteTable
+ }
+ if desired.ServiceEndpoints != nil {
+ dst.ServiceEndpoints = desired.ServiceEndpoints
}
- return nil
}
diff --git a/e2e/cache.go b/e2e/cache.go
index 1b07d383815..6f411e6c2d8 100644
--- a/e2e/cache.go
+++ b/e2e/cache.go
@@ -146,7 +146,7 @@ var ClusterLatestKubernetesVersion = cachedFunc(clusterLatestKubernetesVersion)
// clusterLatestKubernetesVersion creates a cluster with the latest available Kubernetes version
func clusterLatestKubernetesVersion(ctx context.Context, request ClusterRequest) (*Cluster, error) {
- model, err := getLatestKubernetesVersionClusterModel(ctx, "abe2e-latest-kubernetes-version", request.Location, request.K8sSystemPoolSKU)
+ model, err := getLatestKubernetesVersionClusterModel(ctx, "abe2e-latest-kubernetes-version-v2", request.Location, request.K8sSystemPoolSKU)
if err != nil {
return nil, fmt.Errorf("getting latest kubernetes version cluster model: %w", err)
}
@@ -155,51 +155,59 @@ func clusterLatestKubernetesVersion(ctx context.Context, request ClusterRequest)
var ClusterKubenet = cachedFunc(clusterKubenet)
-// clusterKubenet creates a basic cluster using kubenet networking
+// clusterKubenet creates a basic cluster using kubenet networking with shared VNet
func clusterKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) {
- return prepareCluster(ctx, getKubenetClusterModel("abe2e-kubenet-v4", request.Location, request.K8sSystemPoolSKU), false, false)
+ clusterName := "abe2e-kubenet-v5"
+ model := getKubenetClusterModel(clusterName, request.Location, request.K8sSystemPoolSKU)
+ return prepareCluster(ctx, model, false, false)
}
var ClusterAzureNetwork = cachedFunc(clusterAzureNetwork)
// clusterAzureNetwork creates a cluster with Azure CNI networking
func clusterAzureNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) {
- return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-network-v3", request.Location, request.K8sSystemPoolSKU), false, false)
+ model := getAzureNetworkClusterModel("abe2e-azure-network-v4", request.Location, request.K8sSystemPoolSKU)
+ return prepareCluster(ctx, model, false, false)
}
var ClusterAzureBootstrapProfileCache = cachedFunc(clusterAzureBootstrapProfileCache)
// clusterAzureBootstrapProfileCache creates a cluster with bootstrap profile cache but without network isolation
func clusterAzureBootstrapProfileCache(ctx context.Context, request ClusterRequest) (*Cluster, error) {
- return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-bootstrapprofile-cache-v1", request.Location, request.K8sSystemPoolSKU), false, true)
+ model := getAzureNetworkClusterModel("abe2e-azure-bootstrapprofile-cache-v2", request.Location, request.K8sSystemPoolSKU)
+ return prepareCluster(ctx, model, false, true)
}
var ClusterAzureNetworkIsolated = cachedFunc(clusterAzureNetworkIsolated)
// clusterAzureNetworkIsolated creates a networkisolated Azure network cluster (no internet access)
func clusterAzureNetworkIsolated(ctx context.Context, request ClusterRequest) (*Cluster, error) {
- return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-networkisolated-v1", request.Location, request.K8sSystemPoolSKU), true, false)
+ model := getAzureNetworkClusterModel("abe2e-azure-networkisolated-v2", request.Location, request.K8sSystemPoolSKU)
+ return prepareCluster(ctx, model, true, false)
}
var ClusterAzureOverlayNetwork = cachedFunc(clusterAzureOverlayNetwork)
// clusterAzureOverlayNetwork creates a cluster with Azure CNI Overlay networking
func clusterAzureOverlayNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) {
- return prepareCluster(ctx, getAzureOverlayNetworkClusterModel("abe2e-azure-overlay-network-v3", request.Location, request.K8sSystemPoolSKU), false, false)
+ model := getAzureOverlayNetworkClusterModel("abe2e-azure-overlay-network-v4", request.Location, request.K8sSystemPoolSKU)
+ return prepareCluster(ctx, model, false, false)
}
var ClusterAzureOverlayNetworkDualStack = cachedFunc(clusterAzureOverlayNetworkDualStack)
// clusterAzureOverlayNetworkDualStack creates a dual-stack (IPv4+IPv6) Azure CNI Overlay cluster
func clusterAzureOverlayNetworkDualStack(ctx context.Context, request ClusterRequest) (*Cluster, error) {
- return prepareCluster(ctx, getAzureOverlayNetworkDualStackClusterModel("abe2e-azure-overlay-dualstack-v3", request.Location, request.K8sSystemPoolSKU), false, false)
+ model := getAzureOverlayNetworkDualStackClusterModel("abe2e-azure-overlay-dualstack-v4", request.Location, request.K8sSystemPoolSKU)
+ return prepareCluster(ctx, model, false, false)
}
var ClusterCiliumNetwork = cachedFunc(clusterCiliumNetwork)
// clusterCiliumNetwork creates a cluster with Cilium CNI networking
func clusterCiliumNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) {
- return prepareCluster(ctx, getCiliumNetworkClusterModel("abe2e-cilium-network-v3", request.Location, request.K8sSystemPoolSKU), false, false)
+ model := getCiliumNetworkClusterModel("abe2e-cilium-network-v4", request.Location, request.K8sSystemPoolSKU)
+ return prepareCluster(ctx, model, false, false)
}
// isNotFoundErr checks if an error represents a "not found" response from Azure API
diff --git a/e2e/cluster.go b/e2e/cluster.go
index 506a9923b8d..db4c9daaaca 100644
--- a/e2e/cluster.go
+++ b/e2e/cluster.go
@@ -9,7 +9,6 @@ import (
"fmt"
"net"
"net/http"
- "net/netip"
"strings"
"time"
@@ -21,7 +20,6 @@ import (
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v3"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8"
- "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v7"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/privatedns/armprivatedns"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources/v3"
"github.com/google/uuid"
@@ -47,6 +45,7 @@ type Cluster struct {
ClusterParams *ClusterParams
Bastion *Bastion
ProxyURL string
+ TenantID string
}
// Returns true if the cluster is configured with Azure CNI
@@ -74,7 +73,24 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag
ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutCluster)
defer cancel()
+ infra, err := configureSharedVNet(ctx, clusterModel, *clusterModel.Location)
+ if err != nil {
+ return nil, err
+ }
+
clusterModel.Name = to.Ptr(fmt.Sprintf("%s-%s", *clusterModel.Name, hash(clusterModel)))
+ // If configureSharedVNet marked this model, create the per-cluster subnet
+ // now that we have the final hashed name, with an auto-allocated CIDR.
+ subnetID, err := CachedEnsureClusterSubnet(ctx, ClusterSubnetRequest{
+ Location: *clusterModel.Location,
+ ClusterName: *clusterModel.Name,
+ })
+ if err != nil {
+ return nil, fmt.Errorf("ensuring cluster subnet: %w", err)
+ }
+ for _, pool := range clusterModel.Properties.AgentPoolProfiles {
+ pool.VnetSubnetID = to.Ptr(subnetID)
+ }
cluster, err := getOrCreateCluster(ctx, clusterModel)
if err != nil {
@@ -92,7 +108,7 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag
dag.Run(g, func(ctx context.Context) error { return ensureMaintenanceConfiguration(ctx, cluster) })
subnet := dag.Go(g, func(ctx context.Context) (string, error) { return getClusterSubnetID(ctx, cluster) })
vNet := dag.Go(g, func(ctx context.Context) (VNet, error) {
- return getClusterVNet(ctx, *cluster.Properties.NodeResourceGroup)
+ return getClusterVNet(ctx, cluster)
})
kube := dag.Go(g, func(ctx context.Context) (*Kubeclient, error) { return getClusterKubeClient(ctx, cluster) })
identity := dag.Go(g, func(ctx context.Context) (*armcontainerservice.UserAssignedIdentity, error) {
@@ -106,16 +122,29 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag
// objects whose backing VMSS no longer exist.
var networkDeps []dag.Dep
if !isNetworkIsolated {
- networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addFirewallRules(ctx, cluster) }, bastion))
+ networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addFirewallRules(ctx, infra, cluster) }, bastion))
}
if isNetworkIsolated {
networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addNetworkIsolatedSettings(ctx, cluster) }, bastion))
}
dag.Run1(g, kube, func(ctx context.Context, k *Kubeclient) error { return collectGarbageVMSS(ctx, cluster, k) }, networkDeps...)
needACR := isNetworkIsolated || attachPrivateAcr
- acrNonAnon := dag.Run2(g, kube, identity, addACR(cluster, needACR, true))
- acrAnon := dag.Run2(g, kube, identity, addACR(cluster, needACR, false))
- debugDeps := append([]dag.Dep{acrNonAnon, acrAnon}, networkDeps...)
+
+ // The private DNS zone and VNet link must exist before any PE is created.
+ // Create them once as a dependency for both ACR tasks.
+ var acrNonAnon, acrAnon dag.Dep
+ if needACR {
+ dnsReady := dag.Run1(g, vNet, func(ctx context.Context, v VNet) error {
+ _, err := ensurePrivateDNSZone(ctx, v)
+ return err
+ }, bastion)
+ acrNonAnon = dag.Run2(g, kube, identity, addACR(cluster, true), dnsReady)
+ acrAnon = dag.Run2(g, kube, identity, addACR(cluster, false), dnsReady)
+ }
+ debugDeps := append(networkDeps[:0:0], networkDeps...)
+ if acrNonAnon != nil {
+ debugDeps = append(debugDeps, acrNonAnon, acrAnon)
+ }
proxyURL := dag.Go1(g, kube, func(ctx context.Context, k *Kubeclient) (string, error) {
if err := k.EnsureDebugDaemonsets(ctx, isNetworkIsolated, config.GetPrivateACRName(true, *cluster.Location)); err != nil {
return "", err
@@ -125,11 +154,9 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag
}
return k.GetProxyURL(ctx)
}, debugDeps...)
- if !isNetworkIsolated {
- dag.Run(g, func(ctx context.Context) error {
- return setupPrivateDNSForAPIServer(ctx, cluster)
- })
- }
+ dag.Run(g, func(ctx context.Context) error {
+ return setupPrivateDNSForAPIServer(ctx, vNet.MustGet().resourceGroup, cluster)
+ })
extract := dag.Go1(g, kube, extractClusterParams(cluster))
if err := g.Wait(); err != nil {
@@ -144,14 +171,12 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag
ClusterParams: extract.MustGet(),
Bastion: bastion.MustGet(),
ProxyURL: proxyURL.MustGet(),
+ TenantID: infra.TenantID,
}, nil
}
-func addACR(cluster *armcontainerservice.ManagedCluster, needACR, isNonAnonymousPull bool) func(context.Context, *Kubeclient, *armcontainerservice.UserAssignedIdentity) error {
+func addACR(cluster *armcontainerservice.ManagedCluster, isNonAnonymousPull bool) func(context.Context, *Kubeclient, *armcontainerservice.UserAssignedIdentity) error {
return func(ctx context.Context, k *Kubeclient, id *armcontainerservice.UserAssignedIdentity) error {
- if !needACR {
- return nil
- }
return addPrivateAzureContainerRegistry(ctx, cluster, k, id, isNonAnonymousPull)
}
}
@@ -513,249 +538,41 @@ func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontaine
}
func getOrCreateBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) {
- nodeRG := *cluster.Properties.NodeResourceGroup
- bastionName := fmt.Sprintf("%s-bastion", *cluster.Name)
-
- existing, err := config.Azure.BastionHosts.Get(ctx, nodeRG, bastionName, nil)
- var azErr *azcore.ResponseError
- if errors.As(err, &azErr) && azErr.StatusCode == http.StatusNotFound {
- return createNewBastion(ctx, cluster)
- }
- if err != nil {
- return nil, fmt.Errorf("failed to get bastion %q in rg %q: %w", bastionName, nodeRG, err)
- }
-
- return NewBastion(config.Azure.Credential, config.Config.SubscriptionID, nodeRG, *existing.BastionHost.Properties.DNSName), nil
-}
-
-func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) {
- nodeRG := *cluster.Properties.NodeResourceGroup
location := *cluster.Location
- bastionName := fmt.Sprintf("%s-bastion", *cluster.Name)
- defer toolkit.LogStepCtxf(ctx, "creating bastion %s", bastionName)()
- publicIPName := fmt.Sprintf("%s-bastion-pip", *cluster.Name)
- publicIPName = sanitizeAzureResourceName(publicIPName)
-
- vnet, err := getClusterVNet(ctx, nodeRG)
+ sharedRG := config.ResourceGroupName(location)
+ sharedBastion, err := config.Azure.BastionHosts.Get(ctx, sharedRG, SharedBastionName, nil)
if err != nil {
- return nil, fmt.Errorf("get cluster vnet in rg %q: %w", nodeRG, err)
- }
-
- // Azure Bastion requires a dedicated subnet named AzureBastionSubnet. Standard SKU (required for
- // native client support/tunneling) requires at least a /26.
- bastionSubnetName := "AzureBastionSubnet"
- bastionSubnetPrefix := "10.226.0.0/26"
- if _, err := netip.ParsePrefix(bastionSubnetPrefix); err != nil {
- return nil, fmt.Errorf("invalid bastion subnet prefix %q: %w", bastionSubnetPrefix, err)
- }
-
- var bastionSubnetID string
- var bastionSubnet armnetwork.SubnetsClientGetResponse
- var subnetGetErr error
- // Retry the subnet GET with a per-call timeout to tolerate ARM hangs.
- // Without this, a single unresponsive GET consumes the entire 20-minute cluster prep budget.
- err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) {
- callCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
- defer cancel()
- bastionSubnet, subnetGetErr = config.Azure.Subnet.Get(callCtx, nodeRG, vnet.name, bastionSubnetName, nil)
- if subnetGetErr == nil {
- return true, nil
+ if !isNotFoundError(err) {
+ return nil, fmt.Errorf("checking shared bastion %s in %s: %w", SharedBastionName, sharedRG, err)
}
- var subnetAzErr *azcore.ResponseError
- if errors.As(subnetGetErr, &subnetAzErr) && subnetAzErr.StatusCode == http.StatusNotFound {
- return true, nil // 404 is expected — will create below
+ toolkit.Logf(ctx, "shared bastion not found, recreating")
+ dnsName, createErr := ensureSharedBastion(ctx, sharedRG, location)
+ if createErr != nil {
+ return nil, fmt.Errorf("recreating shared bastion: %w", createErr)
}
- toolkit.Logf(ctx, "transient error getting subnet %q (retrying): %v", bastionSubnetName, subnetGetErr)
- return false, nil
- })
- if err != nil {
- return nil, fmt.Errorf("get subnet %q in vnet %q rg %q: retries exhausted: %w (last subnet error: %v)", bastionSubnetName, vnet.name, nodeRG, err, subnetGetErr)
+ return NewBastion(config.Azure.Credential, config.Config.SubscriptionID, sharedRG, dnsName), nil
}
-
- if subnetGetErr != nil {
- // 404 — need to create
- toolkit.Logf(ctx, "creating subnet %s in VNet %s (rg %s)", bastionSubnetName, vnet.name, nodeRG)
- subnetParams := armnetwork.Subnet{
- Properties: &armnetwork.SubnetPropertiesFormat{
- AddressPrefix: to.Ptr(bastionSubnetPrefix),
- },
- }
- subnetPoller, err := config.Azure.Subnet.BeginCreateOrUpdate(ctx, nodeRG, vnet.name, bastionSubnetName, subnetParams, nil)
- if err != nil {
- return nil, fmt.Errorf("failed to start creating bastion subnet: %w", err)
- }
- bastionSubnet, err := subnetPoller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
- if err != nil {
- return nil, fmt.Errorf("failed to create bastion subnet: %w", err)
- }
- bastionSubnetID = *bastionSubnet.ID
- } else {
- bastionSubnetID = *bastionSubnet.ID
- }
-
- // Public IP for Bastion
- pipParams := armnetwork.PublicIPAddress{
- Location: to.Ptr(location),
- SKU: &armnetwork.PublicIPAddressSKU{
- Name: to.Ptr(armnetwork.PublicIPAddressSKUNameStandard),
- },
- Properties: &armnetwork.PublicIPAddressPropertiesFormat{
- PublicIPAllocationMethod: to.Ptr(armnetwork.IPAllocationMethodStatic),
- },
- }
-
- toolkit.Logf(ctx, "creating bastion public IP %s (rg %s)", publicIPName, nodeRG)
- pipPoller, err := config.Azure.PublicIPAddresses.BeginCreateOrUpdate(ctx, nodeRG, publicIPName, pipParams, nil)
- if err != nil {
- return nil, fmt.Errorf("failed to start creating bastion public IP: %w", err)
- }
- pipResp, err := pipPoller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
- if err != nil {
- return nil, fmt.Errorf("failed to create bastion public IP: %w", err)
- }
- if pipResp.ID == nil {
- return nil, fmt.Errorf("bastion public IP response missing ID")
- }
-
- bastionHost := armnetwork.BastionHost{
- Location: to.Ptr(location),
- SKU: &armnetwork.SKU{
- Name: to.Ptr(armnetwork.BastionHostSKUNameStandard),
- },
- Properties: &armnetwork.BastionHostPropertiesFormat{
- // Native client support is enabled via tunneling.
- EnableTunneling: to.Ptr(true),
- IPConfigurations: []*armnetwork.BastionHostIPConfiguration{
- {
- Name: to.Ptr("bastion-ipcfg"),
- Properties: &armnetwork.BastionHostIPConfigurationPropertiesFormat{
- Subnet: &armnetwork.SubResource{
- ID: to.Ptr(bastionSubnetID),
- },
- PublicIPAddress: &armnetwork.SubResource{
- ID: pipResp.ID,
- },
- },
- },
- },
- },
- }
-
- toolkit.Logf(ctx, "creating bastion %s (native client/tunneling enabled) in rg %s", bastionName, nodeRG)
- bastionPoller, err := config.Azure.BastionHosts.BeginCreateOrUpdate(ctx, nodeRG, bastionName, bastionHost, nil)
- if err != nil {
- return nil, fmt.Errorf("failed to start creating bastion: %w", err)
- }
- resp, err := bastionPoller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
- if err != nil {
- return nil, fmt.Errorf("failed to create bastion: %w", err)
- }
-
- bastion := NewBastion(config.Azure.Credential, config.Config.SubscriptionID, nodeRG, *resp.BastionHost.Properties.DNSName)
-
- if err := verifyBastion(ctx, cluster, bastion); err != nil {
- return nil, fmt.Errorf("failed to verify bastion: %w", err)
- }
- return bastion, nil
-}
-
-func verifyBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster, bastion *Bastion) error {
- nodeRG := *cluster.Properties.NodeResourceGroup
- vmssName, err := getSystemPoolVMSSName(ctx, cluster)
- if err != nil {
- return err
- }
-
- var vmssVM *armcompute.VirtualMachineScaleSetVM
- pager := config.Azure.VMSSVM.NewListPager(nodeRG, vmssName, nil)
- if pager.More() {
- page, err := pager.NextPage(ctx)
- if err != nil {
- return fmt.Errorf("list vmss vms for %q in rg %q: %w", vmssName, nodeRG, err)
- }
- if len(page.Value) > 0 {
- vmssVM = page.Value[0]
- }
- }
-
- vmPrivateIP, err := getPrivateIPFromVMSSVM(ctx, nodeRG, vmssName, *vmssVM.InstanceID)
-
- ctx, cancel := context.WithCancel(ctx)
- defer cancel()
-
- sshClient, err := DialSSHOverBastion(ctx, bastion, vmPrivateIP, config.SysSSHPrivateKey)
- if err != nil {
- return err
- }
-
- defer sshClient.Close()
-
- result, err := runSSHCommandWithPrivateKeyFile(ctx, sshClient, "uname -a", false)
- if err != nil {
- return err
- }
- if strings.Contains(result.stdout, vmssName) {
- return nil
- }
- return fmt.Errorf("Executed ssh on wrong VM, Expected %s: %s", vmssName, result.stdout)
-}
-
-func getSystemPoolVMSSName(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (string, error) {
- nodeRG := *cluster.Properties.NodeResourceGroup
- var systemPoolName string
- for _, pool := range cluster.Properties.AgentPoolProfiles {
- if strings.EqualFold(string(*pool.Mode), "System") {
- systemPoolName = *pool.Name
- }
- }
- pager := config.Azure.VMSS.NewListPager(nodeRG, nil)
- if pager.More() {
- page, err := pager.NextPage(ctx)
- if err != nil {
- return "", fmt.Errorf("list vmss in rg %q: %w", nodeRG, err)
- }
- for _, vmss := range page.Value {
- if strings.Contains(strings.ToLower(*vmss.Name), strings.ToLower(systemPoolName)) {
- return *vmss.Name, nil
- }
- }
- }
- return "", fmt.Errorf("no matching VMSS found for system pool %q in rg %q", systemPoolName, nodeRG)
-}
-
-func sanitizeAzureResourceName(name string) string {
- // Azure resource name restrictions vary by type. For our usage here (Public IP name) we just
- // keep it simple and strip problematic characters.
- replacer := strings.NewReplacer("/", "-", "\\", "-", ":", "-", "_", "-", " ", "-")
- name = replacer.Replace(name)
- name = strings.Trim(name, "-")
- if len(name) > 80 {
- name = name[:80]
- }
- return name
+ toolkit.Logf(ctx, "using shared bastion %s in %s", SharedBastionName, sharedRG)
+ return NewBastion(config.Azure.Credential, config.Config.SubscriptionID, sharedRG, *sharedBastion.Properties.DNSName), nil
}
type VNet struct {
- name string
- subnetId string
- resourceGUID string
+ name string
+ resourceGroup string
+ subnetName string
+ subnetId string
+ resourceGUID string
+ addressPrefix string
}
-func getClusterVNet(ctx context.Context, mcResourceGroupName string) (VNet, error) {
- pager := config.Azure.VNet.NewListPager(mcResourceGroupName, nil)
- for pager.More() {
- nextResult, err := pager.NextPage(ctx)
- if err != nil {
- return VNet{}, fmt.Errorf("failed to advance page: %w", err)
- }
- for _, v := range nextResult.Value {
- if v == nil {
- return VNet{}, fmt.Errorf("aks vnet was empty")
- }
- return VNet{name: *v.Name, subnetId: fmt.Sprintf("%s/subnets/%s", *v.ID, "aks-subnet"), resourceGUID: *v.Properties.ResourceGUID}, nil
+// getClusterVNet returns VNet info for the cluster by parsing the VnetSubnetID from the agent pool.
+func getClusterVNet(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (VNet, error) {
+ for _, pool := range cluster.Properties.AgentPoolProfiles {
+ if pool.VnetSubnetID != nil && *pool.VnetSubnetID != "" {
+ return vnetFromSubnetID(ctx, *pool.VnetSubnetID)
}
}
- return VNet{}, fmt.Errorf("failed to find aks vnet")
+ return VNet{}, fmt.Errorf("no VnetSubnetID found on any agent pool profile")
}
func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.ManagedCluster, kube *Kubeclient) error {
@@ -875,49 +692,65 @@ func ensureResourceGroup(ctx context.Context, location string) (armresources.Res
return rg.ResourceGroup, nil
}
-// setupPrivateDNSForAPIServer creates a private DNS zone for the API server FQDN
-// linked to the cluster VNet with an A record pointing to the current public IP.
-// Simulates a customer environment with minimal private DNS entries.
-func setupPrivateDNSForAPIServer(ctx context.Context, cluster *armcontainerservice.ManagedCluster) error {
+// setupPrivateDNSForAPIServer adds an A record for the cluster's API server FQDN
+// to the shared private DNS zone. The zone and VNet link are created once by ensureSharedInfra.
+func setupPrivateDNSForAPIServer(ctx context.Context, resourceGroup string, cluster *armcontainerservice.ManagedCluster) error {
defer toolkit.LogStepCtx(ctx, "setting up private DNS for API server")()
fqdn := *cluster.Properties.Fqdn
- nodeRG := *cluster.Properties.NodeResourceGroup
+ zoneName := APIServerDNSZone(*cluster.Location)
+ recordName := strings.TrimSuffix(fqdn, "."+zoneName)
ips, err := net.LookupHost(fqdn)
if err != nil {
return fmt.Errorf("resolving API server FQDN %q: %w", fqdn, err)
}
- var aRecords []*armprivatedns.ARecord
+ var wantIPs []string
for _, ip := range ips {
if parsed := net.ParseIP(ip); parsed != nil && parsed.To4() != nil {
- aRecords = append(aRecords, &armprivatedns.ARecord{IPv4Address: to.Ptr(ip)})
+ wantIPs = append(wantIPs, ip)
}
}
- if len(aRecords) == 0 {
+ if len(wantIPs) == 0 {
return fmt.Errorf("no IPv4 addresses for %q", fqdn)
}
- // createPrivateZone and createPrivateDNSLink handle 409 conflicts internally
- if _, err := createPrivateZone(ctx, nodeRG, fqdn); err != nil {
- return fmt.Errorf("creating private zone %q: %w", fqdn, err)
+ // Check if the record already exists with the correct IPs
+ existing, err := config.Azure.RecordSetClient.Get(ctx, resourceGroup, zoneName, armprivatedns.RecordTypeA, recordName, nil)
+ if err == nil && existing.Properties != nil && existing.Properties.ARecords != nil {
+ existingIPs := map[string]bool{}
+ for _, r := range existing.Properties.ARecords {
+ if r.IPv4Address != nil {
+ existingIPs[*r.IPv4Address] = true
+ }
+ }
+ if len(existingIPs) == len(wantIPs) {
+ allMatch := true
+ for _, ip := range wantIPs {
+ if !existingIPs[ip] {
+ allMatch = false
+ break
+ }
+ }
+ if allMatch {
+ toolkit.Logf(ctx, "private DNS record %s already up to date", recordName)
+ return nil
+ }
+ }
}
- vnet, err := getClusterVNet(ctx, nodeRG)
- if err != nil {
- return fmt.Errorf("getting cluster VNet: %w", err)
- }
- if err := createPrivateDNSLink(ctx, vnet, nodeRG, fqdn); err != nil {
- return fmt.Errorf("linking private zone to VNet: %w", err)
+ var aRecords []*armprivatedns.ARecord
+ for _, ip := range wantIPs {
+ aRecords = append(aRecords, &armprivatedns.ARecord{IPv4Address: to.Ptr(ip)})
}
- _, err = config.Azure.RecordSetClient.CreateOrUpdate(ctx, nodeRG, fqdn, armprivatedns.RecordTypeA, "@",
+ _, err = config.Azure.RecordSetClient.CreateOrUpdate(ctx, resourceGroup, zoneName, armprivatedns.RecordTypeA, recordName,
armprivatedns.RecordSet{Properties: &armprivatedns.RecordSetProperties{TTL: to.Ptr[int64](300), ARecords: aRecords}}, nil)
if err != nil {
- return fmt.Errorf("creating A record in zone %q: %w", fqdn, err)
+ return fmt.Errorf("creating A record %q in zone %q: %w", recordName, zoneName, err)
}
- toolkit.Logf(ctx, "private DNS zone %q → %v", fqdn, ips)
+ toolkit.Logf(ctx, "private DNS: %s.%s → %v", recordName, zoneName, wantIPs)
return nil
}
diff --git a/e2e/kube.go b/e2e/kube.go
index 1f5ea29d5c2..dd21536ed05 100644
--- a/e2e/kube.go
+++ b/e2e/kube.go
@@ -637,21 +637,12 @@ func (k *Kubeclient) GetProxyURL(ctx context.Context) (string, error) {
}
func getClusterSubnetID(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (string, error) {
- mcResourceGroupName := *cluster.Properties.NodeResourceGroup
- pager := config.Azure.VNet.NewListPager(mcResourceGroupName, nil)
- for pager.More() {
- nextResult, err := pager.NextPage(ctx)
- if err != nil {
- return "", fmt.Errorf("advance page: %w", err)
- }
- for _, v := range nextResult.Value {
- if v == nil {
- return "", fmt.Errorf("aks vnet was empty")
- }
- return fmt.Sprintf("%s/subnets/%s", *v.ID, "aks-subnet"), nil
+ for _, pool := range cluster.Properties.AgentPoolProfiles {
+ if pool.VnetSubnetID != nil && *pool.VnetSubnetID != "" {
+ return *pool.VnetSubnetID, nil
}
}
- return "", fmt.Errorf("failed to find aks vnet")
+ return "", fmt.Errorf("no VnetSubnetID found on any agent pool profile")
}
func podHTTPServerLinux(s *Scenario) *corev1.Pod {
diff --git a/e2e/node_config.go b/e2e/node_config.go
index 2f6f38b150b..92101d9824f 100644
--- a/e2e/node_config.go
+++ b/e2e/node_config.go
@@ -1,6 +1,7 @@
package e2e
import (
+ "context"
"encoding/base64"
"fmt"
"testing"
@@ -65,7 +66,7 @@ func baseKubeletConfig() *aksnodeconfigv1.KubeletConfig {
EventRecordQps: to.Ptr(int32(0)),
ClusterDomain: "cluster.local",
ClusterDns: []string{
- "10.0.0.10",
+ "172.16.0.10",
},
StreamingConnectionIdleTimeout: "4h",
NodeStatusUpdateFrequency: "10s",
@@ -98,7 +99,7 @@ func baseKubeletConfig() *aksnodeconfigv1.KubeletConfig {
}
}
-func getBaseNBC(t testing.TB, cluster *Cluster, vhd *config.Image) (*datamodel.NodeBootstrappingConfiguration, error) {
+func getBaseNBC(ctx context.Context, t testing.TB, cluster *Cluster, vhd *config.Image) (*datamodel.NodeBootstrappingConfiguration, error) {
var nbc *datamodel.NodeBootstrappingConfiguration
if vhd.Distro.IsWindowsDistro() {
@@ -133,8 +134,7 @@ func getBaseNBC(t testing.TB, cluster *Cluster, vhd *config.Image) (*datamodel.N
nbc.SecureTLSBootstrappingConfig = &datamodel.SecureTLSBootstrappingConfig{
Enabled: config.Config.EnableSecureTLSBootstrapping && !vhd.UnsupportedSecureTLSBootstrapping,
}
-
- nbc.TenantID = *cluster.Model.Identity.TenantID
+ nbc.TenantID = cluster.TenantID
nbc.ContainerService.Properties.CertificateProfile.CaCertificate = string(cluster.ClusterParams.CACert)
nbc.ContainerService.Properties.HostedMasterProfile.FQDN = cluster.ClusterParams.FQDN
nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = vhd.Distro
@@ -716,7 +716,7 @@ func baseTemplateLinux(t testing.TB, location string, k8sVersion string, arch st
"127.0.0.1",
"168.63.129.16",
"169.254.169.254",
- "10.0.0.0/16",
+ "172.16.0.0/16",
"agentbaker-agentbaker-e2e-t-8ecadf-c82d8251.hcp.eastus.azmk8s.io",
},
TrustedCA: nil,
@@ -730,7 +730,7 @@ func baseTemplateLinux(t testing.TB, location string, k8sVersion string, arch st
"--cgroups-per-qos": "true",
"--client-ca-file": "/etc/kubernetes/certs/ca.crt",
"--cloud-provider": "external",
- "--cluster-dns": "10.0.0.10",
+ "--cluster-dns": "172.16.0.10",
"--cluster-domain": "cluster.local",
"--dynamic-config-dir": "/var/lib/kubelet",
"--enforce-node-allocatable": "pods",
@@ -824,11 +824,11 @@ func baseTemplateWindows(t testing.TB, location string) *datamodel.NodeBootstrap
KubernetesConfig: &datamodel.KubernetesConfig{
AzureCNIURLWindows: "https://packages.aks.azure.com/azure-cni/v1.6.21/binaries/azure-vnet-cni-windows-amd64-v1.6.21.zip",
ClusterSubnet: "10.224.0.0/16",
- DNSServiceIP: "10.0.0.10",
+ DNSServiceIP: "172.16.0.10",
LoadBalancerSku: "Standard",
NetworkPlugin: "azure",
NetworkPluginMode: "overlay",
- ServiceCIDR: "10.0.0.0/16",
+ ServiceCIDR: "172.16.0.0/16",
UseInstanceMetadata: to.Ptr(true),
UseManagedIdentity: false,
WindowsContainerdURL: "https://packages.aks.azure.com/containerd/windows/",
@@ -962,7 +962,7 @@ DXRqvV7TWO2hndliQq3BW385ZkiephlrmpUVM= r2k1@arturs-mbp.lan`,
"--kubeconfig": "c:\\k\\config",
"--max-pods": "30",
"--resolv-conf": "\"\"\"\"",
- "--cluster-dns": "10.0.0.10",
+ "--cluster-dns": "172.16.0.10",
"--cluster-domain": "cluster.local",
"--rotate-certificates": "true",
"--rotate-server-certificates": "true",
diff --git a/e2e/shared_infra.go b/e2e/shared_infra.go
new file mode 100644
index 00000000000..6aef4382910
--- /dev/null
+++ b/e2e/shared_infra.go
@@ -0,0 +1,615 @@
+package e2e
+
+import (
+ "context"
+ "crypto/sha256"
+ "errors"
+ "fmt"
+ "math/rand"
+ "net/http"
+ "strings"
+ "time"
+
+ "github.com/Azure/agentbaker/e2e/config"
+ "github.com/Azure/agentbaker/e2e/toolkit"
+ "github.com/Azure/azure-sdk-for-go/sdk/azcore"
+ "github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
+ "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v3"
+ "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8"
+ "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi"
+ "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v7"
+ "github.com/google/uuid"
+)
+
+const (
+ SharedVNetName = "abe2e-shared-vnet"
+ SharedVNetCIDR = "10.0.0.0/8"
+ SharedBastionName = "abe2e-shared-bastion"
+ SharedBastionPIPName = "abe2e-shared-bastion-pip"
+ SharedClusterIdentity = "abe2e-cluster-identity"
+ BastionSubnetCIDR = "10.0.0.0/26"
+ FirewallSubnetCIDR = "10.0.1.0/24"
+ PESubnetName = "abe2e-pe-subnet"
+ PESubnetCIDR = "10.0.2.0/24"
+ networkContributorRolID = "/providers/Microsoft.Authorization/roleDefinitions/4d97b98b-1d4f-4787-a291-c67834d212e7"
+)
+
+// APIServerDNSZone returns the private DNS zone name for API server FQDNs in a given location.
+func APIServerDNSZone(location string) string {
+ return fmt.Sprintf("hcp.%s.azmk8s.io", location)
+}
+
+type SharedInfra struct {
+ VNetName string
+ ResourceGroup string
+ BastionDNSName string
+ FirewallIP string
+ IdentityID string // resource ID of the user-assigned managed identity
+ TenantID string // tenant ID of the user-assigned managed identity
+}
+
+var CachedEnsureSharedInfra = cachedFunc(ensureSharedInfra)
+
+func ensureSharedInfra(ctx context.Context, location string) (*SharedInfra, error) {
+ defer toolkit.LogStepCtx(ctx, "ensuring shared infrastructure")()
+ rg := config.ResourceGroupName(location)
+
+ if err := ensureSharedVNet(ctx, rg, location); err != nil {
+ return nil, fmt.Errorf("ensuring shared VNet: %w", err)
+ }
+
+ if err := ensurePESubnet(ctx, rg); err != nil {
+ return nil, fmt.Errorf("ensuring PE subnet: %w", err)
+ }
+
+ if err := ensureAPIServerDNSZone(ctx, rg, location); err != nil {
+ return nil, fmt.Errorf("ensuring API server DNS zone: %w", err)
+ }
+
+ bastionDNS, err := ensureSharedBastion(ctx, rg, location)
+ if err != nil {
+ return nil, fmt.Errorf("ensuring shared bastion: %w", err)
+ }
+
+ firewallIP, err := ensureSharedFirewall(ctx, rg, location)
+ if err != nil {
+ return nil, fmt.Errorf("ensuring shared firewall: %w", err)
+ }
+
+ identityID, tenantID, err := ensureClusterIdentity(ctx, rg, location)
+ if err != nil {
+ return nil, fmt.Errorf("ensuring cluster identity: %w", err)
+ }
+
+ // Best-effort cleanup of orphaned cluster subnets
+ cleanupOrphanedSubnets(ctx, rg)
+
+ return &SharedInfra{
+ VNetName: SharedVNetName,
+ ResourceGroup: rg,
+ BastionDNSName: bastionDNS,
+ FirewallIP: firewallIP,
+ IdentityID: identityID,
+ TenantID: tenantID,
+ }, nil
+}
+
+func ensureSharedVNet(ctx context.Context, rg, location string) error {
+ _, err := config.Azure.VNet.Get(ctx, rg, SharedVNetName, nil)
+ if err == nil {
+ return nil
+ }
+ if !isNotFoundError(err) {
+ return fmt.Errorf("checking shared VNet: %w", err)
+ }
+
+ toolkit.Logf(ctx, "creating shared VNet %s in %s", SharedVNetName, rg)
+ poller, err := config.Azure.VNet.BeginCreateOrUpdate(ctx, rg, SharedVNetName, armnetwork.VirtualNetwork{
+ Location: to.Ptr(location),
+ Properties: &armnetwork.VirtualNetworkPropertiesFormat{
+ AddressSpace: &armnetwork.AddressSpace{
+ AddressPrefixes: []*string{to.Ptr(SharedVNetCIDR)},
+ },
+ },
+ }, nil)
+ if err != nil {
+ return fmt.Errorf("creating shared VNet: %w", err)
+ }
+ _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
+ if err != nil {
+ return fmt.Errorf("waiting for shared VNet creation: %w", err)
+ }
+ return nil
+}
+
+// ensurePESubnet creates the dedicated subnet for shared private endpoints.
+func ensurePESubnet(ctx context.Context, rg string) error {
+ _, err := config.Azure.Subnet.Get(ctx, rg, SharedVNetName, PESubnetName, nil)
+ if err == nil {
+ return nil
+ }
+ if !isNotFoundError(err) {
+ return fmt.Errorf("checking PE subnet: %w", err)
+ }
+ toolkit.Logf(ctx, "creating PE subnet %s", PESubnetName)
+ poller, err := config.Azure.Subnet.BeginCreateOrUpdate(ctx, rg, SharedVNetName, PESubnetName, armnetwork.Subnet{
+ Properties: &armnetwork.SubnetPropertiesFormat{
+ AddressPrefix: to.Ptr(PESubnetCIDR),
+ },
+ }, nil)
+ if err != nil {
+ return fmt.Errorf("creating PE subnet: %w", err)
+ }
+ _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
+ if err != nil {
+ return fmt.Errorf("waiting for PE subnet creation: %w", err)
+ }
+ return nil
+}
+
+// ensureAPIServerDNSZone creates the shared private DNS zone for API server FQDNs
+// and links it to the shared VNet. All clusters add their A records to this single zone.
+func ensureAPIServerDNSZone(ctx context.Context, rg, location string) error {
+ zoneName := APIServerDNSZone(location)
+ if _, err := createPrivateZone(ctx, rg, zoneName); err != nil {
+ return fmt.Errorf("creating API server DNS zone %s: %w", zoneName, err)
+ }
+ vnet := VNet{
+ name: SharedVNetName,
+ resourceGroup: rg,
+ }
+ if err := createPrivateDNSLink(ctx, vnet, rg, zoneName); err != nil {
+ return fmt.Errorf("linking API server DNS zone to VNet: %w", err)
+ }
+ return nil
+}
+
+// cleanupOrphanedSubnets removes cluster subnets whose corresponding AKS cluster
+// no longer exists and that have no active Azure resources attached.
+// Only considers subnets that are not recently provisioned to avoid racing with
+// cluster creation.
+func cleanupOrphanedSubnets(ctx context.Context, rg string) {
+ pager := config.Azure.Subnet.NewListPager(rg, SharedVNetName, nil)
+ for pager.More() {
+ page, err := pager.NextPage(ctx)
+ if err != nil {
+ toolkit.Logf(ctx, "warning: failed to list subnets for cleanup: %v", err)
+ return
+ }
+ for _, subnet := range page.Value {
+ name := *subnet.Name
+ if !strings.HasPrefix(name, "aks-subnet-") {
+ continue
+ }
+ if subnetHasActiveResources(subnet) {
+ continue
+ }
+ clusterName := strings.TrimPrefix(name, "aks-subnet-")
+ _, err := config.Azure.AKS.Get(ctx, rg, clusterName, nil)
+ if err == nil {
+ continue
+ }
+ if !isNotFoundError(err) {
+ toolkit.Logf(ctx, "warning: transient error checking cluster %s, skipping subnet cleanup: %v", clusterName, err)
+ continue
+ }
+ toolkit.Logf(ctx, "deleting orphaned subnet %s", name)
+ poller, err := config.Azure.Subnet.BeginDelete(ctx, rg, SharedVNetName, name, nil)
+ if err != nil {
+ toolkit.Logf(ctx, "warning: failed to start deleting subnet %s: %v", name, err)
+ continue
+ }
+ if _, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions); err != nil {
+ toolkit.Logf(ctx, "warning: failed to delete subnet %s: %v", name, err)
+ }
+ }
+ }
+}
+
+func subnetHasActiveResources(subnet *armnetwork.Subnet) bool {
+ if subnet.Properties == nil {
+ return false
+ }
+ return len(subnet.Properties.IPConfigurations) > 0 ||
+ len(subnet.Properties.ServiceAssociationLinks) > 0 ||
+ len(subnet.Properties.ResourceNavigationLinks) > 0
+}
+
+func ensureSubnet(ctx context.Context, rg, vnetName, subnetName, cidr string) error {
+ _, err := config.Azure.Subnet.Get(ctx, rg, vnetName, subnetName, nil)
+ if err == nil {
+ return nil
+ }
+ if !isNotFoundError(err) {
+ return fmt.Errorf("checking subnet %s: %w", subnetName, err)
+ }
+
+ return retryOn409(ctx, fmt.Sprintf("creating subnet %s", subnetName), func() error {
+ toolkit.Logf(ctx, "creating subnet %s (%s) in VNet %s", subnetName, cidr, vnetName)
+ poller, err := config.Azure.Subnet.BeginCreateOrUpdate(ctx, rg, vnetName, subnetName, armnetwork.Subnet{
+ Properties: &armnetwork.SubnetPropertiesFormat{
+ AddressPrefix: to.Ptr(cidr),
+ },
+ }, nil)
+ if err != nil {
+ return fmt.Errorf("creating subnet %s: %w", subnetName, err)
+ }
+ _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
+ if err != nil {
+ return fmt.Errorf("waiting for subnet %s: %w", subnetName, err)
+ }
+ return nil
+ })
+}
+
+func ensureSharedBastion(ctx context.Context, rg, location string) (string, error) {
+ existing, err := config.Azure.BastionHosts.Get(ctx, rg, SharedBastionName, nil)
+ if err == nil {
+ if existing.Properties == nil || existing.Properties.DNSName == nil {
+ return "", fmt.Errorf("shared bastion %s exists but has no DNS name", SharedBastionName)
+ }
+ return *existing.Properties.DNSName, nil
+ }
+ if !isNotFoundError(err) {
+ return "", fmt.Errorf("checking shared bastion: %w", err)
+ }
+
+ if err := ensureSubnet(ctx, rg, SharedVNetName, "AzureBastionSubnet", BastionSubnetCIDR); err != nil {
+ return "", fmt.Errorf("ensuring bastion subnet: %w", err)
+ }
+
+ toolkit.Logf(ctx, "creating shared bastion public IP %s", SharedBastionPIPName)
+ pipPoller, err := config.Azure.PublicIPAddresses.BeginCreateOrUpdate(ctx, rg, SharedBastionPIPName, armnetwork.PublicIPAddress{
+ Location: to.Ptr(location),
+ SKU: &armnetwork.PublicIPAddressSKU{
+ Name: to.Ptr(armnetwork.PublicIPAddressSKUNameStandard),
+ },
+ Properties: &armnetwork.PublicIPAddressPropertiesFormat{
+ PublicIPAllocationMethod: to.Ptr(armnetwork.IPAllocationMethodStatic),
+ },
+ }, nil)
+ if err != nil {
+ return "", fmt.Errorf("creating bastion public IP: %w", err)
+ }
+ pipResp, err := pipPoller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
+ if err != nil {
+ return "", fmt.Errorf("waiting for bastion public IP: %w", err)
+ }
+
+ bastionSubnetID := fmt.Sprintf(
+ "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/AzureBastionSubnet",
+ config.Config.SubscriptionID, rg, SharedVNetName,
+ )
+
+ toolkit.Logf(ctx, "creating shared bastion %s (Standard SKU, tunneling enabled)", SharedBastionName)
+ bastionPoller, err := config.Azure.BastionHosts.BeginCreateOrUpdate(ctx, rg, SharedBastionName, armnetwork.BastionHost{
+ Location: to.Ptr(location),
+ SKU: &armnetwork.SKU{
+ Name: to.Ptr(armnetwork.BastionHostSKUNameStandard),
+ },
+ Properties: &armnetwork.BastionHostPropertiesFormat{
+ EnableTunneling: to.Ptr(true),
+ IPConfigurations: []*armnetwork.BastionHostIPConfiguration{
+ {
+ Name: to.Ptr("bastion-ipcfg"),
+ Properties: &armnetwork.BastionHostIPConfigurationPropertiesFormat{
+ Subnet: &armnetwork.SubResource{
+ ID: to.Ptr(bastionSubnetID),
+ },
+ PublicIPAddress: &armnetwork.SubResource{
+ ID: pipResp.ID,
+ },
+ },
+ },
+ },
+ },
+ }, nil)
+ if err != nil {
+ return "", fmt.Errorf("creating shared bastion: %w", err)
+ }
+ resp, err := bastionPoller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
+ if err != nil {
+ return "", fmt.Errorf("waiting for shared bastion: %w", err)
+ }
+ return *resp.Properties.DNSName, nil
+}
+
+const (
+ SharedFirewallName = "abe2e-fw"
+ SharedFirewallPIPName = "abe2e-fw-pip"
+)
+
+// ensureSharedFirewall creates or retrieves the shared Azure Firewall and returns its private IP.
+func ensureSharedFirewall(ctx context.Context, rg, location string) (string, error) {
+ existing, err := config.Azure.AzureFirewall.Get(ctx, rg, SharedFirewallName, nil)
+ if err == nil {
+ return getFirewallPrivateIP(existing.AzureFirewall)
+ }
+ if !isNotFoundError(err) {
+ return "", fmt.Errorf("checking shared firewall: %w", err)
+ }
+
+ // Ensure firewall subnet exists
+ if err := ensureSubnet(ctx, rg, SharedVNetName, "AzureFirewallSubnet", FirewallSubnetCIDR); err != nil {
+ return "", fmt.Errorf("ensuring firewall subnet: %w", err)
+ }
+
+ firewallSubnetID := fmt.Sprintf(
+ "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/AzureFirewallSubnet",
+ config.Config.SubscriptionID, rg, SharedVNetName,
+ )
+
+ // Create public IP for firewall
+ toolkit.Logf(ctx, "creating shared firewall public IP %s", SharedFirewallPIPName)
+ pipPoller, err := config.Azure.PublicIPAddresses.BeginCreateOrUpdate(ctx, rg, SharedFirewallPIPName, armnetwork.PublicIPAddress{
+ Location: to.Ptr(location),
+ SKU: &armnetwork.PublicIPAddressSKU{
+ Name: to.Ptr(armnetwork.PublicIPAddressSKUNameStandard),
+ },
+ Properties: &armnetwork.PublicIPAddressPropertiesFormat{
+ PublicIPAllocationMethod: to.Ptr(armnetwork.IPAllocationMethodStatic),
+ },
+ }, nil)
+ if err != nil {
+ return "", fmt.Errorf("creating firewall public IP: %w", err)
+ }
+ pipResp, err := pipPoller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
+ if err != nil {
+ return "", fmt.Errorf("waiting for firewall public IP: %w", err)
+ }
+
+ // Create firewall
+ toolkit.Logf(ctx, "creating shared firewall %s", SharedFirewallName)
+ firewall := getFirewall(ctx, location, firewallSubnetID, *pipResp.ID)
+ fwPoller, err := config.Azure.AzureFirewall.BeginCreateOrUpdate(ctx, rg, SharedFirewallName, *firewall, nil)
+ if err != nil {
+ return "", fmt.Errorf("creating shared firewall: %w", err)
+ }
+ fwResp, err := fwPoller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)
+ if err != nil {
+ return "", fmt.Errorf("waiting for shared firewall: %w", err)
+ }
+
+ return getFirewallPrivateIP(fwResp.AzureFirewall)
+}
+
+func getFirewallPrivateIP(fw armnetwork.AzureFirewall) (string, error) {
+ if fw.Properties != nil && fw.Properties.IPConfigurations != nil && len(fw.Properties.IPConfigurations) > 0 {
+ if fw.Properties.IPConfigurations[0].Properties != nil && fw.Properties.IPConfigurations[0].Properties.PrivateIPAddress != nil {
+ return *fw.Properties.IPConfigurations[0].Properties.PrivateIPAddress, nil
+ }
+ }
+ return "", fmt.Errorf("firewall has no private IP address")
+}
+
+// ensureClusterIdentity creates a user-assigned managed identity for AKS clusters
+// and grants it Network Contributor on the subscription so it can manage route tables
+// in both the shared VNet and the MC_ resource groups.
+func ensureClusterIdentity(ctx context.Context, rg, location string) (string, string, error) {
+ existing, err := config.Azure.UserAssignedIdentities.Get(ctx, rg, SharedClusterIdentity, nil)
+ if err == nil {
+ return *existing.ID, *existing.Properties.TenantID, nil
+ }
+ if !isNotFoundError(err) {
+ return "", "", fmt.Errorf("checking cluster identity: %w", err)
+ }
+
+ toolkit.Logf(ctx, "creating shared cluster identity %s", SharedClusterIdentity)
+ resp, err := config.Azure.UserAssignedIdentities.CreateOrUpdate(ctx, rg, SharedClusterIdentity, armmsi.Identity{
+ Location: to.Ptr(location),
+ }, nil)
+ if err != nil {
+ return "", "", fmt.Errorf("creating cluster identity: %w", err)
+ }
+
+ // Grant Network Contributor on the shared VNet so the identity can manage
+ // subnets and route table associations. AKS automatically grants the identity
+ // Contributor on the MC_ resource group during cluster creation.
+ vnetScope := fmt.Sprintf(
+ "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s",
+ config.Config.SubscriptionID, rg, SharedVNetName,
+ )
+ toolkit.Logf(ctx, "assigning Network Contributor to %s on shared VNet", SharedClusterIdentity)
+ _, err = config.Azure.RoleAssignments.Create(ctx, vnetScope, uuid.New().String(), armauthorization.RoleAssignmentCreateParameters{
+ Properties: &armauthorization.RoleAssignmentProperties{
+ PrincipalID: resp.Properties.PrincipalID,
+ RoleDefinitionID: to.Ptr(networkContributorRolID),
+ PrincipalType: to.Ptr(armauthorization.PrincipalTypeServicePrincipal),
+ },
+ }, nil)
+ if err != nil {
+ var azErr *azcore.ResponseError
+ if errors.As(err, &azErr) && azErr.StatusCode == http.StatusConflict {
+ // role assignment already exists
+ } else {
+ return "", "", fmt.Errorf("assigning Network Contributor: %w", err)
+ }
+ }
+
+ return *resp.ID, *resp.Properties.TenantID, nil
+}
+
+type ClusterSubnetRequest struct {
+ Location string
+ ClusterName string
+}
+
+var CachedEnsureClusterSubnet = cachedFunc(ensureClusterSubnet)
+
+func ensureClusterSubnet(ctx context.Context, req ClusterSubnetRequest) (string, error) {
+ rg := config.ResourceGroupName(req.Location)
+ subnetName := clusterSubnetName(req.ClusterName)
+
+ // Check if this subnet already exists (idempotent)
+ existing, err := config.Azure.Subnet.Get(ctx, rg, SharedVNetName, subnetName, nil)
+ if err == nil {
+ return *existing.ID, nil
+ }
+ if !isNotFoundError(err) {
+ return "", fmt.Errorf("checking subnet %s: %w", subnetName, err)
+ }
+
+ // Collect CIDRs already in use
+ usedCIDRs := map[string]bool{}
+ pager := config.Azure.Subnet.NewListPager(rg, SharedVNetName, nil)
+ for pager.More() {
+ page, err := pager.NextPage(ctx)
+ if err != nil {
+ return "", fmt.Errorf("listing subnets: %w", err)
+ }
+ for _, s := range page.Value {
+ if s.Properties != nil && s.Properties.AddressPrefix != nil {
+ usedCIDRs[*s.Properties.AddressPrefix] = true
+ }
+ }
+ }
+
+ // Find a free CIDR starting from the hash-based slot
+ cidr := allocateSubnetCIDR(subnetName, usedCIDRs)
+ if cidr == "" {
+ return "", fmt.Errorf("no free /20 CIDR available in shared VNet")
+ }
+
+ if err := ensureSubnet(ctx, rg, SharedVNetName, subnetName, cidr); err != nil {
+ return "", err
+ }
+
+ return fmt.Sprintf(
+ "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/%s",
+ config.Config.SubscriptionID, rg, SharedVNetName, subnetName,
+ ), nil
+}
+
+func clusterSubnetName(clusterName string) string {
+ return "aks-subnet-" + clusterName
+}
+
+const totalSubnetSlots = 4080
+
+// allocateSubnetCIDR finds a free /20 CIDR by starting at a hash-derived slot
+// and probing linearly until a free one is found.
+func allocateSubnetCIDR(name string, usedCIDRs map[string]bool) string {
+ h := sha256.Sum256([]byte(name))
+ startIdx := (int(h[0])<<8 | int(h[1])) % totalSubnetSlots
+ for i := 0; i < totalSubnetSlots; i++ {
+ idx := (startIdx + i) % totalSubnetSlots
+ cidr := cidrFromIndex(idx)
+ if !usedCIDRs[cidr] {
+ return cidr
+ }
+ }
+ return ""
+}
+
+func cidrFromIndex(idx int) string {
+ secondOctet := (idx / 16) + 1 // 1-255
+ thirdOctet := (idx % 16) * 16 // 0, 16, 32, ..., 240
+ return fmt.Sprintf("10.%d.%d.0/20", secondOctet, thirdOctet)
+}
+
+func isNotFoundError(err error) bool {
+ var azErr *azcore.ResponseError
+ if errors.As(err, &azErr) && azErr.StatusCode == http.StatusNotFound {
+ return true
+ }
+ return false
+}
+
+// vnetFromSubnetID parses VNet info from a subnet resource ID and fetches the VNet for metadata.
+func vnetFromSubnetID(ctx context.Context, subnetID string) (VNet, error) {
+ parts := strings.Split(subnetID, "/")
+ var rg, vnetName, subnetName string
+ for i, p := range parts {
+ if i+1 >= len(parts) {
+ continue
+ }
+ switch p {
+ case "resourceGroups":
+ rg = parts[i+1]
+ case "virtualNetworks":
+ vnetName = parts[i+1]
+ case "subnets":
+ subnetName = parts[i+1]
+ }
+ }
+ if rg == "" || vnetName == "" || subnetName == "" {
+ return VNet{}, fmt.Errorf("failed to parse VNet info from subnet ID: %s", subnetID)
+ }
+
+ vnetResp, err := config.Azure.VNet.Get(ctx, rg, vnetName, nil)
+ if err != nil {
+ return VNet{}, fmt.Errorf("getting VNet %s in RG %s: %w", vnetName, rg, err)
+ }
+
+ var addressPrefix string
+ if vnetResp.Properties.AddressSpace != nil && len(vnetResp.Properties.AddressSpace.AddressPrefixes) > 0 {
+ addressPrefix = *vnetResp.Properties.AddressSpace.AddressPrefixes[0]
+ }
+
+ var resourceGUID string
+ if vnetResp.Properties != nil && vnetResp.Properties.ResourceGUID != nil {
+ resourceGUID = *vnetResp.Properties.ResourceGUID
+ }
+
+ return VNet{
+ name: vnetName,
+ resourceGroup: rg,
+ subnetName: subnetName,
+ subnetId: subnetID,
+ resourceGUID: resourceGUID,
+ addressPrefix: addressPrefix,
+ }, nil
+}
+
+// retryOn409 retries an Azure operation that fails with 409 Conflict due to
+// concurrent writes on the same resource (e.g., VNet subnet creates).
+func retryOn409(ctx context.Context, operation string, fn func() error) error {
+ maxRetries := 10
+ for attempt := 0; attempt < maxRetries; attempt++ {
+ err := fn()
+ if err == nil {
+ return nil
+ }
+ var azErr *azcore.ResponseError
+ if !errors.As(err, &azErr) || azErr.StatusCode != http.StatusConflict {
+ return err
+ }
+ if attempt == maxRetries-1 {
+ return err
+ }
+ // jittered backoff: 2-8s
+ backoff := time.Duration(2+rand.Intn(6)) * time.Second
+ toolkit.Logf(ctx, "%s: 409 conflict (attempt %d/%d), retrying in %s...", operation, attempt+1, maxRetries, backoff)
+ select {
+ case <-time.After(backoff):
+ case <-ctx.Done():
+ return ctx.Err()
+ }
+ }
+ return fmt.Errorf("%s: exhausted retries", operation)
+}
+
+// configureSharedVNet sets up the cluster model to use the shared VNet and
+// user-assigned identity. The actual subnet is created later in prepareCluster
+// after the cluster name hash is computed, with an auto-allocated CIDR.
+func configureSharedVNet(ctx context.Context, model *armcontainerservice.ManagedCluster, location string) (*SharedInfra, error) {
+ infra, err := CachedEnsureSharedInfra(ctx, location)
+ if err != nil {
+ return nil, fmt.Errorf("ensuring shared infra: %w", err)
+ }
+
+ // Mark the model so prepareCluster knows to create a subnet
+ if model.Tags == nil {
+ model.Tags = map[string]*string{}
+ }
+
+ // Use the shared user-assigned identity
+ model.Identity = &armcontainerservice.ManagedClusterIdentity{
+ Type: to.Ptr(armcontainerservice.ResourceIdentityTypeUserAssigned),
+ UserAssignedIdentities: map[string]*armcontainerservice.ManagedServiceIdentityUserAssignedIdentitiesValue{
+ infra.IdentityID: {},
+ },
+ }
+
+ return infra, nil
+}
diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go
index bf75818cb99..7c9bfc1c160 100644
--- a/e2e/test_helpers.go
+++ b/e2e/test_helpers.go
@@ -258,7 +258,7 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) {
defer toolkit.LogStep(s.T, "preparing AKS node")()
var err error
- nbc, err := getBaseNBC(s.T, s.Runtime.Cluster, s.VHD)
+ nbc, err := getBaseNBC(ctx, s.T, s.Runtime.Cluster, s.VHD)
require.NoError(s.T, err)
nbc.EnableScriptlessCSECmd = true
diff --git a/e2e/vmss.go b/e2e/vmss.go
index 4abf14f8931..5194c9a1218 100644
--- a/e2e/vmss.go
+++ b/e2e/vmss.go
@@ -509,7 +509,7 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc
s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n")
}
// We combine the az aks get credentials in the same line so we don't overwrite the user's kubeconfig.
- result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n"
+ result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, SharedBastionName, config.ResourceGroupName(*s.Runtime.Cluster.Model.Location), config.VMSSHPrivateKeyFileName) + "\n"
s.T.Log(result)
vmssResp, err := operation.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions)