fix: use one big vnet and attach AKS clusters to it to avoid creating bastion multiple times#8646
Conversation
awesomenix
commented
Jun 5, 2026
- Make everyone's life simple
- README has all the details, but idea is to use shared VNET and stop doing unnecessary activities multiple times.
There was a problem hiding this comment.
Pull request overview
This PR refactors the E2E harness to use a single shared per-location VNet and Bastion (in abe2e-{location}) and attaches all AKS clusters to per-cluster subnets inside that VNet, reducing repeated Bastion provisioning and centralizing shared infrastructure creation.
Changes:
- Add shared-infrastructure provisioning (
shared_infra.go) to create/ensure a shared VNet, Bastion, Firewall, and managed identity, plus auto-allocation of per-cluster subnets. - Update cluster creation and networking helpers to rely on
VnetSubnetID(BYO VNet/subnet) rather than discovering VNets in the MC_ resource group. - Update SSH-over-Bastion flow and documentation to reflect shared Bastion/VNet usage.
Reviewed changes
Copilot reviewed 8 out of 8 changed files in this pull request and generated 6 comments.
Show a summary per file
| File | Description |
|---|---|
| e2e/vmss.go | Switch Bastion SSH command to use shared Bastion name + shared RG. |
| e2e/shared_infra.go | Introduce shared VNet/Bastion/Firewall/identity creation and per-cluster subnet allocation/cleanup. |
| e2e/README.md | Document the new shared-infra architecture and updated scenario authoring pointers. |
| e2e/node_config.go | Adjust tenant ID derivation to handle user-assigned identity clusters. |
| e2e/kube.go | Read cluster subnet ID from agent pool VnetSubnetID instead of listing VNets. |
| e2e/cluster.go | Create per-cluster subnet after name hashing; use shared Bastion; derive VNet from subnet ID. |
| e2e/cache.go | Ensure all cached cluster creators configure the shared VNet before preparing clusters. |
| e2e/aks_model.go | Avoid hardcoded subnet CIDRs; use parsed VNet/subnet info when configuring firewall/isolated settings. |
| subnetAddressPrefix := vnet.addressPrefix | ||
|
|
||
| subnetParameters := armnetwork.Subnet{ | ||
| ID: to.Ptr(subnetId), | ||
| Properties: &armnetwork.SubnetPropertiesFormat{ |
2579b61 to
738f4ae
Compare
738f4ae to
69c05f8
Compare
69c05f8 to
f928610
Compare
f928610 to
be31cd4
Compare
be31cd4 to
e71ee5d
Compare
e71ee5d to
8315a13
Compare
8315a13 to
1bb4eab
Compare
| poller, err := config.Azure.Subnet.BeginCreateOrUpdate(ctx, rg, SharedVNetName, PESubnetName, armnetwork.Subnet{ | ||
| Properties: &armnetwork.SubnetPropertiesFormat{ | ||
| AddressPrefix: to.Ptr(PESubnetCIDR), | ||
| }, | ||
| }, nil) |
| for _, subnet := range page.Value { | ||
| name := *subnet.Name | ||
| if !strings.HasPrefix(name, "aks-subnet-") { |
| for _, s := range page.Value { | ||
| if s.Properties != nil && s.Properties.AddressPrefix != nil { | ||
| usedCIDRs[*s.Properties.AddressPrefix] = true | ||
| } |
| // ensureClusterIdentity creates a user-assigned managed identity for AKS clusters | ||
| // and grants it Network Contributor on the subscription so it can manage route tables | ||
| // in both the shared VNet and the MC_ resource groups. |
| toolkit.Logf(ctx, "using shared bastion %s in %s", SharedBastionName, sharedRG) | ||
| return NewBastion(config.Azure.Credential, config.Config.SubscriptionID, sharedRG, *sharedBastion.Properties.DNSName), nil |
| if zone.Name == nil || *zone.Name != privateZoneName { | ||
| continue | ||
| } | ||
| zoneRG := resourceGroupFromID(*zone.ID) | ||
| if strings.EqualFold(zoneRG, sharedRG) { | ||
| continue | ||
| } |
| toolkit.Logf(ctx, "deleting conflicting DNS zone link %s in %s/%s (points to shared VNet)", *link.Name, zoneRG, zoneName) | ||
| poller, err := config.Azure.VirutalNetworkLinksClient.BeginDelete(ctx, zoneRG, zoneName, *link.Name, nil) |
| if privateEndpoint.Properties == nil || len(privateEndpoint.Properties.NetworkInterfaces) == 0 { | ||
| return fmt.Errorf("private endpoint has no network interfaces") | ||
| } | ||
|
|
||
| aRecords := make([]*armprivatedns.ARecord, len(ipAddresses)) | ||
| for i, ip := range ipAddresses { | ||
| aRecords[i] = &armprivatedns.ARecord{IPv4Address: &ip} | ||
| } | ||
| ttl := int64(10) | ||
| aRecordSet := armprivatedns.RecordSet{ | ||
| Properties: &armprivatedns.RecordSetProperties{ | ||
| TTL: &ttl, | ||
| ARecords: aRecords, | ||
| }, | ||
| } | ||
| _, err := config.Azure.RecordSetClient.CreateOrUpdate(ctx, nodeResourceGroup, privateZoneName, armprivatedns.RecordTypeA, *dnsConfig.Fqdn, aRecordSet, nil) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to create record set: %w", err) | ||
| } | ||
| nicID := *privateEndpoint.Properties.NetworkInterfaces[0].ID | ||
| nicName := nicID[strings.LastIndex(nicID, "/")+1:] |
| // Each NIC IP config has a private IP and an associated FQDN from the | ||
| // PE's privateLinkServiceConnections. Create one A record per FQDN. | ||
| for _, ipConfig := range nic.Properties.IPConfigurations { |
1bb4eab to
0648508
Compare
0648508 to
7f6b2a6
Compare
| networkProfile.ServiceCidr = to.Ptr("172.16.0.0/16") | ||
| networkProfile.ServiceCidrs = []*string{ | ||
| networkProfile.ServiceCidr, | ||
| to.Ptr("fd12:3456:789a:1::/108"), |
| poller, err := config.Azure.Subnet.BeginCreateOrUpdate(ctx, rg, SharedVNetName, PESubnetName, armnetwork.Subnet{ | ||
| Properties: &armnetwork.SubnetPropertiesFormat{ | ||
| AddressPrefix: to.Ptr(PESubnetCIDR), | ||
| }, |
| existing, err := config.Azure.UserAssignedIdentities.Get(ctx, rg, SharedClusterIdentity, nil) | ||
| if err == nil { | ||
| return *existing.ID, *existing.Properties.TenantID, nil | ||
| } |
| toolkit.Logf(ctx, "using shared bastion %s in %s", SharedBastionName, sharedRG) | ||
| return NewBastion(config.Azure.Credential, config.Config.SubscriptionID, sharedRG, *sharedBastion.Properties.DNSName), nil |
| // Check if the record already exists with the correct IPs | ||
| existing, err := config.Azure.RecordSetClient.Get(ctx, resourceGroup, zoneName, armprivatedns.RecordTypeA, recordName, nil) | ||
| if err == nil && existing.Properties != nil && existing.Properties.ARecords != nil { |
| for _, ipConfig := range nic.Properties.IPConfigurations { | ||
| if ipConfig.Properties == nil || ipConfig.Properties.PrivateIPAddress == nil { | ||
| continue | ||
| } | ||
| ip := *ipConfig.Properties.PrivateIPAddress | ||
|
|
||
| func addDNSZoneGroup(ctx context.Context, privateZone *armprivatedns.PrivateZone, nodeResourceGroup, privateZoneName, endpointName string) error { | ||
| groupName := strings.Replace(privateZoneName, ".", "-", -1) // replace . with - | ||
| _, err := config.Azure.PrivateDNSZoneGroup.Get(ctx, nodeResourceGroup, endpointName, groupName, nil) | ||
| if err == nil { | ||
| return nil | ||
| } | ||
| dnsZonegroup := armnetwork.PrivateDNSZoneGroup{ | ||
| Name: to.Ptr(fmt.Sprintf("%s/default", privateZoneName)), | ||
| Properties: &armnetwork.PrivateDNSZoneGroupPropertiesFormat{ | ||
| PrivateDNSZoneConfigs: []*armnetwork.PrivateDNSZoneConfig{{ | ||
| Name: to.Ptr(groupName), | ||
| Properties: &armnetwork.PrivateDNSZonePropertiesFormat{ | ||
| PrivateDNSZoneID: privateZone.ID, | ||
| // The PE's CustomDNSConfigs or PrivateLinkServiceConnections tell us the | ||
| // FQDN, but they may be empty. Use the IP config's | ||
| // PrivateLinkConnectionProperties for the FQDN list. | ||
| if ipConfig.Properties.PrivateLinkConnectionProperties == nil || len(ipConfig.Properties.PrivateLinkConnectionProperties.Fqdns) == 0 { | ||
| continue |
| - go.mod | ||
| - go.sum | ||
| - e2e/ | ||
| - e2e/scenario_win_test.go |
… bastion multiple times
7f6b2a6 to
9b2d401
Compare
| return fmt.Errorf("checking subnet %s: %w", subnetName, err) | ||
| } | ||
|
|
||
| return retryOn409(ctx, fmt.Sprintf("creating subnet %s", subnetName), func() error { |
There was a problem hiding this comment.
Condsider adding a retry on 400 too?
Azure returns HTTP 400 (Bad Request) for overlapping subnet address spaces, not 409 Conflict.
if two tests try to create the same subnet name(deterministic from cluster name), only one wins. But if two different clusters hash to the same starting slot and list subnets at the same time, they could both pick the same free CIDR and one will get aconflict on a different subnet name with the same address space. Azure will reject this with a "subnet address space overlaps" error, which is not a 400
There was a problem hiding this comment.
its impossible to have 2 different clusters to have same hash, because the hash is determined by the public SSH key. WE should be ok here
| } | ||
|
|
||
| // Best-effort cleanup of orphaned cluster subnets | ||
| cleanupOrphanedSubnets(ctx, rg) |
There was a problem hiding this comment.
async for unblocking every test?
There was a problem hiding this comment.
deferring to next PR