Skip to content

Commit 51afd0e

Browse files
andreaswachsclaude
andcommitted
fix(lb): prevent empty backend pools on IPAM failures
When IPAM queries fail with 5xx errors during node initialization, InternalIPs are never populated, causing SetBackendServers to be called with an empty list. This adds two layers of defense: - Fallback to direct IPAM query if node addresses are empty - Refuse to clear existing backends when no new IPs are found Signed-off-by: Andreas Wachs <awa@corti.ai> Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent ffe03fa commit 51afd0e

2 files changed

Lines changed: 185 additions & 1 deletion

File tree

scaleway/loadbalancers.go

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,8 +608,28 @@ func (l *loadbalancers) updateLoadBalancer(ctx context.Context, loadbalancer *sc
608608
}
609609

610610
var targetIPs []string
611-
if getForceInternalIP(service) || l.pnID != "" {
611+
pnIDs := getPrivateNetworkIDs(service)
612+
if getForceInternalIP(service) || l.pnID != "" || len(pnIDs) > 0 {
612613
targetIPs = extractNodesInternalIps(nodes)
614+
615+
// Fallback: if no internal IPs and we're in private network mode, try IPAM directly
616+
if len(targetIPs) == 0 && (l.pnID != "" || len(pnIDs) > 0) {
617+
klog.Warningf("no internal IPs in node status for service %s/%s, attempting IPAM fallback",
618+
service.Namespace, service.Name)
619+
620+
fallbackIPs, fallbackErr := l.getNodeIPsFromIPAM(nodes, loadbalancer.Zone)
621+
if fallbackErr != nil {
622+
klog.Errorf("IPAM fallback failed for service %s/%s: %v", service.Namespace, service.Name, fallbackErr)
623+
} else {
624+
targetIPs = fallbackIPs
625+
}
626+
}
627+
628+
if len(targetIPs) == 0 {
629+
klog.Errorf("CRITICAL: no backend IPs found for service %s/%s with pnID %s",
630+
service.Namespace, service.Name, l.pnID)
631+
}
632+
613633
klog.V(3).Infof("using internal nodes ips: %s on loadbalancer %s", strings.Join(targetIPs, ","), loadbalancer.ID)
614634
} else {
615635
targetIPs = extractNodesExternalIps(nodes)
@@ -700,6 +720,13 @@ func (l *loadbalancers) updateLoadBalancer(ctx context.Context, loadbalancer *sc
700720

701721
// Update backend servers
702722
if !stringArrayEqual(backend.Pool, targetIPs) {
723+
// Safety: don't clear existing backends if no new IPs and pool isn't empty
724+
if len(targetIPs) == 0 && len(backend.Pool) > 0 {
725+
klog.Warningf("refusing to clear backend pool for backend %s on loadbalancer %s - keeping %d servers",
726+
backend.ID, loadbalancer.ID, len(backend.Pool))
727+
continue
728+
}
729+
703730
klog.V(3).Infof("update server list for backend: %s port: %d loadbalancer: %s", backend.ID, port.NodePort, loadbalancer.ID)
704731
if _, err := l.api.SetBackendServers(&scwlb.ZonedAPISetBackendServersRequest{
705732
Zone: loadbalancer.Zone,
@@ -882,6 +909,57 @@ func (l *loadbalancers) attachPrivateNetworks(loadbalancer *scwlb.LB, service *v
882909
return nil
883910
}
884911

912+
// getNodeIPsFromIPAM queries IPAM directly to get private network IPs for nodes.
913+
// This is used as a fallback when node.Status.Addresses doesn't contain InternalIPs.
914+
func (l *loadbalancers) getNodeIPsFromIPAM(nodes []*v1.Node, zone scw.Zone) ([]string, error) {
915+
if l.pnID == "" {
916+
return nil, fmt.Errorf("cannot query IPAM without private network ID")
917+
}
918+
919+
region, err := zone.Region()
920+
if err != nil {
921+
return nil, fmt.Errorf("unable to get region from zone %s: %v", zone, err)
922+
}
923+
924+
var ips []string
925+
for _, node := range nodes {
926+
if node.Spec.ProviderID == "" {
927+
klog.Warningf("node %s has no provider ID, skipping IPAM lookup", node.Name)
928+
continue
929+
}
930+
931+
_, _, serverID, err := ServerInfoFromProviderID(node.Spec.ProviderID)
932+
if err != nil {
933+
klog.Warningf("failed to parse provider ID for node %s: %v", node.Name, err)
934+
continue
935+
}
936+
937+
// Query IPAM for IPs on the configured private network for this server
938+
ipamRes, err := l.ipam.ListIPs(&scwipam.ListIPsRequest{
939+
ResourceType: scwipam.ResourceTypeInstancePrivateNic,
940+
IsIPv6: scw.BoolPtr(false),
941+
Region: region,
942+
PrivateNetworkID: &l.pnID,
943+
ResourceName: &serverID,
944+
})
945+
if err != nil {
946+
klog.Warningf("IPAM query failed for node %s (server %s): %v", node.Name, serverID, err)
947+
continue
948+
}
949+
950+
for _, ip := range ipamRes.IPs {
951+
ips = append(ips, ip.Address.IP.String())
952+
}
953+
}
954+
955+
if len(ips) == 0 {
956+
return nil, fmt.Errorf("no IPs found via IPAM for any nodes")
957+
}
958+
959+
klog.V(3).Infof("IPAM fallback found %d IPs: %v", len(ips), ips)
960+
return ips, nil
961+
}
962+
885963
// createPrivateServiceStatus creates a LoadBalancer status for services with private load balancers
886964
func (l *loadbalancers) createPrivateServiceStatus(service *v1.Service, lb *scwlb.LB, ipMode *v1.LoadBalancerIPMode) (*v1.LoadBalancerStatus, error) {
887965
if l.pnID == "" {

scaleway/loadbalancers_test.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2561,3 +2561,109 @@ func TestServicePortToBackendWithHealthCheckFromService(t *testing.T) {
25612561
})
25622562
}
25632563
}
2564+
2565+
func TestBackendPreservation(t *testing.T) {
2566+
// Test that stringArrayEqual correctly identifies when pools differ
2567+
t.Run("EmptyTargetIPsVsNonEmptyPool", func(t *testing.T) {
2568+
existingPool := []string{"10.0.0.1", "10.0.0.2"}
2569+
targetIPs := []string{}
2570+
2571+
// These are not equal, so the update logic would trigger
2572+
equal := stringArrayEqual(existingPool, targetIPs)
2573+
if equal {
2574+
t.Error("Expected pools to be unequal")
2575+
}
2576+
2577+
// The safety check should prevent clearing: len(targetIPs) == 0 && len(existingPool) > 0
2578+
shouldPreserve := len(targetIPs) == 0 && len(existingPool) > 0
2579+
if !shouldPreserve {
2580+
t.Error("Expected preservation logic to trigger")
2581+
}
2582+
})
2583+
2584+
t.Run("BothEmpty", func(t *testing.T) {
2585+
existingPool := []string{}
2586+
targetIPs := []string{}
2587+
2588+
equal := stringArrayEqual(existingPool, targetIPs)
2589+
if !equal {
2590+
t.Error("Expected empty pools to be equal")
2591+
}
2592+
2593+
// No preservation needed - both are empty
2594+
shouldPreserve := len(targetIPs) == 0 && len(existingPool) > 0
2595+
if shouldPreserve {
2596+
t.Error("Should not preserve when both pools are empty")
2597+
}
2598+
})
2599+
2600+
t.Run("NonEmptyTargetIPs", func(t *testing.T) {
2601+
existingPool := []string{"10.0.0.1", "10.0.0.2"}
2602+
targetIPs := []string{"10.0.0.3"}
2603+
2604+
equal := stringArrayEqual(existingPool, targetIPs)
2605+
if equal {
2606+
t.Error("Expected pools to be unequal")
2607+
}
2608+
2609+
// Should NOT preserve - we have new IPs to set
2610+
shouldPreserve := len(targetIPs) == 0 && len(existingPool) > 0
2611+
if shouldPreserve {
2612+
t.Error("Should not preserve when we have new target IPs")
2613+
}
2614+
})
2615+
}
2616+
2617+
func TestExtractNodesInternalIps(t *testing.T) {
2618+
t.Run("NodesWithInternalIPs", func(t *testing.T) {
2619+
nodes := []*v1.Node{
2620+
{
2621+
Status: v1.NodeStatus{
2622+
Addresses: []v1.NodeAddress{
2623+
{Type: v1.NodeInternalIP, Address: "10.0.0.1"},
2624+
{Type: v1.NodeExternalIP, Address: "1.2.3.4"},
2625+
},
2626+
},
2627+
},
2628+
{
2629+
Status: v1.NodeStatus{
2630+
Addresses: []v1.NodeAddress{
2631+
{Type: v1.NodeInternalIP, Address: "10.0.0.2"},
2632+
},
2633+
},
2634+
},
2635+
}
2636+
2637+
ips := extractNodesInternalIps(nodes)
2638+
if len(ips) != 2 {
2639+
t.Errorf("Expected 2 IPs, got %d", len(ips))
2640+
}
2641+
})
2642+
2643+
t.Run("NodesWithoutInternalIPs", func(t *testing.T) {
2644+
nodes := []*v1.Node{
2645+
{
2646+
Status: v1.NodeStatus{
2647+
Addresses: []v1.NodeAddress{
2648+
{Type: v1.NodeExternalIP, Address: "1.2.3.4"},
2649+
{Type: v1.NodeHostName, Address: "node1"},
2650+
},
2651+
},
2652+
},
2653+
}
2654+
2655+
ips := extractNodesInternalIps(nodes)
2656+
if len(ips) != 0 {
2657+
t.Errorf("Expected 0 IPs when no internal IPs, got %d", len(ips))
2658+
}
2659+
})
2660+
2661+
t.Run("EmptyNodes", func(t *testing.T) {
2662+
nodes := []*v1.Node{}
2663+
2664+
ips := extractNodesInternalIps(nodes)
2665+
if len(ips) != 0 {
2666+
t.Errorf("Expected 0 IPs for empty nodes, got %d", len(ips))
2667+
}
2668+
})
2669+
}

0 commit comments

Comments
 (0)