Skip to content

Commit 1382b4c

Browse files
committed
fix: routes controller on node name drift
If a Kubernetes node got successfully initialized we have the Server ID stored in the node objets `providerID`. When renaming a Server, causing a drift between the node and Server name, we should still be able to identify the Server via its ID. This is currently not implemented in the routes' controller, where we rely purly on the Servers name. As the upstream library only provides us with the Kubernetes node name and not the full object, we have to first fetch it from the Kubernetes API. With the node object at hand we can identify and fetch the Server by its ID. Furthermore, we can use the Kubernetes node object to improve our handling of Kubernetes events. The event emitted on a CIDR mismatch is currently not associated with the affected node, because so far we did not have access to the nodes UUID.
1 parent 6ec988b commit 1382b4c

7 files changed

Lines changed: 384 additions & 105 deletions

File tree

hcloud/cloud.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
corev1 "k8s.io/api/core/v1"
2929
"k8s.io/client-go/kubernetes/scheme"
3030
v1 "k8s.io/client-go/kubernetes/typed/core/v1"
31+
corelisters "k8s.io/client-go/listers/core/v1"
3132
"k8s.io/client-go/tools/record"
3233
cloudprovider "k8s.io/cloud-provider"
3334
"k8s.io/klog/v2"
@@ -55,9 +56,10 @@ type cloud struct {
5556
recorder record.EventRecorder
5657
networkID int64
5758
cidr string
59+
nodeLister corelisters.NodeLister
5860
}
5961

60-
func NewCloud(cidr string) (cloudprovider.Interface, error) {
62+
func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Interface, error) {
6163
const op = "hcloud/newCloud"
6264
metrics.OperationCalled.WithLabelValues(op).Inc()
6365

@@ -147,6 +149,7 @@ func NewCloud(cidr string) (cloudprovider.Interface, error) {
147149
cfg: cfg,
148150
networkID: networkID,
149151
cidr: cidr,
152+
nodeLister: nodeLister,
150153
}, nil
151154
}
152155

@@ -213,6 +216,7 @@ func (c *cloud) Routes() (cloudprovider.Routes, bool) {
213216
c.networkID,
214217
c.cidr,
215218
c.recorder,
219+
c.nodeLister,
216220
)
217221
if err != nil {
218222
klog.ErrorS(err, "create routes provider", "networkID", c.networkID)

hcloud/cloud_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ func TestNewCloud(t *testing.T) {
9595
json.NewEncoder(w).Encode(schema.LocationListResponse{Locations: []schema.Location{}})
9696
})
9797

98-
_, err := NewCloud(DefaultClusterCIDR)
98+
_, err := NewCloud(DefaultClusterCIDR, nil)
9999
assert.NoError(t, err)
100100
}
101101

@@ -107,7 +107,7 @@ func TestNewCloudConnectionNotPossible(t *testing.T) {
107107
)
108108
defer resetEnv()
109109

110-
_, err := NewCloud(DefaultClusterCIDR)
110+
_, err := NewCloud(DefaultClusterCIDR, nil)
111111
assert.EqualError(t, err,
112112
`hcloud/newCloud: Get "http://127.0.0.1:4711/v1/locations?": dial tcp 127.0.0.1:4711: connect: connection refused`)
113113
}
@@ -135,7 +135,7 @@ func TestNewCloudInvalidToken(t *testing.T) {
135135
)
136136
})
137137

138-
_, err := NewCloud(DefaultClusterCIDR)
138+
_, err := NewCloud(DefaultClusterCIDR, nil)
139139
assert.EqualError(t, err, "hcloud/newCloud: unable to authenticate (unauthorized)")
140140
}
141141

@@ -172,7 +172,7 @@ func TestCloud(t *testing.T) {
172172
)
173173
})
174174

175-
cloud, err := NewCloud(DefaultClusterCIDR)
175+
cloud, err := NewCloud(DefaultClusterCIDR, nil)
176176
if err != nil {
177177
t.Fatalf("Unexpected error: %v", err)
178178
}
@@ -229,7 +229,7 @@ func TestCloud(t *testing.T) {
229229
)
230230
defer resetEnv()
231231

232-
c, err := NewCloud(DefaultClusterCIDR)
232+
c, err := NewCloud(DefaultClusterCIDR, nil)
233233
if err != nil {
234234
t.Errorf("%s", err)
235235
}

hcloud/routes.go

Lines changed: 111 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,21 @@ import (
55
"errors"
66
"fmt"
77
"net"
8+
"slices"
89
"time"
910

1011
"golang.org/x/time/rate"
1112
corev1 "k8s.io/api/core/v1"
1213
apierrors "k8s.io/apimachinery/pkg/api/errors"
13-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1414
"k8s.io/apimachinery/pkg/types"
15+
corelisters "k8s.io/client-go/listers/core/v1"
1516
"k8s.io/client-go/tools/record"
1617
cloudprovider "k8s.io/cloud-provider"
1718
"k8s.io/klog/v2"
1819

1920
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/hcops"
2021
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics"
22+
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/providerid"
2123
"github.com/hetznercloud/hcloud-go/v2/hcloud"
2224
)
2325

@@ -29,9 +31,10 @@ type routes struct {
2931
serverCache *hcops.AllServersCache
3032
clusterCIDR *net.IPNet
3133
recorder record.EventRecorder
34+
nodeLister corelisters.NodeLister
3235
}
3336

34-
func newRoutes(client *hcloud.Client, networkID int64, clusterCIDR string, recorder record.EventRecorder) (*routes, error) {
37+
func newRoutes(client *hcloud.Client, networkID int64, clusterCIDR string, recorder record.EventRecorder, nodeLister corelisters.NodeLister) (*routes, error) {
3538
const op = "hcloud/newRoutes"
3639
metrics.OperationCalled.WithLabelValues(op).Inc()
3740

@@ -60,6 +63,7 @@ func newRoutes(client *hcloud.Client, networkID int64, clusterCIDR string, recor
6063
},
6164
clusterCIDR: cidr,
6265
recorder: recorder,
66+
nodeLister: nodeLister,
6367
}, nil
6468
}
6569

@@ -105,85 +109,132 @@ func (r *routes) CreateRoute(ctx context.Context, _ string, _ string, route *clo
105109
const op = "hcloud/CreateRoute"
106110
metrics.OperationCalled.WithLabelValues(op).Inc()
107111

108-
srv, err := r.serverCache.ByName(ctx, string(route.TargetNode))
112+
node, gateway, err := r.resolveRouteTarget(ctx, string(route.TargetNode))
109113
if err != nil {
110114
return fmt.Errorf("%s: %w", op, err)
111115
}
112116

113-
privNet, ok := findServerPrivateNetByID(srv, r.network.ID)
114-
if !ok {
115-
r.serverCache.InvalidateCache()
116-
srv, err = r.serverCache.ByName(ctx, string(route.TargetNode))
117-
if err != nil {
118-
return fmt.Errorf("%s: %w", op, err)
119-
}
120-
121-
privNet, ok = findServerPrivateNetByID(srv, r.network.ID)
122-
if !ok {
123-
return fmt.Errorf("%s: server %v: network with id %d not attached to this server", op, route.TargetNode, r.network.ID)
124-
}
117+
if !slices.ContainsFunc(route.TargetNodeAddresses, func(target corev1.NodeAddress) bool {
118+
return target.Type == corev1.NodeInternalIP && target.Address == gateway.String()
119+
}) {
120+
return fmt.Errorf("%s: IP %s not part of routes target addresses", op, gateway.String())
125121
}
126-
ip := privNet.IP
127122

128123
_, cidr, err := net.ParseCIDR(route.DestinationCIDR)
129124
if err != nil {
130125
return fmt.Errorf("%s: %w", op, err)
131126
}
132127

133-
clusterPrefixLen, _ := r.clusterCIDR.Mask.Size()
134-
destPrefixLen, _ := cidr.Mask.Size()
128+
r.warnCIDRMismatch(cidr, node)
135129

136-
if !r.clusterCIDR.Contains(cidr.IP) || destPrefixLen < clusterPrefixLen {
137-
node := &corev1.Node{
138-
ObjectMeta: metav1.ObjectMeta{
139-
Name: string(route.TargetNode),
140-
Namespace: "",
141-
},
142-
}
143-
// Event is only visible via `kubectl get events` and not `kubectl describe node`,
144-
// as we do not have the UID here and `kubectl describe node` filters by UID.
145-
// Because of this behavior we are also dispatching a log message.
146-
r.recorder.Eventf(
147-
node,
148-
corev1.EventTypeWarning,
149-
"ClusterCIDRMisconfigured",
150-
"route CIDR %s is not contained within cluster CIDR %s",
151-
route.DestinationCIDR,
152-
r.clusterCIDR.String(),
153-
)
154-
klog.Warningf(
155-
"route CIDR %s is not contained within cluster CIDR %s",
156-
route.DestinationCIDR,
157-
r.clusterCIDR.String(),
158-
)
130+
return r.upsertRoute(ctx, gateway, cidr, string(route.TargetNode))
131+
}
132+
133+
// resolveRouteTarget returns the k8s node and the hcloud server's private IP on the routes
134+
// network — everything needed to create a route for this node (gateway IP) and record events
135+
// against it (node).
136+
// Looks up the server by ProviderID to survive k8s node-name drift, with a ByName fallback for
137+
// ID changes (e.g. server recreated). Refreshes the cache once if the private-net attachment
138+
// isn't yet reflected.
139+
func (r *routes) resolveRouteTarget(ctx context.Context, nodeName string) (*corev1.Node, net.IP, error) {
140+
const op = "hcloud/resolveRouteTarget"
141+
metrics.OperationCalled.WithLabelValues(op).Inc()
142+
143+
node, err := r.nodeLister.Get(nodeName)
144+
if err != nil {
145+
return nil, nil, fmt.Errorf("%s: %w", op, err)
146+
}
147+
148+
if node.Spec.ProviderID == "" {
149+
return nil, nil, fmt.Errorf("%s: node %q not yet initialized", op, node.Name)
150+
}
151+
152+
id, isCloudServer, err := providerid.ToServerID(node.Spec.ProviderID)
153+
if err != nil {
154+
return nil, nil, fmt.Errorf("%s: %w", op, err)
155+
}
156+
if !isCloudServer {
157+
return nil, nil, fmt.Errorf("%s: node %q is not a Cloud server, routes are only supported for Cloud servers", op, node.Name)
159158
}
160159

161-
routeExists, err := r.checkIfRouteAlreadyExists(ctx, route)
160+
server, err := r.serverCache.ByID(ctx, id)
161+
if errors.Is(err, hcops.ErrNotFound) {
162+
server, err = r.serverCache.ByName(ctx, node.Name)
163+
}
162164
if err != nil {
165+
return nil, nil, fmt.Errorf("%s: %w", op, err)
166+
}
167+
168+
privNet, ok := findServerPrivateNetByID(server, r.network.ID)
169+
if !ok {
170+
r.serverCache.InvalidateCache()
171+
server, err = r.serverCache.ByID(ctx, server.ID)
172+
if err != nil {
173+
return nil, nil, fmt.Errorf("%s: %w", op, err)
174+
}
175+
privNet, ok = findServerPrivateNetByID(server, r.network.ID)
176+
if !ok {
177+
return nil, nil, fmt.Errorf("%s: server %q: network with id %d not attached to this server", op, node.Name, r.network.ID)
178+
}
179+
}
180+
181+
return node, privNet.IP, nil
182+
}
183+
184+
// upsertRoute ensures the hcloud network has a route for cidr pointing at gateway. A matching
185+
// route is a no-op; a stale route with a different gateway is replaced in place. nodeName is
186+
// used only for logging and for surfacing API conflicts against the right k8s object.
187+
func (r *routes) upsertRoute(ctx context.Context, gateway net.IP, cidr *net.IPNet, nodeName string) error {
188+
const op = "hcloud/upsertRoute"
189+
metrics.OperationCalled.WithLabelValues(op).Inc()
190+
191+
if err := r.reloadNetwork(ctx); err != nil {
163192
return fmt.Errorf("%s: %w", op, err)
164193
}
165194

166-
if routeExists {
195+
destination := cidr.String()
196+
existingIdx := slices.IndexFunc(r.network.Routes, func(nr hcloud.NetworkRoute) bool {
197+
return nr.Destination.String() == destination
198+
})
199+
if existingIdx >= 0 {
200+
existing := r.network.Routes[existingIdx]
201+
if existing.Gateway.Equal(gateway) {
202+
klog.InfoS(
203+
"route already exists: skipping creation",
204+
"target-node", nodeName,
205+
"destination-cidr", destination,
206+
)
207+
return nil
208+
}
209+
210+
action, _, err := r.client.Network.DeleteRoute(ctx, r.network, hcloud.NetworkDeleteRouteOpts{
211+
Route: existing,
212+
})
213+
if err != nil {
214+
return fmt.Errorf("%s: %w", op, err)
215+
}
216+
if err := r.client.Action.WaitFor(ctx, action); err != nil {
217+
return fmt.Errorf("%s: %w", op, err)
218+
}
167219
klog.InfoS(
168-
"route already exists: skipping creation",
169-
"target-node", route.TargetNode,
170-
"destination-cidr", route.DestinationCIDR,
220+
"deleted stale route with wrong gateway; recreating",
221+
"target-node", nodeName,
222+
"destination-cidr", destination,
171223
)
172-
return nil
173224
}
174225

175226
opts := hcloud.NetworkAddRouteOpts{
176227
Route: hcloud.NetworkRoute{
177228
Destination: cidr,
178-
Gateway: ip,
229+
Gateway: gateway,
179230
},
180231
}
181232
action, _, err := r.client.Network.AddRoute(ctx, r.network, opts)
182233
if err != nil {
183234
if hcloud.IsError(err, hcloud.ErrorCodeLocked, hcloud.ErrorCodeConflict) {
184235
return apierrors.NewConflict(
185236
corev1.Resource("nodes"),
186-
string(route.TargetNode),
237+
nodeName,
187238
err,
188239
)
189240
}
@@ -220,17 +271,6 @@ func (r *routes) DeleteRoute(ctx context.Context, _ string, route *cloudprovider
220271
return fmt.Errorf("%s: %w", op, err)
221272
}
222273

223-
err = r.deleteRouteFromHcloud(ctx, cidr, ip)
224-
if err != nil {
225-
return fmt.Errorf("%s: %w", op, err)
226-
}
227-
return nil
228-
}
229-
230-
func (r *routes) deleteRouteFromHcloud(ctx context.Context, cidr *net.IPNet, ip net.IP) error {
231-
const op = "hcloud/deleteRouteFromHcloud"
232-
metrics.OperationCalled.WithLabelValues(op).Inc()
233-
234274
opts := hcloud.NetworkDeleteRouteOpts{
235275
Route: hcloud.NetworkRoute{
236276
Destination: cidr,
@@ -272,43 +312,19 @@ func (r *routes) hcloudRouteToRoute(ctx context.Context, route hcloud.NetworkRou
272312
return cpRoute, nil
273313
}
274314

275-
func (r *routes) checkIfRouteAlreadyExists(ctx context.Context, route *cloudprovider.Route) (bool, error) {
276-
const op = "hcloud/checkIfRouteAlreadyExists"
277-
metrics.OperationCalled.WithLabelValues(op).Inc()
315+
func (r *routes) warnCIDRMismatch(cidr *net.IPNet, node *corev1.Node) {
316+
clusterPrefixLen, _ := r.clusterCIDR.Mask.Size()
317+
destPrefixLen, _ := cidr.Mask.Size()
278318

279-
if err := r.reloadNetwork(ctx); err != nil {
280-
return false, fmt.Errorf("%s: %w", op, err)
281-
}
282-
283-
for _, _route := range r.network.Routes {
284-
if _route.Destination.String() == route.DestinationCIDR {
285-
srv, err := r.serverCache.ByName(ctx, string(route.TargetNode))
286-
if err != nil {
287-
return false, fmt.Errorf("%s: %w", op, err)
288-
}
289-
privNet, ok := findServerPrivateNetByID(srv, r.network.ID)
290-
if !ok {
291-
return false, fmt.Errorf("%s: server %v: no network with id: %d", op, route.TargetNode, r.network.ID)
292-
}
293-
ip := privNet.IP
294-
295-
if !_route.Gateway.Equal(ip) {
296-
action, _, err := r.client.Network.DeleteRoute(ctx, r.network, hcloud.NetworkDeleteRouteOpts{
297-
Route: _route,
298-
})
299-
if err != nil {
300-
return false, fmt.Errorf("%s: %w", op, err)
301-
}
302-
303-
if err := r.client.Action.WaitFor(ctx, action); err != nil {
304-
return false, fmt.Errorf("%s: %w", op, err)
305-
}
306-
}
307-
308-
return true, nil
309-
}
319+
if !r.clusterCIDR.Contains(cidr.IP) || destPrefixLen < clusterPrefixLen {
320+
warnMsg := fmt.Sprintf(
321+
"route CIDR %s is not contained within cluster CIDR %s",
322+
cidr.String(),
323+
r.clusterCIDR.String(),
324+
)
325+
klog.Warning(warnMsg)
326+
r.recorder.Event(node, corev1.EventTypeWarning, "ClusterCIDRMisconfigured", warnMsg)
310327
}
311-
return false, nil
312328
}
313329

314330
func findServerPrivateNetByID(srv *hcloud.Server, id int64) (hcloud.ServerPrivateNet, bool) {

0 commit comments

Comments
 (0)