@@ -5,19 +5,21 @@ import (
55 "errors"
66 "fmt"
77 "net"
8+ "slices"
89 "time"
910
1011 "golang.org/x/time/rate"
1112 corev1 "k8s.io/api/core/v1"
1213 apierrors "k8s.io/apimachinery/pkg/api/errors"
13- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1414 "k8s.io/apimachinery/pkg/types"
15+ corelisters "k8s.io/client-go/listers/core/v1"
1516 "k8s.io/client-go/tools/record"
1617 cloudprovider "k8s.io/cloud-provider"
1718 "k8s.io/klog/v2"
1819
1920 "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/hcops"
2021 "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics"
22+ "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/providerid"
2123 "github.com/hetznercloud/hcloud-go/v2/hcloud"
2224)
2325
@@ -29,9 +31,10 @@ type routes struct {
2931 serverCache * hcops.AllServersCache
3032 clusterCIDR * net.IPNet
3133 recorder record.EventRecorder
34+ nodeLister corelisters.NodeLister
3235}
3336
34- func newRoutes (client * hcloud.Client , networkID int64 , clusterCIDR string , recorder record.EventRecorder ) (* routes , error ) {
37+ func newRoutes (client * hcloud.Client , networkID int64 , clusterCIDR string , recorder record.EventRecorder , nodeLister corelisters. NodeLister ) (* routes , error ) {
3538 const op = "hcloud/newRoutes"
3639 metrics .OperationCalled .WithLabelValues (op ).Inc ()
3740
@@ -60,6 +63,7 @@ func newRoutes(client *hcloud.Client, networkID int64, clusterCIDR string, recor
6063 },
6164 clusterCIDR : cidr ,
6265 recorder : recorder ,
66+ nodeLister : nodeLister ,
6367 }, nil
6468}
6569
@@ -105,85 +109,132 @@ func (r *routes) CreateRoute(ctx context.Context, _ string, _ string, route *clo
105109 const op = "hcloud/CreateRoute"
106110 metrics .OperationCalled .WithLabelValues (op ).Inc ()
107111
108- srv , err := r .serverCache . ByName (ctx , string (route .TargetNode ))
112+ node , gateway , err := r .resolveRouteTarget (ctx , string (route .TargetNode ))
109113 if err != nil {
110114 return fmt .Errorf ("%s: %w" , op , err )
111115 }
112116
113- privNet , ok := findServerPrivateNetByID (srv , r .network .ID )
114- if ! ok {
115- r .serverCache .InvalidateCache ()
116- srv , err = r .serverCache .ByName (ctx , string (route .TargetNode ))
117- if err != nil {
118- return fmt .Errorf ("%s: %w" , op , err )
119- }
120-
121- privNet , ok = findServerPrivateNetByID (srv , r .network .ID )
122- if ! ok {
123- return fmt .Errorf ("%s: server %v: network with id %d not attached to this server" , op , route .TargetNode , r .network .ID )
124- }
117+ if ! slices .ContainsFunc (route .TargetNodeAddresses , func (target corev1.NodeAddress ) bool {
118+ return target .Type == corev1 .NodeInternalIP && target .Address == gateway .String ()
119+ }) {
120+ return fmt .Errorf ("%s: IP %s not part of routes target addresses" , op , gateway .String ())
125121 }
126- ip := privNet .IP
127122
128123 _ , cidr , err := net .ParseCIDR (route .DestinationCIDR )
129124 if err != nil {
130125 return fmt .Errorf ("%s: %w" , op , err )
131126 }
132127
133- clusterPrefixLen , _ := r .clusterCIDR .Mask .Size ()
134- destPrefixLen , _ := cidr .Mask .Size ()
128+ r .warnCIDRMismatch (cidr , node )
135129
136- if ! r .clusterCIDR .Contains (cidr .IP ) || destPrefixLen < clusterPrefixLen {
137- node := & corev1.Node {
138- ObjectMeta : metav1.ObjectMeta {
139- Name : string (route .TargetNode ),
140- Namespace : "" ,
141- },
142- }
143- // Event is only visible via `kubectl get events` and not `kubectl describe node`,
144- // as we do not have the UID here and `kubectl describe node` filters by UID.
145- // Because of this behavior we are also dispatching a log message.
146- r .recorder .Eventf (
147- node ,
148- corev1 .EventTypeWarning ,
149- "ClusterCIDRMisconfigured" ,
150- "route CIDR %s is not contained within cluster CIDR %s" ,
151- route .DestinationCIDR ,
152- r .clusterCIDR .String (),
153- )
154- klog .Warningf (
155- "route CIDR %s is not contained within cluster CIDR %s" ,
156- route .DestinationCIDR ,
157- r .clusterCIDR .String (),
158- )
130+ return r .upsertRoute (ctx , gateway , cidr , string (route .TargetNode ))
131+ }
132+
133+ // resolveRouteTarget returns the k8s node and the hcloud server's private IP on the routes
134+ // network — everything needed to create a route for this node (gateway IP) and record events
135+ // against it (node).
136+ // Looks up the server by ProviderID to survive k8s node-name drift, with a ByName fallback for
137+ // ID changes (e.g. server recreated). Refreshes the cache once if the private-net attachment
138+ // isn't yet reflected.
139+ func (r * routes ) resolveRouteTarget (ctx context.Context , nodeName string ) (* corev1.Node , net.IP , error ) {
140+ const op = "hcloud/resolveRouteTarget"
141+ metrics .OperationCalled .WithLabelValues (op ).Inc ()
142+
143+ node , err := r .nodeLister .Get (nodeName )
144+ if err != nil {
145+ return nil , nil , fmt .Errorf ("%s: %w" , op , err )
146+ }
147+
148+ if node .Spec .ProviderID == "" {
149+ return nil , nil , fmt .Errorf ("%s: node %q not yet initialized" , op , node .Name )
150+ }
151+
152+ id , isCloudServer , err := providerid .ToServerID (node .Spec .ProviderID )
153+ if err != nil {
154+ return nil , nil , fmt .Errorf ("%s: %w" , op , err )
155+ }
156+ if ! isCloudServer {
157+ return nil , nil , fmt .Errorf ("%s: node %q is not a Cloud server, routes are only supported for Cloud servers" , op , node .Name )
159158 }
160159
161- routeExists , err := r .checkIfRouteAlreadyExists (ctx , route )
160+ server , err := r .serverCache .ByID (ctx , id )
161+ if errors .Is (err , hcops .ErrNotFound ) {
162+ server , err = r .serverCache .ByName (ctx , node .Name )
163+ }
162164 if err != nil {
165+ return nil , nil , fmt .Errorf ("%s: %w" , op , err )
166+ }
167+
168+ privNet , ok := findServerPrivateNetByID (server , r .network .ID )
169+ if ! ok {
170+ r .serverCache .InvalidateCache ()
171+ server , err = r .serverCache .ByID (ctx , server .ID )
172+ if err != nil {
173+ return nil , nil , fmt .Errorf ("%s: %w" , op , err )
174+ }
175+ privNet , ok = findServerPrivateNetByID (server , r .network .ID )
176+ if ! ok {
177+ return nil , nil , fmt .Errorf ("%s: server %q: network with id %d not attached to this server" , op , node .Name , r .network .ID )
178+ }
179+ }
180+
181+ return node , privNet .IP , nil
182+ }
183+
184+ // upsertRoute ensures the hcloud network has a route for cidr pointing at gateway. A matching
185+ // route is a no-op; a stale route with a different gateway is replaced in place. nodeName is
186+ // used only for logging and for surfacing API conflicts against the right k8s object.
187+ func (r * routes ) upsertRoute (ctx context.Context , gateway net.IP , cidr * net.IPNet , nodeName string ) error {
188+ const op = "hcloud/upsertRoute"
189+ metrics .OperationCalled .WithLabelValues (op ).Inc ()
190+
191+ if err := r .reloadNetwork (ctx ); err != nil {
163192 return fmt .Errorf ("%s: %w" , op , err )
164193 }
165194
166- if routeExists {
195+ destination := cidr .String ()
196+ existingIdx := slices .IndexFunc (r .network .Routes , func (nr hcloud.NetworkRoute ) bool {
197+ return nr .Destination .String () == destination
198+ })
199+ if existingIdx >= 0 {
200+ existing := r .network .Routes [existingIdx ]
201+ if existing .Gateway .Equal (gateway ) {
202+ klog .InfoS (
203+ "route already exists: skipping creation" ,
204+ "target-node" , nodeName ,
205+ "destination-cidr" , destination ,
206+ )
207+ return nil
208+ }
209+
210+ action , _ , err := r .client .Network .DeleteRoute (ctx , r .network , hcloud.NetworkDeleteRouteOpts {
211+ Route : existing ,
212+ })
213+ if err != nil {
214+ return fmt .Errorf ("%s: %w" , op , err )
215+ }
216+ if err := r .client .Action .WaitFor (ctx , action ); err != nil {
217+ return fmt .Errorf ("%s: %w" , op , err )
218+ }
167219 klog .InfoS (
168- "route already exists: skipping creation " ,
169- "target-node" , route . TargetNode ,
170- "destination-cidr" , route . DestinationCIDR ,
220+ "deleted stale route with wrong gateway; recreating " ,
221+ "target-node" , nodeName ,
222+ "destination-cidr" , destination ,
171223 )
172- return nil
173224 }
174225
175226 opts := hcloud.NetworkAddRouteOpts {
176227 Route : hcloud.NetworkRoute {
177228 Destination : cidr ,
178- Gateway : ip ,
229+ Gateway : gateway ,
179230 },
180231 }
181232 action , _ , err := r .client .Network .AddRoute (ctx , r .network , opts )
182233 if err != nil {
183234 if hcloud .IsError (err , hcloud .ErrorCodeLocked , hcloud .ErrorCodeConflict ) {
184235 return apierrors .NewConflict (
185236 corev1 .Resource ("nodes" ),
186- string ( route . TargetNode ) ,
237+ nodeName ,
187238 err ,
188239 )
189240 }
@@ -220,17 +271,6 @@ func (r *routes) DeleteRoute(ctx context.Context, _ string, route *cloudprovider
220271 return fmt .Errorf ("%s: %w" , op , err )
221272 }
222273
223- err = r .deleteRouteFromHcloud (ctx , cidr , ip )
224- if err != nil {
225- return fmt .Errorf ("%s: %w" , op , err )
226- }
227- return nil
228- }
229-
230- func (r * routes ) deleteRouteFromHcloud (ctx context.Context , cidr * net.IPNet , ip net.IP ) error {
231- const op = "hcloud/deleteRouteFromHcloud"
232- metrics .OperationCalled .WithLabelValues (op ).Inc ()
233-
234274 opts := hcloud.NetworkDeleteRouteOpts {
235275 Route : hcloud.NetworkRoute {
236276 Destination : cidr ,
@@ -272,43 +312,19 @@ func (r *routes) hcloudRouteToRoute(ctx context.Context, route hcloud.NetworkRou
272312 return cpRoute , nil
273313}
274314
275- func (r * routes ) checkIfRouteAlreadyExists ( ctx context. Context , route * cloudprovider. Route ) ( bool , error ) {
276- const op = "hcloud/checkIfRouteAlreadyExists"
277- metrics . OperationCalled . WithLabelValues ( op ). Inc ()
315+ func (r * routes ) warnCIDRMismatch ( cidr * net. IPNet , node * corev1. Node ) {
316+ clusterPrefixLen , _ := r . clusterCIDR . Mask . Size ()
317+ destPrefixLen , _ := cidr . Mask . Size ()
278318
279- if err := r .reloadNetwork (ctx ); err != nil {
280- return false , fmt .Errorf ("%s: %w" , op , err )
281- }
282-
283- for _ , _route := range r .network .Routes {
284- if _route .Destination .String () == route .DestinationCIDR {
285- srv , err := r .serverCache .ByName (ctx , string (route .TargetNode ))
286- if err != nil {
287- return false , fmt .Errorf ("%s: %w" , op , err )
288- }
289- privNet , ok := findServerPrivateNetByID (srv , r .network .ID )
290- if ! ok {
291- return false , fmt .Errorf ("%s: server %v: no network with id: %d" , op , route .TargetNode , r .network .ID )
292- }
293- ip := privNet .IP
294-
295- if ! _route .Gateway .Equal (ip ) {
296- action , _ , err := r .client .Network .DeleteRoute (ctx , r .network , hcloud.NetworkDeleteRouteOpts {
297- Route : _route ,
298- })
299- if err != nil {
300- return false , fmt .Errorf ("%s: %w" , op , err )
301- }
302-
303- if err := r .client .Action .WaitFor (ctx , action ); err != nil {
304- return false , fmt .Errorf ("%s: %w" , op , err )
305- }
306- }
307-
308- return true , nil
309- }
319+ if ! r .clusterCIDR .Contains (cidr .IP ) || destPrefixLen < clusterPrefixLen {
320+ warnMsg := fmt .Sprintf (
321+ "route CIDR %s is not contained within cluster CIDR %s" ,
322+ cidr .String (),
323+ r .clusterCIDR .String (),
324+ )
325+ klog .Warning (warnMsg )
326+ r .recorder .Event (node , corev1 .EventTypeWarning , "ClusterCIDRMisconfigured" , warnMsg )
310327 }
311- return false , nil
312328}
313329
314330func findServerPrivateNetByID (srv * hcloud.Server , id int64 ) (hcloud.ServerPrivateNet , bool ) {
0 commit comments