Skip to content

Commit 64cebc4

Browse files
authored
fix(route): error handling on locked networks (#1215)
If an error code `conflict` or `locked` is returned when adding a new route, the retry mechanism of HCCM sleeps the current goroutine for five seconds and calls the `CreateRoute` method recursively, without a max retry counter. This behavior is inefficient and prone to error. Additionally, the `k8s.io/cloud-provider` library already performs retry handling on errors. Doing this twice might result in a fast rate-limit exhaustion. I propose to remove retry handling from our side and let `k8s.io/cloud-provider` handle this.
1 parent 82fcf7f commit 64cebc4

2 files changed

Lines changed: 35 additions & 38 deletions

File tree

hcloud/routes.go

Lines changed: 34 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99

1010
"golang.org/x/time/rate"
1111
corev1 "k8s.io/api/core/v1"
12+
apierrors "k8s.io/apimachinery/pkg/api/errors"
1213
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1314
"k8s.io/apimachinery/pkg/types"
1415
"k8s.io/client-go/tools/record"
@@ -20,9 +21,7 @@ import (
2021
"github.com/hetznercloud/hcloud-go/v2/hcloud"
2122
)
2223

23-
var (
24-
serversCacheMissRefreshRate = rate.Every(30 * time.Second)
25-
)
24+
var serversCacheMissRefreshRate = rate.Every(30 * time.Second)
2625

2726
type routes struct {
2827
client *hcloud.Client
@@ -102,7 +101,7 @@ func (r *routes) ListRoutes(ctx context.Context, _ string) ([]*cloudprovider.Rou
102101
// CreateRoute creates the described managed route
103102
// route.Name will be ignored, although the cloud-provider may use nameHint
104103
// to create a more user-meaningful name.
105-
func (r *routes) CreateRoute(ctx context.Context, clusterName string, nameHint string, route *cloudprovider.Route) error {
104+
func (r *routes) CreateRoute(ctx context.Context, _ string, _ string, route *cloudprovider.Route) error {
106105
const op = "hcloud/CreateRoute"
107106
metrics.OperationCalled.WithLabelValues(op).Inc()
108107

@@ -131,10 +130,10 @@ func (r *routes) CreateRoute(ctx context.Context, clusterName string, nameHint s
131130
return fmt.Errorf("%s: %w", op, err)
132131
}
133132

134-
clusterNetSize, _ := r.clusterCIDR.Mask.Size()
135-
destNetSize, _ := cidr.Mask.Size()
133+
clusterPrefixLen, _ := r.clusterCIDR.Mask.Size()
134+
destPrefixLen, _ := cidr.Mask.Size()
136135

137-
if !r.clusterCIDR.Contains(cidr.IP) || destNetSize < clusterNetSize {
136+
if !r.clusterCIDR.Contains(cidr.IP) || destPrefixLen < clusterPrefixLen {
138137
node := &corev1.Node{
139138
ObjectMeta: metav1.ObjectMeta{
140139
Name: string(route.TargetNode),
@@ -159,35 +158,42 @@ func (r *routes) CreateRoute(ctx context.Context, clusterName string, nameHint s
159158
)
160159
}
161160

162-
doesRouteAlreadyExist, err := r.checkIfRouteAlreadyExists(ctx, route)
161+
routeExists, err := r.checkIfRouteAlreadyExists(ctx, route)
163162
if err != nil {
164163
return fmt.Errorf("%s: %w", op, err)
165164
}
166165

167-
if !doesRouteAlreadyExist {
168-
opts := hcloud.NetworkAddRouteOpts{
169-
Route: hcloud.NetworkRoute{
170-
Destination: cidr,
171-
Gateway: ip,
172-
},
173-
}
174-
action, _, err := r.client.Network.AddRoute(ctx, r.network, opts)
175-
if err != nil {
176-
if hcloud.IsError(err, hcloud.ErrorCodeLocked, hcloud.ErrorCodeConflict) {
177-
retryDelay := time.Second * 5
178-
klog.InfoS("retry due to conflict or lock",
179-
"op", op, "delay", fmt.Sprintf("%v", retryDelay), "err", fmt.Sprintf("%v", err))
180-
time.Sleep(retryDelay)
166+
if routeExists {
167+
klog.InfoS(
168+
"route already exists: skipping creation",
169+
"target-node", route.TargetNode,
170+
"destination-cidr", route.DestinationCIDR,
171+
)
172+
return nil
173+
}
181174

182-
return r.CreateRoute(ctx, clusterName, nameHint, route)
183-
}
184-
return fmt.Errorf("%s: %w", op, err)
175+
opts := hcloud.NetworkAddRouteOpts{
176+
Route: hcloud.NetworkRoute{
177+
Destination: cidr,
178+
Gateway: ip,
179+
},
180+
}
181+
action, _, err := r.client.Network.AddRoute(ctx, r.network, opts)
182+
if err != nil {
183+
if hcloud.IsError(err, hcloud.ErrorCodeLocked, hcloud.ErrorCodeConflict) {
184+
return apierrors.NewConflict(
185+
corev1.Resource("nodes"),
186+
string(route.TargetNode),
187+
err,
188+
)
185189
}
190+
return fmt.Errorf("%s: %w", op, err)
191+
}
186192

187-
if err := r.client.Action.WaitFor(ctx, action); err != nil {
188-
return fmt.Errorf("%s: %w", op, err)
189-
}
193+
if err := r.client.Action.WaitFor(ctx, action); err != nil {
194+
return fmt.Errorf("%s: %w", op, err)
190195
}
196+
191197
return nil
192198
}
193199

@@ -234,14 +240,6 @@ func (r *routes) deleteRouteFromHcloud(ctx context.Context, cidr *net.IPNet, ip
234240

235241
action, _, err := r.client.Network.DeleteRoute(ctx, r.network, opts)
236242
if err != nil {
237-
if hcloud.IsError(err, hcloud.ErrorCodeLocked, hcloud.ErrorCodeConflict) {
238-
retryDelay := time.Second * 5
239-
klog.InfoS("retry due to conflict or lock",
240-
"op", op, "delay", fmt.Sprintf("%v", retryDelay), "err", fmt.Sprintf("%v", err))
241-
time.Sleep(retryDelay)
242-
243-
return r.deleteRouteFromHcloud(ctx, cidr, ip)
244-
}
245243
return fmt.Errorf("%s: %w", op, err)
246244
}
247245
if err := r.client.Action.WaitFor(ctx, action); err != nil {

internal/metrics/metrics.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@ import (
2222

2323
"github.com/prometheus/client_golang/prometheus"
2424
"k8s.io/component-base/metrics/legacyregistry"
25-
// Initialize cloud-provider internal metrics (e.g. workqueue).
26-
_ "k8s.io/component-base/metrics/prometheus/clientgo"
25+
_ "k8s.io/component-base/metrics/prometheus/clientgo" // Initialize cloud-provider internal metrics (e.g. workqueue).
2726
"k8s.io/klog/v2"
2827
)
2928

0 commit comments

Comments
 (0)