Skip to content

Commit 7053463

Browse files
authored
Fix backpressure metric capturing for Kubernetes API calls
1 parent a4d5b79 commit 7053463

File tree

3 files changed

+240
-33
lines changed

3 files changed

+240
-33
lines changed

frontend/csi/interceptor.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ func incomingRequestMetricsInterceptor(
8383
return handler(ctx, req)
8484
}
8585

86-
// timeoutInterceptor applies a default request timeout for Node and AllInOne deployments.
87-
// All RPCs served by these roles receive the timeout; Controller deployments are unaffected.
86+
// timeoutInterceptor applies a default request timeout for Node deployments.
87+
// All RPCs served by the Node role receive the timeout; Controller and AIO deployments are unaffected.
8888
// It must be the second from outermost interceptor in the chain so that downstream interceptors
8989
// capture the timeout-aware context.
9090
func timeoutInterceptor(

logging/transport.go

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -43,45 +43,56 @@ func NewMetricsTransport(
4343

4444
// RoundTrip captures the time on the wire of a request. It does not capture the time
4545
// an HTTP client may be waiting for a lock or token.
46-
//
47-
// err serves double duty: the deferred recorder observes it for metrics,
48-
// while the Kubernetes backpressure path returns nil to the caller without
49-
// clearing err, so backpressure is still recorded without violating
50-
// RoundTripper semantics.
51-
func (m *MetricsTransport) RoundTrip(req *http.Request) (res *http.Response, err error) {
46+
func (m *MetricsTransport) RoundTrip(req *http.Request) (*http.Response, error) {
5247
// target: "kubernetes", "ontap", etc.,
5348
// host: "10.0.0.1", "10.0.0.2", etc.,
5449
// method: "GET", "POST", etc.
50+
// recErr is a separate local variable used by the deferred recorder so that the
51+
// Kubernetes backpressure path can return nil to the caller (RoundTripper semantics)
52+
// while still recording the backpressure error in metrics.
53+
var recErr error
5554
ctx, rec := NewContextBuilder(req.Context()).
5655
WithOutgoingAPIMetrics(m.target, req.URL.Host, req.Method).
5756
BuildContextAndTelemetry()
58-
defer rec(&err)
57+
defer rec(&recErr)
5958

60-
res, err = m.base.RoundTrip(req.WithContext(ctx))
59+
res, err := m.base.RoundTrip(req.WithContext(ctx))
6160
if err != nil {
62-
// Handle the special case where ONTAP or any other API doesn't honor HTTP status codes.
63-
// This is a workaround for ONTAP's EOF handling when the API gives up.
61+
recErr = err
62+
// Handle the special ONTAP case where the API may return EOF instead of
63+
// an HTTP backpressure status when the server gives up.
6464
if errors.Is(err, io.EOF) && m.target == ContextRequestTargetONTAP {
65-
err = errors.WrapWithServerBackPressureError(err, "received EOF from server")
65+
recErr = errors.WrapWithServerBackPressureError(err, "received EOF from server")
6666
}
67-
return res, err
67+
return res, recErr
6868
}
6969

7070
// Check for HTTP-level backpressure.
7171
switch res.StatusCode {
7272
case http.StatusTooManyRequests, http.StatusServiceUnavailable, http.StatusGatewayTimeout:
73-
err = errors.ServerBackPressureError("received status: %d from the server", res.StatusCode)
73+
// Set the recorder error to a backpressure error.
74+
// This should be set for all outgoing API requests that experience backpressure.
75+
// Some outgoing API requests handle backpressure differently, so switch on the target
76+
// to decide what to return.
77+
recErr = errors.ServerBackPressureError("received status: %d from the server", res.StatusCode)
7478

75-
// ONTAP callers (LimitedRetryTransport, go-openapi, azgo) gate on err != nil
76-
// and do not inspect the response status code on the error path.
77-
if m.target == ContextRequestTargetONTAP {
78-
return res, err
79+
switch m.target {
80+
case ContextRequestTargetONTAP:
81+
// This breaks the standard RoundTripper semantics but is necessary to handle
82+
// the special case where ONTAP returns an EOF error when the API is too busy.
83+
// ONTAP callers gate on err != nil and may not inspect the HTTP status code,
84+
// so return a non-nil backpressure error.
85+
return res, recErr
86+
case ContextRequestTargetKubernetes:
87+
// Kubernetes client-go depends on the response status, not the error, but
88+
// the metrics telemetry still needs to record the backpressure error.
89+
// Therefore, handle this case explicitly by returning nil to the caller
90+
// and recording the backpressure error in metrics.
91+
return res, nil
7992
}
80-
81-
// Kubernetes client-go depends on the response status, not the error.
82-
// Return the response with a nil error so client-go can inspect the status code.
83-
return res, nil
8493
}
8594

95+
// Preserve standard RoundTripper semantics: return the response with nil error
96+
// and let callers inspect the status code. recErr is captured independently for metrics.
8697
return res, nil
8798
}

logging/transport_test.go

Lines changed: 206 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
stdErrors "errors"
88
"io"
99
"net/http"
10+
"strings"
1011
"testing"
1112

1213
"github.com/stretchr/testify/assert"
@@ -33,12 +34,21 @@ func makeResponse(statusCode int) *http.Response {
3334
return &http.Response{StatusCode: statusCode}
3435
}
3536

36-
// ---- ONTAP EOF handling ----
37+
// roundTrip is a shorthand that constructs a MetricsTransport from a stub and calls RoundTrip.
38+
// It uses a unique host derived from the test name so Prometheus counters never collide across tests,
39+
// while preserving the real target value for code-path correctness.
40+
func roundTrip(
41+
t *testing.T, target ContextRequestTarget, base *stubRoundTripper, method string,
42+
) (*http.Response, error, string, string, string) {
43+
t.Helper()
44+
host := strings.ReplaceAll(t.Name(), "/", "_")
45+
mt := &MetricsTransport{base: base, target: target}
46+
req := makeRequest(context.Background(), method, "https://"+host+"/path")
47+
resp, err := mt.RoundTrip(req)
48+
return resp, err, string(target), host, method
49+
}
3750

3851
func TestMetricsTransport_ONTAP_EOF_WrapsAsServerBackPressure(t *testing.T) {
39-
// ONTAP signals overload with a bare EOF instead of an HTTP status code.
40-
// MetricsTransport must wrap it as a ServerBackPressureError so the telemeter
41-
// increments the backpressure counter correctly.
4252
base := &stubRoundTripper{resp: nil, err: io.EOF}
4353
mt := NewMetricsTransport(base, WithMetricsTransportTarget(ContextRequestTargetONTAP))
4454
req := makeRequest(context.Background(), http.MethodGet, "https://ontap.local/cluster")
@@ -51,8 +61,6 @@ func TestMetricsTransport_ONTAP_EOF_WrapsAsServerBackPressure(t *testing.T) {
5161
}
5262

5363
func TestMetricsTransport_ONTAP_EOF_PreservesEOFInChain(t *testing.T) {
54-
// LimitedRetryTransport uses errors.Is(err, io.EOF) to decide whether to retry.
55-
// The wrapping must preserve io.EOF in the chain so retries still trigger.
5664
base := &stubRoundTripper{resp: nil, err: io.EOF}
5765
mt := NewMetricsTransport(base, WithMetricsTransportTarget(ContextRequestTargetONTAP))
5866
req := makeRequest(context.Background(), http.MethodGet, "https://ontap.local/cluster")
@@ -64,7 +72,6 @@ func TestMetricsTransport_ONTAP_EOF_PreservesEOFInChain(t *testing.T) {
6472
}
6573

6674
func TestMetricsTransport_NonONTAP_EOF_PassesThrough(t *testing.T) {
67-
// Only ONTAP gets EOF-to-backpressure wrapping; other targets pass EOF through unchanged.
6875
base := &stubRoundTripper{resp: nil, err: io.EOF}
6976
mt := NewMetricsTransport(base, WithMetricsTransportTarget(ContextRequestTargetKubernetes))
7077
req := makeRequest(context.Background(), http.MethodGet, "https://k8s.local/api/v1/pods")
@@ -166,10 +173,7 @@ func TestMetricsTransport_HTTP200_Success(t *testing.T) {
166173
assert.NoError(t, gotErr, "HTTP 200 should return no error")
167174
}
168175

169-
// ---- Non-backpressure transport errors pass through unchanged ----
170-
171176
func TestMetricsTransport_TransportError_PassesThrough(t *testing.T) {
172-
// Connection-level errors that are not EOF pass through as-is for all targets.
173177
someErr := stdErrors.New("connection refused")
174178
base := &stubRoundTripper{resp: nil, err: someErr}
175179
mt := NewMetricsTransport(base, WithMetricsTransportTarget(ContextRequestTargetUnknown))
@@ -181,3 +185,195 @@ func TestMetricsTransport_TransportError_PassesThrough(t *testing.T) {
181185
"non-EOF transport errors should be returned unchanged")
182186
assert.False(t, utilsErrors.IsServerBackPressureError(gotErr))
183187
}
188+
189+
func TestMetricsTransport_ONTAP_HTTP429_RecordsBackpressureMetric(t *testing.T) {
190+
_, _, target, host, method := roundTrip(
191+
t, ContextRequestTargetONTAP,
192+
&stubRoundTripper{resp: makeResponse(http.StatusTooManyRequests)},
193+
http.MethodGet,
194+
)
195+
196+
assert.Equal(t, float64(1),
197+
readCounter(outgoingAPIRequestBackpressureTotal, target, host, method),
198+
"backpressure counter must increment for ONTAP 429",
199+
)
200+
}
201+
202+
func TestMetricsTransport_Kubernetes_HTTP429_RecordsBackpressureMetric(t *testing.T) {
203+
resp, err, target, host, method := roundTrip(
204+
t, ContextRequestTargetKubernetes,
205+
&stubRoundTripper{resp: makeResponse(http.StatusTooManyRequests)},
206+
http.MethodGet,
207+
)
208+
209+
assert.NoError(t, err, "caller must get nil error per RoundTripper semantics")
210+
assert.Equal(t, http.StatusTooManyRequests, resp.StatusCode)
211+
assert.Equal(t, float64(1),
212+
readCounter(outgoingAPIRequestBackpressureTotal, target, host, method),
213+
"backpressure counter must increment for Kubernetes 429 even though caller gets nil error",
214+
)
215+
}
216+
217+
func TestMetricsTransport_Kubernetes_HTTP503_RecordsBackpressureMetric(t *testing.T) {
218+
resp, err, target, host, method := roundTrip(
219+
t, ContextRequestTargetKubernetes,
220+
&stubRoundTripper{resp: makeResponse(http.StatusServiceUnavailable)},
221+
http.MethodGet,
222+
)
223+
224+
assert.NoError(t, err)
225+
assert.Equal(t, http.StatusServiceUnavailable, resp.StatusCode)
226+
assert.Equal(t, float64(1),
227+
readCounter(outgoingAPIRequestBackpressureTotal, target, host, method),
228+
"backpressure counter must increment for Kubernetes 503",
229+
)
230+
}
231+
232+
func TestMetricsTransport_Kubernetes_HTTP504_RecordsBackpressureMetric(t *testing.T) {
233+
resp, err, target, host, method := roundTrip(
234+
t, ContextRequestTargetKubernetes,
235+
&stubRoundTripper{resp: makeResponse(http.StatusGatewayTimeout)},
236+
http.MethodGet,
237+
)
238+
239+
assert.NoError(t, err)
240+
assert.Equal(t, http.StatusGatewayTimeout, resp.StatusCode)
241+
assert.Equal(t, float64(1),
242+
readCounter(outgoingAPIRequestBackpressureTotal, target, host, method),
243+
"backpressure counter must increment for Kubernetes 504",
244+
)
245+
}
246+
247+
func TestMetricsTransport_Unknown_HTTP429_ReturnsNilError(t *testing.T) {
248+
resp, err, target, host, method := roundTrip(
249+
t, ContextRequestTargetUnknown,
250+
&stubRoundTripper{resp: makeResponse(http.StatusTooManyRequests)},
251+
http.MethodGet,
252+
)
253+
254+
assert.NoError(t, err,
255+
"unknown targets must return nil error to preserve RoundTripper semantics")
256+
assert.NotNil(t, resp)
257+
assert.Equal(t, http.StatusTooManyRequests, resp.StatusCode)
258+
assert.Equal(t, float64(1),
259+
readCounter(outgoingAPIRequestBackpressureTotal, target, host, method),
260+
"backpressure counter must still increment for unknown target 429",
261+
)
262+
}
263+
264+
func TestMetricsTransport_ONTAP_EOF_RecordsBackpressureMetric(t *testing.T) {
265+
_, _, target, host, method := roundTrip(
266+
t, ContextRequestTargetONTAP,
267+
&stubRoundTripper{err: io.EOF},
268+
http.MethodGet,
269+
)
270+
271+
assert.Equal(t, float64(1),
272+
readCounter(outgoingAPIRequestBackpressureTotal, target, host, method),
273+
"backpressure counter must increment for ONTAP EOF",
274+
)
275+
}
276+
277+
func TestMetricsTransport_HTTP200_NoBackpressureMetric(t *testing.T) {
278+
_, _, target, host, method := roundTrip(
279+
t, ContextRequestTargetONTAP,
280+
&stubRoundTripper{resp: makeResponse(http.StatusOK)},
281+
http.MethodGet,
282+
)
283+
284+
assert.Equal(t, float64(0),
285+
readCounter(outgoingAPIRequestBackpressureTotal, target, host, method),
286+
"backpressure counter must not increment for 200 OK",
287+
)
288+
}
289+
290+
func TestMetricsTransport_HTTP200_RecordsDurationAsSuccess(t *testing.T) {
291+
_, _, target, host, method := roundTrip(
292+
t, ContextRequestTargetONTAP,
293+
&stubRoundTripper{resp: makeResponse(http.StatusOK)},
294+
http.MethodGet,
295+
)
296+
297+
assert.Equal(t, uint64(1),
298+
readHistogramCount(outgoingAPIRequestDurationSeconds, metricStatusSuccess, target, host, method),
299+
)
300+
}
301+
302+
func TestMetricsTransport_ONTAP_HTTP429_RecordsDurationAsFailure(t *testing.T) {
303+
_, _, target, host, method := roundTrip(
304+
t, ContextRequestTargetONTAP,
305+
&stubRoundTripper{resp: makeResponse(http.StatusTooManyRequests)},
306+
http.MethodGet,
307+
)
308+
309+
assert.Equal(t, uint64(1),
310+
readHistogramCount(outgoingAPIRequestDurationSeconds, metricStatusFailure, target, host, method),
311+
"ONTAP 429 must be recorded as failure in the duration histogram",
312+
)
313+
}
314+
315+
func TestMetricsTransport_Kubernetes_HTTP429_RecordsDurationAsFailure(t *testing.T) {
316+
_, err, target, host, method := roundTrip(
317+
t, ContextRequestTargetKubernetes,
318+
&stubRoundTripper{resp: makeResponse(http.StatusTooManyRequests)},
319+
http.MethodGet,
320+
)
321+
322+
assert.NoError(t, err, "caller gets nil")
323+
assert.Equal(t, uint64(1),
324+
readHistogramCount(outgoingAPIRequestDurationSeconds, metricStatusFailure, target, host, method),
325+
"Kubernetes 429 must be recorded as failure in the duration histogram even though caller gets nil",
326+
)
327+
}
328+
329+
func TestMetricsTransport_TransportError_RecordsDurationAsFailure(t *testing.T) {
330+
_, _, target, host, method := roundTrip(
331+
t, ContextRequestTargetUnknown,
332+
&stubRoundTripper{err: stdErrors.New("connection refused")},
333+
http.MethodGet,
334+
)
335+
336+
assert.Equal(t, uint64(1),
337+
readHistogramCount(outgoingAPIRequestDurationSeconds, metricStatusFailure, target, host, method),
338+
"transport errors must be recorded as failure",
339+
)
340+
}
341+
342+
func TestMetricsTransport_InFlight_ReturnsToZeroAfterSuccess(t *testing.T) {
343+
_, _, target, host, method := roundTrip(
344+
t, ContextRequestTargetONTAP,
345+
&stubRoundTripper{resp: makeResponse(http.StatusOK)},
346+
http.MethodGet,
347+
)
348+
349+
assert.Equal(t, float64(0),
350+
readGauge(outgoingAPIRequestsInFlight, target, host, method),
351+
"in-flight gauge must return to zero after a completed request",
352+
)
353+
}
354+
355+
func TestMetricsTransport_InFlight_ReturnsToZeroAfterError(t *testing.T) {
356+
_, _, target, host, method := roundTrip(
357+
t, ContextRequestTargetUnknown,
358+
&stubRoundTripper{err: stdErrors.New("fail")},
359+
http.MethodGet,
360+
)
361+
362+
assert.Equal(t, float64(0),
363+
readGauge(outgoingAPIRequestsInFlight, target, host, method),
364+
"in-flight gauge must return to zero after an error",
365+
)
366+
}
367+
368+
func TestMetricsTransport_InFlight_ReturnsToZeroAfterKubernetesBackpressure(t *testing.T) {
369+
_, _, target, host, method := roundTrip(
370+
t, ContextRequestTargetKubernetes,
371+
&stubRoundTripper{resp: makeResponse(http.StatusTooManyRequests)},
372+
http.MethodGet,
373+
)
374+
375+
assert.Equal(t, float64(0),
376+
readGauge(outgoingAPIRequestsInFlight, target, host, method),
377+
"in-flight gauge must return to zero after Kubernetes backpressure",
378+
)
379+
}

0 commit comments

Comments
 (0)