Skip to content

Commit 76947fd

Browse files
sandy2008claude
andauthored
[BUGFIX] Ingester: release TSDB appender on early-return paths in Push (#7528)
* [BUGFIX] Ingester: release TSDB appender on early-return paths in Push Push acquired a TSDB appender at the start of the function and, on early returns (e.g. out-of-order label set, hard append errors, Commit failure), could return without calling Commit or Rollback. This leaked TSDB head series references, mmap'd chunks, and pending appender state, observable as monotonic growth of cortex_ingester_tsdb_head_active_appenders. Defer Rollback right after the appender is acquired and skip it on the success path. `committed` is flipped to true before app.Commit() because Prometheus closes the appender even on Commit failure (internal self-rollback on WAL error), so the deferred Rollback must not run afterwards. The two now-redundant explicit Rollback calls inside the sample / histogram failure paths are removed and rely on the same defer. Add a regression test asserting cortex_ingester_tsdb_head_active_appenders stays at 0 after repeated out-of-order-label-set pushes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Signed-off-by: Sandy Chen <Yuxuan.Chen@morganstanley.com> * [BUGFIX] Ingester: modernize for-loop in regression test Address `make check-modernize` lint failure by using Go 1.22+ `for range N` syntax instead of `for n := 0; n < numLeakyPushes; n++`. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Signed-off-by: Sandy Chen <Yuxuan.Chen@morganstanley.com> --------- Signed-off-by: Sandy Chen <Yuxuan.Chen@morganstanley.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 85df243 commit 76947fd

3 files changed

Lines changed: 83 additions & 9 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
* [BUGFIX] Security: Fix stored XSS vulnerability in Alertmanager and Store Gateway status pages by replacing `text/template` with `html/template`. #7512
4040
* [BUGFIX] Security: Limit decompressed gzip output in `ParseProtoReader` and OTLP ingestion path. The decompressed body is now capped by `-distributor.otlp-max-recv-msg-size`. #7515
4141
* [BUGFIX] Tenant Federation: Fix regex resolver clearing known users list when user scan fails. #7534
42+
* [BUGFIX] Ingester: Release the TSDB appender on every early-return path in `Push` (e.g. out-of-order label set) by deferring `Rollback`. Previously such requests leaked TSDB head series references, mmap'd chunks and pending state per request, causing the `cortex_ingester_tsdb_head_active_appenders` gauge to grow unbounded. #7528
4243

4344
## 1.21.0 2026-04-24
4445

pkg/ingester/ingester.go

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1437,6 +1437,22 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
14371437
// Walk the samples, appending them to the users database
14381438
app := db.Appender(ctx).(extendedAppender)
14391439

1440+
// Ensure the appender is always released so that we don't leak TSDB head
1441+
// series references, mmap'd chunks and pending state on early returns.
1442+
// `committed` is flipped to true immediately before app.Commit() because
1443+
// Prometheus closes the appender even on Commit failure (it self-rolls
1444+
// back internally on WAL error), so the deferred Rollback must not run
1445+
// afterwards.
1446+
committed := false
1447+
defer func() {
1448+
if committed {
1449+
return
1450+
}
1451+
if rollbackErr := app.Rollback(); rollbackErr != nil {
1452+
level.Warn(logutil.WithContext(ctx, i.logger)).Log("msg", "failed to rollback appender on early return", "user", userID, "err", rollbackErr)
1453+
}
1454+
}()
1455+
14401456
// Even when OOO is enabled globally, we want to reject OOO samples in some cases.
14411457
// prometheus implementation: https://github.com/prometheus/prometheus/pull/14710
14421458
if req.DiscardOutOfOrder {
@@ -1505,11 +1521,8 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
15051521
if rollback := handleAppendFailure(err, s.TimestampMs, ts.Labels, copiedLabels, matchedLabelSetLimits); !rollback {
15061522
continue
15071523
}
1508-
// The error looks an issue on our side, so we should rollback
1509-
if rollbackErr := app.Rollback(); rollbackErr != nil {
1510-
level.Warn(logutil.WithContext(ctx, i.logger)).Log("msg", "failed to rollback on error", "user", userID, "err", rollbackErr)
1511-
}
1512-
1524+
// The error looks an issue on our side, so we should rollback.
1525+
// The deferred rollback above will close the appender; nothing to do here.
15131526
return nil, wrapWithUser(err, userID)
15141527
}
15151528

@@ -1560,10 +1573,8 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
15601573
if rollback := handleAppendFailure(err, hp.TimestampMs, ts.Labels, copiedLabels, matchedLabelSetLimits); !rollback {
15611574
continue
15621575
}
1563-
// The error looks an issue on our side, so we should rollback
1564-
if rollbackErr := app.Rollback(); rollbackErr != nil {
1565-
level.Warn(logutil.WithContext(ctx, i.logger)).Log("msg", "failed to rollback on error", "user", userID, "err", rollbackErr)
1566-
}
1576+
// The error looks an issue on our side, so we should rollback.
1577+
// The deferred rollback above will close the appender; nothing to do here.
15671578
return nil, wrapWithUser(err, userID)
15681579
}
15691580
} else {
@@ -1626,6 +1637,10 @@ func (i *Ingester) Push(ctx context.Context, req *cortexpb.WriteRequest) (*corte
16261637
}
16271638

16281639
startCommit := time.Now()
1640+
// Mark committed before calling Commit: Prometheus closes the appender on
1641+
// both success and failure of Commit (it self-rolls-back on WAL error), so
1642+
// the deferred Rollback must not fire afterwards.
1643+
committed = true
16291644
if err := app.Commit(); err != nil {
16301645
return nil, wrapWithUser(err, userID)
16311646
}

pkg/ingester/ingester_test.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2833,6 +2833,64 @@ func TestIngester_Push_OutOfOrderLabels(t *testing.T) {
28332833
require.NoError(t, err)
28342834
}
28352835

2836+
// TestIngester_Push_OutOfOrderLabels_AppenderNotLeaked verifies that when Push
2837+
// returns early because of an out-of-order label set (and on any other early
2838+
// return after the appender is acquired) the appender is released via
2839+
// Rollback. Otherwise the TSDB head leaks series references, mmap'd chunks and
2840+
// pending state on every such request; observable via the
2841+
// cortex_ingester_tsdb_head_active_appenders gauge.
2842+
func TestIngester_Push_OutOfOrderLabels_AppenderNotLeaked(t *testing.T) {
2843+
cfg := defaultIngesterTestConfig(t)
2844+
r := prometheus.NewRegistry()
2845+
i, err := prepareIngesterWithBlocksStorage(t, cfg, r)
2846+
require.NoError(t, err)
2847+
require.NoError(t, services.StartAndAwaitRunning(context.Background(), i))
2848+
defer services.StopAndAwaitTerminated(context.Background(), i) //nolint:errcheck
2849+
2850+
// Wait until it's ACTIVE.
2851+
test.Poll(t, time.Second, ring.ACTIVE, func() any {
2852+
return i.lifecycler.GetState()
2853+
})
2854+
2855+
ctx := user.InjectOrgID(context.Background(), "test-user")
2856+
2857+
// First push a valid sample to initialise the user TSDB head so that
2858+
// subsequent Push() calls take the headAppender path.
2859+
validLabels := labels.FromStrings(labels.MetricName, "test_metric", "a", "1", "b", "2")
2860+
validReq, _ := mockWriteRequest(t, validLabels, 1, 1)
2861+
_, err = i.Push(ctx, validReq)
2862+
require.NoError(t, err)
2863+
2864+
// Sanity check: no appenders are currently active.
2865+
const activeAppendersMetric = "cortex_ingester_tsdb_head_active_appenders"
2866+
expectedZero := `
2867+
# HELP cortex_ingester_tsdb_head_active_appenders Number of currently active TSDB appender transactions.
2868+
# TYPE cortex_ingester_tsdb_head_active_appenders gauge
2869+
cortex_ingester_tsdb_head_active_appenders 0
2870+
`
2871+
require.NoError(t, testutil.GatherAndCompare(r, bytes.NewBufferString(expectedZero), activeAppendersMetric))
2872+
2873+
// Now push a series of requests with an out-of-order label set. Each
2874+
// such request creates an appender that, without the leak fix, is never
2875+
// released, leaving the active-appenders gauge growing.
2876+
outOfOrderLabels := []cortexpb.LabelAdapter{
2877+
{Name: labels.MetricName, Value: "test_metric"},
2878+
{Name: "c", Value: "3"},
2879+
{Name: "a", Value: "1"},
2880+
}
2881+
const numLeakyPushes = 5
2882+
for n := range numLeakyPushes {
2883+
req, _ := mockWriteRequest(t, cortexpb.FromLabelAdaptersToLabels(outOfOrderLabels), 1, int64(2+n))
2884+
_, err = i.Push(ctx, req)
2885+
require.Error(t, err)
2886+
require.Contains(t, err.Error(), "out-of-order label set found")
2887+
}
2888+
2889+
// The active-appenders gauge must still be 0 — every appender created by
2890+
// the early-returning Push() must have been released.
2891+
require.NoError(t, testutil.GatherAndCompare(r, bytes.NewBufferString(expectedZero), activeAppendersMetric))
2892+
}
2893+
28362894
func BenchmarkIngesterPush(b *testing.B) {
28372895
limits := defaultLimitsTestConfig()
28382896
benchmarkIngesterPush(b, limits, false)

0 commit comments

Comments
 (0)