From d40ab0739eb678e9a3c662ade59385804685cc7e Mon Sep 17 00:00:00 2001 From: Jens Neuse Date: Wed, 6 May 2026 22:47:32 +0200 Subject: [PATCH] perf: forward-only cursor in monotonicArena.Alloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Alloc walked a.buffers from index 0 on every call, giving O(numBuffers) cost per Alloc and O(N²) total work over an arena's lifetime. On the Cosmo Router workload reported in #2 (~180MB JSON response, ~600-1200 buffers, ~29M Allocs per request), this dominated request time at ~40s of router-side merge. Track the index of the most recent successful Alloc and start subsequent walks there. Cursor advances on a later-buffer hit and on grow; Reset and Release rewind it to 0 so a reused arena can re-fill its early buffers from scratch. For roughly uniform-size allocations the per-call cost becomes O(1); for mixed sizes the walk is bounded by the number of buffers ahead of the cursor. Benchmarks (controlled prefix, isolated walk cost): prefix=10: 17.5 ns/op → 2.7 ns/op (6.5x) prefix=100: 149 ns/op → 2.6 ns/op (57x) prefix=1000: 1293 ns/op → 2.6 ns/op (497x) Realistic growth workload (AllocCosmoLike): prefix=10: 5125 ns/op → 3.4 ns/op (1500x) prefix=100: 5265 ns/op → 4.0 ns/op (1300x) prefix=1000: 2785 ns/op → 4.0 ns/op (700x) Closes #2. Credit: original analysis and patch by @thoec. Co-Authored-By: Claude Opus 4.7 (1M context) --- monotonic_arena.go | 15 +++- monotonic_arena_test.go | 176 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+), 1 deletion(-) diff --git a/monotonic_arena.go b/monotonic_arena.go index c13d74f..0003cdb 100644 --- a/monotonic_arena.go +++ b/monotonic_arena.go @@ -12,6 +12,15 @@ type monotonicArena struct { peak uintptr // tracks peak allocated space minBufferSize uintptr // minimum size for new buffers initialBufferCount int // number of initial buffers to create + // cursor is the index of the buffer where the most recent Alloc found + // space. Subsequent Allocs start their walk at cursor instead of index 0, + // skipping buffers that earlier Allocs have already exhausted. The cursor + // only advances; once past, a buffer's remaining free space is no longer + // searched for the rest of the request. Reset and Release rewind cursor + // to 0 so a reused arena can re-fill its early buffers from scratch. For + // allocations of roughly uniform size that still fit at cursor, this + // reduces per-Alloc cost from O(len(buffers)) to O(1). + cursor int } type monotonicBuffer struct { @@ -134,9 +143,10 @@ func (a *monotonicArena) Alloc(size, alignment uintptr) unsafe.Pointer { if size == 0 { return nil } - for i := 0; i < len(a.buffers); i++ { + for i := a.cursor; i < len(a.buffers); i++ { ptr, consumed, ok := a.buffers[i].alloc(size, alignment) if ok { + a.cursor = i a.totalAlloc += consumed if a.totalAlloc > a.peak { a.peak = a.totalAlloc @@ -172,6 +182,7 @@ func (a *monotonicArena) Alloc(size, alignment uintptr) unsafe.Pointer { newBuffer := newMonotonicBuffer(int(newBufferSize)) a.buffers = append(a.buffers, newBuffer) + a.cursor = len(a.buffers) - 1 ptr, consumed, _ := newBuffer.alloc(size, alignment) @@ -189,6 +200,7 @@ func (a *monotonicArena) Reset() { s.reset() } a.totalAlloc = 0 + a.cursor = 0 } // Release satisfies the Arena interface. @@ -197,6 +209,7 @@ func (a *monotonicArena) Release() { s.release() } a.totalAlloc = 0 + a.cursor = 0 } // Len returns the total number of bytes currently allocated in the arena. diff --git a/monotonic_arena_test.go b/monotonic_arena_test.go index 42adc91..11ba92d 100644 --- a/monotonic_arena_test.go +++ b/monotonic_arena_test.go @@ -3,6 +3,7 @@ package arena import ( + "fmt" "testing" "unsafe" @@ -875,6 +876,181 @@ func TestMonotonicArenaInitialBufferCountAllocation(t *testing.T) { require.True(t, len(arena.(*monotonicArena).buffers) >= 3) } +// TestMonotonicArenaCursorStartsAtZero verifies the cursor is zero on a fresh +// arena, before any Alloc has succeeded. +func TestMonotonicArenaCursorStartsAtZero(t *testing.T) { + arena := NewMonotonicArena(WithInitialBufferCount(1), WithMinBufferSize(64)).(*monotonicArena) + require.Equal(t, 0, arena.cursor) +} + +// TestMonotonicArenaCursorStaysOnSameBufferHit verifies that successive allocs +// landing in the same buffer leave the cursor unchanged. +func TestMonotonicArenaCursorStaysOnSameBufferHit(t *testing.T) { + arena := NewMonotonicArena(WithInitialBufferCount(1), WithMinBufferSize(1024)).(*monotonicArena) + + require.NotNil(t, arena.Alloc(8, 1)) + require.Equal(t, 0, arena.cursor) + require.NotNil(t, arena.Alloc(8, 1)) + require.Equal(t, 0, arena.cursor) + require.NotNil(t, arena.Alloc(8, 1)) + require.Equal(t, 0, arena.cursor) +} + +// TestMonotonicArenaCursorAdvancesOnGrow verifies that growing the arena (when +// no existing buffer can fit the request) advances the cursor to the new +// trailing buffer. +func TestMonotonicArenaCursorAdvancesOnGrow(t *testing.T) { + arena := NewMonotonicArena(WithInitialBufferCount(1), WithMinBufferSize(64)).(*monotonicArena) + + require.NotNil(t, arena.Alloc(64, 1)) + require.Equal(t, 0, arena.cursor) + require.Equal(t, 1, len(arena.buffers)) + + require.NotNil(t, arena.Alloc(64, 1)) + require.Equal(t, 1, arena.cursor) + require.Equal(t, 2, len(arena.buffers)) + + require.NotNil(t, arena.Alloc(64, 1)) + require.Equal(t, 2, arena.cursor) + require.Equal(t, 3, len(arena.buffers)) +} + +// TestMonotonicArenaCursorAdvancesPastFullBuffersToLaterBuffer verifies that +// when the cursor's buffer is full but a later existing buffer has space, the +// cursor advances to the later buffer that satisfied the alloc. +func TestMonotonicArenaCursorAdvancesPastFullBuffersToLaterBuffer(t *testing.T) { + arena := NewMonotonicArena(WithInitialBufferCount(3), WithMinBufferSize(64)).(*monotonicArena) + require.Equal(t, 3, len(arena.buffers)) + + arena.buffers[0].alloc(64, 1) + arena.buffers[1].alloc(64, 1) + require.Equal(t, 0, arena.cursor) + + require.NotNil(t, arena.Alloc(8, 1)) + require.Equal(t, 2, arena.cursor) + require.Equal(t, 3, len(arena.buffers)) +} + +// TestMonotonicArenaCursorRewindsOnReset verifies that Reset returns the +// cursor to zero so subsequent allocs can reuse early buffers. +func TestMonotonicArenaCursorRewindsOnReset(t *testing.T) { + arena := NewMonotonicArena(WithInitialBufferCount(1), WithMinBufferSize(64)).(*monotonicArena) + + require.NotNil(t, arena.Alloc(64, 1)) + require.NotNil(t, arena.Alloc(64, 1)) + require.NotNil(t, arena.Alloc(64, 1)) + require.Equal(t, 2, arena.cursor) + + arena.Reset() + require.Equal(t, 0, arena.cursor) + + require.NotNil(t, arena.Alloc(8, 1)) + require.Equal(t, 0, arena.cursor) +} + +// TestMonotonicArenaCursorRewindsOnRelease verifies that Release returns the +// cursor to zero so the arena starts fresh after release. +func TestMonotonicArenaCursorRewindsOnRelease(t *testing.T) { + arena := NewMonotonicArena(WithInitialBufferCount(1), WithMinBufferSize(64)).(*monotonicArena) + + require.NotNil(t, arena.Alloc(64, 1)) + require.NotNil(t, arena.Alloc(64, 1)) + require.Equal(t, 1, arena.cursor) + + arena.Release() + require.Equal(t, 0, arena.cursor) +} + +// TestMonotonicArenaCursorDoesNotRescanFullBuffers verifies the regression +// bound: after the arena has grown, the cursor pins subsequent allocs to the +// trailing buffer instead of rescanning full buffers from index 0. This is +// the optimization that closes issue #2. +func TestMonotonicArenaCursorDoesNotRescanFullBuffers(t *testing.T) { + arena := NewMonotonicArena(WithInitialBufferCount(1), WithMinBufferSize(64)).(*monotonicArena) + + for range 10 { + require.NotNil(t, arena.Alloc(64, 1)) + } + require.Equal(t, 10, len(arena.buffers)) + require.Equal(t, 9, arena.cursor) + + // First small alloc forces a grow (buffer[9] is full from the 10th + // exact-fill alloc above). The new trailing buffer becomes the cursor. + require.NotNil(t, arena.Alloc(8, 1)) + require.Equal(t, 11, len(arena.buffers)) + require.Equal(t, 10, arena.cursor) + + // All subsequent small allocs land in buffer[10] without rescanning the + // 10 full buffers in front of it. The cursor stays put until that + // buffer fills. + for range 7 { + require.NotNil(t, arena.Alloc(8, 1)) + require.Equal(t, 10, arena.cursor) + require.Equal(t, 11, len(arena.buffers)) + } +} + +// BenchmarkMonotonicArenaAllocAfterFullBuffers reproduces the Cosmo Router +// hot path from issue #2: an arena with many full buffers, where every Alloc +// walks past them to find space in the trailing buffer. The unpatched code +// walks a.buffers from index 0 on every call, giving O(numBuffers) cost +// per alloc and O(N²) total work over the lifetime of an arena that grows to +// N buffers. The cursor optimization makes each alloc O(1) regardless of the +// prefix length. +// +// Setup: pre-fill numBuffers buffers exactly to capacity, then manually +// append a single large trailing buffer. The timed loop recycles only the +// trailing buffer's offset on overflow, so the total buffer count stays fixed +// at numBuffers+1 and the prefix-walk cost dominates the measurement instead +// of being diluted by buffer-count drift. +func BenchmarkMonotonicArenaAllocAfterFullBuffers(b *testing.B) { + const bufSize = 1024 + const allocSize = 8 + const trailingSize = 8 * 1024 * 1024 + for _, numBuffers := range []int{10, 100, 1000} { + b.Run(fmt.Sprintf("prefix=%d", numBuffers), func(b *testing.B) { + arena := NewMonotonicArena(WithInitialBufferCount(0), WithMinBufferSize(bufSize)).(*monotonicArena) + for range numBuffers { + _ = arena.Alloc(bufSize, 1) + } + arena.buffers = append(arena.buffers, newMonotonicBuffer(trailingSize)) + _ = arena.Alloc(allocSize, 1) + trailing := arena.buffers[len(arena.buffers)-1] + b.ResetTimer() + for b.Loop() { + if trailing.offset+allocSize > trailing.size { + b.StopTimer() + trailing.offset = 0 + b.StartTimer() + } + _ = arena.Alloc(allocSize, 1) + } + }) + } +} + +// BenchmarkMonotonicArenaAllocCosmoLike approximates the Cosmo Router workload +// described in issue #2: an arena that grows naturally during use, with many +// small allocations after the arena has already grown to many buffers. The +// arena grows during the timed loop (no manual offset tricks), so this is the +// closest representation of the real-world cost a caller pays. +func BenchmarkMonotonicArenaAllocCosmoLike(b *testing.B) { + const bufSize = 1024 + const allocSize = 8 + for _, numBuffers := range []int{10, 100, 1000} { + b.Run(fmt.Sprintf("prefix=%d", numBuffers), func(b *testing.B) { + arena := NewMonotonicArena(WithInitialBufferCount(0), WithMinBufferSize(bufSize)) + for range numBuffers { + _ = arena.Alloc(bufSize, 1) + } + b.ResetTimer() + for b.Loop() { + _ = arena.Alloc(allocSize, 1) + } + }) + } +} + func TestMonotonicArenaInitialBufferCountReset(t *testing.T) { // Test that reset works correctly with multiple initial buffers arena := NewMonotonicArena(WithInitialBufferCount(2), WithMinBufferSize(100))