Skip to content

Commit 7771c87

Browse files
bushidocodesclaude
andauthored
fix: size the work-stealing deque to its capacity and index it circularly (#393)
The deque (used by the FIFO global request scheduler) declared its backing store as a fixed inline array `type wrk[DEQUE_MAX_SZ]` (1<<23 entries — ~64 MB for struct sandbox*), and the `size` passed at init was only a soft occupancy cap. Worse, top/bottom were used as raw, monotonically increasing array indices with no modulo wrap; since the global scheduler only pushes (bottom++) and steals (top++), the index marches forward forever and runs off the end of the array after 1<<23 total requests. Allocate the backing buffer on the heap at init, sized to the requested capacity (rounded up to a power of two), and index it circularly via a mask. The occupancy cap (size - 1) keeps the producer from lapping the consumers, so masked indices of live elements never collide. This makes memory scale with the configured capacity (4096 entries instead of 1<<23 -> ~64 MB less virtual reservation) and removes the eventual out-of-bounds. The atomics and ordering are unchanged; only the index reads/writes gained `& mask`. Also add deque_free, have deque_init report allocation failure (checked by the caller), and make deque.h self-contained (stdbool/stdlib/errno/assert). Growing the buffer when full is intentionally left out: stealers consume it lock-free concurrently with the producer, so reallocating underneath an in-flight steal would be a use-after-free. The deque returns -ENOSPC when full and the caller applies backpressure. Verified: builds; FIFO serves resize requests correctly (20/20 byte- identical); a unit test drives 1,000,007 push/steal pairs through a size-8 ring (indices wrap ~125,000x, live window straddles the boundary 750,000 times) preserving strict FIFO order; virtual footprint drops ~64 MB. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 31c5780 commit 7771c87

2 files changed

Lines changed: 57 additions & 20 deletions

File tree

runtime/include/deque.h

Lines changed: 54 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
#ifndef DEQUE_H
88
#define DEQUE_H
99

10+
#include <assert.h>
11+
#include <errno.h>
12+
#include <stdbool.h>
13+
#include <stddef.h>
14+
#include <stdlib.h>
15+
1016
/*
1117
* This was implemented by referring to:
1218
* https://github.com/cpp-taskflow/cpp-taskflow/blob/9c28ccec910346a9937c40db7bdb542262053f9c/taskflow/executor/workstealing.hpp
@@ -18,33 +24,63 @@
1824
*
1925
* PPoPP implementation paper, "Correct and Efficient Work-Stealing for Weak Memory Models"
2026
* https://www.di.ens.fr/~zappa/readings/ppopp13.pdf
27+
*
28+
* The backing buffer is heap-allocated at init to the requested capacity (rounded up to a power of two)
29+
* and indexed circularly: the top/bottom counters grow monotonically and are masked into the buffer. The
30+
* occupancy cap (size - 1) keeps the producer from lapping the consumers, so the masked indices of live
31+
* elements never collide. Memory therefore scales with the configured capacity rather than a fixed
32+
* compile-time maximum.
33+
*
34+
* Growing the buffer when full is intentionally not implemented: the deque is consumed lock-free by
35+
* stealers concurrently with the producer, so reallocating (and freeing) the backing buffer underneath
36+
* an in-flight steal would be a use-after-free. A safe grow would require synchronizing every stealer or
37+
* a lock-free resizable variant; until that is designed and perf-tested the deque returns -ENOSPC when
38+
* full and the caller applies backpressure.
2139
*/
2240

23-
/* TODO: Implement the ability to dynamically resize! Issue #89 */
41+
/* Upper bound on a deque's capacity. A sanity guard on the requested size, not an allocation size. */
2442
#define DEQUE_MAX_SZ (1 << 23)
2543

44+
static inline size_t
45+
deque_round_up_to_pow2(size_t v)
46+
{
47+
size_t p = 1;
48+
while (p < v) p <<= 1;
49+
return p;
50+
}
51+
2652
#define DEQUE_PROTOTYPE(name, type) \
2753
struct deque_##name { \
28-
type wrk[DEQUE_MAX_SZ]; \
29-
long size; \
54+
type *wrk; \
55+
long size; \
56+
long mask; \
3057
\
3158
volatile long top; \
3259
volatile long bottom; \
3360
}; \
3461
\
35-
static inline void deque_init_##name(struct deque_##name *q, size_t sz) \
62+
/* Allocates the backing buffer. Returns 0 on success, -ENOMEM on allocation failure. */ \
63+
static inline int deque_init_##name(struct deque_##name *q, size_t sz) \
3664
{ \
37-
memset(q, 0, sizeof(struct deque_##name)); \
38-
\
39-
if (sz) { \
40-
/* only for size with pow of 2 */ \
41-
/* assert((sz & (sz - 1)) == 0); */ \
42-
assert(sz <= DEQUE_MAX_SZ); \
43-
} else { \
44-
sz = DEQUE_MAX_SZ; \
45-
} \
65+
if (sz == 0) sz = DEQUE_MAX_SZ; \
66+
assert(sz <= DEQUE_MAX_SZ); \
67+
sz = deque_round_up_to_pow2(sz); \
68+
\
69+
q->wrk = (type *)calloc(sz, sizeof(type)); \
70+
if (q->wrk == NULL) return -ENOMEM; \
4671
\
47-
q->size = sz; \
72+
q->size = (long)sz; \
73+
q->mask = (long)sz - 1; \
74+
q->top = 0; \
75+
q->bottom = 0; \
76+
\
77+
return 0; \
78+
} \
79+
\
80+
static inline void deque_free_##name(struct deque_##name *q) \
81+
{ \
82+
free(q->wrk); \
83+
q->wrk = NULL; \
4884
} \
4985
\
5086
/* Use mutual exclusion locks around push/pop if multi-threaded. */ \
@@ -55,10 +91,10 @@
5591
ct = q->top; \
5692
cb = q->bottom; \
5793
\
58-
/* nope, fixed size only */ \
94+
/* Bounded by the configured capacity; caller applies backpressure on -ENOSPC. */ \
5995
if (q->size - 1 < (cb - ct)) return -ENOSPC; \
6096
\
61-
q->wrk[cb] = *w; \
97+
q->wrk[cb & q->mask] = *w; \
6298
__sync_synchronize(); \
6399
if (__sync_bool_compare_and_swap(&q->bottom, cb, cb + 1) == false) assert(0); \
64100
\
@@ -82,7 +118,7 @@
82118
return -ENOENT; \
83119
} \
84120
\
85-
*w = q->wrk[cb]; \
121+
*w = q->wrk[cb & q->mask]; \
86122
if (sz > 0) return 0; \
87123
\
88124
ret = __sync_bool_compare_and_swap(&q->top, ct, ct + 1); \
@@ -110,7 +146,7 @@
110146
/* Empty */ \
111147
if (ct >= cb) return -ENOENT; \
112148
\
113-
*w = deque->wrk[ct]; \
149+
*w = deque->wrk[ct & deque->mask]; \
114150
if (__sync_bool_compare_and_swap(&deque->top, ct, ct + 1) == false) return -EAGAIN; \
115151
\
116152
return 0; \

runtime/src/global_request_scheduler_deque.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,9 @@ global_request_scheduler_deque_initialize()
5353
/* Allocate and Initialize the global deque */
5454
global_request_scheduler_deque = (struct deque_sandbox *)calloc(1, sizeof(struct deque_sandbox));
5555
assert(global_request_scheduler_deque);
56-
/* Note: Below is a Macro */
57-
deque_init_sandbox(global_request_scheduler_deque, GLOBAL_REQUEST_SCHEDULER_DEQUE_CAPACITY);
56+
/* Note: Below is a Macro. It heap-allocates the backing buffer sized to the requested capacity. */
57+
int rc = deque_init_sandbox(global_request_scheduler_deque, GLOBAL_REQUEST_SCHEDULER_DEQUE_CAPACITY);
58+
if (rc != 0) panic("Failed to allocate global request scheduler deque\n");
5859

5960
/* Register Function Pointers for Abstract Scheduling API */
6061
struct global_request_scheduler_config config = {.add_fn = global_request_scheduler_deque_add,

0 commit comments

Comments
 (0)