Skip to content

Commit c31008f

Browse files
author
Kristian Larsson
committed
Add RTS worker sync pause
Introduce a cooperative sync-pause primitive for RTS worker threads. A worker can request a pause, wake its peers, wait until they park at worker-loop safepoints, and release them when the synchronized operation completes. Pause requests fail before the worker pool is running or while another pause is active. Active pause waits time out briefly and re-check rts_exit, so graceful shutdown cannot leave the owner or parked workers asleep on the pause condition.
1 parent 836cce0 commit c31008f

5 files changed

Lines changed: 196 additions & 3 deletions

File tree

base/rts/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ void acton_free(void* ptr);
3535
char *acton_strdup(const char *s);
3636
char *acton_strndup(const char *s, size_t n);
3737

38+
int acton_sync_pause_begin(void);
39+
void acton_sync_pause_end(void);
40+
3841
void *acton_gc_malloc(size_t size);
3942
void *acton_gc_malloc_atomic(size_t size);
4043
void *acton_gc_realloc(void* ptr, size_t size);

base/rts/rts.c

Lines changed: 131 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,132 @@ void pin_actor_affinity() { }
234234
void set_actor_affinity(int wthread_id) { }
235235
#endif // ACTON_THREADS
236236

237+
#ifdef ACTON_THREADS
238+
static pthread_mutex_t sync_pause_lock = PTHREAD_MUTEX_INITIALIZER;
239+
static pthread_cond_t sync_pause_cond = PTHREAD_COND_INITIALIZER;
240+
static int sync_pause_requested = 0;
241+
static int sync_pause_owner = -1;
242+
static int sync_pause_parked_count = 0;
243+
static int sync_pause_workers_are_started = 0;
244+
static int sync_pause_parked[MAX_WTHREADS];
245+
246+
static void sync_pause_clear_parked(void) {
247+
for (int i = 0; i <= num_wthreads; i++) {
248+
sync_pause_parked[i] = 0;
249+
}
250+
}
251+
252+
static void wake_all_wt(void) {
253+
for (int i = 0; i <= num_wthreads; i++) {
254+
uv_async_send(&wake_ev[i]);
255+
}
256+
}
257+
258+
static void sync_pause_wait(void) {
259+
struct timespec ts;
260+
clock_gettime(CLOCK_REALTIME, &ts);
261+
// Use a bounded wait so pause owners and parked workers re-check rts_exit
262+
// even if shutdown starts without a matching condition broadcast.
263+
ts.tv_nsec += 10 * 1000 * 1000;
264+
if (ts.tv_nsec >= 1000 * 1000 * 1000) {
265+
ts.tv_sec++;
266+
ts.tv_nsec -= 1000 * 1000 * 1000;
267+
}
268+
pthread_cond_timedwait(&sync_pause_cond, &sync_pause_lock, &ts);
269+
}
270+
271+
int acton_sync_pause_begin(void) {
272+
WorkerCtx wctx = GET_WCTX();
273+
if (wctx == NULL || wctx->id < 0 || wctx->id > num_wthreads) {
274+
return -1;
275+
}
276+
int owner = (int)wctx->id;
277+
278+
pthread_mutex_lock(&sync_pause_lock);
279+
if (!sync_pause_workers_are_started || sync_pause_requested) {
280+
pthread_mutex_unlock(&sync_pause_lock);
281+
return -1;
282+
}
283+
if (rts_exit) {
284+
pthread_mutex_unlock(&sync_pause_lock);
285+
return -1;
286+
}
287+
288+
sync_pause_requested = 1;
289+
sync_pause_owner = owner;
290+
sync_pause_parked_count = 0;
291+
sync_pause_clear_parked();
292+
293+
wake_all_wt();
294+
while (sync_pause_parked_count < num_wthreads && !rts_exit) {
295+
sync_pause_wait();
296+
}
297+
if (rts_exit) {
298+
sync_pause_requested = 0;
299+
sync_pause_owner = -1;
300+
sync_pause_parked_count = 0;
301+
pthread_cond_broadcast(&sync_pause_cond);
302+
pthread_mutex_unlock(&sync_pause_lock);
303+
return -1;
304+
}
305+
306+
pthread_mutex_unlock(&sync_pause_lock);
307+
return 0;
308+
}
309+
310+
void acton_sync_pause_end(void) {
311+
WorkerCtx wctx = GET_WCTX();
312+
if (wctx == NULL || wctx->id < 0 || wctx->id > num_wthreads) {
313+
return;
314+
}
315+
int owner = (int)wctx->id;
316+
317+
pthread_mutex_lock(&sync_pause_lock);
318+
if (!sync_pause_requested || sync_pause_owner != owner) {
319+
pthread_mutex_unlock(&sync_pause_lock);
320+
return;
321+
}
322+
323+
sync_pause_requested = 0;
324+
sync_pause_owner = -1;
325+
sync_pause_parked_count = 0;
326+
pthread_cond_broadcast(&sync_pause_cond);
327+
pthread_mutex_unlock(&sync_pause_lock);
328+
}
329+
330+
// Called from the worker loop between actor continuations. If a sync pause is
331+
// active, non-owner workers park here while the owner runs the synchronized op.
332+
static void maybe_sync_pause(void) {
333+
WorkerCtx wctx = GET_WCTX();
334+
if (wctx == NULL || wctx->id < 0 || wctx->id >= MAX_WTHREADS) {
335+
return;
336+
}
337+
int id = (int)wctx->id;
338+
339+
pthread_mutex_lock(&sync_pause_lock);
340+
while (sync_pause_requested && id != sync_pause_owner && !rts_exit) {
341+
if (!sync_pause_parked[id]) {
342+
sync_pause_parked[id] = 1;
343+
sync_pause_parked_count++;
344+
pthread_cond_broadcast(&sync_pause_cond);
345+
}
346+
sync_pause_wait();
347+
}
348+
pthread_mutex_unlock(&sync_pause_lock);
349+
}
350+
351+
static void sync_pause_workers_started(void) {
352+
pthread_mutex_lock(&sync_pause_lock);
353+
sync_pause_workers_are_started = 1;
354+
pthread_mutex_unlock(&sync_pause_lock);
355+
}
356+
#else
357+
int acton_sync_pause_begin(void) { return 0; }
358+
void acton_sync_pause_end(void) { }
359+
static void maybe_sync_pause(void) { }
360+
static void sync_pause_workers_started(void) { }
361+
#endif
362+
237363
void wake_wt(int wtid) {
238364
// We are sometimes optimistically called, i.e. the caller sometimes does
239365
// not really know whether there is new work or not. We check and if there
@@ -1533,6 +1659,7 @@ void wt_work_cb(uv_check_t *ev) {
15331659

15341660
uv_clock_gettime(UV_CLOCK_MONOTONIC, &ts_start);
15351661
while (true) {
1662+
maybe_sync_pause();
15361663
if (rts_exit) {
15371664
return;
15381665
}
@@ -2605,9 +2732,9 @@ int main(int argc, char **argv) {
26052732
}
26062733

26072734
#ifdef ACTON_THREADS
2608-
if (num_wthreads > MAX_WTHREADS) {
2609-
fprintf(stderr, "ERROR: Maximum of %d worker threads supported.\n", MAX_WTHREADS);
2610-
fprintf(stderr, "HINT: Run this program with fewer worker threads: %s --rts-wthreads %d\n", argv[0], MAX_WTHREADS);
2735+
if (num_wthreads >= MAX_WTHREADS) {
2736+
fprintf(stderr, "ERROR: Maximum of %d worker threads supported.\n", MAX_WTHREADS - 1);
2737+
fprintf(stderr, "HINT: Run this program with fewer worker threads: %s --rts-wthreads %d\n", argv[0], MAX_WTHREADS - 1);
26112738
exit(1);
26122739
}
26132740
// Determine number of worker threads, normally 1:1 per CPU thread / core
@@ -2858,6 +2985,7 @@ int main(int argc, char **argv) {
28582985
//pthread_setaffinity_np(threads[idx-1], sizeof(cpu_set), &cpu_set);
28592986
}
28602987
}
2988+
sync_pause_workers_started();
28612989

28622990
pthread_attr_destroy(&ss_attr);
28632991
#endif

docs/acton-dev-guide/src/SUMMARY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
- [FFI hooks](builtins/ffi.md)
2828
- [Runtime](runtime/index.md)
2929
- [Scheduler](runtime/scheduler.md)
30+
- [RTS sync pause](runtime/sync_pause.md)
3031
- [Actors](runtime/actors.md)
3132
- [Memory and GC](runtime/memory.md)
3233
- [Tooling](tooling/index.md)

docs/acton-dev-guide/src/runtime/index.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,8 @@
22

33
This section covers the runtime system, scheduling model, and core services
44
that the compiler targets.
5+
6+
- [Scheduler](scheduler.md)
7+
- [RTS sync pause](sync_pause.md)
8+
- [Actors](actors.md)
9+
- [Memory and GC](memory.md)
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# RTS Sync Pause
2+
3+
The RTS sync pause is a cooperative stop-the-world primitive for worker
4+
threads. It is intended for runtime operations that must publish process-wide
5+
state while no other worker is executing Acton continuations, for example
6+
dynamic library code reload. It is not a general blocking API for arbitrary
7+
threads; callers must be RTS workers.
8+
9+
The public C entry points are declared in `base/rts/common.h`:
10+
11+
```c
12+
int acton_sync_pause_begin(void);
13+
void acton_sync_pause_end(void);
14+
```
15+
16+
`acton_sync_pause_begin()` may only be called after the worker pool has started
17+
and before shutdown. It returns `0` for the worker that owns the pause and `-1`
18+
if the caller is not a worker, shutdown has started, or another pause is already
19+
active. There is no queue of pause requests; callers that race with an active
20+
pause fail and must decide at the higher layer whether to retry or report an
21+
error. `acton_sync_pause_end()` releases the pause only when called by the
22+
worker that owns it.
23+
24+
## Protocol
25+
26+
The pause owner records its worker id, clears the per-worker parked bitmap, and
27+
wakes all worker-thread event loops with `wake_all_wt()`. The wake is just a
28+
libuv poke; sending it to the owner's own loop is harmless because async
29+
callbacks are not run inline and wake notifications may be coalesced.
30+
31+
Workers run `maybe_sync_pause()` at the top of `wt_work_cb()` before dequeuing
32+
the next actor continuation. A non-owner
33+
worker that sees an active pause marks itself parked exactly once, increments
34+
`sync_pause_parked_count`, signals the condition variable, and waits until the
35+
owner releases the pause. The owner waits until `sync_pause_parked_count`
36+
reaches `num_wthreads`, which is the number of workers other than the owner in
37+
the `0..num_wthreads` worker-id range.
38+
39+
The primitive is therefore cooperative. It does not interrupt a continuation
40+
that is already running; the pause is established only once every other worker
41+
has returned to the worker-loop pause check.
42+
43+
## Shutdown
44+
45+
Both the owner and parked workers use a short timed condition-variable wait
46+
rather than an unbounded `pthread_cond_wait()`. This keeps the pause responsive
47+
to `rts_exit` even if shutdown starts while a worker is waiting and no further
48+
condition broadcast arrives. If shutdown is observed while the owner is waiting,
49+
the owner clears the pause state, broadcasts to parked workers, and fails the
50+
pause request.
51+
52+
## Non-Threaded Builds
53+
54+
When Acton is built without RTS threads, `acton_sync_pause_begin()` returns
55+
success and `acton_sync_pause_end()` is a no-op. There are no peer workers to
56+
park in that configuration.

0 commit comments

Comments
 (0)