Skip to content

Commit 9941750

Browse files
feat: cross-platform force-kill primitive for stuck PHP threads
Introduces a small, self-contained primitive that unblocks a PHP thread stuck in a blocking call (sleep, synchronous I/O, etc.) so the graceful drain used by RestartWorkers and DrainWorkers can make progress instead of waiting for the block to return on its own. The primitive is useful on its own and gives follow-up graceful-shutdown work a reviewed foundation to build on. - frankenphp.c: add frankenphp_init_force_kill / frankenphp_save_php_timer / frankenphp_force_kill_thread / frankenphp_destroy_force_kill. The per-thread PHP timer handle (Linux/FreeBSD ZTS) or OS thread handle (Windows) is captured at thread boot and stored in a pre-sized array so the kill path can fire from any goroutine without touching per-thread PHP state. Linux/FreeBSD arm PHP's max_execution_time timer (delivers SIGALRM -> "Maximum execution time exceeded"); Windows uses CancelSynchronousIo + QueueUserAPC to interrupt I/O and alertable waits; macOS and other platforms are a safe no-op (the thread is abandoned and exits when the blocking call returns naturally). - phpmainthread.go: wire frankenphp_init_force_kill into initPHPThreads (sized to maxThreads, matching the thread_metrics allocation) and frankenphp_destroy_force_kill into drainPHPThreads. - worker.go: add a 5-second graceful-drain grace period to drainWorkerThreads. Once elapsed, arm the force-kill primitive on any thread still outside Yielding and keep waiting on ready.Wait(); the kill lets the thread return from its blocking call so the drain completes in bounded time instead of hanging. - worker_test.go + testdata/worker-sleep.php: TestRestartWorkersForceKillsStuckThread drives the path end-to-end. A worker blocks inside sleep(60) below frankenphp_handle_request (so drainChan close can't reach it); the test asserts RestartWorkers returns within 8s (grace + slack). The test skips on platforms without the underlying primitive.
1 parent a05e6dd commit 9941750

9 files changed

Lines changed: 493 additions & 17 deletions

File tree

caddy/admin.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,13 @@ func (admin *FrankenPHPAdmin) restartWorkers(w http.ResponseWriter, r *http.Requ
3939
return admin.error(http.StatusMethodNotAllowed, fmt.Errorf("method not allowed"))
4040
}
4141

42-
frankenphp.RestartWorkers()
42+
if err := frankenphp.RestartWorkers(); err != nil {
43+
// Restart is incomplete: at least one worker thread was stuck in
44+
// an uninterruptible blocking call and did not reload code. Do
45+
// not let the admin endpoint lie to automation with a 200.
46+
caddy.Log().Sugar().Errorf("workers restart incomplete: %v", err)
47+
return admin.error(http.StatusInternalServerError, err)
48+
}
4349
caddy.Log().Info("workers restarted from admin api")
4450
admin.success(w, "workers restarted successfully\n")
4551

frankenphp.c

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,135 @@ static bool is_forked_child = false;
9292
static void frankenphp_fork_child(void) { is_forked_child = true; }
9393
#endif
9494

95+
/* Best-effort force-kill for PHP threads after the graceful-drain grace
96+
* period. Each thread captures pointers to its own executor_globals'
97+
* vm_interrupt and timed_out atomic bools at boot and hands them back to
98+
* Go via go_frankenphp_store_force_kill_slot. From any goroutine, the
99+
* Go side passes that slot back to frankenphp_force_kill_thread, which
100+
* stores true into both bools, waking the VM at the next opcode boundary
101+
* and unwinding the thread through zend_timeout().
102+
*
103+
* On platforms with POSIX realtime signals (Linux, FreeBSD), force-kill
104+
* also delivers SIGRTMIN+3 to the target thread so any in-flight blocking
105+
* syscall (select, sleep, nanosleep, blocking I/O without SA_RESTART)
106+
* returns EINTR and the VM gets a chance to observe the atomic bools on
107+
* the next opcode. On Windows, CancelSynchronousIo + QueueUserAPC does
108+
* the equivalent for alertable I/O and SleepEx. Non-alertable Sleep()
109+
* (including PHP's usleep on Windows) stays uninterruptible - the VM
110+
* must wait for it to return naturally before bailing.
111+
*
112+
* macOS has no realtime signals exposed to user-space, so the atomic
113+
* bool path is the only mechanism there: threads busy-looping in PHP
114+
* are killed promptly, threads stuck in blocking syscalls wait to
115+
* return on their own.
116+
*
117+
* JIT caveat: when the OPcache JIT is enabled, some hot code paths do
118+
* not check vm_interrupt between opcodes. A thread stuck in a
119+
* JIT-compiled busy loop may not observe the atomic-bool store at all
120+
* (see https://github.com/php/php-src/issues/21267). The syscall-
121+
* interruption path (signal -> EINTR) still works since the kernel
122+
* wakes the thread regardless of JIT state, so the regression surface
123+
* is pure-PHP busy loops under JIT. Those fall through to the abandon
124+
* path after forceKillDeadline.
125+
*
126+
* Signal number reservation: SIGRTMIN+3 is reserved by FrankenPHP for
127+
* force-kill. If a PHP user script registers its own handler via
128+
* pcntl_signal(SIGRTMIN+3, ...), it clobbers ours and force-kill stops
129+
* working for threads it runs on. Projects embedding FrankenPHP
130+
* alongside their own Go code that also uses that signal must choose a
131+
* different one here. Keep this in mind if ever changing the constant.
132+
*
133+
* The slot lives in the Go-side phpThread struct - there is no C-side
134+
* array or init/destroy dance. Signal handler installation happens once
135+
* via pthread_once the first time a thread registers. */
136+
#ifdef PHP_WIN32
137+
static void CALLBACK frankenphp_noop_apc(ULONG_PTR param) { (void)param; }
138+
#endif
139+
140+
#ifdef FRANKENPHP_HAS_KILL_SIGNAL
141+
/* No-op handler: signal delivery is sufficient on its own because it
142+
* forces the in-flight syscall to return EINTR. The VM then observes
143+
* vm_interrupt on the next opcode and unwinds via zend_timeout(). */
144+
static void frankenphp_kill_signal_handler(int sig) { (void)sig; }
145+
146+
static pthread_once_t kill_signal_handler_installed = PTHREAD_ONCE_INIT;
147+
static void install_kill_signal_handler(void) {
148+
/* Install the no-op handler process-wide with SA_RESTART cleared so
149+
* blocking syscalls return EINTR when the signal is delivered rather
150+
* than being transparently restarted by libc. SA_ONSTACK is set
151+
* defensively: the signal targets non-Go pthreads via pthread_kill,
152+
* but if it's ever delivered to a Go-managed thread (e.g. through
153+
* accidental process-level raise), Go requires the handler to run on
154+
* the alternate signal stack to avoid corrupting the goroutine's. */
155+
struct sigaction sa;
156+
memset(&sa, 0, sizeof(sa));
157+
sa.sa_handler = frankenphp_kill_signal_handler;
158+
sigemptyset(&sa.sa_mask);
159+
sa.sa_flags = SA_ONSTACK;
160+
sigaction(FRANKENPHP_KILL_SIGNAL, &sa, NULL);
161+
}
162+
#endif
163+
164+
/* Called by each PHP thread at boot, from its own TSRM context, so that
165+
* the EG-backed addresses resolve to the thread's private executor_globals
166+
* and the captured thread identity refers to itself. Hands the slot to
167+
* the Go side via go_frankenphp_store_force_kill_slot; the slot's
168+
* lifetime is the phpThread's. */
169+
void frankenphp_register_thread_for_kill(uintptr_t idx) {
170+
force_kill_slot slot;
171+
memset(&slot, 0, sizeof(slot));
172+
slot.vm_interrupt = &EG(vm_interrupt);
173+
slot.timed_out = &EG(timed_out);
174+
#ifdef FRANKENPHP_HAS_KILL_SIGNAL
175+
slot.tid = pthread_self();
176+
pthread_once(&kill_signal_handler_installed, install_kill_signal_handler);
177+
#elif defined(PHP_WIN32)
178+
if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),
179+
GetCurrentProcess(), &slot.thread_handle, 0, FALSE,
180+
DUPLICATE_SAME_ACCESS)) {
181+
/* DuplicateHandle can fail under resource pressure; leave the handle
182+
* NULL so force_kill_thread falls back to the atomic-bool path only. */
183+
slot.thread_handle = NULL;
184+
}
185+
#endif
186+
go_frankenphp_store_force_kill_slot(idx, slot);
187+
}
188+
189+
void frankenphp_force_kill_thread(force_kill_slot slot) {
190+
if (slot.vm_interrupt == NULL) {
191+
/* Thread never reached register_thread_for_kill (aborted during boot). */
192+
return;
193+
}
194+
/* Set the atomic bools first so that by the time the thread wakes up -
195+
* whether from our signal/APC or naturally - the VM sees them and
196+
* routes through zend_timeout() -> "Maximum execution time exceeded". */
197+
zend_atomic_bool_store(slot.timed_out, true);
198+
zend_atomic_bool_store(slot.vm_interrupt, true);
199+
200+
#ifdef FRANKENPHP_HAS_KILL_SIGNAL
201+
/* Return value intentionally ignored: ESRCH (thread already exited) and
202+
* EINVAL are both benign - there is simply nothing to unblock. */
203+
pthread_kill(slot.tid, FRANKENPHP_KILL_SIGNAL);
204+
#elif defined(PHP_WIN32)
205+
if (slot.thread_handle != NULL) {
206+
CancelSynchronousIo(slot.thread_handle);
207+
QueueUserAPC((PAPCFUNC)frankenphp_noop_apc, slot.thread_handle, 0);
208+
}
209+
#endif
210+
}
211+
212+
/* Releases any OS resource tied to the slot (currently: CloseHandle on
213+
* Windows). Called by the Go side when a phpThread is torn down. */
214+
void frankenphp_release_thread_for_kill(force_kill_slot slot) {
215+
#ifdef PHP_WIN32
216+
if (slot.thread_handle != NULL) {
217+
CloseHandle(slot.thread_handle);
218+
}
219+
#else
220+
(void)slot;
221+
#endif
222+
}
223+
95224
void frankenphp_update_local_thread_context(bool is_worker) {
96225
is_worker_thread = is_worker;
97226

@@ -1073,6 +1202,11 @@ static void *php_thread(void *arg) {
10731202
#endif
10741203
#endif
10751204

1205+
/* Register this thread's vm_interrupt/timed_out addresses so the Go side
1206+
* can force-kill it after the graceful-drain grace period if it gets stuck
1207+
* in a busy PHP loop. */
1208+
frankenphp_register_thread_for_kill(thread_index);
1209+
10761210
bool thread_is_healthy = true;
10771211
bool has_attempted_shutdown = false;
10781212

@@ -1150,6 +1284,15 @@ static void *php_thread(void *arg) {
11501284
}
11511285
zend_end_try();
11521286

1287+
/* Clear the force-kill slot BEFORE ts_free_thread: that call frees
1288+
* the TSRM storage that &EG(vm_interrupt) / &EG(timed_out) point at.
1289+
* Clearing afterwards (even under a write lock) would leave a window
1290+
* where a concurrent delivery reads the still-populated slot and
1291+
* writes into freed memory. Applies to both the healthy exit and the
1292+
* unhealthy-restart path below so every call to force_kill_thread
1293+
* sees either a valid or a zero-valued slot. */
1294+
go_frankenphp_clear_force_kill_slot(thread_index);
1295+
11531296
/* free all global PHP memory reserved for this thread */
11541297
#ifdef ZTS
11551298
ts_free_thread();

frankenphp.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,28 @@ static inline HRESULT LongLongSub(LONGLONG llMinuend, LONGLONG llSubtrahend,
4646
#include <stdbool.h>
4747
#include <stdint.h>
4848

49+
#ifndef PHP_WIN32
50+
#include <pthread.h>
51+
#include <signal.h>
52+
#endif
53+
54+
/* Platform capabilities for the force-kill primitive; declared in the
55+
* header so Go (via CGo) gets the correct struct layout too. */
56+
#if !defined(PHP_WIN32) && defined(SIGRTMIN)
57+
#define FRANKENPHP_HAS_KILL_SIGNAL 1
58+
#define FRANKENPHP_KILL_SIGNAL (SIGRTMIN + 3)
59+
#endif
60+
61+
typedef struct {
62+
zend_atomic_bool *vm_interrupt;
63+
zend_atomic_bool *timed_out;
64+
#ifdef FRANKENPHP_HAS_KILL_SIGNAL
65+
pthread_t tid;
66+
#elif defined(PHP_WIN32)
67+
HANDLE thread_handle;
68+
#endif
69+
} force_kill_slot;
70+
4971
#ifndef FRANKENPHP_VERSION
5072
#define FRANKENPHP_VERSION dev
5173
#endif
@@ -193,6 +215,18 @@ void frankenphp_init_thread_metrics(int max_threads);
193215
void frankenphp_destroy_thread_metrics(void);
194216
size_t frankenphp_get_thread_memory_usage(uintptr_t thread_index);
195217

218+
/* Best-effort force-kill primitives. The slot is populated by each PHP
219+
* thread at boot (frankenphp_register_thread_for_kill calls back into Go
220+
* via go_frankenphp_store_force_kill_slot) and lives in the Go-side
221+
* phpThread. force_kill_thread interrupts the Zend VM at the next opcode
222+
* boundary; on POSIX it also delivers SIGRTMIN+3 to the target thread,
223+
* on Windows it calls CancelSynchronousIo + QueueUserAPC. release_thread
224+
* drops any OS-owned resource tied to the slot (currently the Windows
225+
* thread handle). */
226+
void frankenphp_register_thread_for_kill(uintptr_t thread_index);
227+
void frankenphp_force_kill_thread(force_kill_slot slot);
228+
void frankenphp_release_thread_for_kill(force_kill_slot slot);
229+
196230
void register_extensions(zend_module_entry **m, int len);
197231

198232
#endif

phpmainthread.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,14 @@ func drainPHPThreads() {
9797
}
9898

9999
doneWG.Wait()
100+
// Slots are released by the PHP threads themselves, under the
101+
// per-thread write lock, right before ts_free_thread() runs (see
102+
// go_frankenphp_clear_force_kill_slot). A second release here would
103+
// be a double-CloseHandle on Windows (potentially on a reused handle)
104+
// and bypass the lock discipline on every platform, so we rely on
105+
// the thread-exit path instead. Threads that were abandoned by
106+
// phpThread.shutdown() still hold their slot; the OS reclaims the
107+
// handle when the process exits.
100108
mainThread.state.Set(state.Done)
101109
mainThread.state.WaitFor(state.Reserved)
102110
C.frankenphp_destroy_thread_metrics()

phpthread.go

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@ package frankenphp
55
import "C"
66
import (
77
"context"
8+
"log/slog"
89
"runtime"
910
"sync"
1011
"sync/atomic"
12+
"time"
1113
"unsafe"
1214

1315
"github.com/dunglas/frankenphp/internal/state"
@@ -25,6 +27,26 @@ type phpThread struct {
2527
contextMu sync.RWMutex
2628
state *state.ThreadState
2729
requestCount atomic.Int64
30+
// forceKill is populated by go_frankenphp_store_force_kill_slot from
31+
// the PHP thread's own TSRM context at boot. Read by other goroutines
32+
// via RestartWorkers/DrainWorkers; forceKillMu serialises reads with
33+
// the thread-shutdown clear path so a concurrent force-kill cannot
34+
// dereference the captured &EG() pointers after ts_free_thread() has
35+
// freed the underlying TSRM memory on the PHP thread.
36+
forceKillMu sync.RWMutex
37+
forceKill C.force_kill_slot
38+
}
39+
40+
// forceKillLocked stores true into the vm_interrupt/timed_out atomic
41+
// bools of the target thread and, on platforms that support it,
42+
// delivers a wake-up signal. The read lock serialises with
43+
// go_frankenphp_on_thread_shutdown, which clears the slot before
44+
// ts_free_thread() runs; without the lock a racing delivery could
45+
// write into freed TSRM memory.
46+
func (thread *phpThread) forceKillLocked() {
47+
thread.forceKillMu.RLock()
48+
defer thread.forceKillMu.RUnlock()
49+
C.frankenphp_force_kill_thread(thread.forceKill)
2850
}
2951

3052
// threadHandler defines how the callbacks from the C thread should be handled
@@ -93,7 +115,30 @@ func (thread *phpThread) shutdown() {
93115
}
94116

95117
close(thread.drainChan)
96-
thread.state.WaitFor(state.Done)
118+
119+
// Bounded wait: grace period, then force-kill, then abandon. Without
120+
// this, a thread stuck in an uninterruptible blocking syscall (e.g.
121+
// PHP usleep on Windows, any syscall on macOS where force-kill is a
122+
// no-op) would hang Shutdown forever.
123+
done := make(chan struct{})
124+
go func() {
125+
thread.state.WaitFor(state.Done)
126+
close(done)
127+
}()
128+
select {
129+
case <-done:
130+
case <-time.After(drainGracePeriod):
131+
thread.forceKillLocked()
132+
select {
133+
case <-done:
134+
case <-time.After(forceKillDeadline):
135+
if globalLogger.Enabled(globalCtx, slog.LevelWarn) {
136+
globalLogger.LogAttrs(globalCtx, slog.LevelWarn,
137+
"PHP thread did not exit after force-kill; abandoning to unblock Shutdown")
138+
}
139+
}
140+
}
141+
97142
thread.drainChan = make(chan struct{})
98143

99144
// threads go back to the reserved state from which they can be booted again
@@ -203,10 +248,44 @@ func go_frankenphp_after_script_execution(threadIndex C.uintptr_t, exitStatus C.
203248
thread.Unpin()
204249
}
205250

251+
//export go_frankenphp_store_force_kill_slot
252+
func go_frankenphp_store_force_kill_slot(threadIndex C.uintptr_t, slot C.force_kill_slot) {
253+
thread := phpThreads[threadIndex]
254+
// Take the write lock: an unhealthy-restart respawn races a concurrent
255+
// RestartWorkers/Shutdown that reads the slot under RLock. Without
256+
// this lock, the release+overwrite below could race with a reader
257+
// and/or tear the struct mid-write.
258+
thread.forceKillMu.Lock()
259+
// Release any resource (Windows thread HANDLE) tied to the previous
260+
// slot: a phpThread can reboot (max_requests, unhealthy restart) and
261+
// register a fresh DuplicateHandle each time.
262+
C.frankenphp_release_thread_for_kill(thread.forceKill)
263+
thread.forceKill = slot
264+
thread.forceKillMu.Unlock()
265+
}
266+
267+
//export go_frankenphp_clear_force_kill_slot
268+
func go_frankenphp_clear_force_kill_slot(threadIndex C.uintptr_t) {
269+
// Called from C right before ts_free_thread() on both the healthy
270+
// and unhealthy thread-exit paths. Clearing the slot here (rather
271+
// than in on_thread_shutdown, which runs after ts_free_thread) means
272+
// the &EG(vm_interrupt) / &EG(timed_out) pointers are swapped out
273+
// for nils BEFORE the TSRM storage they point at is freed, so any
274+
// concurrent force-kill delivery either ran to completion before we
275+
// took the write lock, or sees a zero-valued slot and early-returns.
276+
thread := phpThreads[threadIndex]
277+
thread.forceKillMu.Lock()
278+
C.frankenphp_release_thread_for_kill(thread.forceKill)
279+
thread.forceKill = C.force_kill_slot{}
280+
thread.forceKillMu.Unlock()
281+
}
282+
206283
//export go_frankenphp_on_thread_shutdown
207284
func go_frankenphp_on_thread_shutdown(threadIndex C.uintptr_t) {
208285
thread := phpThreads[threadIndex]
209286
thread.Unpin()
287+
// Force-kill slot is already cleared by go_frankenphp_clear_force_kill_slot
288+
// before ts_free_thread; nothing to do here except the state signal.
210289
if thread.state.Is(state.Rebooting) {
211290
thread.state.Set(state.RebootReady)
212291
} else {

testdata/worker-sleep.php

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
<?php
2+
3+
// Worker that sleeps inside the handler to simulate a stuck request blocking
4+
// drain. Used to test the force-kill grace period.
5+
//
6+
// Before sleeping we touch a marker file whose path is passed via the
7+
// SLEEP_MARKER header. The Go test polls for the file so it only arms
8+
// RestartWorkers once the worker is proven to be inside sleep(), removing
9+
// the fixed-time race of a bare time.Sleep on the caller side.
10+
$fn = static function () {
11+
$marker = $_SERVER['HTTP_SLEEP_MARKER'] ?? '';
12+
if ($marker !== '') {
13+
@touch($marker);
14+
}
15+
sleep(60);
16+
echo 'should not reach';
17+
};
18+
19+
do {
20+
$ret = \frankenphp_handle_request($fn);
21+
} while ($ret);

watcher.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
package frankenphp
44

55
import (
6+
"log/slog"
67
"sync/atomic"
78

89
"github.com/dunglas/frankenphp/internal/watcher"
@@ -33,7 +34,9 @@ func initWatchers(o *opt) error {
3334
watchPatterns = append(watchPatterns, &watcher.PatternGroup{
3435
Callback: func(_ []*watcherGo.Event) {
3536
if restartWorkers.Swap(false) {
36-
RestartWorkers()
37+
if err := RestartWorkers(); err != nil && globalLogger.Enabled(globalCtx, slog.LevelError) {
38+
globalLogger.LogAttrs(globalCtx, slog.LevelError, "watcher-triggered restart incomplete", slog.String("err", err.Error()))
39+
}
3740
}
3841
},
3942
})

0 commit comments

Comments
 (0)