Skip to content

Commit fc32dc4

Browse files
feat: cross-platform force-kill primitive for stuck PHP threads
Introduces a small, self-contained primitive that unblocks a PHP thread stuck in a blocking call (sleep, synchronous I/O, etc.) so the graceful drain used by RestartWorkers and DrainWorkers can make progress instead of waiting for the block to return on its own. The primitive is useful on its own and gives follow-up graceful-shutdown work a reviewed foundation to build on. - frankenphp.c: add frankenphp_init_force_kill / frankenphp_save_php_timer / frankenphp_force_kill_thread / frankenphp_destroy_force_kill. The per-thread PHP timer handle (Linux/FreeBSD ZTS) or OS thread handle (Windows) is captured at thread boot and stored in a pre-sized array so the kill path can fire from any goroutine without touching per-thread PHP state. Linux/FreeBSD arm PHP's max_execution_time timer (delivers SIGALRM -> "Maximum execution time exceeded"); Windows uses CancelSynchronousIo + QueueUserAPC to interrupt I/O and alertable waits; macOS and other platforms are a safe no-op (the thread is abandoned and exits when the blocking call returns naturally). - phpmainthread.go: wire frankenphp_init_force_kill into initPHPThreads (sized to maxThreads, matching the thread_metrics allocation) and frankenphp_destroy_force_kill into drainPHPThreads. - worker.go: add a 5-second graceful-drain grace period to drainWorkerThreads. Once elapsed, arm the force-kill primitive on any thread still outside Yielding and keep waiting on ready.Wait(); the kill lets the thread return from its blocking call so the drain completes in bounded time instead of hanging. - worker_test.go + testdata/worker-sleep.php: TestRestartWorkersForceKillsStuckThread drives the path end-to-end. A worker blocks inside sleep(60) below frankenphp_handle_request (so drainChan close can't reach it); the test asserts RestartWorkers returns within 8s (grace + slack). The test skips on platforms without the underlying primitive.
1 parent a05e6dd commit fc32dc4

6 files changed

Lines changed: 255 additions & 1 deletion

File tree

frankenphp.c

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,143 @@ static bool is_forked_child = false;
9292
static void frankenphp_fork_child(void) { is_forked_child = true; }
9393
#endif
9494

95+
/* Best-effort force-kill for PHP threads after the graceful-drain grace
96+
* period. Each thread registers pointers to its own executor_globals'
97+
* vm_interrupt and timed_out atomic bools at boot; force_kill_thread
98+
* stores true into both from any goroutine, which wakes the VM at the
99+
* next opcode boundary and unwinds the thread through zend_timeout().
100+
*
101+
* On platforms with POSIX realtime signals (Linux, FreeBSD), force-kill
102+
* also delivers SIGRTMIN+3 to the target thread so any in-flight blocking
103+
* syscall (select, sleep, nanosleep, blocking I/O without SA_RESTART)
104+
* returns EINTR and the VM gets a chance to observe the atomic bools on
105+
* the next opcode. On Windows, CancelSynchronousIo + QueueUserAPC does
106+
* the equivalent for alertable I/O and SleepEx. Non-alertable Sleep()
107+
* (including PHP's usleep on Windows) stays uninterruptible - the VM
108+
* must wait for it to return naturally before bailing.
109+
*
110+
* macOS has no realtime signals exposed to user-space, so the atomic
111+
* bool path is the only mechanism there: threads busy-looping in PHP
112+
* are killed promptly, threads stuck in blocking syscalls wait to
113+
* return on their own. */
114+
#if !defined(PHP_WIN32) && defined(SIGRTMIN)
115+
#define FRANKENPHP_HAS_KILL_SIGNAL 1
116+
#define FRANKENPHP_KILL_SIGNAL (SIGRTMIN + 3)
117+
118+
/* No-op handler: signal delivery is sufficient on its own because it
119+
* forces the in-flight syscall to return EINTR. The VM then observes
120+
* vm_interrupt on the next opcode and unwinds via zend_timeout(). */
121+
static void frankenphp_kill_signal_handler(int sig) { (void)sig; }
122+
#endif
123+
124+
#ifdef PHP_WIN32
125+
static void CALLBACK frankenphp_noop_apc(ULONG_PTR param) { (void)param; }
126+
#endif
127+
128+
typedef struct {
129+
zend_atomic_bool *vm_interrupt;
130+
zend_atomic_bool *timed_out;
131+
#ifdef FRANKENPHP_HAS_KILL_SIGNAL
132+
pthread_t tid;
133+
#elif defined(PHP_WIN32)
134+
HANDLE thread_handle;
135+
#endif
136+
} force_kill_slot;
137+
138+
static int force_kill_num_threads = 0;
139+
static force_kill_slot *thread_slots = NULL;
140+
141+
void frankenphp_init_force_kill(int num_threads) {
142+
thread_slots = calloc((size_t)num_threads, sizeof(force_kill_slot));
143+
if (thread_slots == NULL) {
144+
/* Out of memory at startup: leave force-kill disabled rather than crash
145+
* later in register/kill. Graceful drain still works via the yielding
146+
* path. */
147+
force_kill_num_threads = 0;
148+
return;
149+
}
150+
151+
#ifdef FRANKENPHP_HAS_KILL_SIGNAL
152+
/* Install the no-op handler process-wide with SA_RESTART cleared so
153+
* blocking syscalls return EINTR when the signal is delivered rather
154+
* than being transparently restarted by libc. */
155+
struct sigaction sa;
156+
memset(&sa, 0, sizeof(sa));
157+
sa.sa_handler = frankenphp_kill_signal_handler;
158+
sigemptyset(&sa.sa_mask);
159+
sa.sa_flags = 0;
160+
sigaction(FRANKENPHP_KILL_SIGNAL, &sa, NULL);
161+
#endif
162+
163+
force_kill_num_threads = num_threads;
164+
}
165+
166+
/* Called by each PHP thread at boot, from its own TSRM context, so that
167+
* the EG-backed addresses resolve to the thread's private executor_globals
168+
* and the captured thread identity refers to itself. The pointers stay
169+
* valid for the thread's lifetime. */
170+
void frankenphp_register_thread_for_kill(uintptr_t idx) {
171+
if (thread_slots == NULL || idx >= (uintptr_t)force_kill_num_threads) {
172+
return;
173+
}
174+
thread_slots[idx].vm_interrupt = &EG(vm_interrupt);
175+
thread_slots[idx].timed_out = &EG(timed_out);
176+
#ifdef FRANKENPHP_HAS_KILL_SIGNAL
177+
thread_slots[idx].tid = pthread_self();
178+
#elif defined(PHP_WIN32)
179+
if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),
180+
GetCurrentProcess(), &thread_slots[idx].thread_handle, 0,
181+
FALSE, DUPLICATE_SAME_ACCESS)) {
182+
/* DuplicateHandle can fail under resource pressure; leave the slot's
183+
* handle NULL so force_kill_thread falls back to the atomic-bool path
184+
* only and does not dereference an uninitialized handle. */
185+
thread_slots[idx].thread_handle = NULL;
186+
}
187+
#endif
188+
}
189+
190+
void frankenphp_force_kill_thread(uintptr_t idx) {
191+
if (thread_slots == NULL || idx >= (uintptr_t)force_kill_num_threads) {
192+
return;
193+
}
194+
force_kill_slot slot = thread_slots[idx];
195+
if (slot.vm_interrupt == NULL) {
196+
/* Thread never reached register_thread_for_kill (aborted during boot). */
197+
return;
198+
}
199+
/* Set the atomic bools first so that by the time the thread wakes up -
200+
* whether from our signal/APC or naturally - the VM sees them and
201+
* routes through zend_timeout() -> "Maximum execution time exceeded". */
202+
zend_atomic_bool_store(slot.timed_out, true);
203+
zend_atomic_bool_store(slot.vm_interrupt, true);
204+
205+
#ifdef FRANKENPHP_HAS_KILL_SIGNAL
206+
/* Return value intentionally ignored: ESRCH (thread already exited) and
207+
* EINVAL are both benign - there is simply nothing to unblock. */
208+
pthread_kill(slot.tid, FRANKENPHP_KILL_SIGNAL);
209+
#elif defined(PHP_WIN32)
210+
if (slot.thread_handle != NULL) {
211+
CancelSynchronousIo(slot.thread_handle);
212+
QueueUserAPC((PAPCFUNC)frankenphp_noop_apc, slot.thread_handle, 0);
213+
}
214+
#endif
215+
}
216+
217+
void frankenphp_destroy_force_kill(void) {
218+
#ifdef PHP_WIN32
219+
if (thread_slots != NULL) {
220+
for (int i = 0; i < force_kill_num_threads; i++) {
221+
if (thread_slots[i].thread_handle != NULL) {
222+
CloseHandle(thread_slots[i].thread_handle);
223+
}
224+
}
225+
}
226+
#endif
227+
free(thread_slots);
228+
thread_slots = NULL;
229+
force_kill_num_threads = 0;
230+
}
231+
95232
void frankenphp_update_local_thread_context(bool is_worker) {
96233
is_worker_thread = is_worker;
97234

@@ -1073,6 +1210,11 @@ static void *php_thread(void *arg) {
10731210
#endif
10741211
#endif
10751212

1213+
/* Register this thread's vm_interrupt/timed_out addresses so the Go side
1214+
* can force-kill it after the graceful-drain grace period if it gets stuck
1215+
* in a busy PHP loop. */
1216+
frankenphp_register_thread_for_kill(thread_index);
1217+
10761218
bool thread_is_healthy = true;
10771219
bool has_attempted_shutdown = false;
10781220

frankenphp.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,13 @@ void frankenphp_init_thread_metrics(int max_threads);
193193
void frankenphp_destroy_thread_metrics(void);
194194
size_t frankenphp_get_thread_memory_usage(uintptr_t thread_index);
195195

196+
/* Best-effort force-kill primitives. Interrupt the Zend VM at the next
197+
* opcode boundary; blocking syscalls are not woken. */
198+
void frankenphp_init_force_kill(int num_threads);
199+
void frankenphp_register_thread_for_kill(uintptr_t thread_index);
200+
void frankenphp_force_kill_thread(uintptr_t thread_index);
201+
void frankenphp_destroy_force_kill(void);
202+
196203
void register_extensions(zend_module_entry **m, int len);
197204

198205
#endif

phpmainthread.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,12 @@ func initPHPThreads(numThreads int, numMaxThreads int, phpIni map[string]string)
5656

5757
C.frankenphp_init_thread_metrics(C.int(mainThread.maxThreads))
5858

59+
// initialize force-kill support: allocates per-thread slots that each PHP
60+
// thread fills in at boot (frankenphp_register_thread_for_kill) so a
61+
// thread stuck in a busy PHP loop can be interrupted after the
62+
// graceful-drain grace period.
63+
C.frankenphp_init_force_kill(C.int(mainThread.maxThreads))
64+
5965
// initialize all other threads
6066
phpThreads = make([]*phpThread, mainThread.maxThreads)
6167
phpThreads[0] = initialThread
@@ -97,6 +103,7 @@ func drainPHPThreads() {
97103
}
98104

99105
doneWG.Wait()
106+
C.frankenphp_destroy_force_kill()
100107
mainThread.state.Set(state.Done)
101108
mainThread.state.WaitFor(state.Reserved)
102109
C.frankenphp_destroy_thread_metrics()

testdata/worker-sleep.php

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<?php
2+
3+
// Worker that sleeps inside the handler to simulate a stuck request blocking
4+
// drain. Used to test the force-kill grace period.
5+
$fn = static function () {
6+
sleep(60);
7+
echo 'should not reach';
8+
};
9+
10+
do {
11+
$ret = \frankenphp_handle_request($fn);
12+
} while ($ret);

worker.go

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ package frankenphp
44
import "C"
55
import (
66
"fmt"
7+
"log/slog"
78
"os"
89
"path/filepath"
910
"runtime"
@@ -165,6 +166,13 @@ func newWorker(o workerOpt) (*worker, error) {
165166
return w, nil
166167
}
167168

169+
// drainGracePeriod is the time a worker thread has to stop gracefully after
170+
// receiving the drain signal before the force-kill primitive is armed on it.
171+
// Well-behaved scripts return promptly on drainChan close; stuck ones (e.g.
172+
// blocking C calls inside the VM) would otherwise hang drainWorkerThreads
173+
// forever.
174+
const drainGracePeriod = 5 * time.Second
175+
168176
// EXPERIMENTAL: DrainWorkers finishes all worker scripts before a graceful shutdown
169177
func DrainWorkers() {
170178
_ = drainWorkerThreads()
@@ -201,7 +209,31 @@ func drainWorkerThreads() []*phpThread {
201209
worker.threadMutex.RUnlock()
202210
}
203211

204-
ready.Wait()
212+
// Wait for graceful drain, then arm the force-kill primitive on any
213+
// thread still stuck. Linux/FreeBSD ZTS arms PHP's max_execution_time
214+
// timer; Windows interrupts blocking I/O and alertable waits; other
215+
// platforms leave the thread abandoned (it will exit when the blocking
216+
// call returns).
217+
done := make(chan struct{})
218+
go func() {
219+
ready.Wait()
220+
close(done)
221+
}()
222+
223+
select {
224+
case <-done:
225+
// everyone yielded in time
226+
case <-time.After(drainGracePeriod):
227+
for _, thread := range drainedThreads {
228+
if !thread.state.Is(state.Yielding) {
229+
C.frankenphp_force_kill_thread(C.uintptr_t(thread.threadIndex))
230+
}
231+
}
232+
if globalLogger.Enabled(globalCtx, slog.LevelWarn) {
233+
globalLogger.LogAttrs(globalCtx, slog.LevelWarn, "worker threads did not yield within grace period, force-killing stuck threads")
234+
}
235+
<-done
236+
}
205237

206238
return drainedThreads
207239
}

worker_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,17 @@ import (
99
"net/http"
1010
"net/http/httptest"
1111
"net/url"
12+
"os"
13+
"runtime"
1214
"strconv"
1315
"strings"
1416
"sync"
1517
"testing"
18+
"time"
1619

1720
"github.com/dunglas/frankenphp"
1821
"github.com/stretchr/testify/assert"
22+
"github.com/stretchr/testify/require"
1923
)
2024

2125
func TestWorker(t *testing.T) {
@@ -45,6 +49,56 @@ func TestWorker(t *testing.T) {
4549
}, &testOptions{workerScript: "worker.php", nbWorkers: 1, nbParallelRequests: 1})
4650
}
4751

52+
// TestRestartWorkersForceKillsStuckThread verifies that the drain path used
53+
// by RestartWorkers and DrainWorkers does not hang indefinitely when a
54+
// worker thread is stuck inside a blocking PHP call (sleep, synchronous
55+
// I/O, etc.). The force-kill primitive delivers a realtime signal to the
56+
// thread on Linux/FreeBSD (interrupts the syscall with EINTR) or calls
57+
// CancelSynchronousIo + QueueUserAPC on Windows. macOS has no realtime
58+
// signal exposed to user-space, so a thread stuck in sleep() cannot be
59+
// force-unblocked there; skip the test.
60+
func TestRestartWorkersForceKillsStuckThread(t *testing.T) {
61+
if runtime.GOOS != "linux" && runtime.GOOS != "freebsd" && runtime.GOOS != "windows" {
62+
t.Skipf("force-kill cannot interrupt blocking syscalls on %s", runtime.GOOS)
63+
}
64+
65+
cwd, _ := os.Getwd()
66+
testDataDir := cwd + "/testdata/"
67+
68+
require.NoError(t, frankenphp.Init(
69+
frankenphp.WithWorkers("sleep-worker", testDataDir+"worker-sleep.php", 1),
70+
frankenphp.WithNumThreads(2),
71+
))
72+
t.Cleanup(frankenphp.Shutdown)
73+
74+
// Fire a request the worker will handle and then block on (sleep 60s).
75+
// When the drain runs, the worker script is inside the handler callback,
76+
// below frankenphp_handle_request, so the drain signal on drainChan
77+
// can't be observed until the blocking sleep returns.
78+
go func() {
79+
req := httptest.NewRequest("GET", "http://example.com/worker-sleep.php", nil)
80+
fr, err := frankenphp.NewRequestWithContext(req, frankenphp.WithRequestDocumentRoot(testDataDir, false))
81+
if err != nil {
82+
return
83+
}
84+
_ = frankenphp.ServeHTTP(httptest.NewRecorder(), fr)
85+
}()
86+
87+
// Give the request time to reach the handler and enter sleep().
88+
time.Sleep(500 * time.Millisecond)
89+
90+
// RestartWorkers must complete within the grace period + a bit of slack.
91+
// Without force-kill, it would wait for the 60s sleep to return.
92+
start := time.Now()
93+
frankenphp.RestartWorkers()
94+
elapsed := time.Since(start)
95+
96+
// Grace period is 5s; allow margin for SIGALRM dispatch, PHP VM tick,
97+
// and the drain's final ready.Wait() plus the restart loop.
98+
const budget = 8 * time.Second
99+
assert.Less(t, elapsed, budget, "drain must force-kill the stuck thread within the grace period")
100+
}
101+
48102
func TestWorkerDie(t *testing.T) {
49103
runTest(t, func(handler func(http.ResponseWriter, *http.Request), _ *httptest.Server, i int) {
50104
req := httptest.NewRequest("GET", "http://example.com/die.php", nil)

0 commit comments

Comments
 (0)