diff --git a/bundles/org.eclipse.swt/Eclipse SWT Custom Widgets/common/org/eclipse/swt/custom/BusyIndicator.java b/bundles/org.eclipse.swt/Eclipse SWT Custom Widgets/common/org/eclipse/swt/custom/BusyIndicator.java index eb143853535..d49a288fb5d 100644 --- a/bundles/org.eclipse.swt/Eclipse SWT Custom Widgets/common/org/eclipse/swt/custom/BusyIndicator.java +++ b/bundles/org.eclipse.swt/Eclipse SWT Custom Widgets/common/org/eclipse/swt/custom/BusyIndicator.java @@ -107,11 +107,16 @@ public static void showWhile(Future future) { Integer busyId = setBusyCursor(display); try { if (future instanceof CompletionStage stage) { - // let us wake up from sleep once the future is done + // Wake the UI thread from sleep once the future is done. + // Using asyncExec instead of wake() because asyncExec adds + // a message to the synchronizer queue, which is checked by + // Display.sleep()'s loop condition. This is more reliable + // than the wake flag alone on GTK/Linux, where the wake + // flag can be subject to race conditions. stage.handle((nil1, nil2) -> { if (!display.isDisposed()) { try { - display.wake(); + display.asyncExec(() -> {}); } catch (SWTException e) { // ignore then, this can happen due to the async nature between our check for // disposed and the actual call to wake the display can be disposed diff --git a/bundles/org.eclipse.swt/Eclipse SWT/gtk/org/eclipse/swt/widgets/Display.java b/bundles/org.eclipse.swt/Eclipse SWT/gtk/org/eclipse/swt/widgets/Display.java index c94e0a0ce43..91d2d868b15 100644 --- a/bundles/org.eclipse.swt/Eclipse SWT/gtk/org/eclipse/swt/widgets/Display.java +++ b/bundles/org.eclipse.swt/Eclipse SWT/gtk/org/eclipse/swt/widgets/Display.java @@ -127,7 +127,7 @@ public class Display extends Device implements Executor { Event [] eventQueue; long fds; int allocated_nfds; - boolean wake; + volatile boolean wake; boolean windowSizeSet; int [] max_priority = new int [1], timeout = new int [1]; Callback eventCallback; @@ -5610,6 +5610,19 @@ public boolean sleep () { max_priority [0] = timeout [0] = 0; long context = OS.g_main_context_default (); boolean result = false; + /* + * Reset the wake flag once before entering the poll loop. This clears + * any stale wake signal from before sleep() was called. The flag must + * NOT be reset inside the loop, because doing so creates a race + * condition: another thread calling wakeThread() may set wake=true and + * signal the wakeup fd, but if the UI thread then resets wake=false at + * the start of the next iteration, the wakeup is lost. The eventfd + * signal gets consumed by g_main_context_check(), and with wake=false + * the loop continues sleeping, potentially indefinitely. + * + * See https://github.com/eclipse-platform/eclipse.platform.swt/issues/3044 + */ + wake = false; do { if (OS.g_main_context_acquire (context)) { result = OS.g_main_context_prepare (context, max_priority); @@ -5623,13 +5636,14 @@ public boolean sleep () { if (poll != 0) { if (nfds > 0 || timeout [0] != 0) { /* - * Bug in GTK. For some reason, g_main_context_wakeup() may - * fail to wake up the UI thread from the polling function. - * The fix is to sleep for a maximum of 50 milliseconds. - */ + * Cap the poll timeout to ensure the event loop remains + * responsive. The g_main_context_wakeup() mechanism is + * reliable on modern GLib (tested on GLib 2.56+), but this + * timeout acts as a safety net. A timeout of -1 would mean + * blocking indefinitely, which could cause hangs if a + * wakeup signal is missed for any reason. + */ if (timeout [0] < 0) timeout [0] = 50; - - wake = false; OS.Call (poll, fds, nfds, timeout [0]); } } @@ -5637,7 +5651,6 @@ public boolean sleep () { OS.g_main_context_release (context); } } while (!result && synchronizer.isMessagesEmpty() && !wake); - wake = false; if (!GTK.GTK4) GDK.gdk_threads_enter (); sendPostExternalEventDispatchEvent (); return true; diff --git a/docs/wake-sleep-race-condition-analysis.md b/docs/wake-sleep-race-condition-analysis.md new file mode 100644 index 00000000000..de96f78e9c5 --- /dev/null +++ b/docs/wake-sleep-race-condition-analysis.md @@ -0,0 +1,179 @@ +# Display.wake()/sleep() Race Condition Analysis + +## Summary + +The GTK implementation of `Display.sleep()` contained a race condition that could cause the +UI thread to miss wakeup signals, leading to the event loop becoming unresponsive until an +external event (mouse movement, timer, etc.) occurred. This manifested as intermittent hangs +in `BusyIndicator.showWhile()` tests on Linux CI environments, where there are no external +input events to accidentally break the deadlock. + +## Related Issues + +- [#3044](https://github.com/eclipse-platform/eclipse.platform.swt/issues/3044) - Test_org_eclipse_swt_custom_BusyIndicator fails on Linux +- [#3053](https://github.com/eclipse-platform/eclipse.platform.swt/issues/3053) - `g_main_context_wakeup` Is Maybe Not Buggy +- [#3059](https://github.com/eclipse-platform/eclipse.platform.swt/issues/3059) - sleep() does not react to thread interruption +- [#3060](https://github.com/eclipse-platform/eclipse.platform.swt/issues/3060) - Display#wake must likely be volatile + +## Root Cause Analysis + +### The `wake` Flag Race Condition + +The core issue was in `Display.sleep()` (GTK implementation) at the line `wake = false` inside +the `do-while` loop. This reset of the `wake` flag created a race condition with +`wakeThread()` called from other threads. + +#### Before Fix (Problematic Code) + +```java +// Display.sleep() - GTK +do { + if (OS.g_main_context_acquire(context)) { + result = OS.g_main_context_prepare(context, max_priority); + // ... query poll descriptors ... + if (poll != 0) { + if (nfds > 0 || timeout[0] != 0) { + if (timeout[0] < 0) timeout[0] = 50; + wake = false; // ← PROBLEM: Resets flag inside loop + OS.Call(poll, fds, nfds, timeout[0]); // ← Blocks in poll() + } + } + OS.g_main_context_check(context, ...); // ← Consumes wakeup fd signal + OS.g_main_context_release(context); + } +} while (!result && synchronizer.isMessagesEmpty() && !wake); +wake = false; +``` + +```java +// Display.wakeThread() - called from other threads +void wakeThread() { + OS.g_main_context_wakeup(0); // Signals the wakeup eventfd + wake = true; // Sets the flag +} +``` + +#### The Race Scenario + +``` +UI Thread (sleep loop) Background Thread +======================== ====================== + +Loop iteration N completes. +Poll returned (timeout/event). +g_main_context_check() runs. +Loop condition: !wake → true + (wake is still false, loop continues) + wakeThread() called: + g_main_context_wakeup(0) → signals eventfd + wake = true → sets flag + +wake = false ← OVERWRITES true! (flag is now lost!) +OS.Call(poll, ...) → returns immediately + (eventfd was still signaled) +g_main_context_check() → CONSUMES the eventfd signal +Loop condition: !wake → true + (wake was overwritten to false!) + +Next iteration: +wake = false +OS.Call(poll, ...) → blocks for 50ms + (no eventfd signal, no events) +... repeats every 50ms indefinitely ... +``` + +The wakeup is **permanently lost** because: +1. `wake = false` inside the loop overwrites the `true` set by `wakeThread()` +2. `g_main_context_check()` consumes the eventfd signal +3. With `wake` back to `false` and no eventfd signal, the loop has no exit condition + +The loop only breaks when: +- A GTK event arrives (mouse move, timer, etc.) causing `result` to be `true` +- A synchronizer message is added (causing `isMessagesEmpty()` to return `false`) +- Another call to `wakeThread()` happens to be timed correctly + +### The `volatile` Issue + +Additionally, the `wake` field was **not volatile**, meaning: +- The JVM could cache the value in a CPU register +- Writes from background threads might not be visible to the UI thread +- The compiler could reorder reads/writes around the field + +### Why This Only Manifested in CI + +In normal desktop usage, external events (mouse movements, window repaints, system timers) +frequently cause the poll to return and `g_main_context_prepare()` to return `true`, breaking +the loop naturally. In headless CI environments on Linux, there are no such external events, +making the race condition much more likely to cause an observable hang. + +## The "Bug in GTK" Comment + +The original code contained this comment (from October 2004, commit `400a4197`): + +```java +/* + * Bug in GTK. For some reason, g_main_context_wakeup() may + * fail to wake up the UI thread from the polling function. + * The fix is to sleep for a maximum of 50 milliseconds. + */ +if (timeout[0] < 0) timeout[0] = 50; +``` + +### Analysis + +As demonstrated in [issue #3053](https://github.com/eclipse-platform/eclipse.platform.swt/issues/3053), +`g_main_context_wakeup()` works correctly on modern GLib (tested on GLib 2.84). The function +uses an eventfd-based mechanism that reliably wakes polling threads. + +The original "bug" reported in 2004 was likely a symptom of the same race condition in SWT's +own code (`wake = false` overwriting `wake = true`), not a bug in GTK's `g_main_context_wakeup`. +When the wakeup appeared to "fail", it was actually SWT's flag reset that lost the signal. + +The 50ms timeout cap serves as a useful **safety net** to limit the maximum sleep duration, but +it was not a fix for a GTK bug — it was masking the race condition in SWT's own code. + +## Platform Comparison + +| Platform | `wake` field? | `sleep()` mechanism | `wakeThread()` mechanism | Race condition? | +|----------|--------------|---------------------|-------------------------|-----------------| +| **GTK** | `boolean wake` (was not volatile) | Manual poll loop with `g_main_context_*` | `g_main_context_wakeup(0)` + `wake = true` | **YES** (fixed) | +| **Win32** | None | `OS.WaitMessage()` | `OS.PostThreadMessage(threadId, WM_NULL, ...)` | No — uses message queue | +| **Cocoa** | None | `NSRunLoop.runMode()` | `performSelectorOnMainThread()` | No — uses run loop event | + +Win32 and Cocoa don't have this issue because they use native event queue mechanisms to both +block and wake, with no separate boolean flag. + +## Fixes Applied + +### Fix 1: Move `wake = false` Before the Loop (Display.sleep) + +The `wake = false` statement is moved from inside the `do-while` loop to before it. This +ensures the flag is reset once at the start of `sleep()`, clearing any stale signal, but +is never overwritten during the loop. Once `wakeThread()` sets `wake = true`, the flag +stays true and the loop exits. + +### Fix 2: Make `wake` Volatile (Display field) + +The `wake` field is now declared `volatile` to ensure cross-thread visibility. This +guarantees that writes from `wakeThread()` (called from background threads) are +immediately visible to the UI thread checking the flag in the loop condition. + +### Fix 3: Use `asyncExec` in BusyIndicator (Defense in Depth) + +`BusyIndicator.showWhile()` now uses `display.asyncExec(() -> {})` instead of +`display.wake()` to signal the UI thread when a `CompletionStage` completes. This +works through a different code path: `asyncExec` adds a message to the synchronizer +queue, and `Display.sleep()`'s loop condition checks `synchronizer.isMessagesEmpty()`. +This provides an additional, independent mechanism to break the sleep loop. + +## Historical Timeline + +| Date | Event | +|------|-------| +| 2004-10-08 | Original `wake = false` and 50ms timeout added (commit `400a4197`) | +| 2004-10-08 | "Bug in GTK" comment added — likely misattributed SWT race condition to GTK | +| 2026-02-02 | Issue #3044 opened — BusyIndicator test hang on Linux CI | +| 2026-02-04 | Issue #3053 opened — Analysis showing g_main_context_wakeup works correctly | +| 2026-02-05 | Issues #3059 and #3060 opened — sleep() interrupt support and volatile wake | +| 2026-04-01 | HeikoKlare confirms removing `wake = false` inside loop fixes the issue | +| 2026-04-02 | This fix applied — race condition eliminated, comment corrected |