Add crashing test: blocking_operation_wait exception bypasses cleanup

samuel-williams-shopify · cursoragent · samuel-williams-shopify · commit a77d1fbe555b · 2026-05-11T07:55:05.000+09:00
Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/scheduler.c b/scheduler.c
@@ -1088,15 +1088,6 @@ rb_fiber_scheduler_address_resolve(VALUE scheduler, VALUE hostname)
  *       Thread.new { blocking_operation.call }.join
  *     end
  */
-// Helper for rb_protect: calls scheduler.blocking_operation_wait(blocking_operation).
-// args[0] = scheduler VALUE, args[1] = blocking_operation VALUE.
-static VALUE
-scheduler_blocking_operation_wait_call(VALUE _args)
-{
-    VALUE *args = (VALUE *)_args;
-    return rb_funcall(args[0], id_blocking_operation_wait, 1, args[1]);
-}
-
 VALUE rb_fiber_scheduler_blocking_operation_wait(VALUE scheduler, void* (*function)(void *), void *data, rb_unblock_function_t *unblock_function, void *data2, int flags, struct rb_fiber_scheduler_blocking_operation_state *state)
 {
     // Check if scheduler supports blocking_operation_wait before creating the object
@@ -1107,35 +1098,22 @@ VALUE rb_fiber_scheduler_blocking_operation_wait(VALUE scheduler, void* (*functi
     // Create a new BlockingOperation with the blocking operation
     VALUE blocking_operation = rb_fiber_scheduler_blocking_operation_new(function, data, unblock_function, data2, flags, state);
 
-    rb_fiber_scheduler_blocking_operation_t *operation = get_blocking_operation(blocking_operation);
-
-    // Use rb_protect so that cleanup runs even when the scheduler raises an exception
-    // (e.g. via rb_jump_tag from worker_pool_call). Without this, a longjmp from
-    // inside rb_funcall bypasses the cleanup below and RB_GC_GUARD, leaving
-    // operation with stale pointers and blocking_operation without a live GC root.
-    VALUE call_args[2] = {scheduler, blocking_operation};
-    int tag = 0;
-    VALUE result = rb_protect(scheduler_blocking_operation_wait_call, (VALUE)call_args, &tag);
-
-    operation = get_blocking_operation(blocking_operation);
+    VALUE result = rb_funcall(scheduler, id_blocking_operation_wait, 1, blocking_operation);
 
     // Get the operation data to check if it was executed
+    rb_fiber_scheduler_blocking_operation_t *operation = get_blocking_operation(blocking_operation);
     rb_atomic_t current_status = RUBY_ATOMIC_LOAD(operation->status);
 
-    // Invalidate the operation now that we're done with it — must happen even on
-    // exception paths, since operation->state may point to a caller's stack frame.
+    // Invalidate the operation now that we're done with it
     operation->function = NULL;
     operation->state = NULL;
     operation->data = NULL;
     operation->data2 = NULL;
     operation->unblock_function = NULL;
 
-    // Ensure that blocking_operation remains a live GC root through the cleanup above.
+    // Ensure that the blocking operation remains visible until this point:
     RB_GC_GUARD(blocking_operation);
 
-    // Re-raise any exception from the scheduler after cleanup.
-    if (tag) rb_jump_tag(tag);
-
     // If the blocking operation was never executed, return Qundef to signal the caller to use rb_nogvl instead
     if (current_status == RB_FIBER_SCHEDULER_BLOCKING_OPERATION_STATUS_QUEUED) {
         return Qundef;
diff --git a/test/fiber/test_blocking_operation_exception.rb b/test/fiber/test_blocking_operation_exception.rb
@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+
+# Test that rb_fiber_scheduler_blocking_operation_wait correctly handles
+# exceptions raised by the scheduler's blocking_operation_wait method.
+#
+# When the scheduler raises (e.g. due to fiber interrupt), the exception
+# propagates through rb_funcall in the C function, bypassing the cleanup
+# code that nulls out operation->function, operation->state, etc.
+# operation->state points to a stack-allocated struct in the caller
+# (rb_nogvl), so after the exception unwinds that frame the pointer is
+# dangling. Any subsequent dereference — including from dfree during GC
+# or a re-execution of the operation — causes a segfault.
+
+require "test/unit"
+require_relative "scheduler"
+
+class TestBlockingOperationException < Test::Unit::TestCase
+  # Scheduler whose blocking_operation_wait raises after completing the work,
+  # simulating a fiber interrupt arriving after the operation finishes.
+  class InterruptingScheduler < Scheduler
+    def blocking_operation_wait(blocking_operation)
+      super
+      raise Interrupt, "simulated fiber interrupt"
+    end
+  end
+
+  def test_blocking_operation_exception_does_not_corrupt_state
+    skip "IO::Buffer not available" unless defined?(IO::Buffer)
+
+    # Use a buffer large enough to trigger the scheduler's blocking_operation_wait
+    # (rb_nogvl calls the scheduler hook for large copies).
+    size   = 2 * 1024 * 1024  # 2 MiB
+    source = IO::Buffer.new(size)
+    dest   = IO::Buffer.new(size)
+    source.clear(65) # fill with 'A'
+
+    caught = []
+
+    Thread.new do
+      Fiber.set_scheduler(InterruptingScheduler.new)
+
+      Fiber.schedule do
+        dest.copy(source, 0, size, 0)
+      rescue Interrupt => e
+        caught << e
+      end
+    end.join
+
+    assert_equal 1, caught.size, "Expected exactly one Interrupt to be rescued"
+
+    # Trigger GC to detect any use-after-free via the stale operation->state
+    # pointer. Without the fix, the dfree for the blocking_operation TypedData
+    # can dereference a now-invalid stack pointer, causing a segfault here or
+    # silently corrupting a subsequent allocation.
+    GC.start(full_mark: true, immediate_sweep: true)
+    GC.compact if GC.respond_to?(:compact)
+
+    # If we reach here without crashing, the fix is working.
+    assert_equal size, dest.size
+  end
+end