|
| 1 | +# frozen_string_literal: true |
| 2 | +require 'test/unit' |
| 3 | +require_relative 'scheduler' |
| 4 | + |
| 5 | +# Regression test for a GC bug in rb_fiber_scheduler_blocking_operation_wait. |
| 6 | +# |
| 7 | +# rb_fiber_scheduler_blocking_operation_wait creates `blocking_operation` as a |
| 8 | +# C-local VALUE, then calls rb_funcall(scheduler, :blocking_operation_wait, 1, |
| 9 | +# blocking_operation). If the scheduler's implementation causes a fiber switch |
| 10 | +# (via rb_fiber_scheduler_block), the calling fiber's C stack is suspended. |
| 11 | +# The conservative GC may not scan it correctly, allowing `blocking_operation` |
| 12 | +# to be collected. When Ruby reads it back via get_blocking_operation() at |
| 13 | +# scheduler.c:1104 after rb_funcall returns, it finds freed/reused memory -> |
| 14 | +# crash (rb_unexpected_object_type -> segfault). |
| 15 | +# |
| 16 | +# We can reliably trigger the same crash without a fiber switch by calling |
| 17 | +# GC.compact inside blocking_operation_wait: the compacting GC may move the |
| 18 | +# blocking_operation object without updating the C-local VALUE in the (now |
| 19 | +# inactive) scheduler.c frame, leaving a stale pointer. |
| 20 | +# |
| 21 | +# See: https://github.com/socketry/io-event/pull/166 |
| 22 | + |
| 23 | +class BlockingOperationGCScheduler < Scheduler |
| 24 | + # Two variants for triggering the bug: |
| 25 | + # :fiber_switch - cause a fiber switch (realistic: worker-pool-style) |
| 26 | + # :compact - call GC.compact (does not require a fiber switch) |
| 27 | + attr_accessor :trigger |
| 28 | + attr_reader :blocking_operation_wait_call_count |
| 29 | + |
| 30 | + def initialize(fiber = Fiber.current, trigger: :compact) |
| 31 | + super(fiber) |
| 32 | + @trigger = trigger |
| 33 | + @gc_fiber = nil |
| 34 | + @blocking_operation_wait_call_count = 0 |
| 35 | + end |
| 36 | + |
| 37 | + def blocking_operation_wait(work) |
| 38 | + @blocking_operation_wait_call_count += 1 |
| 39 | + |
| 40 | + # Execute the blocking operation so Ruby marks it done. |
| 41 | + thread = Thread.new { work.call } |
| 42 | + thread.join |
| 43 | + |
| 44 | + case @trigger |
| 45 | + when :compact |
| 46 | + # GC.compact moves objects. If the C-local `blocking_operation` VALUE in |
| 47 | + # rb_fiber_scheduler_blocking_operation_wait is not updated, the pointer |
| 48 | + # becomes stale. Ruby then calls get_blocking_operation() on a moved/freed |
| 49 | + # object -> wrong-type crash. |
| 50 | + # Allocate garbage first to increase pressure on the GC. |
| 51 | + 1000.times { Object.new } |
| 52 | + GC.compact |
| 53 | + when :fiber_switch |
| 54 | + # Transfer to another fiber so the calling fiber's C stack is suspended. |
| 55 | + # The GC runs during the switch. If `blocking_operation` is in a register |
| 56 | + # on the suspended fiber (not scanned by conservative GC), it gets collected. |
| 57 | + @gc_fiber ||= Fiber.new do |
| 58 | + loop do |
| 59 | + 1000.times { Object.new } |
| 60 | + GC.start(full_mark: true, immediate_sweep: true) |
| 61 | + Fiber.yield |
| 62 | + end |
| 63 | + end |
| 64 | + @gc_fiber.resume |
| 65 | + end |
| 66 | + end |
| 67 | +end |
| 68 | + |
| 69 | +class TestBlockingOperationGC < Test::Unit::TestCase |
| 70 | + def omit_unless_supported |
| 71 | + omit "GC.compact not supported" unless GC.respond_to?(:compact) |
| 72 | + omit "TCPServer not available" unless defined?(TCPServer) |
| 73 | + end |
| 74 | + |
| 75 | + def run_with_scheduler(trigger:) |
| 76 | + scheduler = nil |
| 77 | + Thread.new do |
| 78 | + scheduler = BlockingOperationGCScheduler.new(trigger: trigger) |
| 79 | + Fiber.set_scheduler(scheduler) |
| 80 | + |
| 81 | + server = TCPServer.new("localhost", 0) |
| 82 | + port = server.addr[1] |
| 83 | + client = TCPSocket.new("localhost", port) |
| 84 | + peer = server.accept |
| 85 | + |
| 86 | + Fiber.schedule do |
| 87 | + # IO#close calls maygvl_close -> IO_WITHOUT_GVL -> rb_nogvl(RB_NOGVL_OFFLOAD_SAFE) |
| 88 | + # -> rb_fiber_scheduler_blocking_operation_wait when a fiber scheduler is active. |
| 89 | + server.close |
| 90 | + end |
| 91 | + ensure |
| 92 | + client&.close rescue nil |
| 93 | + peer&.close rescue nil |
| 94 | + end.join |
| 95 | + scheduler |
| 96 | + end |
| 97 | + |
| 98 | + # Trigger via GC.compact inside blocking_operation_wait. |
| 99 | + # GC.compact moves objects; if the C-local `blocking_operation` VALUE in |
| 100 | + # rb_fiber_scheduler_blocking_operation_wait is not updated (e.g. it was only |
| 101 | + # in a register on the suspended fiber), get_blocking_operation() reads a |
| 102 | + # stale pointer -> wrong-type crash. |
| 103 | + def test_blocking_operation_gc_compact |
| 104 | + omit_unless_supported |
| 105 | + assert_nothing_raised do |
| 106 | + scheduler = run_with_scheduler(trigger: :compact) |
| 107 | + omit "blocking_operation_wait was not called (Ruby version does not route IO#close through it)" if scheduler.blocking_operation_wait_call_count == 0 |
| 108 | + end |
| 109 | + end |
| 110 | + |
| 111 | + # Trigger via fiber switch inside blocking_operation_wait. |
| 112 | + # When the calling fiber is suspended, the conservative GC may not find |
| 113 | + # `blocking_operation` if it is only in a register (not on the stack). |
| 114 | + # It gets collected, and get_blocking_operation() crashes on return from rb_funcall. |
| 115 | + def test_blocking_operation_gc_fiber_switch |
| 116 | + omit_unless_supported |
| 117 | + assert_nothing_raised do |
| 118 | + scheduler = run_with_scheduler(trigger: :fiber_switch) |
| 119 | + omit "blocking_operation_wait was not called (Ruby version does not route IO#close through it)" if scheduler.blocking_operation_wait_call_count == 0 |
| 120 | + end |
| 121 | + end |
| 122 | +end |
0 commit comments