Skip to content

Commit e65daa8

Browse files
vfs: fix MemFS cloneMu recursive RLock deadlock
Three MemFS methods deadlock against concurrent CrashClone due to recursive cloneMu.RLock() acquisition. Go's sync.RWMutex is writer-preferring: once CrashClone is waiting for the write lock, any new RLock call blocks behind it. When a method already holding RLock calls another method that also tries RLock, neither can proceed. The three affected methods and their recursive locking chains: - ReuseForWrite: takes RLock, then calls Rename which takes RLock - Link: takes RLock at entry, then takes RLock again inside inner walk callback (copy-paste of the locking boilerplate) - Lock: takes RLock, then calls Create which takes RLock The bug was introduced in a70d5b3 ("vfs: redesign MemFS strict mode", 2024-08-26), which replaced the old SetIgnoreSyncs/ResetToSyncedState approach with the CrashClone model. That commit systematically added cloneMu.RLock() to every mutation method, but did not trace call graphs to catch methods that compose other MemFS methods. The deadlock has existed for ~20 months as a sporadic CI flake (#4677, #5199, #5327). CrashClone fires only once per test iteration (when the error injector's write counter hits zero), so the contention window is extremely narrow on fast hardware. It surfaces reliably only on slow platforms: s390x QEMU (~10x slower), Windows CI, linux-race. Each occurrence was closed without a root cause until #5987 provided a full goroutine dump showing the RWMutex.Lock / RWMutex.RLock deadlock. Fix: extract lock-free internal methods (rename, create) that assume the caller already holds cloneMu. The public Rename/Create methods delegate to these after acquiring the lock. ReuseForWrite calls rename() and Lock calls create() to avoid the recursive acquisition. For Link, the inner RLock/RUnlock is simply removed since the outer lock at method entry already covers the entire body. Add a comment on the cloneMu field documenting the recursive-locking constraint to prevent recurrence. Add TestMemFSCrashCloneConcurrency which exercises concurrent CrashClone against ReuseForWrite, Link, and Lock. Without this fix, the test deadlocks within seconds. Fixes #5987. Informs #5959, #5972. Co-Authored-By: roachdev-claude <roachdev-claude-bot@cockroachlabs.com>
1 parent 52cdf9e commit e65daa8

2 files changed

Lines changed: 133 additions & 4 deletions

File tree

vfs/mem_fs.go

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ type MemFS struct {
8585

8686
// cloneMu is used to block all modification operations while we clone the
8787
// filesystem. Only used when crashable is true.
88+
//
89+
// Methods holding cloneMu must not call other public MemFS methods that also
90+
// acquire cloneMu; use the unexported variants (rename, create) instead.
91+
// Go's sync.RWMutex is writer-preferring, so recursive RLock deadlocks when
92+
// a writer (CrashClone) is waiting.
8893
cloneMu sync.RWMutex
8994

9095
// lockFiles holds a map of open file locks. Presence in this map indicates
@@ -224,6 +229,12 @@ func (y *MemFS) Create(fullname string, category DiskWriteCategory) (File, error
224229
y.cloneMu.RLock()
225230
defer y.cloneMu.RUnlock()
226231
}
232+
return y.create(fullname, category)
233+
}
234+
235+
// create is the lock-free internal implementation of Create. The caller must
236+
// hold cloneMu when crashable is true.
237+
func (y *MemFS) create(fullname string, category DiskWriteCategory) (File, error) {
227238
var ret *memFile
228239
err := y.walk(fullname, func(dir *memNode, frag string, final bool) error {
229240
if final {
@@ -281,8 +292,6 @@ func (y *MemFS) Link(oldname, newname string) error {
281292
if frag == "" {
282293
return errors.New("pebble/vfs: empty file name")
283294
}
284-
y.cloneMu.RLock()
285-
defer y.cloneMu.RUnlock()
286295
if _, ok := dir.children[frag]; ok {
287296
return &os.LinkError{
288297
Op: "link",
@@ -423,6 +432,12 @@ func (y *MemFS) Rename(oldname, newname string) error {
423432
y.cloneMu.RLock()
424433
defer y.cloneMu.RUnlock()
425434
}
435+
return y.rename(oldname, newname)
436+
}
437+
438+
// rename is the lock-free internal implementation of Rename. The caller must
439+
// hold cloneMu when crashable is true.
440+
func (y *MemFS) rename(oldname, newname string) error {
426441
var n *memNode
427442
err := y.walk(oldname, func(dir *memNode, frag string, final bool) error {
428443
if final {
@@ -461,7 +476,7 @@ func (y *MemFS) ReuseForWrite(oldname, newname string, category DiskWriteCategor
461476
y.cloneMu.RLock()
462477
defer y.cloneMu.RUnlock()
463478
}
464-
if err := y.Rename(oldname, newname); err != nil {
479+
if err := y.rename(oldname, newname); err != nil {
465480
return nil, err
466481
}
467482
f, err := y.Open(newname)
@@ -530,7 +545,7 @@ func (y *MemFS) Lock(fullname string) (io.Closer, error) {
530545
// directory. Create the path so that we have the normal detection of
531546
// non-existent directory paths, and make the lock visible when listing
532547
// directory entries.
533-
f, err := y.Create(fullname, WriteCategoryUnspecified)
548+
f, err := y.create(fullname, WriteCategoryUnspecified)
534549
if err != nil {
535550
// "Release" the lock since we failed.
536551
y.lockedFiles.Delete(fullname)

vfs/mem_fs_test.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55
package vfs
66

77
import (
8+
"context"
89
"fmt"
910
"io"
1011
"math/rand/v2"
1112
"sort"
1213
"strconv"
1314
"strings"
15+
"sync"
1416
"testing"
17+
"time"
1518

1619
"github.com/cockroachdb/datadriven"
1720
"github.com/stretchr/testify/require"
@@ -140,6 +143,117 @@ func TestMemFile(t *testing.T) {
140143
}
141144
}
142145

146+
// TestMemFSCrashCloneConcurrency exercises concurrent CrashClone (write lock)
147+
// against ReuseForWrite, Link, and Lock (all of which take read locks and call
148+
// other methods that also take read locks). Before the fix in this package,
149+
// these methods would recursively acquire cloneMu.RLock(), deadlocking when
150+
// CrashClone was waiting for the write lock (Go's sync.RWMutex is
151+
// writer-preferring, blocking new RLocks behind a pending Lock).
152+
func TestMemFSCrashCloneConcurrency(t *testing.T) {
153+
const workers = 4
154+
const duration = 5 * time.Second
155+
156+
fs := NewCrashableMem()
157+
// Pre-create a directory and seed files for ReuseForWrite and Link.
158+
require.NoError(t, fs.MkdirAll("/d", 0755))
159+
for i := 0; i < workers; i++ {
160+
name := fmt.Sprintf("/d/src-%d", i)
161+
f, err := fs.Create(name, WriteCategoryUnspecified)
162+
require.NoError(t, err)
163+
require.NoError(t, f.Close())
164+
}
165+
166+
ctx, cancel := context.WithTimeout(context.Background(), duration)
167+
defer cancel()
168+
169+
var g sync.WaitGroup
170+
171+
// CrashClone goroutines (writer lock).
172+
for i := 0; i < workers; i++ {
173+
g.Add(1)
174+
go func() {
175+
defer g.Done()
176+
rng := rand.New(rand.NewPCG(0, uint64(i)))
177+
for ctx.Err() == nil {
178+
fs.CrashClone(CrashCloneCfg{UnsyncedDataPercent: 50, RNG: rng})
179+
}
180+
}()
181+
}
182+
183+
// ReuseForWrite goroutines.
184+
for i := 0; i < workers; i++ {
185+
g.Add(1)
186+
go func() {
187+
defer g.Done()
188+
a := fmt.Sprintf("/d/reuse-a-%d", i)
189+
b := fmt.Sprintf("/d/reuse-b-%d", i)
190+
// Seed initial file.
191+
f, err := fs.Create(a, WriteCategoryUnspecified)
192+
if err != nil {
193+
return
194+
}
195+
_ = f.Close()
196+
useA := true
197+
for ctx.Err() == nil {
198+
var from, to string
199+
if useA {
200+
from, to = a, b
201+
} else {
202+
from, to = b, a
203+
}
204+
f, err := fs.ReuseForWrite(from, to, WriteCategoryUnspecified)
205+
if err != nil {
206+
// File may have been removed by CrashClone side effects;
207+
// recreate and retry.
208+
f2, err2 := fs.Create(a, WriteCategoryUnspecified)
209+
if err2 == nil {
210+
_ = f2.Close()
211+
}
212+
useA = true
213+
continue
214+
}
215+
_ = f.Close()
216+
useA = !useA
217+
}
218+
}()
219+
}
220+
221+
// Link goroutines.
222+
for i := 0; i < workers; i++ {
223+
g.Add(1)
224+
go func() {
225+
defer g.Done()
226+
src := fmt.Sprintf("/d/src-%d", i)
227+
n := 0
228+
for ctx.Err() == nil {
229+
dst := fmt.Sprintf("/d/link-%d-%d", i, n)
230+
_ = fs.Link(src, dst)
231+
// Clean up to avoid unbounded growth.
232+
_ = fs.Remove(dst)
233+
n++
234+
}
235+
}()
236+
}
237+
238+
// Lock goroutines.
239+
for i := 0; i < workers; i++ {
240+
g.Add(1)
241+
go func() {
242+
defer g.Done()
243+
name := fmt.Sprintf("/d/lock-%d", i)
244+
for ctx.Err() == nil {
245+
closer, err := fs.Lock(name)
246+
if err != nil {
247+
continue
248+
}
249+
_ = closer.Close()
250+
}
251+
}()
252+
}
253+
254+
g.Wait()
255+
}
256+
143257
func TestMemFSLock(t *testing.T) {
144258
filesystems := map[string]FS{}
145259
fileLocks := map[string]io.Closer{}

0 commit comments

Comments
 (0)