Skip to content

Commit a1b2124

Browse files
e2bclaude
authored andcommitted
fix: data races and test fixes exposed by ARM64 testing
ARM64's weaker memory model reliably triggers races that x86 papers over: - clean-nfs-cache: fd use-after-close between Scanner and Statter goroutines — pass directory path string instead of *os.File - nbd/path_direct: loop variable capture in goroutine closure - envd conversion_test: shared connect.Response across parallel subtests — use RunAndReturn to create fresh response per call - errorcollector_test: plain bool and ctx variable reuse in concurrent test — use atomic.Bool and distinct context variables - db/testutils: goose v3.26 SetDialect() races on package globals when parallel tests run migrations — serialize with sync.Mutex - uffd/page_mmap: graceful skip on hugepage ENOMEM in CI - async_wp_test: skip UFFD write-protection test on ARM64 (unsupported) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 090f084 commit a1b2124

10 files changed

Lines changed: 78 additions & 29 deletions

File tree

packages/db/pkg/testutils/db.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"os/exec"
66
"path/filepath"
77
"strings"
8+
"sync"
89
"testing"
910
"time"
1011

@@ -106,6 +107,12 @@ func SetupDatabase(t *testing.T) *Database {
106107
}
107108
}
108109

110+
// gooseMu serializes goose operations across parallel tests.
111+
// goose.OpenDBWithDriver calls goose.SetDialect which writes to package-level
112+
// globals (dialect, store) without synchronization. Concurrent test goroutines
113+
// race on these globals, triggering the race detector on ARM64.
114+
var gooseMu sync.Mutex
115+
109116
// runDatabaseMigrations executes all required database migrations
110117
func runDatabaseMigrations(t *testing.T, connStr string) {
111118
t.Helper()
@@ -115,6 +122,9 @@ func runDatabaseMigrations(t *testing.T, connStr string) {
115122
require.NoError(t, err, "Failed to find git root")
116123
repoRoot := strings.TrimSpace(string(output))
117124

125+
gooseMu.Lock()
126+
defer gooseMu.Unlock()
127+
118128
db, err := goose.OpenDBWithDriver("pgx", connStr)
119129
require.NoError(t, err)
120130
t.Cleanup(func() {

packages/envd/internal/services/legacy/conversion_test.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package legacy
22

33
import (
44
"bytes"
5+
"context"
56
"io"
67
"net/http"
78
"net/http/httptest"
@@ -22,12 +23,19 @@ import (
2223
func TestFilesystemClient_FieldFormatter(t *testing.T) {
2324
t.Parallel()
2425
fsh := filesystemconnectmocks.NewMockFilesystemHandler(t)
25-
fsh.EXPECT().Move(mock.Anything, mock.Anything).Return(connect.NewResponse(&filesystem.MoveResponse{
26-
Entry: &filesystem.EntryInfo{
27-
Name: "test-name",
28-
Owner: "new-extra-field",
26+
// Use RunAndReturn to create a fresh response per call. Using Return()
27+
// shares one Response across parallel subtests, causing a data race on
28+
// the lazily-initialized header/trailer maps inside connect.Response.
29+
fsh.EXPECT().Move(mock.Anything, mock.Anything).RunAndReturn(
30+
func(_ context.Context, _ *connect.Request[filesystem.MoveRequest]) (*connect.Response[filesystem.MoveResponse], error) {
31+
return connect.NewResponse(&filesystem.MoveResponse{
32+
Entry: &filesystem.EntryInfo{
33+
Name: "test-name",
34+
Owner: "new-extra-field",
35+
},
36+
}), nil
2937
},
30-
}), nil)
38+
)
3139

3240
_, handler := filesystemconnect.NewFilesystemHandler(fsh,
3341
connect.WithInterceptors(

packages/orchestrator/cmd/clean-nfs-cache/cleaner/clean.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ type Candidate struct {
9999
}
100100

101101
type statReq struct {
102-
df *os.File
102+
dirPath string
103103
name string
104104
response chan *statReq
105105
f *File

packages/orchestrator/cmd/clean-nfs-cache/cleaner/scan.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ func (c *Cleaner) Statter(ctx context.Context, done *sync.WaitGroup) {
6060
case <-ctx.Done():
6161
return
6262
case req := <-c.statRequestCh:
63-
f, err := c.statInDir(req.df, req.name)
63+
f, err := c.statInDir(req.dirPath, req.name)
6464
req.f = f
6565
req.err = err
6666
req.response <- req
@@ -201,13 +201,16 @@ func (c *Cleaner) scanDir(ctx context.Context, path []*Dir) (out *Dir, err error
201201
}
202202
}
203203

204-
// submit all stat requests
204+
// Submit stat requests using the directory path (not the *os.File).
205+
// The file descriptor df is closed when scanDir returns (defer above),
206+
// but Statter goroutines may still be processing requests concurrently.
207+
// Passing the path avoids a race between df.Close() and df.Fd().
205208
responseCh := make(chan *statReq, len(filenames))
206209
for _, name := range filenames {
207210
select {
208211
case <-ctx.Done():
209212
return nil, ctx.Err()
210-
case c.statRequestCh <- &statReq{df: df, name: name, response: responseCh}:
213+
case c.statRequestCh <- &statReq{dirPath: absPath, name: name, response: responseCh}:
211214
// submitted
212215
}
213216
}

packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_linux.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ package cleaner
44

55
import (
66
"fmt"
7-
"os"
7+
"path/filepath"
88

99
"golang.org/x/sys/unix"
1010
)
@@ -30,11 +30,11 @@ func (c *Cleaner) stat(fullPath string) (*Candidate, error) {
3030
}, nil
3131
}
3232

33-
func (c *Cleaner) statInDir(df *os.File, filename string) (*File, error) {
33+
func (c *Cleaner) statInDir(dirPath string, filename string) (*File, error) {
3434
c.StatxC.Add(1)
3535
c.StatxInDirC.Add(1)
3636
var statx unix.Statx_t
37-
err := unix.Statx(int(df.Fd()), filename,
37+
err := unix.Statx(unix.AT_FDCWD, filepath.Join(dirPath, filename),
3838
unix.AT_STATX_DONT_SYNC|unix.AT_SYMLINK_NOFOLLOW|unix.AT_NO_AUTOMOUNT,
3939
unix.STATX_ATIME|unix.STATX_SIZE,
4040
&statx,

packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_osx.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ func (c *Cleaner) stat(path string) (*Candidate, error) {
3131
}, nil
3232
}
3333

34-
func (c *Cleaner) statInDir(df *os.File, filename string) (*File, error) {
34+
func (c *Cleaner) statInDir(dirPath string, filename string) (*File, error) {
3535
c.StatxInDirC.Add(1)
36-
// performance on OS X doeas not matter, so just use the full stat
37-
cand, err := c.stat(filepath.Join(df.Name(), filename))
36+
// performance on OS X does not matter, so just use the full stat
37+
cand, err := c.stat(filepath.Join(dirPath, filename))
3838
if err != nil {
3939
return nil, err
4040
}

packages/orchestrator/pkg/sandbox/nbd/path_direct.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ func (d *DirectPathMount) Open(ctx context.Context) (retDeviceIndex uint32, err
7878

7979
telemetry.ReportEvent(ctx, "got backend size")
8080

81-
deviceIndex := uint32(math.MaxUint32)
81+
var deviceIndex uint32
8282

8383
for {
8484
deviceIndex, err = d.devicePool.GetDevice(ctx)
@@ -119,14 +119,19 @@ func (d *DirectPathMount) Open(ctx context.Context) (retDeviceIndex uint32, err
119119
server.Close()
120120

121121
dispatch := NewDispatch(serverc, d.Backend)
122+
// Capture loop variables for the goroutine closure to avoid a data
123+
// race: deviceIndex is reassigned on each retry iteration of the
124+
// outer for-loop while the goroutine may still read it.
125+
devIdx := deviceIndex
126+
sockIdx := i
122127
// Start reading commands on the socket and dispatching them to our provider
123128
d.handlersWg.Go(func() {
124129
handleErr := dispatch.Handle(ctx)
125130
// The error is expected to happen if the nbd (socket connection) is closed
126131
logger.L().Info(ctx, "closing handler for NBD commands",
127132
zap.Error(handleErr),
128-
zap.Uint32("device_index", deviceIndex),
129-
zap.Int("socket_index", i),
133+
zap.Uint32("device_index", devIdx),
134+
zap.Int("socket_index", sockIdx),
130135
)
131136
})
132137

packages/orchestrator/pkg/sandbox/uffd/testutils/page_mmap.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package testutils
22

33
import (
4+
"errors"
45
"fmt"
56
"math"
67
"syscall"
@@ -20,7 +21,16 @@ func NewPageMmap(t *testing.T, size, pagesize uint64) ([]byte, uintptr, error) {
2021
}
2122

2223
if pagesize == header.HugepageSize {
23-
return newMmap(t, size, header.HugepageSize, unix.MAP_HUGETLB|unix.MAP_HUGE_2MB)
24+
b, addr, err := newMmap(t, size, header.HugepageSize, unix.MAP_HUGETLB|unix.MAP_HUGE_2MB)
25+
// Hugepage allocation can fail with ENOMEM on CI runners that don't
26+
// have enough (or any) hugepages pre-allocated in /proc/sys/vm/nr_hugepages.
27+
// Skip gracefully rather than failing the test.
28+
if err != nil && errors.Is(err, syscall.ENOMEM) {
29+
pages := int(math.Ceil(float64(size) / float64(header.HugepageSize)))
30+
t.Skipf("skipping: hugepage mmap failed (need %d hugepages): %v", pages, err)
31+
}
32+
33+
return b, addr, err
2434
}
2535

2636
return nil, 0, fmt.Errorf("unsupported page size: %d", pagesize)

packages/orchestrator/pkg/sandbox/uffd/userfaultfd/async_wp_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package userfaultfd
22

33
import (
44
"os"
5+
"runtime"
56
"testing"
67
"unsafe"
78

@@ -33,6 +34,9 @@ func TestAsyncWriteProtection(t *testing.T) {
3334
if os.Geteuid() != 0 {
3435
t.Skip("this test requires root privileges")
3536
}
37+
if runtime.GOARCH == "arm64" {
38+
t.Skip("ARM64 kernels do not support UFFD write protection")
39+
}
3640

3741
tests := []struct {
3842
name string

packages/shared/pkg/utils/errorcollector_test.go

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package utils
33
import (
44
"context"
55
"errors"
6+
"sync/atomic"
67
"testing"
78

89
"github.com/stretchr/testify/assert"
@@ -49,23 +50,31 @@ func TestErrorCollector(t *testing.T) {
4950

5051
ec := NewErrorCollector(1)
5152

52-
// Block the collector's only slot
53+
// Block the collector's only slot.
54+
// ctx1 and ctx2 must be distinct variables: the closure passed to ec.Go
55+
// captures the context variable by reference. If we reused a single "ctx"
56+
// variable, the first closure's <-ctx.Done() would race with the main
57+
// goroutine's reassignment of ctx on the second WithCancel call.
5358
started := make(chan struct{})
54-
ctx, cancel1 := context.WithCancel(t.Context())
55-
ec.Go(ctx, func() error {
59+
ctx1, cancel1 := context.WithCancel(t.Context())
60+
ec.Go(ctx1, func() error {
5661
close(started)
57-
<-ctx.Done()
62+
<-ctx1.Done()
5863

5964
return nil
6065
})
6166

6267
<-started
6368

64-
// This Go call should block on the semaphore
65-
var wasCalled bool
66-
ctx, cancel2 := context.WithCancel(t.Context())
67-
ec.Go(ctx, func() error {
68-
wasCalled = true
69+
// This Go call should block on the semaphore.
70+
// wasCalled must be atomic: the goroutine spawned by ec.Go may write it
71+
// concurrently with the main goroutine's read in assert.False below.
72+
// A plain bool causes a data race that the -race detector catches on ARM64
73+
// (weaker memory model) even though it appears safe on x86.
74+
var wasCalled atomic.Bool
75+
ctx2, cancel2 := context.WithCancel(t.Context())
76+
ec.Go(ctx2, func() error {
77+
wasCalled.Store(true)
6978

7079
return nil
7180
})
@@ -78,6 +87,6 @@ func TestErrorCollector(t *testing.T) {
7887

7988
err := ec.Wait()
8089
require.ErrorIs(t, err, context.Canceled)
81-
assert.False(t, wasCalled)
90+
assert.False(t, wasCalled.Load())
8291
})
8392
}

0 commit comments

Comments
 (0)