Skip to content

Commit 0e004df

Browse files
fix: (runner): eliminate data race in removeHook (fatal crash) (#250)
**Issue Description**: `removeHook` mutated `logger.Hooks` in place without holding logrus's internal mutex, while logrus `fireHooks` reads that map under the mutex. Because the deferred `removeHook` at the end of `RunInstance` can run while the container death-monitor and log-streaming goroutines (tracked on the global `r.wg`, joined only in `Stop()`) are still logging, this is a concurrent map read/write (an unrecoverable `fatal error: concurrent map read and map write` ) that kills the process and aborts the in-progress suite. **Fix**: rebuild the filtered hook map and install it atomically via `logger.ReplaceHooks`, which takes the logger lock. Adds `TestRemoveHookConcurrentWithLogging`, which fails under `-race` against the old code and passes against the fix.
1 parent 3a6f740 commit 0e004df

2 files changed

Lines changed: 103 additions & 2 deletions

File tree

pkg/runner/logging.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,23 @@ func waitForLogDrain(
272272
}
273273

274274
// removeHook removes a hook from the logger.
275+
//
276+
// logrus protects logger.Hooks with the logger's own (unexported) mutex, which
277+
// both AddHook and the hook-firing path take before touching the map. We can't
278+
// take that mutex from here, so mutating logger.Hooks directly would race any
279+
// goroutine that is still logging on this logger. At teardown that includes the
280+
// container death monitor and the log-streaming goroutine, which run under
281+
// r.wg and may not have stopped yet. Such a race makes Go abort the whole
282+
// process with a "concurrent map read and map write" fatal error.
283+
//
284+
// ReplaceHooks swaps the map in under the logger's lock, so we build the
285+
// filtered copy first and then install it in one atomic step.
275286
func (r *runner) removeHook(hook logrus.Hook) {
276-
for level, hooks := range r.logger.Hooks {
287+
oldHooks := r.logger.ReplaceHooks(make(logrus.LevelHooks))
288+
289+
newHooks := make(logrus.LevelHooks, len(oldHooks))
290+
291+
for level, hooks := range oldHooks {
277292
filtered := make([]logrus.Hook, 0, len(hooks))
278293

279294
for _, h := range hooks {
@@ -282,6 +297,8 @@ func (r *runner) removeHook(hook logrus.Hook) {
282297
}
283298
}
284299

285-
r.logger.Hooks[level] = filtered
300+
newHooks[level] = filtered
286301
}
302+
303+
r.logger.ReplaceHooks(newHooks)
287304
}

pkg/runner/logging_race_test.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
package runner
2+
3+
import (
4+
"io"
5+
"sync"
6+
"testing"
7+
8+
"github.com/sirupsen/logrus"
9+
)
10+
11+
// TestRemoveHookConcurrentWithLogging guards against a data race in removeHook.
12+
//
13+
// In production, removeHook runs from a deferred call at the end of RunInstance
14+
// while the container death monitor and log-streaming goroutines may still be
15+
// logging on the same logger. Those goroutines run under the shared r.wg, which
16+
// is only waited on in Stop(), so they are not guaranteed to have stopped when
17+
// the defer fires.
18+
//
19+
// If removeHook mutates logger.Hooks in place without the logger's lock, that
20+
// write races the locked map read in logrus's hook-firing path. Go turns a
21+
// concurrent map read and write into an unrecoverable fatal error that takes
22+
// the whole process down.
23+
//
24+
// Run with "go test -race": this test reports a race against an in-place
25+
// implementation and passes against the ReplaceHooks-based one. It re-adds the
26+
// hook on every iteration so a regression keeps writing the map under load and
27+
// is caught reliably.
28+
func TestRemoveHookConcurrentWithLogging(t *testing.T) {
29+
logger := logrus.New()
30+
logger.SetOutput(io.Discard)
31+
logger.SetLevel(logrus.DebugLevel)
32+
33+
r := &runner{logger: logger}
34+
35+
hook := &fileHook{writer: io.Discard, formatter: logger.Formatter}
36+
r.logger.AddHook(hook)
37+
38+
var wg sync.WaitGroup
39+
stop := make(chan struct{})
40+
41+
// Stand in for the death monitor and log-streaming goroutines that are still
42+
// emitting log entries when removeHook runs. Each Warn makes logrus read
43+
// logger.Hooks under the logger's lock.
44+
for i := 0; i < 4; i++ {
45+
wg.Add(1)
46+
47+
go func() {
48+
defer wg.Done()
49+
50+
for {
51+
select {
52+
case <-stop:
53+
return
54+
default:
55+
logger.WithField("k", "v").Warn("container exited unexpectedly")
56+
}
57+
}
58+
}()
59+
}
60+
61+
// In production removeHook is called once via defer. Calling it many times
62+
// here, re-adding the hook each cycle, keeps an in-place implementation
63+
// writing the map at the same time as the readers above.
64+
for i := 0; i < 2000; i++ {
65+
r.removeHook(hook)
66+
r.logger.AddHook(hook)
67+
}
68+
69+
close(stop)
70+
wg.Wait()
71+
72+
// With no goroutines left logging, a final removeHook should leave the hook
73+
// gone. Reading logger.Hooks directly is safe here because every logger has
74+
// returned, so there is no concurrent access.
75+
r.removeHook(hook)
76+
77+
for level, hooks := range logger.Hooks {
78+
for _, h := range hooks {
79+
if h == hook {
80+
t.Fatalf("hook still present at level %v after removeHook", level)
81+
}
82+
}
83+
}
84+
}

0 commit comments

Comments
 (0)