Skip to content

Commit 9ee6410

Browse files
authored
[CWS] Fix panic on closed channel during APIServer shutdown (#47047)
### What does this PR do? Wait for the `APIServer.start()` goroutine to finish before stopping pipeline providers, preventing a race where events are sent on a closed `logChan`. ### Motivation During system-probe shutdown, `CWSConsumer.Stop()` calls `c.cancelFnc()` to cancel the Go context, and then `c.apiServer.Stop()`. `c.apiServer.Stop()` then calls `a.stopper.Stop()` to close the `logChan` channel. However, the `APIServer.start()` goroutine was not tracked by any synchronization primitive, so after the Go context was cancelled the goroutine could still be in the middle of a dequeue cycle while the `logChan` channel was already closed. This resulted in the following panic: ``` panic: send on closed channel goroutine 781 [running]: github.com/DataDog/datadog-agent/pkg/security/reporter.(*RuntimeReporter).ReportRaw(0x4000e84e20, {0x4003c26000, 0x3d43, 0x4000}, {0x0, 0x0}, {0x40009f7b48?, 0x1a15a98?, 0x0?}, {0x400209a880, ...}) github.com/DataDog/datadog-agent/pkg/security/reporter/reporter.go:39 +0x1d0 github.com/DataDog/datadog-agent/pkg/security/module.(*DirectEventMsgSender).Send(0x400278a2a0, 0x400295d7c0, 0x3a?) github.com/DataDog/datadog-agent/pkg/security/module/msg_sender.go:100 +0x138 github.com/DataDog/datadog-agent/pkg/security/module.(*APIServer).start.func1(0x40016c3e00, 0x1) github.com/DataDog/datadog-agent/pkg/security/module/server.go:386 +0x400 github.com/DataDog/datadog-agent/pkg/security/module.(*APIServer).dequeue.func1(...) github.com/DataDog/datadog-agent/pkg/security/module/server.go:256 github.com/DataDog/datadog-agent/pkg/security/module.slicesDeleteUntilFalse(...) github.com/DataDog/datadog-agent/pkg/security/module/server.go:280 github.com/DataDog/datadog-agent/pkg/security/module.(*APIServer).dequeue(0x40019afa20, {0x4003e2bf20?, 0x48e7fd6f8e4?, 0x401af60?}, 0x40009f7f30) github.com/DataDog/datadog-agent/pkg/security/module/server.go:248 +0x1d8 github.com/DataDog/datadog-agent/pkg/security/module.(*APIServer).start(0x40019afa20, {0x2ba4608, 0x400225d0e0}) github.com/DataDog/datadog-agent/pkg/security/module/server.go:337 +0x80 created by github.com/DataDog/datadog-agent/pkg/security/module.(*APIServer).Start in goroutine 1 github.com/DataDog/datadog-agent/pkg/security/module/server.go:412 +0x1d0 ``` ### Describe how you validated your changes ### Additional Notes Co-authored-by: yoann.ghigoff <yoann.ghigoff@datadoghq.com>
1 parent 3c1e0ca commit 9ee6410

1 file changed

Lines changed: 11 additions & 2 deletions

File tree

pkg/security/module/server.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ type APIServer struct {
176176

177177
stopChan chan struct{}
178178
stopper startstop.Stopper
179+
wg sync.WaitGroup
179180

180181
securityAgentAPIClient *SecurityAgentAPIClient
181182
}
@@ -409,7 +410,11 @@ func (a *APIServer) Start(ctx context.Context) {
409410
})
410411
go a.securityAgentAPIClient.SendActivityDumps(ctx, a.activityDumps)
411412
}
412-
go a.start(ctx)
413+
a.wg.Add(1)
414+
go func() {
415+
defer a.wg.Done()
416+
a.start(ctx)
417+
}()
413418
}
414419

415420
// GetConfig returns config of the runtime security module required by the security agent
@@ -685,8 +690,12 @@ func (a *APIServer) GetSECLVariables() map[string]*api.SECLVariableState {
685690
return a.cwsConsumer.ruleEngine.GetSECLVariables()
686691
}
687692

688-
// Stop stops the API server
693+
// Stop stops the API server. The start goroutine must finish before the
694+
// stopper closes pipeline channels, otherwise sends to logChan will panic.
689695
func (a *APIServer) Stop() {
696+
// Wait for the start goroutine to exit (triggered by context cancellation)
697+
// before stopping pipeline providers which close the underlying channels.
698+
a.wg.Wait()
690699
a.stopper.Stop()
691700
}
692701

0 commit comments

Comments
 (0)