-
Notifications
You must be signed in to change notification settings - Fork 130
OCPBUGS-62269 - Fix race condition causing missing audit log entries for rapid commands #3052
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -204,7 +204,7 @@ func (e *JsonEnricher) Run(ctx context.Context, runErr chan<- error) { | |||||||||||||||||||
|
|
||||||||||||||||||||
| e.logger.V(config.VerboseLevel).Info("Emit audit log for process", | ||||||||||||||||||||
| "pid", logItem.Key()) | ||||||||||||||||||||
| e.dispatchSeccompLine(auditLogBucket, nodeName) | ||||||||||||||||||||
| e.dispatchSeccompLine(ctx, auditLogBucket, logItem.Key(), nodeName) | ||||||||||||||||||||
| }) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| e.logger.Info(fmt.Sprintf("Setting up caches with expiry of %v", defaultCacheTimeout)) | ||||||||||||||||||||
|
|
@@ -411,8 +411,82 @@ func (e *JsonEnricher) fetchProcessInfo(processId int, executable string, uid, g | |||||||||||||||||||
| return processInfo | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| // retryBpfCmdLine retries fetching cmdLine from BPF cache with progressive delays. | ||||||||||||||||||||
| // This handles the race condition where audit events arrive before BPF events are processed. | ||||||||||||||||||||
| func (e *JsonEnricher) retryBpfCmdLine(logBucket *types.LogBucket) { | ||||||||||||||||||||
| if e.bpfProcessCache == nil || logBucket.ProcessInfo.CmdLine != "" || logBucket.ProcessInfo.Pid == 0 { | ||||||||||||||||||||
| return | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| maxRetries := 3 | ||||||||||||||||||||
| retryDelays := []time.Duration{0, 10 * time.Millisecond, 50 * time.Millisecond} | ||||||||||||||||||||
|
|
||||||||||||||||||||
| for attempt := 0; attempt < maxRetries && logBucket.ProcessInfo.CmdLine == ""; attempt++ { | ||||||||||||||||||||
| if attempt > 0 { | ||||||||||||||||||||
| time.Sleep(retryDelays[attempt]) | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| cmdLine, errCmdLine := e.bpfProcessCache.GetCmdLine(logBucket.ProcessInfo.Pid) | ||||||||||||||||||||
| if errCmdLine == nil { | ||||||||||||||||||||
| logBucket.ProcessInfo.CmdLine = cmdLine | ||||||||||||||||||||
| e.logger.V(1).Info("cmdline found in eBPF on retry", | ||||||||||||||||||||
| "processId", logBucket.ProcessInfo.Pid, "cmdLine", cmdLine, "attempt", attempt+1) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| break | ||||||||||||||||||||
|
Comment on lines
+434
to
+435
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The loop condition already checks Same applies to
Suggested change
|
||||||||||||||||||||
| } | ||||||||||||||||||||
| } | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| // retryBpfRequestUID retries fetching requestUID from BPF environment with progressive delays. | ||||||||||||||||||||
| func (e *JsonEnricher) retryBpfRequestUID(logBucket *types.LogBucket) { | ||||||||||||||||||||
| if e.bpfProcessCache == nil || logBucket.ProcessInfo.ExecRequestId != nil { | ||||||||||||||||||||
| return | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| maxRetries := 3 | ||||||||||||||||||||
| retryDelays := []time.Duration{0, 10 * time.Millisecond, 50 * time.Millisecond} | ||||||||||||||||||||
|
Comment on lines
+446
to
+447
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are identical to lines 421-422. Extract to a package-level variable to avoid the duplication: var bpfRetryDelays = []time.Duration{0, 10 * time.Millisecond, 50 * time.Millisecond} |
||||||||||||||||||||
|
|
||||||||||||||||||||
| for attempt := 0; attempt < maxRetries && logBucket.ProcessInfo.ExecRequestId == nil; attempt++ { | ||||||||||||||||||||
| if attempt > 0 { | ||||||||||||||||||||
| time.Sleep(retryDelays[attempt]) | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| procEnv, errEnv := e.bpfProcessCache.GetEnv(logBucket.ProcessInfo.Pid) | ||||||||||||||||||||
| if errEnv == nil { | ||||||||||||||||||||
| reqId, ok := procEnv[requestIdEnv] | ||||||||||||||||||||
| if ok { | ||||||||||||||||||||
| logBucket.ProcessInfo.ExecRequestId = &reqId | ||||||||||||||||||||
| e.logger.V(1).Info("exec request id found in eBPF on retry", "reqId", reqId, | ||||||||||||||||||||
| "processId", logBucket.ProcessInfo.Pid, "attempt", attempt+1) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| break | ||||||||||||||||||||
| } | ||||||||||||||||||||
| } | ||||||||||||||||||||
| } | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| // retryContainerInfo retries fetching container info if it's missing. | ||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unlike the BPF retry functions, this makes a single call with no delay or loop. The name |
||||||||||||||||||||
| func (e *JsonEnricher) retryContainerInfo( | ||||||||||||||||||||
| ctx context.Context, logBucket *types.LogBucket, processID int, nodeName string, | ||||||||||||||||||||
| ) { | ||||||||||||||||||||
| if logBucket.ContainerInfo != nil { | ||||||||||||||||||||
| return | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| e.logger.V(config.VerboseLevel).Info("Container info not found, retrying", | ||||||||||||||||||||
| "processID", processID) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| logBucket.ContainerInfo = e.fetchContainerInfo(ctx, processID, nodeName) | ||||||||||||||||||||
| if logBucket.ContainerInfo != nil { | ||||||||||||||||||||
| e.logger.V(1).Info("Container info found on retry", | ||||||||||||||||||||
| "processID", processID, | ||||||||||||||||||||
| "pod", logBucket.ContainerInfo.PodName, | ||||||||||||||||||||
| "namespace", logBucket.ContainerInfo.Namespace) | ||||||||||||||||||||
| } | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| func (e *JsonEnricher) dispatchSeccompLine( | ||||||||||||||||||||
| logBucket *types.LogBucket, nodeName string, | ||||||||||||||||||||
| ctx context.Context, logBucket *types.LogBucket, processID int, nodeName string, | ||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||||||||||||||||
| ) { | ||||||||||||||||||||
| var syscallNames []string | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
@@ -443,6 +517,13 @@ func (e *JsonEnricher) dispatchSeccompLine( | |||||||||||||||||||
| return | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| // Retry BPF cache lookups with progressive delays to allow ring buffer polling to catch up | ||||||||||||||||||||
| e.retryBpfCmdLine(logBucket) | ||||||||||||||||||||
| e.retryBpfRequestUID(logBucket) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| // Retry container info lookup if it's still missing | ||||||||||||||||||||
| e.retryContainerInfo(ctx, logBucket, processID, nodeName) | ||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @BhargaviGudi Thanks for the PR. Can you try to test if
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tested with only
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ngopalak-redhat Could you please help me review the PR? Thanks
Comment on lines
+520
to
+525
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The function names already communicate intent. These comments restate what the code does.
Suggested change
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| if logBucket.ContainerInfo == nil { | ||||||||||||||||||||
| e.logger.V(config.VerboseLevel).Info("Container info not found in cache") | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dispatchSeccompLineis called from theOnEvictioncallback, andDeleteExpired()(line 274) runs every 30s from theRun()loop. Evictions are processed synchronously, so sleeping here (up to 60ms per retry function, 120ms total per entry) blocks all subsequent evictions and the main audit line processing loop. For N expired entries, that's N * 120ms of stall.Consider dispatching into a goroutine so the callback returns immediately: