Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 83 additions & 2 deletions internal/pkg/daemon/enricher/jsonenricher.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ func (e *JsonEnricher) Run(ctx context.Context, runErr chan<- error) {

e.logger.V(config.VerboseLevel).Info("Emit audit log for process",
"pid", logItem.Key())
e.dispatchSeccompLine(auditLogBucket, nodeName)
e.dispatchSeccompLine(ctx, auditLogBucket, logItem.Key(), nodeName)
})

e.logger.Info(fmt.Sprintf("Setting up caches with expiry of %v", defaultCacheTimeout))
Expand Down Expand Up @@ -411,8 +411,82 @@ func (e *JsonEnricher) fetchProcessInfo(processId int, executable string, uid, g
return processInfo
}

// retryBpfCmdLine retries fetching cmdLine from BPF cache with progressive delays.
// This handles the race condition where audit events arrive before BPF events are processed.
func (e *JsonEnricher) retryBpfCmdLine(logBucket *types.LogBucket) {
if e.bpfProcessCache == nil || logBucket.ProcessInfo.CmdLine != "" || logBucket.ProcessInfo.Pid == 0 {
return
}

maxRetries := 3
retryDelays := []time.Duration{0, 10 * time.Millisecond, 50 * time.Millisecond}

for attempt := 0; attempt < maxRetries && logBucket.ProcessInfo.CmdLine == ""; attempt++ {
if attempt > 0 {
time.Sleep(retryDelays[attempt])
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dispatchSeccompLine is called from the OnEviction callback, and DeleteExpired() (line 274) runs every 30s from the Run() loop. Evictions are processed synchronously, so sleeping here (up to 60ms per retry function, 120ms total per entry) blocks all subsequent evictions and the main audit line processing loop. For N expired entries, that's N * 120ms of stall.

Consider dispatching into a goroutine so the callback returns immediately:

e.logLinesCache.OnEviction(
    func(ctx context.Context, reason ttlcache.EvictionReason, logItem *ttlcache.Item[int, *types.LogBucket]) {
        auditLogBucket := logItem.Value()
        go e.dispatchSeccompLine(ctx, auditLogBucket, logItem.Key(), nodeName)
    })

}

cmdLine, errCmdLine := e.bpfProcessCache.GetCmdLine(logBucket.ProcessInfo.Pid)
if errCmdLine == nil {
logBucket.ProcessInfo.CmdLine = cmdLine
e.logger.V(1).Info("cmdline found in eBPF on retry",
"processId", logBucket.ProcessInfo.Pid, "cmdLine", cmdLine, "attempt", attempt+1)

break
Comment on lines +434 to +435
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The loop condition already checks logBucket.ProcessInfo.CmdLine == "". After setting CmdLine to a non-empty value, the loop terminates on its own. The break is redundant.

Same applies to retryBpfRequestUID at line 462.

Suggested change
break

}
}
}

// retryBpfRequestUID retries fetching requestUID from BPF environment with progressive delays.
func (e *JsonEnricher) retryBpfRequestUID(logBucket *types.LogBucket) {
if e.bpfProcessCache == nil || logBucket.ProcessInfo.ExecRequestId != nil {
return
}

maxRetries := 3
retryDelays := []time.Duration{0, 10 * time.Millisecond, 50 * time.Millisecond}
Comment on lines +446 to +447
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are identical to lines 421-422. Extract to a package-level variable to avoid the duplication:

var bpfRetryDelays = []time.Duration{0, 10 * time.Millisecond, 50 * time.Millisecond}


for attempt := 0; attempt < maxRetries && logBucket.ProcessInfo.ExecRequestId == nil; attempt++ {
if attempt > 0 {
time.Sleep(retryDelays[attempt])
}

procEnv, errEnv := e.bpfProcessCache.GetEnv(logBucket.ProcessInfo.Pid)
if errEnv == nil {
reqId, ok := procEnv[requestIdEnv]
if ok {
logBucket.ProcessInfo.ExecRequestId = &reqId
e.logger.V(1).Info("exec request id found in eBPF on retry", "reqId", reqId,
"processId", logBucket.ProcessInfo.Pid, "attempt", attempt+1)

break
}
}
}
}

// retryContainerInfo retries fetching container info if it's missing.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unlike the BPF retry functions, this makes a single call with no delay or loop. The name retryContainerInfo is misleading. Either rename to something like refetchContainerInfo, or add actual retry logic consistent with the other two functions.

func (e *JsonEnricher) retryContainerInfo(
ctx context.Context, logBucket *types.LogBucket, processID int, nodeName string,
) {
if logBucket.ContainerInfo != nil {
return
}

e.logger.V(config.VerboseLevel).Info("Container info not found, retrying",
"processID", processID)

logBucket.ContainerInfo = e.fetchContainerInfo(ctx, processID, nodeName)
if logBucket.ContainerInfo != nil {
e.logger.V(1).Info("Container info found on retry",
"processID", processID,
"pod", logBucket.ContainerInfo.PodName,
"namespace", logBucket.ContainerInfo.Namespace)
}
}

func (e *JsonEnricher) dispatchSeccompLine(
logBucket *types.LogBucket, nodeName string,
ctx context.Context, logBucket *types.LogBucket, processID int, nodeName string,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

processID is redundant since logBucket.ProcessInfo.Pid holds the same value (both sourced from auditLine.ProcessID). The BPF retry functions already use logBucket.ProcessInfo.Pid directly. Only retryContainerInfo uses this parameter, creating an inconsistency.

) {
var syscallNames []string

Expand Down Expand Up @@ -443,6 +517,13 @@ func (e *JsonEnricher) dispatchSeccompLine(
return
}

// Retry BPF cache lookups with progressive delays to allow ring buffer polling to catch up
e.retryBpfCmdLine(logBucket)
e.retryBpfRequestUID(logBucket)

// Retry container info lookup if it's still missing
e.retryContainerInfo(ctx, logBucket, processID, nodeName)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@BhargaviGudi Thanks for the PR. Can you try to test if e.retryContainerInfo(ctx, logBucket, processID, nodeName) alone can fix the issue without the retryBpfCmdLine and retryBpfRequestUID? I feel missing container info is the cause of the issue.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested with only retryContainerInfo as suggested, but it doesn't fix the issue for short-lived processes.

  • The short-lived processes exit so fast that by the time the LogBucket expires (30 seconds later), their /proc/ is gone and are missing from audit log

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ngopalak-redhat Could you please help me review the PR? Thanks

Comment on lines +520 to +525
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function names already communicate intent. These comments restate what the code does.

Suggested change
// Retry BPF cache lookups with progressive delays to allow ring buffer polling to catch up
e.retryBpfCmdLine(logBucket)
e.retryBpfRequestUID(logBucket)
// Retry container info lookup if it's still missing
e.retryContainerInfo(ctx, logBucket, processID, nodeName)
e.retryBpfCmdLine(logBucket)
e.retryBpfRequestUID(logBucket)
e.retryContainerInfo(ctx, logBucket, processID, nodeName)


if logBucket.ContainerInfo == nil {
e.logger.V(config.VerboseLevel).Info("Container info not found in cache")
}
Expand Down
Loading