Skip to content

Commit f8fe5cf

Browse files
committed
Recover python frames when BPF fails to read PyCodeObject
When bpf_probe_read_user fails to read a PyCodeObject (e.g. page swapped out), push the frame with codeobject_id=0 instead of aborting the unwind. This preserves the rest of the stack trace. On the agent side, handle ebpfChecksum=0 in getCodeObject by skipping the LRU cache (no checksum to validate against) and the staleness check (no BPF reference to compare). The agent reads the code object via process_vm_readv which supports page faults, so it can succeed where BPF could not. Store the calculated checksum in the cache so subsequent frames with a real BPF checksum can match.
1 parent 7a56dbf commit f8fe5cf

4 files changed

Lines changed: 14 additions & 7 deletions

File tree

interpreter/python/python.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -454,10 +454,12 @@ func (p *pythonInstance) getCodeObject(addr libpf.Address,
454454
if addr == 0 {
455455
return nil, errors.New("failed to read code object: null pointer")
456456
}
457-
if value, ok := p.addrToCodeObject.Get(addr); ok {
458-
m := value
459-
if m.ebpfChecksum == ebpfChecksum {
460-
return m, nil
457+
if ebpfChecksum != 0 {
458+
if value, ok := p.addrToCodeObject.Get(addr); ok {
459+
m := value
460+
if m.ebpfChecksum == ebpfChecksum {
461+
return m, nil
462+
}
461463
}
462464
}
463465

@@ -512,7 +514,7 @@ func (p *pythonInstance) getCodeObject(addr libpf.Address,
512514

513515
ebpfChecksumCalculated := (argCount << 25) + (kwonlyArgCount << 18) +
514516
(flags << 10) + firstLineNo
515-
if ebpfChecksum != ebpfChecksumCalculated {
517+
if ebpfChecksum != 0 && ebpfChecksum != ebpfChecksumCalculated {
516518
return nil, fmt.Errorf("read code object was stale: %x != %x",
517519
ebpfChecksum, ebpfChecksumCalculated)
518520
}
@@ -533,7 +535,7 @@ func (p *pythonInstance) getCodeObject(addr libpf.Address,
533535
sourceFileName: libpf.Intern(sourceFileName),
534536
firstLineNo: firstLineNo,
535537
lineTable: lineTable,
536-
ebpfChecksum: ebpfChecksum,
538+
ebpfChecksum: ebpfChecksumCalculated,
537539
}
538540
p.addrToCodeObject.Add(addr, pco)
539541
return pco, nil

support/ebpf/python_tracer.ebpf.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,12 @@ static EBPF_INLINE ErrorCode process_python_frame(
130130
if (bpf_probe_read_user(pss->code, sizeof(pss->code), py_codeobject)) {
131131
DEBUG_PRINT("Failed to read PyCodeObject at 0x%lx", (unsigned long)(py_codeobject));
132132
increment_metric(metricID_UnwindPythonErrBadCodeObjectArgCountAddr);
133-
return ERR_PYTHON_BAD_CODE_OBJECT_ADDR;
133+
// Push the frame with the code object address so the agent can try to
134+
// read it via /proc/pid/mem (which supports page faults unlike BPF).
135+
// codeobject_id=0 distinguishes this from a successful read.
136+
file_id = (u64)py_codeobject;
137+
lineno = py_encode_lineno(0, (u32)py_f_lasti);
138+
goto push_frame;
134139
}
135140

136141
int py_argcount = *(int *)(&pss->code[pyinfo->PyCodeObject_co_argcount]);

support/ebpf/tracer.ebpf.amd64

-583 KB
Binary file not shown.

support/ebpf/tracer.ebpf.arm64

-584 KB
Binary file not shown.

0 commit comments

Comments
 (0)