Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions .github/workflows/unit-test-on-pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,8 @@ jobs:
run: |
sudo apt-get update -y
case "${{ matrix.target_arch }}" in
amd64) sudo apt-get -y install qemu-system-x86;;
arm64) sudo apt-get -y install qemu-system-arm;;
amd64) sudo apt-get -y --no-install-recommends --no-install-suggests install qemu-system-x86 ;;
arm64) sudo apt-get -y --no-install-recommends --no-install-suggests install qemu-system-arm ipxe-qemu ;;
*) echo >&2 "bug: bad arch selected"; exit 1;;
esac
go install github.com/florianl/bluebox@v0.0.1
Expand Down Expand Up @@ -287,11 +287,16 @@ jobs:
run: |
sudo apt-get update -y
case "${{ matrix.target_arch }}" in
amd64) sudo apt-get -y install qemu-system-x86;;
arm64) sudo apt-get -y install qemu-system-arm;;
amd64) sudo apt-get -y --no-install-recommends --no-install-suggests install qemu-system-x86;;
arm64) sudo apt-get -y --no-install-recommends --no-install-suggests install qemu-system-arm ipxe-qemu;;
*) echo >&2 "bug: bad arch selected"; exit 1;;
esac
sudo apt-get install -y debootstrap systemtap-sdt-dev
- name: Cache parcagpu library
uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
with:
path: test/distro-qemu/parcagpu-lib
key: parcagpu-${{ matrix.target_arch }}
- name: Download kernel
run: |
cd test/distro-qemu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,12 @@ func Test_Golabels(t *testing.T) {

go func() {
if err := exec.CommandContext(ctx, exe.Name()).Run(); err != nil {
t.Log(err)
select {
case <-ctx.Done():
// Context is canceled, meaning the test is done.
default:
t.Log(err)
}
}
}()

Expand Down
14 changes: 8 additions & 6 deletions interpreter/python/python.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,10 +454,12 @@ func (p *pythonInstance) getCodeObject(addr libpf.Address,
if addr == 0 {
return nil, errors.New("failed to read code object: null pointer")
}
if value, ok := p.addrToCodeObject.Get(addr); ok {
m := value
if m.ebpfChecksum == ebpfChecksum {
return m, nil
if ebpfChecksum != 0 {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When/why can this checksum be 0

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the bpf_read fails for the py code object.

if value, ok := p.addrToCodeObject.Get(addr); ok {
m := value
if m.ebpfChecksum == ebpfChecksum {
return m, nil
}
}
}
Comment on lines +457 to 464

Copilot AI Feb 23, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new behavior for handling ebpfChecksum=0 (when BPF fails to read the code object) lacks test coverage. Consider adding a unit test that verifies:

  1. When ebpfChecksum=0, the cache lookup is skipped
  2. The code object is successfully read via RemoteMemory.Read
  3. The calculated checksum is stored in the cache
  4. Subsequent lookups with the calculated checksum result in cache hits

This would help prevent regressions and document the expected behavior.

Copilot uses AI. Check for mistakes.

Expand Down Expand Up @@ -512,7 +514,7 @@ func (p *pythonInstance) getCodeObject(addr libpf.Address,

ebpfChecksumCalculated := (argCount << 25) + (kwonlyArgCount << 18) +
(flags << 10) + firstLineNo
if ebpfChecksum != ebpfChecksumCalculated {
if ebpfChecksum != 0 && ebpfChecksum != ebpfChecksumCalculated {
return nil, fmt.Errorf("read code object was stale: %x != %x",
ebpfChecksum, ebpfChecksumCalculated)
}
Expand All @@ -533,7 +535,7 @@ func (p *pythonInstance) getCodeObject(addr libpf.Address,
sourceFileName: libpf.Intern(sourceFileName),
firstLineNo: firstLineNo,
lineTable: lineTable,
ebpfChecksum: ebpfChecksum,
ebpfChecksum: ebpfChecksumCalculated,
}
p.addrToCodeObject.Add(addr, pco)
return pco, nil
Expand Down
7 changes: 6 additions & 1 deletion support/ebpf/python_tracer.ebpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,12 @@ static EBPF_INLINE ErrorCode process_python_frame(
if (bpf_probe_read_user(pss->code, sizeof(pss->code), py_codeobject)) {
DEBUG_PRINT("Failed to read PyCodeObject at 0x%lx", (unsigned long)(py_codeobject));
increment_metric(metricID_UnwindPythonErrBadCodeObjectArgCountAddr);
return ERR_PYTHON_BAD_CODE_OBJECT_ADDR;
// Push the frame with the code object address so the agent can try to
// read it via /proc/pid/mem (which supports page faults unlike BPF).
// codeobject_id=0 distinguishes this from a successful read.
Comment on lines +133 to +135

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where does the reading via /proc/pid/mem happen?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

func (p *pythonInstance) getCodeObject(addr libpf.Address,

file_id = (u64)py_codeobject;
lineno = py_encode_lineno(0, (u32)py_f_lasti);
goto push_frame;
}

int py_argcount = *(int *)(&pss->code[pyinfo->PyCodeObject_co_argcount]);
Expand Down
Binary file modified support/ebpf/tracer.ebpf.amd64
Binary file not shown.
Binary file modified support/ebpf/tracer.ebpf.arm64
Binary file not shown.
4 changes: 0 additions & 4 deletions support/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,6 @@ case "$qemu_arch" in
;;
esac

if [ "$qemu_arch" = "aarch64" ]; then
additionalQemuArgs+=" -machine virt -cpu max"
fi

bluebox "${bb_args[@]}" || (echo "failed to generate initramfs"; exit 1)

echo Testing on "${kernel_version}"
Expand Down
13 changes: 10 additions & 3 deletions tools/coredump/coredump.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ func (c *symbolizationCache) formatFrame(frame *libpf.Frame) (string, error) {
}

func ExtractTraces(ctx context.Context, pr process.Process, debug bool,
lwpFilter libpf.Set[libpf.PID]) ([]ThreadInfo, error) {
lwpFilter libpf.Set[libpf.PID], ebpfPr ...process.Process) ([]ThreadInfo, error) {
todo, cancel := context.WithCancel(ctx)
defer cancel()

Expand Down Expand Up @@ -153,8 +153,15 @@ func ExtractTraces(ctx context.Context, pr process.Process, debug bool,
return nil, fmt.Errorf("failed to get thread info for process %d: %v", pid, err)
}

// Interfaces for the managers
ebpfCtx := newEBPFContext(pr)
// Interfaces for the managers.
// An optional separate process can be provided for the eBPF context,
// allowing tests to inject a faulting memory reader for bpf_probe_read_user
// while the interpreter manager uses the real process memory.
ebpfProcess := pr
if len(ebpfPr) > 0 && ebpfPr[0] != nil {
ebpfProcess = ebpfPr[0]
}
ebpfCtx := newEBPFContext(ebpfProcess)
defer ebpfCtx.release()

coredumpEbpfMaps := ebpfMapsCoredump{ctx: ebpfCtx}
Expand Down
107 changes: 107 additions & 0 deletions tools/coredump/coredump_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,16 @@
package main

import (
"fmt"
"io"
"path/filepath"
"strings"
"testing"

"github.com/stretchr/testify/require"
"go.opentelemetry.io/ebpf-profiler/libpf/pfelf"
"go.opentelemetry.io/ebpf-profiler/process"
"go.opentelemetry.io/ebpf-profiler/remotememory"
"go.opentelemetry.io/ebpf-profiler/tools/coredump/cloudstore"
"go.opentelemetry.io/ebpf-profiler/tools/coredump/modulestore"
)
Expand Down Expand Up @@ -41,3 +48,103 @@ func TestCoreDumps(t *testing.T) {
})
}
}

// faultingReaderAt wraps an io.ReaderAt and returns an error for reads
// matching a predicate. This simulates bpf_probe_read_user failures where
// the kernel cannot read a page that is swapped out.
type faultingReaderAt struct {
inner io.ReaderAt
shouldFault func(off int64, size int) bool
}

func (f *faultingReaderAt) ReadAt(p []byte, off int64) (int, error) {
if f.shouldFault != nil && f.shouldFault(off, len(p)) {
return 0, fmt.Errorf("injected fault at offset 0x%x (size %d)", off, len(p))
}
return f.inner.ReadAt(p, off)
}

// faultingProcess wraps a process.Process and overrides GetRemoteMemory to
// return a RemoteMemory backed by a faultingReaderAt. This is passed as the
// eBPF process to ExtractTraces so that bpf_probe_read_user hits injected
// faults, while the interpreter manager uses the real process and can still
// read the code objects successfully.
type faultingProcess struct {
process.Process
faultingRM remotememory.RemoteMemory
}

func newFaultingProcess(pr process.Process, shouldFault func(off int64, size int) bool) *faultingProcess {
rm := pr.GetRemoteMemory()
return &faultingProcess{
Process: pr,
faultingRM: remotememory.RemoteMemory{
ReaderAt: &faultingReaderAt{
inner: rm.ReaderAt,
shouldFault: shouldFault,
},
Bias: rm.Bias,
},
}
}

func (fp *faultingProcess) GetRemoteMemory() remotememory.RemoteMemory {
return fp.faultingRM
}

func (fp *faultingProcess) OpenELF(path string) (*pfelf.File, error) {
return fp.Process.(pfelf.ELFOpener).OpenELF(path)
}

// TestPythonRecoverCodeObject tests that Python frames are recovered correctly
// when the eBPF bpf_probe_read_user fails to read a PyCodeObject (e.g. because
// the page was swapped out). The agent-side code should still read the code
// object via the coredump memory (simulating process_vm_readv which supports
// page faults) and produce the same symbolized output.
func TestPythonRecoverCodeObject(t *testing.T) {
cases, err := findTestCases(true)
require.NoError(t, err)

var pythonCases []string
for _, c := range cases {
base := filepath.Base(c)
if !strings.HasPrefix(base, "python") {
continue
}
tc, err := readTestCase(c)
if err != nil || tc.Skip != "" {
continue
}
pythonCases = append(pythonCases, c)
}
require.NotEmpty(t, pythonCases, "no Python test cases found")

cloudClient, err := cloudstore.Client()
require.NoError(t, err)
store, err := modulestore.New(cloudClient,
cloudstore.PublicReadURL(), cloudstore.ModulestoreS3Bucket(), "modulecache")
require.NoError(t, err)

for _, filename := range pythonCases {
t.Run(filename+"/recover", func(t *testing.T) {
testCase, err := readTestCase(filename)
require.NoError(t, err)

core, err := OpenStoreCoredump(store, testCase.CoredumpRef, testCase.Modules)
require.NoError(t, err)
defer core.Close()

// Wrap the process with a faulting reader that fails all
// PyCodeObject-sized reads. Pass it as the eBPF process
// so bpf_probe_read_user faults, while the real process
// is used by the interpreter manager for recovery reads.
faulting := newFaultingProcess(core, func(_ int64, size int) bool {
return size == pyCodeObjectBPFReadSize
})

data, err := ExtractTraces(t.Context(), core, false, nil, faulting)
require.NoError(t, err)
require.Equal(t, testCase.Threads, data)
})
}
}
7 changes: 7 additions & 0 deletions tools/coredump/ebpfcontext.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ import (
*/
import "C"

// pyCodeObjectBPFReadSize is the size of PythonUnwindScratchSpace.code,
// the buffer bpf_probe_read_user uses to read a PyCodeObject.
var pyCodeObjectBPFReadSize = func() int {
var pss C.PythonUnwindScratchSpace
return int(unsafe.Sizeof(pss.code))
}()

// ebpfContext is the context for EBPF code regarding the process it's unwinding.
type ebpfContext struct {
// trace will contain the trace from the CGO executed eBPF unwinding code
Expand Down
Loading