Skip to content

Commit cf4738d

Browse files
committed
devices: drop cilium/ebpf{,link} deps
Replace the use of the cilium/ebpf and cilium/ebpf/link with direct bpf(2) syscalls. Keep cilium/ebpf/asm for instruction assembly. Notes: - The eBPF device-filter programs are now tracked by raw file descriptors instead of *ebpf.Program handles; - asm.Instructions.Marshal requires a concrete binary.LittleEndian or binary.BigEndian, thus endian_{le,be}.go are introduced as a workaround. This reduces the runc binary size by about ~1M. Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
1 parent 011bbf1 commit cf4738d

4 files changed

Lines changed: 188 additions & 104 deletions

File tree

devices/ebpf_linux.go

Lines changed: 170 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,138 @@
11
package devices
22

33
import (
4+
"bytes"
45
"errors"
56
"fmt"
67
"os"
78
"runtime"
89
"sync"
910
"unsafe"
1011

11-
"github.com/cilium/ebpf"
1212
"github.com/cilium/ebpf/asm"
13-
"github.com/cilium/ebpf/link"
1413
"github.com/sirupsen/logrus"
1514
"golang.org/x/sys/unix"
1615
)
1716

18-
func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error) {
17+
func bpf(cmd uintptr, attr unsafe.Pointer, size uintptr) (uintptr, error) {
18+
r1, _, err := unix.Syscall(unix.SYS_BPF, cmd, uintptr(attr), size)
19+
runtime.KeepAlive(attr)
20+
if err != 0 {
21+
return r1, err
22+
}
23+
return r1, nil
24+
}
25+
26+
// bpfProgLoad loads a BPF_PROG_TYPE_CGROUP_DEVICE program and returns its fd.
27+
func bpfProgLoad(insns asm.Instructions, license string) (int, error) {
28+
buf := bytes.NewBuffer(make([]byte, 0, insns.Size()))
29+
if err := insns.Marshal(buf, nativeEndian); err != nil {
30+
return -1, err
31+
}
32+
insnsBytes := buf.Bytes()
33+
34+
licensePtr, err := unix.BytePtrFromString(license)
35+
if err != nil {
36+
return -1, err
37+
}
38+
39+
// Subset of struct bpf_attr for BPF_PROG_LOAD. Fields past the ones we set
40+
// are left zero; the kernel zero-fills any part of bpf_attr beyond the size
41+
// we pass.
42+
attr := struct {
43+
progType uint32
44+
insnCnt uint32
45+
insns uint64 // pointer
46+
license uint64 // pointer
47+
logLevel uint32
48+
logSize uint32
49+
logBuf uint64 // pointer
50+
}{
51+
progType: unix.BPF_PROG_TYPE_CGROUP_DEVICE,
52+
insnCnt: uint32(len(insnsBytes) / asm.InstructionSize),
53+
insns: uint64(uintptr(unsafe.Pointer(&insnsBytes[0]))),
54+
license: uint64(uintptr(unsafe.Pointer(licensePtr))),
55+
}
56+
57+
fd, err := bpf(unix.BPF_PROG_LOAD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
58+
// attr holds the pointers as integers, so the GC can't see them; keep the
59+
// referenced objects alive until the syscall returns.
60+
runtime.KeepAlive(insnsBytes)
61+
runtime.KeepAlive(licensePtr)
62+
if err == nil {
63+
return int(fd), nil
64+
}
65+
66+
// The load failed. Retry with the verifier log enabled so we can include
67+
// it in the error (the first attempt skips it, as it is the fast path).
68+
log := make([]byte, 64*1024)
69+
attr.logLevel = 1
70+
attr.logSize = uint32(len(log))
71+
attr.logBuf = uint64(uintptr(unsafe.Pointer(&log[0])))
72+
73+
fd, err = bpf(unix.BPF_PROG_LOAD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
74+
runtime.KeepAlive(insnsBytes)
75+
runtime.KeepAlive(licensePtr)
76+
runtime.KeepAlive(log)
77+
if err == nil {
78+
return int(fd), nil
79+
}
80+
if n := bytes.IndexByte(log, 0); n > 0 {
81+
return -1, fmt.Errorf("%w: %s", err, bytes.TrimRight(log[:n], "\n"))
82+
}
83+
return -1, err
84+
}
85+
86+
// bpfProgGetFdByID returns the fd for the BPF program with the given ID.
87+
func bpfProgGetFdByID(id uint32) (int, error) {
88+
// The kernel zero-fills the rest of bpf_attr beyond the size we pass.
89+
attr := struct{ id uint32 }{id}
90+
fd, err := bpf(unix.BPF_PROG_GET_FD_BY_ID, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
91+
if err != nil {
92+
return -1, err
93+
}
94+
return int(fd), nil
95+
}
96+
97+
// bpfProgAttach attaches progFd to cgroupFd with the given flags. If replaceFd
98+
// is >= 0, its fd is set in replaceBpfFd (for BPF_F_REPLACE semantics).
99+
func bpfProgAttach(cgroupFd, progFd int, attachFlags uint32, replaceFd int) error {
100+
attr := struct {
101+
targetFd uint32
102+
attachBpfFd uint32
103+
attachType uint32
104+
attachFlags uint32
105+
replaceBpfFd uint32
106+
}{
107+
targetFd: uint32(cgroupFd),
108+
attachBpfFd: uint32(progFd),
109+
attachType: uint32(unix.BPF_CGROUP_DEVICE),
110+
attachFlags: attachFlags,
111+
}
112+
if replaceFd >= 0 {
113+
attr.replaceBpfFd = uint32(replaceFd)
114+
}
115+
_, err := bpf(unix.BPF_PROG_ATTACH, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
116+
return err
117+
}
118+
119+
// bpfProgDetach detaches progFd from cgroupFd.
120+
func bpfProgDetach(cgroupFd, progFd int) error {
121+
// The kernel zero-fills the rest of bpf_attr beyond the size we pass.
122+
attr := struct {
123+
targetFd uint32
124+
attachBpfFd uint32
125+
attachType uint32
126+
}{
127+
targetFd: uint32(cgroupFd),
128+
attachBpfFd: uint32(progFd),
129+
attachType: uint32(unix.BPF_CGROUP_DEVICE),
130+
}
131+
_, err := bpf(unix.BPF_PROG_DETACH, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
132+
return err
133+
}
134+
135+
func findAttachedCgroupDeviceFilters(dirFd int) (_ []int, retErr error) {
19136
type bpfAttrQuery struct {
20137
TargetFd uint32
21138
AttachType uint32
@@ -37,36 +154,33 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error
37154
ProgCnt: uint32(len(progIds)),
38155
}
39156

40-
// Fetch the list of program ids.
41-
_, _, errno := unix.Syscall(unix.SYS_BPF,
42-
uintptr(unix.BPF_PROG_QUERY),
43-
uintptr(unsafe.Pointer(&query)),
44-
unsafe.Sizeof(query))
157+
// Fetch the list of program ids. bpf() keeps &query alive for the
158+
// duration of the syscall, and query.ProgCnt is read right after.
159+
_, err := bpf(unix.BPF_PROG_QUERY, unsafe.Pointer(&query), unsafe.Sizeof(query))
45160
size = int(query.ProgCnt)
46-
runtime.KeepAlive(query)
47-
if errno != 0 {
161+
if err != nil {
48162
// On ENOSPC we get the correct number of programs.
49-
if errno == unix.ENOSPC {
163+
if errors.Is(err, unix.ENOSPC) {
50164
retries++
51165
continue
52166
}
53-
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
167+
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", err)
54168
}
55169

56-
// Convert the ids to program handles.
57-
// On error we don't return the programs slice, so close the fds stored there.
170+
// Convert the ids to program fds.
171+
// On error we don't return the fds slice, so close the fds stored there.
58172
progIds = progIds[:size]
59-
programs := make([]*ebpf.Program, 0, len(progIds))
173+
fds := make([]int, 0, len(progIds))
60174
defer func() {
61175
if retErr != nil {
62-
for _, p := range programs {
63-
p.Close()
176+
for _, fd := range fds {
177+
unix.Close(fd)
64178
}
65179
}
66180
}()
67181

68182
for _, progId := range progIds {
69-
program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
183+
fd, err := bpfProgGetFdByID(progId)
70184
if err != nil {
71185
// We skip over programs that give us -EACCES or -EPERM. This
72186
// is necessary because there may be BPF programs that have
@@ -83,10 +197,10 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error
83197
}
84198
return nil, fmt.Errorf("cannot fetch program from id: %w", err)
85199
}
86-
programs = append(programs, program)
200+
fds = append(fds, fd)
87201
}
88202
runtime.KeepAlive(progIds)
89-
return programs, nil
203+
return fds, nil
90204
}
91205

92206
return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
@@ -99,23 +213,17 @@ var (
99213

100214
// Loosely based on the BPF_F_REPLACE support check in
101215
// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
102-
//
103-
// TODO: move this logic to cilium/ebpf
104216
func haveBpfProgReplace() bool {
105217
haveBpfProgReplaceOnce.Do(func() {
106-
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
107-
Type: ebpf.CGroupDevice,
108-
License: "MIT",
109-
Instructions: asm.Instructions{
110-
asm.Mov.Imm(asm.R0, 0),
111-
asm.Return(),
112-
},
113-
})
218+
progFd, err := bpfProgLoad(asm.Instructions{
219+
asm.Mov.Imm(asm.R0, 0),
220+
asm.Return(),
221+
}, "MIT")
114222
if err != nil {
115-
logrus.Warnf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
223+
logrus.Warnf("checking for BPF_F_REPLACE support: bpfProgLoad failed: %v", err)
116224
return
117225
}
118-
defer prog.Close()
226+
defer unix.Close(progFd)
119227

120228
devnull, err := os.Open("/dev/null")
121229
if err != nil {
@@ -127,24 +235,19 @@ func haveBpfProgReplace() bool {
127235
// We know that we have BPF_PROG_ATTACH since we can load
128236
// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
129237
// we know that the feature isn't present.
130-
err = link.RawAttachProgram(link.RawAttachProgramOptions{
131-
// We rely on this fd being checked after attachFlags in the kernel.
132-
Target: int(devnull.Fd()),
133-
// Attempt to "replace" our BPF program with itself. This will
134-
// always fail, but we should get -EINVAL if BPF_F_REPLACE is not
135-
// supported.
136-
Anchor: link.ReplaceProgram(prog),
137-
Program: prog,
138-
Attach: ebpf.AttachCGroupDevice,
139-
Flags: unix.BPF_F_ALLOW_MULTI,
140-
})
141-
if errors.Is(err, ebpf.ErrNotSupported) || errors.Is(err, unix.EINVAL) {
238+
//
239+
// We rely on the target fd being checked after attachFlags in the
240+
// kernel. Attempting to "replace" our BPF program with itself always
241+
// fails, but we should get -EINVAL if BPF_F_REPLACE is not supported,
242+
// and -EBADF (from the dummy target fd) if it is.
243+
err = bpfProgAttach(int(devnull.Fd()), progFd, unix.BPF_F_ALLOW_MULTI|unix.BPF_F_REPLACE, progFd)
244+
if errors.Is(err, unix.EINVAL) {
142245
// not supported
143246
return
144247
}
145248
if !errors.Is(err, unix.EBADF) {
146249
// If we see any new errors here, it's possible that there is a
147-
// regression due to a cilium/ebpf update and the above EINVAL
250+
// regression due to a kernel update and the above EINVAL
148251
// checks are not working. So, be loud about it so someone notices
149252
// and we can get the issue fixed quicker.
150253
logrus.Warnf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
@@ -169,83 +272,58 @@ func loadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd
169272
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
170273

171274
// Get the list of existing programs.
172-
oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
275+
oldFds, err := findAttachedCgroupDeviceFilters(dirFd)
173276
if err != nil {
174277
return err
175278
}
176279
defer func() {
177-
for _, p := range oldProgs {
178-
p.Close()
280+
for _, fd := range oldFds {
281+
unix.Close(fd)
179282
}
180283
}()
181284

182-
useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
285+
useReplaceProg := haveBpfProgReplace() && len(oldFds) == 1
183286

184287
// Generate new program.
185-
spec := &ebpf.ProgramSpec{
186-
Type: ebpf.CGroupDevice,
187-
Instructions: insts,
188-
License: license,
189-
}
190-
prog, err := ebpf.NewProgram(spec)
288+
progFd, err := bpfProgLoad(insts, license)
191289
if err != nil {
192290
return err
193291
}
194-
defer prog.Close()
292+
// Once the program is attached, the kernel keeps it alive via the cgroup
293+
// attachment, so we no longer need our own fd; we also don't need it if the
294+
// attach below fails. Either way, close it on return.
295+
defer unix.Close(progFd)
195296

196297
// If there is only one old program, we can just replace it directly.
197-
198-
attachProgramOptions := link.RawAttachProgramOptions{
199-
Target: dirFd,
200-
Program: prog,
201-
Attach: ebpf.AttachCGroupDevice,
202-
Flags: unix.BPF_F_ALLOW_MULTI,
203-
}
204-
298+
replaceFd := -1
299+
attachFlags := uint32(unix.BPF_F_ALLOW_MULTI)
205300
if useReplaceProg {
206-
attachProgramOptions.Anchor = link.ReplaceProgram(oldProgs[0])
301+
replaceFd = oldFds[0]
302+
attachFlags |= unix.BPF_F_REPLACE
207303
}
208-
err = link.RawAttachProgram(attachProgramOptions)
304+
err = bpfProgAttach(dirFd, progFd, attachFlags, replaceFd)
209305
if err != nil {
210306
return fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
211307
}
308+
212309
if !useReplaceProg {
213310
logLevel := logrus.DebugLevel
214311
// If there was more than one old program, give a warning (since this
215312
// really shouldn't happen with runc-managed cgroups) and then detach
216313
// all the old programs.
217-
if len(oldProgs) > 1 {
314+
if len(oldFds) > 1 {
218315
// NOTE: Ideally this should be a warning but it turns out that
219316
// systemd-managed cgroups trigger this warning (apparently
220317
// systemd doesn't delete old non-systemd programs when
221318
// setting properties).
222-
logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
319+
logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldFds))
223320
logLevel = logrus.InfoLevel
224321
}
225-
for idx, oldProg := range oldProgs {
226-
// Output some extra debug info.
227-
if info, err := oldProg.Info(); err == nil {
228-
fields := logrus.Fields{
229-
"type": info.Type.String(),
230-
"tag": info.Tag,
231-
"name": info.Name,
232-
}
233-
if id, ok := info.ID(); ok {
234-
fields["id"] = id
235-
}
236-
if runCount, ok := info.RunCount(); ok {
237-
fields["run_count"] = runCount
238-
}
239-
if runtime, ok := info.Runtime(); ok {
240-
fields["runtime"] = runtime.String()
241-
}
242-
logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
243-
}
244-
err = link.RawDetachProgram(link.RawDetachProgramOptions{
245-
Target: dirFd,
246-
Program: oldProg,
247-
Attach: ebpf.AttachCGroupDevice,
248-
})
322+
for idx, oldFd := range oldFds {
323+
logrus.WithFields(logrus.Fields{
324+
"fd": oldFd,
325+
}).Logf(logLevel, "removing old filter %d from cgroup", idx)
326+
err = bpfProgDetach(dirFd, oldFd)
249327
if err != nil {
250328
return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
251329
}

devices/endian_be.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
//go:build armbe || arm64be || mips || mips64 || mips64p32 || ppc64 || s390 || s390x || sparc || sparc64
2+
3+
package devices
4+
5+
import "encoding/binary"
6+
7+
// nativeEndian is used as a workaround for cilium/ebpf/asm
8+
// which does not accept binary.NativeEndian.
9+
var nativeEndian = binary.BigEndian

devices/endian_le.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
//go:build 386 || amd64 || amd64p32 || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || ppc64le || riscv64 || wasm
2+
3+
package devices
4+
5+
import "encoding/binary"
6+
7+
// nativeEndian is used as a workaround for cilium/ebpf/asm
8+
// which does not accept binary.NativeEndian.
9+
var nativeEndian = binary.LittleEndian

0 commit comments

Comments
 (0)