11package devices
22
33import (
4+ "bytes"
45 "errors"
56 "fmt"
67 "os"
78 "runtime"
89 "sync"
910 "unsafe"
1011
11- "github.com/cilium/ebpf"
1212 "github.com/cilium/ebpf/asm"
13- "github.com/cilium/ebpf/link"
1413 "github.com/sirupsen/logrus"
1514 "golang.org/x/sys/unix"
1615)
1716
18- func findAttachedCgroupDeviceFilters (dirFd int ) (_ []* ebpf.Program , retErr error ) {
17+ func bpf (cmd uintptr , attr unsafe.Pointer , size uintptr ) (uintptr , error ) {
18+ r1 , _ , err := unix .Syscall (unix .SYS_BPF , cmd , uintptr (attr ), size )
19+ runtime .KeepAlive (attr )
20+ if err != 0 {
21+ return r1 , err
22+ }
23+ return r1 , nil
24+ }
25+
26+ // bpfProgLoad loads a BPF_PROG_TYPE_CGROUP_DEVICE program and returns its fd.
27+ func bpfProgLoad (insns asm.Instructions , license string ) (int , error ) {
28+ buf := bytes .NewBuffer (make ([]byte , 0 , insns .Size ()))
29+ if err := insns .Marshal (buf , nativeEndian ); err != nil {
30+ return - 1 , err
31+ }
32+ insnsBytes := buf .Bytes ()
33+
34+ licensePtr , err := unix .BytePtrFromString (license )
35+ if err != nil {
36+ return - 1 , err
37+ }
38+
39+ // Subset of struct bpf_attr for BPF_PROG_LOAD. Fields past the ones we set
40+ // are left zero; the kernel zero-fills any part of bpf_attr beyond the size
41+ // we pass.
42+ attr := struct {
43+ progType uint32
44+ insnCnt uint32
45+ insns uint64 // pointer
46+ license uint64 // pointer
47+ logLevel uint32
48+ logSize uint32
49+ logBuf uint64 // pointer
50+ }{
51+ progType : unix .BPF_PROG_TYPE_CGROUP_DEVICE ,
52+ insnCnt : uint32 (len (insnsBytes ) / asm .InstructionSize ),
53+ insns : uint64 (uintptr (unsafe .Pointer (& insnsBytes [0 ]))),
54+ license : uint64 (uintptr (unsafe .Pointer (licensePtr ))),
55+ }
56+
57+ fd , err := bpf (unix .BPF_PROG_LOAD , unsafe .Pointer (& attr ), unsafe .Sizeof (attr ))
58+ // attr holds the pointers as integers, so the GC can't see them; keep the
59+ // referenced objects alive until the syscall returns.
60+ runtime .KeepAlive (insnsBytes )
61+ runtime .KeepAlive (licensePtr )
62+ if err == nil {
63+ return int (fd ), nil
64+ }
65+
66+ // The load failed. Retry with the verifier log enabled so we can include
67+ // it in the error (the first attempt skips it, as it is the fast path).
68+ log := make ([]byte , 64 * 1024 )
69+ attr .logLevel = 1
70+ attr .logSize = uint32 (len (log ))
71+ attr .logBuf = uint64 (uintptr (unsafe .Pointer (& log [0 ])))
72+
73+ fd , err = bpf (unix .BPF_PROG_LOAD , unsafe .Pointer (& attr ), unsafe .Sizeof (attr ))
74+ runtime .KeepAlive (insnsBytes )
75+ runtime .KeepAlive (licensePtr )
76+ runtime .KeepAlive (log )
77+ if err == nil {
78+ return int (fd ), nil
79+ }
80+ if n := bytes .IndexByte (log , 0 ); n > 0 {
81+ return - 1 , fmt .Errorf ("%w: %s" , err , bytes .TrimRight (log [:n ], "\n " ))
82+ }
83+ return - 1 , err
84+ }
85+
86+ // bpfProgGetFdByID returns the fd for the BPF program with the given ID.
87+ func bpfProgGetFdByID (id uint32 ) (int , error ) {
88+ // The kernel zero-fills the rest of bpf_attr beyond the size we pass.
89+ attr := struct { id uint32 }{id }
90+ fd , err := bpf (unix .BPF_PROG_GET_FD_BY_ID , unsafe .Pointer (& attr ), unsafe .Sizeof (attr ))
91+ if err != nil {
92+ return - 1 , err
93+ }
94+ return int (fd ), nil
95+ }
96+
97+ // bpfProgAttach attaches progFd to cgroupFd with the given flags. If replaceFd
98+ // is >= 0, its fd is set in replaceBpfFd (for BPF_F_REPLACE semantics).
99+ func bpfProgAttach (cgroupFd , progFd int , attachFlags uint32 , replaceFd int ) error {
100+ attr := struct {
101+ targetFd uint32
102+ attachBpfFd uint32
103+ attachType uint32
104+ attachFlags uint32
105+ replaceBpfFd uint32
106+ }{
107+ targetFd : uint32 (cgroupFd ),
108+ attachBpfFd : uint32 (progFd ),
109+ attachType : uint32 (unix .BPF_CGROUP_DEVICE ),
110+ attachFlags : attachFlags ,
111+ }
112+ if replaceFd >= 0 {
113+ attr .replaceBpfFd = uint32 (replaceFd )
114+ }
115+ _ , err := bpf (unix .BPF_PROG_ATTACH , unsafe .Pointer (& attr ), unsafe .Sizeof (attr ))
116+ return err
117+ }
118+
119+ // bpfProgDetach detaches progFd from cgroupFd.
120+ func bpfProgDetach (cgroupFd , progFd int ) error {
121+ // The kernel zero-fills the rest of bpf_attr beyond the size we pass.
122+ attr := struct {
123+ targetFd uint32
124+ attachBpfFd uint32
125+ attachType uint32
126+ }{
127+ targetFd : uint32 (cgroupFd ),
128+ attachBpfFd : uint32 (progFd ),
129+ attachType : uint32 (unix .BPF_CGROUP_DEVICE ),
130+ }
131+ _ , err := bpf (unix .BPF_PROG_DETACH , unsafe .Pointer (& attr ), unsafe .Sizeof (attr ))
132+ return err
133+ }
134+
135+ func findAttachedCgroupDeviceFilters (dirFd int ) (_ []int , retErr error ) {
19136 type bpfAttrQuery struct {
20137 TargetFd uint32
21138 AttachType uint32
@@ -37,36 +154,33 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error
37154 ProgCnt : uint32 (len (progIds )),
38155 }
39156
40- // Fetch the list of program ids.
41- _ , _ , errno := unix .Syscall (unix .SYS_BPF ,
42- uintptr (unix .BPF_PROG_QUERY ),
43- uintptr (unsafe .Pointer (& query )),
44- unsafe .Sizeof (query ))
157+ // Fetch the list of program ids. bpf() keeps &query alive for the
158+ // duration of the syscall, and query.ProgCnt is read right after.
159+ _ , err := bpf (unix .BPF_PROG_QUERY , unsafe .Pointer (& query ), unsafe .Sizeof (query ))
45160 size = int (query .ProgCnt )
46- runtime .KeepAlive (query )
47- if errno != 0 {
161+ if err != nil {
48162 // On ENOSPC we get the correct number of programs.
49- if errno == unix .ENOSPC {
163+ if errors . Is ( err , unix .ENOSPC ) {
50164 retries ++
51165 continue
52166 }
53- return nil , fmt .Errorf ("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w" , errno )
167+ return nil , fmt .Errorf ("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w" , err )
54168 }
55169
56- // Convert the ids to program handles .
57- // On error we don't return the programs slice, so close the fds stored there.
170+ // Convert the ids to program fds .
171+ // On error we don't return the fds slice, so close the fds stored there.
58172 progIds = progIds [:size ]
59- programs := make ([]* ebpf. Program , 0 , len (progIds ))
173+ fds := make ([]int , 0 , len (progIds ))
60174 defer func () {
61175 if retErr != nil {
62- for _ , p := range programs {
63- p .Close ()
176+ for _ , fd := range fds {
177+ unix .Close (fd )
64178 }
65179 }
66180 }()
67181
68182 for _ , progId := range progIds {
69- program , err := ebpf . NewProgramFromID ( ebpf . ProgramID ( progId ) )
183+ fd , err := bpfProgGetFdByID ( progId )
70184 if err != nil {
71185 // We skip over programs that give us -EACCES or -EPERM. This
72186 // is necessary because there may be BPF programs that have
@@ -83,10 +197,10 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error
83197 }
84198 return nil , fmt .Errorf ("cannot fetch program from id: %w" , err )
85199 }
86- programs = append (programs , program )
200+ fds = append (fds , fd )
87201 }
88202 runtime .KeepAlive (progIds )
89- return programs , nil
203+ return fds , nil
90204 }
91205
92206 return nil , errors .New ("could not get complete list of CGROUP_DEVICE programs" )
@@ -99,23 +213,17 @@ var (
99213
100214// Loosely based on the BPF_F_REPLACE support check in
101215// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
102- //
103- // TODO: move this logic to cilium/ebpf
104216func haveBpfProgReplace () bool {
105217 haveBpfProgReplaceOnce .Do (func () {
106- prog , err := ebpf .NewProgram (& ebpf.ProgramSpec {
107- Type : ebpf .CGroupDevice ,
108- License : "MIT" ,
109- Instructions : asm.Instructions {
110- asm .Mov .Imm (asm .R0 , 0 ),
111- asm .Return (),
112- },
113- })
218+ progFd , err := bpfProgLoad (asm.Instructions {
219+ asm .Mov .Imm (asm .R0 , 0 ),
220+ asm .Return (),
221+ }, "MIT" )
114222 if err != nil {
115- logrus .Warnf ("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v" , err )
223+ logrus .Warnf ("checking for BPF_F_REPLACE support: bpfProgLoad failed: %v" , err )
116224 return
117225 }
118- defer prog .Close ()
226+ defer unix .Close (progFd )
119227
120228 devnull , err := os .Open ("/dev/null" )
121229 if err != nil {
@@ -127,24 +235,19 @@ func haveBpfProgReplace() bool {
127235 // We know that we have BPF_PROG_ATTACH since we can load
128236 // BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
129237 // we know that the feature isn't present.
130- err = link .RawAttachProgram (link.RawAttachProgramOptions {
131- // We rely on this fd being checked after attachFlags in the kernel.
132- Target : int (devnull .Fd ()),
133- // Attempt to "replace" our BPF program with itself. This will
134- // always fail, but we should get -EINVAL if BPF_F_REPLACE is not
135- // supported.
136- Anchor : link .ReplaceProgram (prog ),
137- Program : prog ,
138- Attach : ebpf .AttachCGroupDevice ,
139- Flags : unix .BPF_F_ALLOW_MULTI ,
140- })
141- if errors .Is (err , ebpf .ErrNotSupported ) || errors .Is (err , unix .EINVAL ) {
238+ //
239+ // We rely on the target fd being checked after attachFlags in the
240+ // kernel. Attempting to "replace" our BPF program with itself always
241+ // fails, but we should get -EINVAL if BPF_F_REPLACE is not supported,
242+ // and -EBADF (from the dummy target fd) if it is.
243+ err = bpfProgAttach (int (devnull .Fd ()), progFd , unix .BPF_F_ALLOW_MULTI | unix .BPF_F_REPLACE , progFd )
244+ if errors .Is (err , unix .EINVAL ) {
142245 // not supported
143246 return
144247 }
145248 if ! errors .Is (err , unix .EBADF ) {
146249 // If we see any new errors here, it's possible that there is a
147- // regression due to a cilium/ebpf update and the above EINVAL
250+ // regression due to a kernel update and the above EINVAL
148251 // checks are not working. So, be loud about it so someone notices
149252 // and we can get the issue fixed quicker.
150253 logrus .Warnf ("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v" , err )
@@ -169,83 +272,58 @@ func loadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd
169272 _ = unix .Setrlimit (unix .RLIMIT_MEMLOCK , memlockLimit )
170273
171274 // Get the list of existing programs.
172- oldProgs , err := findAttachedCgroupDeviceFilters (dirFd )
275+ oldFds , err := findAttachedCgroupDeviceFilters (dirFd )
173276 if err != nil {
174277 return err
175278 }
176279 defer func () {
177- for _ , p := range oldProgs {
178- p .Close ()
280+ for _ , fd := range oldFds {
281+ unix .Close (fd )
179282 }
180283 }()
181284
182- useReplaceProg := haveBpfProgReplace () && len (oldProgs ) == 1
285+ useReplaceProg := haveBpfProgReplace () && len (oldFds ) == 1
183286
184287 // Generate new program.
185- spec := & ebpf.ProgramSpec {
186- Type : ebpf .CGroupDevice ,
187- Instructions : insts ,
188- License : license ,
189- }
190- prog , err := ebpf .NewProgram (spec )
288+ progFd , err := bpfProgLoad (insts , license )
191289 if err != nil {
192290 return err
193291 }
194- defer prog .Close ()
292+ // Once the program is attached, the kernel keeps it alive via the cgroup
293+ // attachment, so we no longer need our own fd; we also don't need it if the
294+ // attach below fails. Either way, close it on return.
295+ defer unix .Close (progFd )
195296
196297 // If there is only one old program, we can just replace it directly.
197-
198- attachProgramOptions := link.RawAttachProgramOptions {
199- Target : dirFd ,
200- Program : prog ,
201- Attach : ebpf .AttachCGroupDevice ,
202- Flags : unix .BPF_F_ALLOW_MULTI ,
203- }
204-
298+ replaceFd := - 1
299+ attachFlags := uint32 (unix .BPF_F_ALLOW_MULTI )
205300 if useReplaceProg {
206- attachProgramOptions .Anchor = link .ReplaceProgram (oldProgs [0 ])
301+ replaceFd = oldFds [0 ]
302+ attachFlags |= unix .BPF_F_REPLACE
207303 }
208- err = link . RawAttachProgram ( attachProgramOptions )
304+ err = bpfProgAttach ( dirFd , progFd , attachFlags , replaceFd )
209305 if err != nil {
210306 return fmt .Errorf ("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w" , err )
211307 }
308+
212309 if ! useReplaceProg {
213310 logLevel := logrus .DebugLevel
214311 // If there was more than one old program, give a warning (since this
215312 // really shouldn't happen with runc-managed cgroups) and then detach
216313 // all the old programs.
217- if len (oldProgs ) > 1 {
314+ if len (oldFds ) > 1 {
218315 // NOTE: Ideally this should be a warning but it turns out that
219316 // systemd-managed cgroups trigger this warning (apparently
220317 // systemd doesn't delete old non-systemd programs when
221318 // setting properties).
222- logrus .Infof ("found more than one filter (%d) attached to a cgroup -- removing extra filters!" , len (oldProgs ))
319+ logrus .Infof ("found more than one filter (%d) attached to a cgroup -- removing extra filters!" , len (oldFds ))
223320 logLevel = logrus .InfoLevel
224321 }
225- for idx , oldProg := range oldProgs {
226- // Output some extra debug info.
227- if info , err := oldProg .Info (); err == nil {
228- fields := logrus.Fields {
229- "type" : info .Type .String (),
230- "tag" : info .Tag ,
231- "name" : info .Name ,
232- }
233- if id , ok := info .ID (); ok {
234- fields ["id" ] = id
235- }
236- if runCount , ok := info .RunCount (); ok {
237- fields ["run_count" ] = runCount
238- }
239- if runtime , ok := info .Runtime (); ok {
240- fields ["runtime" ] = runtime .String ()
241- }
242- logrus .WithFields (fields ).Logf (logLevel , "removing old filter %d from cgroup" , idx )
243- }
244- err = link .RawDetachProgram (link.RawDetachProgramOptions {
245- Target : dirFd ,
246- Program : oldProg ,
247- Attach : ebpf .AttachCGroupDevice ,
248- })
322+ for idx , oldFd := range oldFds {
323+ logrus .WithFields (logrus.Fields {
324+ "fd" : oldFd ,
325+ }).Logf (logLevel , "removing old filter %d from cgroup" , idx )
326+ err = bpfProgDetach (dirFd , oldFd )
249327 if err != nil {
250328 return fmt .Errorf ("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w" , err )
251329 }
0 commit comments