Skip to content

Commit 9f71a23

Browse files
committed
improve ebpf
1 parent ef3ee97 commit 9f71a23

1 file changed

Lines changed: 136 additions & 41 deletions

File tree

crates/runtime/src/hooks.rs

Lines changed: 136 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -32,33 +32,58 @@ mod imp {
3232
use core::sync::atomic::AtomicBool;
3333
use core::sync::atomic::Ordering;
3434

35-
const SECCOMP_SET_MODE_FILTER: libc::c_uint = 1;
36-
const PR_SET_NO_NEW_PRIVS: libc::c_int = 38;
37-
const AUDIT_ARCH_X86_64: u32 = 0xC000003E;
38-
const __NR_FUTEX: u32 = 202;
39-
const SECCOMP_RET_TRAP: u32 = 0x00030000;
40-
const SECCOMP_RET_ALLOW: u32 = 0x7FFF0000;
35+
// ── constants from kernel headers ──────────────────────────────────
36+
// Most come from `libc` directly; a few are defined here because
37+
// `libc` does not export them (e.g. `AUDIT_ARCH_X86_64`).
38+
const AUDIT_ARCH_X86_64: u32 = 0xC000003E; // <linux/audit.h> — EM_X86_64 | __AUDIT_ARCH_64BIT
39+
40+
// ── BPF instruction builders ───────────────────────────────────────
41+
// Classic BPF instruction format used by seccomp.
42+
// Each instruction is a `sock_filter { code, jt, jf, k }`:
43+
// code — opcode (class | size | mode)
44+
// jt — jump offset if true
45+
// jf — jump offset if false
46+
// k — generic operand / immediate / offset
47+
//
48+
// Available opcode components from <linux/bpf_common.h>:
49+
// class: BPF_LD (0x00), BPF_LDX (0x01), BPF_ALU (0x04), BPF_JMP (0x05), BPF_RET (0x06)
50+
// size: BPF_W (0x00), BPF_H (0x08), BPF_B (0x10)
51+
// mode: BPF_ABS(0x20), BPF_IND(0x40), BPF_MEM(0x60), BPF_LEN(0x80)
52+
// jmp-op: BPF_JA (0x00), BPF_JEQ(0x10), BPF_JGT(0x20), BPF_JGE(0x30), BPF_JSET(0x40)
53+
// alu-op: BPF_ADD(0x00), BPF_SUB(0x10), BPF_MUL(0x20), BPF_AND(0x50)
54+
// src: BPF_K (0x00 — use k field), BPF_X (0x08 — use X register)
55+
56+
/// One BPF statement (no jump): reads data or returns a value.
57+
fn bpf_stmt(op: u32, k: u32) -> libc::sock_filter {
58+
libc::sock_filter { code: op as u16, jt: 0, jf: 0, k }
59+
}
60+
61+
/// One BPF jump: compares A against `k` and branches.
62+
fn bpf_jmp(op: u32, jt: u8, jf: u8, k: u32) -> libc::sock_filter {
63+
libc::sock_filter { code: op as u16, jt, jf, k }
64+
}
4165

4266
/// Install a seccomp BPF filter that traps `futex(FUTEX_WAIT)`.
4367
///
4468
/// Everything (prctl + sigaction + BPF) is done once per process via
45-
/// a `OnceLock`. The first thread to enter simulation performs the
69+
/// an `AtomicBool`. The first thread to enter simulation performs the
4670
/// syscalls; subsequent threads inherit the filter at creation time.
4771
pub fn install() {
4872
static INSTALLED: AtomicBool = AtomicBool::new(false);
4973
if INSTALLED.swap(true, Ordering::Relaxed) {
5074
return;
5175
}
5276
unsafe {
53-
// `PR_SET_NO_NEW_PRIVS` lets unprivileged threads install a filter.
54-
let ret = libc::prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
77+
// ── step 1: PR_SET_NO_NEW_PRIVS ─────────────────────────────
78+
// Lets unprivileged threads install a seccomp filter.
79+
let ret = libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
5580
assert_eq!(ret, 0, "parking_detect: PR_SET_NO_NEW_PRIVS failed");
5681

57-
// Install the SIGSYS handler.
82+
// ── step 2: register SIGSYS handler ─────────────────────────
5883
// SA_NODEFER: allow re‑entering the handler if an abort‑time
5984
// syscall also hits the filter.
6085
let mut sa: libc::sigaction = core::mem::zeroed();
61-
sa.sa_flags = 0x0004 | 0x40000000; // SA_SIGINFO | SA_NODEFER
86+
sa.sa_flags = libc::SA_SIGINFO | libc::SA_NODEFER;
6287
let ptr = sigsys_handler as extern "C" fn(i32, *mut libc::siginfo_t, *mut libc::c_void);
6388
// The sa_handler / sa_sigaction field is a union; write via raw
6489
// bytes to avoid fighting the libc type definitions.
@@ -67,7 +92,7 @@ mod imp {
6792
let ret = libc::sigaction(libc::SIGSYS, &sa, core::ptr::null_mut());
6893
assert_eq!(ret, 0, "parking_detect: sigaction(SIGSYS) failed");
6994

70-
// ── BPF filter ──────────────────────────────────────────────────
95+
// ── step 3: install the BPF filter ──────────────────────────
7196
// Every syscall is checked by this 11-instruction seccomp
7297
// program. The kernel provides `struct seccomp_data`:
7398
//
@@ -76,43 +101,113 @@ mod imp {
76101
// 4 4 arch (AUDIT_ARCH_*)
77102
// 24 8 args[1] (futex op | flags)
78103
//
79-
// We check: arch, then nr, then args[1] masked to strip the
80-
// private flag. See each line for its instruction.
104+
// We verify the architecture, then the syscall number,
105+
// then the futex operation (after masking the PRIVATE flag).
81106
let bpf: [libc::sock_filter; 11] = [
82-
// ld [4] — load arch field into accumulator
83-
libc::sock_filter { code: 0x20, jt: 0, jf: 0, k: 4 },
84-
// jeq AUDIT_ARCH_X86_64 — x86_64? continue (jt:0), else jump to insn 10 (jf:8 → ret KILL)
85-
// x86 compat syscalls have a different data layout; they must be rejected.
86-
libc::sock_filter { code: 0x15, jt: 0, jf: 8, k: AUDIT_ARCH_X86_64 },
87-
// ld [0] — load syscall number
88-
libc::sock_filter { code: 0x20, jt: 0, jf: 0, k: 0 },
89-
// jeq __NR_FUTEX (202) — is it futex? continue (jt:0), else jump to insn 9 (jf:5 → ret ALLOW)
90-
libc::sock_filter { code: 0x15, jt: 0, jf: 5, k: __NR_FUTEX },
91-
// ld [24] — load args[1], the futex operation word (op | flags)
92-
// e.g. FUTEX_WAIT (0), FUTEX_WAIT_BITSET (9), FUTEX_PRIVATE_FLAG (0x80)
93-
libc::sock_filter { code: 0x20, jt: 0, jf: 0, k: 24 },
94-
// and 0x7F — strip bit 7 (FUTEX_PRIVATE_FLAG = 0x80)
95-
// After this: FUTEX_WAIT (0) and FUTEX_WAIT|PRIVATE (128) → 0
96-
// FUTEX_WAIT_BITSET (9) and FUTEX_WAIT_BITSET|PRIVATE (137) → 9
97-
libc::sock_filter { code: 0x54, jt: 0, jf: 0, k: 0x7F },
98-
// jeq 0 (FUTEX_WAIT) — if masked op == 0, jump to insn 8 (jt:1 → ret TRAP)
99-
libc::sock_filter { code: 0x15, jt: 1, jf: 0, k: 0 },
100-
// jeq 9 (FUTEX_WAIT_BITSET) — if masked op == 9, fall through to insn 8 (jf:1 → ret ALLOW)
101-
libc::sock_filter { code: 0x15, jt: 0, jf: 1, k: 9 },
102-
// ret SECCOMP_RET_TRAP — deliver SIGSYS, our handler inspects in_simulation()
103-
libc::sock_filter { code: 0x06, jt: 0, jf: 0, k: SECCOMP_RET_TRAP },
104-
// ret SECCOMP_RET_ALLOW — not a futex wait, let it through
105-
libc::sock_filter { code: 0x06, jt: 0, jf: 0, k: SECCOMP_RET_ALLOW },
106-
// ret SECCOMP_RET_KILL — arch mismatch, kill the process
107-
libc::sock_filter { code: 0x06, jt: 0, jf: 0, k: 0 },
107+
// ── insn 0: ld [4] ─────────────────────────────────
108+
// Load the `arch` field of `seccomp_data` into A.
109+
bpf_stmt(
110+
libc::BPF_LD | libc::BPF_W | libc::BPF_ABS,
111+
4,
112+
),
113+
114+
// ── insn 1: jeq AUDIT_ARCH_X86_64, 0, 8 ──────────
115+
// If arch == x86_64 → continue (jt:0 → insn 2).
116+
// Otherwise → jump forward 8 (jf:8 → insn 10, KILL).
117+
// x86 compat syscalls have a different data layout
118+
// and must be rejected outright.
119+
bpf_jmp(
120+
libc::BPF_JMP | libc::BPF_JEQ | libc::BPF_K,
121+
0, 8,
122+
AUDIT_ARCH_X86_64,
123+
),
124+
125+
// ── insn 2: ld [0] ─────────────────────────────────
126+
// Load the `nr` (syscall number) into A.
127+
bpf_stmt(
128+
libc::BPF_LD | libc::BPF_W | libc::BPF_ABS,
129+
0,
130+
),
131+
132+
// ── insn 3: jeq __NR_FUTEX, 0, 5 ─────────────────
133+
// If nr == FUTEX (202) → continue (jt:0 → insn 4).
134+
// Otherwise → jump forward 5 (jf:5 → insn 9, ALLOW).
135+
bpf_jmp(
136+
libc::BPF_JMP | libc::BPF_JEQ | libc::BPF_K,
137+
0, 5,
138+
libc::SYS_futex as u32,
139+
),
140+
141+
// ── insn 4: ld [24] ────────────────────────────────
142+
// Load `args[1]` — the futex operation word (op | flags).
143+
// e.g. FUTEX_WAIT (0), FUTEX_WAIT_BITSET (9),
144+
// FUTEX_PRIVATE_FLAG (0x80)
145+
bpf_stmt(
146+
libc::BPF_LD | libc::BPF_W | libc::BPF_ABS,
147+
24,
148+
),
149+
150+
// ── insn 5: and 0x7F ──────────────────────────────
151+
// Strip the PRIVATE flag bit (0x80).
152+
// After masking:
153+
// FUTEX_WAIT (0), FUTEX_WAIT|PRIVATE (0x80) → 0
154+
// FUTEX_WAIT_BITSET (9), FUTEX_WAIT_BITSET|PRIVATE (0x89) → 9
155+
bpf_stmt(
156+
libc::BPF_ALU | libc::BPF_AND | libc::BPF_K,
157+
0x7F,
158+
),
159+
160+
// ── insn 6: jeq 0, 1, 0 ──────────────────────────
161+
// If masked op == FUTEX_WAIT (0) → jump forward 1
162+
// (jt:1 → insn 8, TRAP).
163+
// Otherwise → fall through (jf:0 → insn 7).
164+
bpf_jmp(
165+
libc::BPF_JMP | libc::BPF_JEQ | libc::BPF_K,
166+
1, 0,
167+
0, // FUTEX_WAIT
168+
),
169+
170+
// ── insn 7: jeq 9, 0, 1 ──────────────────────────
171+
// If masked op == FUTEX_WAIT_BITSET (9) → fall
172+
// through (jt:0 → insn 8, TRAP).
173+
// Otherwise → jump forward 1 (jf:1 → insn 9, ALLOW).
174+
bpf_jmp(
175+
libc::BPF_JMP | libc::BPF_JEQ | libc::BPF_K,
176+
0, 1,
177+
9, // FUTEX_WAIT_BITSET
178+
),
179+
180+
// ── insn 8: ret SECCOMP_RET_TRAP ────────────────
181+
// Deliver SIGSYS. Our handler checks
182+
// `sim_std::in_simulation()` and aborts if inside a
183+
// simulation, or skips the instruction otherwise.
184+
bpf_stmt(
185+
libc::BPF_RET | libc::BPF_K,
186+
libc::SECCOMP_RET_TRAP,
187+
),
188+
189+
// ── insn 9: ret SECCOMP_RET_ALLOW ──────────────
190+
// Not a futex wait — let the syscall through.
191+
bpf_stmt(
192+
libc::BPF_RET | libc::BPF_K,
193+
libc::SECCOMP_RET_ALLOW,
194+
),
195+
196+
// ── insn 10: ret SECCOMP_RET_KILL ─────────────
197+
// Architecture mismatch — kill the process.
198+
bpf_stmt(
199+
libc::BPF_RET | libc::BPF_K,
200+
libc::SECCOMP_RET_KILL,
201+
),
108202
];
203+
109204
let prog = libc::sock_fprog {
110205
len: bpf.len() as u16,
111206
filter: &bpf as *const libc::sock_filter as *mut libc::sock_filter,
112207
};
113208
let ret = libc::syscall(
114209
libc::SYS_seccomp,
115-
SECCOMP_SET_MODE_FILTER,
210+
libc::SECCOMP_SET_MODE_FILTER,
116211
0,
117212
&prog,
118213
);

0 commit comments

Comments
 (0)