@@ -32,33 +32,58 @@ mod imp {
3232 use core:: sync:: atomic:: AtomicBool ;
3333 use core:: sync:: atomic:: Ordering ;
3434
35- const SECCOMP_SET_MODE_FILTER : libc:: c_uint = 1 ;
36- const PR_SET_NO_NEW_PRIVS : libc:: c_int = 38 ;
37- const AUDIT_ARCH_X86_64 : u32 = 0xC000003E ;
38- const __NR_FUTEX: u32 = 202 ;
39- const SECCOMP_RET_TRAP : u32 = 0x00030000 ;
40- const SECCOMP_RET_ALLOW : u32 = 0x7FFF0000 ;
35+ // ── constants from kernel headers ──────────────────────────────────
36+ // Most come from `libc` directly; a few are defined here because
37+ // `libc` does not export them (e.g. `AUDIT_ARCH_X86_64`).
38+ const AUDIT_ARCH_X86_64 : u32 = 0xC000003E ; // <linux/audit.h> — EM_X86_64 | __AUDIT_ARCH_64BIT
39+
40+ // ── BPF instruction builders ───────────────────────────────────────
41+ // Classic BPF instruction format used by seccomp.
42+ // Each instruction is a `sock_filter { code, jt, jf, k }`:
43+ // code — opcode (class | size | mode)
44+ // jt — jump offset if true
45+ // jf — jump offset if false
46+ // k — generic operand / immediate / offset
47+ //
48+ // Available opcode components from <linux/bpf_common.h>:
49+ // class: BPF_LD (0x00), BPF_LDX (0x01), BPF_ALU (0x04), BPF_JMP (0x05), BPF_RET (0x06)
50+ // size: BPF_W (0x00), BPF_H (0x08), BPF_B (0x10)
51+ // mode: BPF_ABS(0x20), BPF_IND(0x40), BPF_MEM(0x60), BPF_LEN(0x80)
52+ // jmp-op: BPF_JA (0x00), BPF_JEQ(0x10), BPF_JGT(0x20), BPF_JGE(0x30), BPF_JSET(0x40)
53+ // alu-op: BPF_ADD(0x00), BPF_SUB(0x10), BPF_MUL(0x20), BPF_AND(0x50)
54+ // src: BPF_K (0x00 — use k field), BPF_X (0x08 — use X register)
55+
56+ /// One BPF statement (no jump): reads data or returns a value.
57+ fn bpf_stmt ( op : u32 , k : u32 ) -> libc:: sock_filter {
58+ libc:: sock_filter { code : op as u16 , jt : 0 , jf : 0 , k }
59+ }
60+
61+ /// One BPF jump: compares A against `k` and branches.
62+ fn bpf_jmp ( op : u32 , jt : u8 , jf : u8 , k : u32 ) -> libc:: sock_filter {
63+ libc:: sock_filter { code : op as u16 , jt, jf, k }
64+ }
4165
4266 /// Install a seccomp BPF filter that traps `futex(FUTEX_WAIT)`.
4367 ///
4468 /// Everything (prctl + sigaction + BPF) is done once per process via
45- /// a `OnceLock `. The first thread to enter simulation performs the
69+ /// an `AtomicBool `. The first thread to enter simulation performs the
4670 /// syscalls; subsequent threads inherit the filter at creation time.
4771 pub fn install ( ) {
4872 static INSTALLED : AtomicBool = AtomicBool :: new ( false ) ;
4973 if INSTALLED . swap ( true , Ordering :: Relaxed ) {
5074 return ;
5175 }
5276 unsafe {
53- // `PR_SET_NO_NEW_PRIVS` lets unprivileged threads install a filter.
54- let ret = libc:: prctl ( PR_SET_NO_NEW_PRIVS , 1 , 0 , 0 , 0 ) ;
77+ // ── step 1: PR_SET_NO_NEW_PRIVS ─────────────────────────────
78+ // Lets unprivileged threads install a seccomp filter.
79+ let ret = libc:: prctl ( libc:: PR_SET_NO_NEW_PRIVS , 1 , 0 , 0 , 0 ) ;
5580 assert_eq ! ( ret, 0 , "parking_detect: PR_SET_NO_NEW_PRIVS failed" ) ;
5681
57- // Install the SIGSYS handler.
82+ // ── step 2: register SIGSYS handler ─────────────────────────
5883 // SA_NODEFER: allow re‑entering the handler if an abort‑time
5984 // syscall also hits the filter.
6085 let mut sa: libc:: sigaction = core:: mem:: zeroed ( ) ;
61- sa. sa_flags = 0x0004 | 0x40000000 ; // SA_SIGINFO | SA_NODEFER
86+ sa. sa_flags = libc :: SA_SIGINFO | libc :: SA_NODEFER ;
6287 let ptr = sigsys_handler as extern "C" fn ( i32 , * mut libc:: siginfo_t , * mut libc:: c_void ) ;
6388 // The sa_handler / sa_sigaction field is a union; write via raw
6489 // bytes to avoid fighting the libc type definitions.
@@ -67,7 +92,7 @@ mod imp {
6792 let ret = libc:: sigaction ( libc:: SIGSYS , & sa, core:: ptr:: null_mut ( ) ) ;
6893 assert_eq ! ( ret, 0 , "parking_detect: sigaction(SIGSYS) failed" ) ;
6994
70- // ── BPF filter ──────────────────────── ──────────────────────────
95+ // ── step 3: install the BPF filter ──────────────────────────
7196 // Every syscall is checked by this 11-instruction seccomp
7297 // program. The kernel provides `struct seccomp_data`:
7398 //
@@ -76,43 +101,113 @@ mod imp {
76101 // 4 4 arch (AUDIT_ARCH_*)
77102 // 24 8 args[1] (futex op | flags)
78103 //
79- // We check: arch, then nr , then args[1] masked to strip the
80- // private flag. See each line for its instruction .
104+ // We verify the architecture , then the syscall number,
105+ // then the futex operation (after masking the PRIVATE flag) .
81106 let bpf: [ libc:: sock_filter ; 11 ] = [
82- // ld [4] — load arch field into accumulator
83- libc:: sock_filter { code : 0x20 , jt : 0 , jf : 0 , k : 4 } ,
84- // jeq AUDIT_ARCH_X86_64 — x86_64? continue (jt:0), else jump to insn 10 (jf:8 → ret KILL)
85- // x86 compat syscalls have a different data layout; they must be rejected.
86- libc:: sock_filter { code : 0x15 , jt : 0 , jf : 8 , k : AUDIT_ARCH_X86_64 } ,
87- // ld [0] — load syscall number
88- libc:: sock_filter { code : 0x20 , jt : 0 , jf : 0 , k : 0 } ,
89- // jeq __NR_FUTEX (202) — is it futex? continue (jt:0), else jump to insn 9 (jf:5 → ret ALLOW)
90- libc:: sock_filter { code : 0x15 , jt : 0 , jf : 5 , k : __NR_FUTEX } ,
91- // ld [24] — load args[1], the futex operation word (op | flags)
92- // e.g. FUTEX_WAIT (0), FUTEX_WAIT_BITSET (9), FUTEX_PRIVATE_FLAG (0x80)
93- libc:: sock_filter { code : 0x20 , jt : 0 , jf : 0 , k : 24 } ,
94- // and 0x7F — strip bit 7 (FUTEX_PRIVATE_FLAG = 0x80)
95- // After this: FUTEX_WAIT (0) and FUTEX_WAIT|PRIVATE (128) → 0
96- // FUTEX_WAIT_BITSET (9) and FUTEX_WAIT_BITSET|PRIVATE (137) → 9
97- libc:: sock_filter { code : 0x54 , jt : 0 , jf : 0 , k : 0x7F } ,
98- // jeq 0 (FUTEX_WAIT) — if masked op == 0, jump to insn 8 (jt:1 → ret TRAP)
99- libc:: sock_filter { code : 0x15 , jt : 1 , jf : 0 , k : 0 } ,
100- // jeq 9 (FUTEX_WAIT_BITSET) — if masked op == 9, fall through to insn 8 (jf:1 → ret ALLOW)
101- libc:: sock_filter { code : 0x15 , jt : 0 , jf : 1 , k : 9 } ,
102- // ret SECCOMP_RET_TRAP — deliver SIGSYS, our handler inspects in_simulation()
103- libc:: sock_filter { code : 0x06 , jt : 0 , jf : 0 , k : SECCOMP_RET_TRAP } ,
104- // ret SECCOMP_RET_ALLOW — not a futex wait, let it through
105- libc:: sock_filter { code : 0x06 , jt : 0 , jf : 0 , k : SECCOMP_RET_ALLOW } ,
106- // ret SECCOMP_RET_KILL — arch mismatch, kill the process
107- libc:: sock_filter { code : 0x06 , jt : 0 , jf : 0 , k : 0 } ,
107+ // ── insn 0: ld [4] ─────────────────────────────────
108+ // Load the `arch` field of `seccomp_data` into A.
109+ bpf_stmt (
110+ libc:: BPF_LD | libc:: BPF_W | libc:: BPF_ABS ,
111+ 4 ,
112+ ) ,
113+
114+ // ── insn 1: jeq AUDIT_ARCH_X86_64, 0, 8 ──────────
115+ // If arch == x86_64 → continue (jt:0 → insn 2).
116+ // Otherwise → jump forward 8 (jf:8 → insn 10, KILL).
117+ // x86 compat syscalls have a different data layout
118+ // and must be rejected outright.
119+ bpf_jmp (
120+ libc:: BPF_JMP | libc:: BPF_JEQ | libc:: BPF_K ,
121+ 0 , 8 ,
122+ AUDIT_ARCH_X86_64 ,
123+ ) ,
124+
125+ // ── insn 2: ld [0] ─────────────────────────────────
126+ // Load the `nr` (syscall number) into A.
127+ bpf_stmt (
128+ libc:: BPF_LD | libc:: BPF_W | libc:: BPF_ABS ,
129+ 0 ,
130+ ) ,
131+
132+ // ── insn 3: jeq __NR_FUTEX, 0, 5 ─────────────────
133+ // If nr == FUTEX (202) → continue (jt:0 → insn 4).
134+ // Otherwise → jump forward 5 (jf:5 → insn 9, ALLOW).
135+ bpf_jmp (
136+ libc:: BPF_JMP | libc:: BPF_JEQ | libc:: BPF_K ,
137+ 0 , 5 ,
138+ libc:: SYS_futex as u32 ,
139+ ) ,
140+
141+ // ── insn 4: ld [24] ────────────────────────────────
142+ // Load `args[1]` — the futex operation word (op | flags).
143+ // e.g. FUTEX_WAIT (0), FUTEX_WAIT_BITSET (9),
144+ // FUTEX_PRIVATE_FLAG (0x80)
145+ bpf_stmt (
146+ libc:: BPF_LD | libc:: BPF_W | libc:: BPF_ABS ,
147+ 24 ,
148+ ) ,
149+
150+ // ── insn 5: and 0x7F ──────────────────────────────
151+ // Strip the PRIVATE flag bit (0x80).
152+ // After masking:
153+ // FUTEX_WAIT (0), FUTEX_WAIT|PRIVATE (0x80) → 0
154+ // FUTEX_WAIT_BITSET (9), FUTEX_WAIT_BITSET|PRIVATE (0x89) → 9
155+ bpf_stmt (
156+ libc:: BPF_ALU | libc:: BPF_AND | libc:: BPF_K ,
157+ 0x7F ,
158+ ) ,
159+
160+ // ── insn 6: jeq 0, 1, 0 ──────────────────────────
161+ // If masked op == FUTEX_WAIT (0) → jump forward 1
162+ // (jt:1 → insn 8, TRAP).
163+ // Otherwise → fall through (jf:0 → insn 7).
164+ bpf_jmp (
165+ libc:: BPF_JMP | libc:: BPF_JEQ | libc:: BPF_K ,
166+ 1 , 0 ,
167+ 0 , // FUTEX_WAIT
168+ ) ,
169+
170+ // ── insn 7: jeq 9, 0, 1 ──────────────────────────
171+ // If masked op == FUTEX_WAIT_BITSET (9) → fall
172+ // through (jt:0 → insn 8, TRAP).
173+ // Otherwise → jump forward 1 (jf:1 → insn 9, ALLOW).
174+ bpf_jmp (
175+ libc:: BPF_JMP | libc:: BPF_JEQ | libc:: BPF_K ,
176+ 0 , 1 ,
177+ 9 , // FUTEX_WAIT_BITSET
178+ ) ,
179+
180+ // ── insn 8: ret SECCOMP_RET_TRAP ────────────────
181+ // Deliver SIGSYS. Our handler checks
182+ // `sim_std::in_simulation()` and aborts if inside a
183+ // simulation, or skips the instruction otherwise.
184+ bpf_stmt (
185+ libc:: BPF_RET | libc:: BPF_K ,
186+ libc:: SECCOMP_RET_TRAP ,
187+ ) ,
188+
189+ // ── insn 9: ret SECCOMP_RET_ALLOW ──────────────
190+ // Not a futex wait — let the syscall through.
191+ bpf_stmt (
192+ libc:: BPF_RET | libc:: BPF_K ,
193+ libc:: SECCOMP_RET_ALLOW ,
194+ ) ,
195+
196+ // ── insn 10: ret SECCOMP_RET_KILL ─────────────
197+ // Architecture mismatch — kill the process.
198+ bpf_stmt (
199+ libc:: BPF_RET | libc:: BPF_K ,
200+ libc:: SECCOMP_RET_KILL ,
201+ ) ,
108202 ] ;
203+
109204 let prog = libc:: sock_fprog {
110205 len : bpf. len ( ) as u16 ,
111206 filter : & bpf as * const libc:: sock_filter as * mut libc:: sock_filter ,
112207 } ;
113208 let ret = libc:: syscall (
114209 libc:: SYS_seccomp ,
115- SECCOMP_SET_MODE_FILTER ,
210+ libc :: SECCOMP_SET_MODE_FILTER ,
116211 0 ,
117212 & prog,
118213 ) ;
0 commit comments