Skip to content

Commit 75962e1

Browse files
authored
Merge pull request #53 from sysprog21/seccomp
Add opt-in seccomp BPF allowlist behind --seccomp
2 parents 737d00d + 7dcd9fa commit 75962e1

6 files changed

Lines changed: 352 additions & 7 deletions

File tree

.github/workflows/main.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,19 @@ jobs:
185185
- name: boot test
186186
run: .ci/autorun.sh
187187
timeout-minutes: 5
188+
# Smoke-test the --seccomp opt-in path on the same nested-KVM runner.
189+
# The smoke test only waits for the "Linux version " banner before
190+
# sending Ctrl-A x, but reaching the banner already exercises
191+
# prctl(PR_SET_NO_NEW_PRIVS), seccomp(2)+TSYNC install over the
192+
# already-running serial worker, and the early KVM_RUN dispatch
193+
# under the filter. A regression that drops a steady-state syscall
194+
# from src/seccomp.c's allowlist surfaces here as a SIGSYS before
195+
# the banner appears.
196+
- name: boot test (seccomp)
197+
run: .ci/autorun.sh
198+
env:
199+
KVM_HOST_FLAGS: --seccomp
200+
timeout-minutes: 5
188201

189202
# arm64 host build: configs/linux-arm64.config has no prebuilt path and
190203
# the GitHub-hosted ubuntu-24.04-arm runners (Cobalt 100) do NOT expose

Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ OBJS := \
3232
virtio-blk.o \
3333
virtio-net.o \
3434
diskimg.o \
35+
seccomp.o \
3536
main.o
3637

3738
ifeq ($(ARCH), x86_64)
@@ -66,9 +67,15 @@ $(OUT)/ext4.img:
6667
$(Q)dd if=/dev/zero of=$@ bs=4k count=600
6768
$(Q)mkfs.ext4 -F $@
6869

70+
# KVM_HOST_FLAGS forwards extra flags to the binary so CI and developers
71+
# can opt into --seccomp without duplicating the recipe. Empty by default
72+
# to keep `make check` matching the documented invocation.
73+
KVM_HOST_FLAGS ?=
74+
6975
check: $(BIN) $(LINUX_IMG) $(ROOTFS_IMG) $(OUT)/ext4.img
7076
$(VECHO) "\nOnce the message 'Kernel panic' appears, press Ctrl-C to exit\n\n"
71-
$(Q)sudo $(BIN) -k $(LINUX_IMG) -i $(ROOTFS_IMG) -d $(OUT)/ext4.img
77+
$(Q)sudo $(BIN) -k $(LINUX_IMG) -i $(ROOTFS_IMG) -d $(OUT)/ext4.img \
78+
$(KVM_HOST_FLAGS)
7279

7380
clean:
7481
$(VECHO) "Cleaning...\n"

README.md

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,31 @@ make check
4343
### Start Emulator
4444

4545
```
46-
$ build/kvm-host -k bzImage [-i initrd] [-d disk-image]
46+
$ build/kvm-host -k bzImage [-i initrd] [-d disk-image] [--seccomp]
4747
```
4848

4949
`bzImage` is the path to linux kernel bzImage. The bzImage file is in a specific format,
5050
containing concatenated `bootsect.o + setup.o + misc.o + piggy.o`. `initrd` is the path to
5151
initial RAM disk image, which is an optional argument.
5252
`disk-image` is the path to disk image which can be mounted as a block device via virtio. For the reference Linux guest, ext4 filesystem is used for disk image.
5353

54+
`--seccomp` is an opt-in defense-in-depth flag that installs a seccomp BPF
55+
allowlist over the steady-state KVM_RUN loop. Once active, only the
56+
syscalls that the vcpu, virtio-blk, virtio-net, and serial workers need
57+
are permitted; anything else (including a memory-corruption RCE in
58+
device emulation pivoting to `execve`, `open`, or `socket`) terminates
59+
the process with `SIGSYS`. The filter is applied via `seccomp(2)` with
60+
`SECCOMP_FILTER_FLAG_TSYNC` so already-running worker threads inherit
61+
it. The flag is off by default so existing test and development
62+
workflows are unaffected. CI exercises both paths
63+
(`.github/workflows/main.yml`).
64+
65+
To run `make check` with the filter enabled:
66+
67+
```shell
68+
$ make KVM_HOST_FLAGS=--seccomp check
69+
```
70+
5471
### Exit Emulator
5572

5673
To exit kvm-host, press "Ctrl-A", release both keys, and then press "x".

src/main.c

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,18 @@
44
#include <unistd.h>
55

66
#include "err.h"
7+
#include "seccomp.h"
78
#include "vm.h"
89

910
static char *kernel_file = NULL, *initrd_file = NULL, *diskimg_file = NULL;
11+
static int enable_seccomp = 0;
12+
13+
/* Long-only option ids start above the ASCII range so they can never collide
14+
* with a short-option char in the getopt_long return.
15+
*/
16+
enum {
17+
OPT_SECCOMP = 256,
18+
};
1019

1120
#define print_option(args, help_msg) printf(" %-30s%s", args, help_msg)
1221

@@ -19,6 +28,8 @@ static void usage(const char *execpath)
1928
print_option("-i, --initrd initrd", "Initial RAM disk image\n");
2029
print_option("-d, --disk disk-image",
2130
"Disk image for virtio-blk devices\n");
31+
print_option("--seccomp",
32+
"Install a seccomp BPF allowlist before vm_run.\n");
2233
}
2334

2435
static struct termios saved_attributes;
@@ -50,9 +61,8 @@ int main(int argc, char *argv[])
5061
{
5162
int option_index = 0;
5263
struct option opts[] = {
53-
{"kernel", 1, NULL, 'k'},
54-
{"initrd", 1, NULL, 'i'},
55-
{"disk", 1, NULL, 'd'},
64+
{"kernel", 1, NULL, 'k'}, {"initrd", 1, NULL, 'i'},
65+
{"disk", 1, NULL, 'd'}, {"seccomp", 0, NULL, OPT_SECCOMP},
5666
{"help", 0, NULL, 'h'},
5767
};
5868

@@ -69,6 +79,9 @@ int main(int argc, char *argv[])
6979
case 'd':
7080
diskimg_file = optarg;
7181
break;
82+
case OPT_SECCOMP:
83+
enable_seccomp = 1;
84+
break;
7285
case 'h':
7386
usage(argv[0]);
7487
exit(0);
@@ -97,8 +110,15 @@ int main(int argc, char *argv[])
97110
if (vm_late_init(&vm) < 0)
98111
return -1;
99112

100-
/* Switch the terminal to raw mode only once setup has succeeded so that
101-
* any error from the load/init paths above is rendered on a normal tty.
113+
/* Lock down the syscall surface before raw-mode and vm_run, so a
114+
* memory-corruption RCE in device emulation cannot escape to arbitrary host
115+
* syscalls. Off by default — opt in via --seccomp.
116+
*/
117+
if (enable_seccomp && seccomp_apply() < 0)
118+
return -1;
119+
120+
/* Switch the terminal to raw mode only once setup has succeeded so that any
121+
* error from the load/init paths above is rendered on a normal tty.
102122
*/
103123
set_input_mode();
104124

0 commit comments

Comments
 (0)