Skip to content

Commit f890905

Browse files
committed
libct/exeseal: add annotation to choose runc binary protection mechanism
Introduce the org.opencontainers.runc.clone-self-exe annotation to let users explicitly choose how runc protects the host runc binary against tampering by the container. Previously, runc attempted sealed overlayfs and silently fell back to the clone-binary path on failure, with no way for users to express a preference. Recognized values: - independent-data-copy: use the clone-binary path only (memfd, with an internal fallback to a classic unlinked tmpfile on older kernels). - ro-shared-page: use sealed overlayfs only. When the annotation is absent, runc's existing default behavior is preserved unchanged (sealed overlayfs, then clone-binary fallback). The annotation is registered in PotentiallyUnsafeConfigAnnotations because it configures runc's own execution path. Signed-off-by: Mohammed Aminu Futa <mohammedfuta2000@gmail.com>
1 parent 3e802d1 commit f890905

7 files changed

Lines changed: 176 additions & 16 deletions

File tree

features.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ var featuresCommand = &cli.Command{
8080
"bundle",
8181
"org.systemd.property.", // prefix form
8282
"org.criu.config",
83+
"org.opencontainers.runc.clone-self-exe",
8384
},
8485
}
8586

libcontainer/configs/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/opencontainers/cgroups"
2020
devices "github.com/opencontainers/cgroups/devices/config"
2121
"github.com/opencontainers/runtime-spec/specs-go"
22+
"github.com/opencontainers/runc/libcontainer/exeseal"
2223
)
2324

2425
type Rlimit struct {
@@ -206,6 +207,9 @@ type Config struct {
206207
// Labels are user defined metadata that is stored in the config and populated on the state
207208
Labels []string `json:"labels"`
208209

210+
// CloneSelfExe selects how runc protects runc binary against tampering.
211+
CloneSelfExe exeseal.Mode `json:"clone_self_exe,omitempty"`
212+
209213
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
210214
// callers keyring in this case.
211215
NoNewKeyring bool `json:"no_new_keyring,omitempty"`

libcontainer/container_linux.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,7 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
538538
exePath = "/proc/self/exe"
539539
} else {
540540
var err error
541-
safeExe, err = exeseal.CloneSelfExe(c.stateDir)
541+
safeExe, err = exeseal.CloneSelfExe(c.stateDir, c.config.CloneSelfExe)
542542
if err != nil {
543543
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
544544
}

libcontainer/exeseal/cloned_binary_linux.go

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -215,24 +215,45 @@ func IsCloned(exe *os.File) bool {
215215
// /proc/self/exe). This binary can then be used for "runc init" in order to
216216
// make sure the container process can never resolve the original runc binary.
217217
// For more details on why this is necessary, see CVE-2019-5736.
218-
func CloneSelfExe(tmpDir string) (*os.File, error) {
219-
// Try to create a temporary overlayfs to produce a readonly version of
220-
// /proc/self/exe that cannot be "unwrapped" by the container. In contrast
221-
// to CloneBinary, this technique does not require any extra memory usage
222-
// and does not have the (fairly noticeable) performance impact of copying
223-
// a large binary file into a memfd.
224-
//
225-
// Based on some basic performance testing, the overlayfs approach has
226-
// effectively no performance overhead (it is on par with both
227-
// MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds
228-
// around ~60% overhead during container startup.
229-
overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
230-
if err == nil {
218+
func CloneSelfExe(tmpDir string, mode Mode) (*os.File, error) {
219+
switch mode {
220+
case ModeROSharedPage:
221+
overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
222+
if err != nil {
223+
return nil, fmt.Errorf("%s=ro-shared-page requested but overlayfs unavailable: %w",
224+
AnnotationKey, err)
225+
}
231226
logrus.Debug("runc exeseal: using overlayfs for sealed /proc/self/exe") // used for tests
232227
return overlayFile, nil
228+
229+
case ModeIndependentDataCopy:
230+
return cloneSelfExeViaCloneBinary(tmpDir)
231+
232+
case ModeUnset:
233+
// Try to create a temporary overlayfs to produce a readonly version of
234+
// /proc/self/exe that cannot be "unwrapped" by the container. In contrast
235+
// to CloneBinary, this technique does not require any extra memory usage
236+
// and does not have the (fairly noticeable) performance impact of copying
237+
// a large binary file into a memfd.
238+
//
239+
// Based on some basic performance testing, the overlayfs approach has
240+
// effectively no performance overhead (it is on par with both
241+
// MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds
242+
// around ~60% overhead during container startup.
243+
overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
244+
if err == nil {
245+
logrus.Debug("runc exeseal: using overlayfs for sealed /proc/self/exe") // used for tests
246+
return overlayFile, nil
247+
}
248+
logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy")
249+
return cloneSelfExeViaCloneBinary(tmpDir)
250+
251+
default:
252+
return nil, fmt.Errorf("internal error: unhandled CloneSelfExe mode %v", mode)
233253
}
234-
logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy")
254+
}
235255

256+
func cloneSelfExeViaCloneBinary(tmpDir string) (*os.File, error) {
236257
selfExe, err := os.Open("/proc/self/exe")
237258
if err != nil {
238259
return nil, fmt.Errorf("opening current binary: %w", err)
@@ -244,7 +265,7 @@ func CloneSelfExe(tmpDir string) (*os.File, error) {
244265
return nil, fmt.Errorf("checking /proc/self/exe size: %w", err)
245266
}
246267
size := stat.Size()
247-
268+
logrus.Debug("runc exeseal: using clone-binary path") // used for tests
248269
return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir)
249270
}
250271

libcontainer/exeseal/mode.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package exeseal
2+
3+
import "fmt"
4+
5+
// AnnotationKey is the OCI annotation that selects how runc protects
6+
// the host runc binary against tampering by the container. See
7+
// ParseMode for the recognized values and behavior.
8+
const AnnotationKey = "org.opencontainers.runc.clone-self-exe"
9+
10+
// Mode the selected mechanism used to protect the host runc binary.
11+
// See ParseMode for the recognized annotation values.
12+
type Mode int
13+
14+
const (
15+
// ModeUnset means the annotation was not present in the config.
16+
ModeUnset Mode = iota
17+
ModeIndependentDataCopy
18+
ModeROSharedPage
19+
)
20+
21+
// String returns the canonical annotation value for a Mode, or
22+
// "<unset>" for ModeUnset (which has no annotation form).
23+
func (m Mode) String() string {
24+
switch m {
25+
case ModeUnset:
26+
return "<unset>"
27+
case ModeIndependentDataCopy:
28+
return "independent-data-copy"
29+
case ModeROSharedPage:
30+
return "ro-shared-page"
31+
default:
32+
return fmt.Sprintf("unknown(%d)", int(m))
33+
}
34+
}
35+
36+
// ParseMode converts an annotation value string into a Mode.
37+
//
38+
// Recognized values:
39+
// - "independent-data-copy": use the clone-binary path (memfd, with
40+
// an internal fallback to a classic unlinked tmpfile on older
41+
// kernels). Sealed overlayfs is not attempted.
42+
// - "ro-shared-page": use sealed overlayfs only; fail
43+
// container creation if it is not available.
44+
//
45+
// If the annotation is absent, use ModeUnset.
46+
//
47+
// Explicit values do not fall back to the other mechanism on failure; this is intentional.
48+
// if a caller has expressed a preference, getting the other mechanism silently defeats the
49+
// purpose of the annotation.
50+
//
51+
// Unrecognized values, including empty string, return an error.
52+
func ParseMode(value string) (Mode, error) {
53+
switch value {
54+
case "independent-data-copy":
55+
return ModeIndependentDataCopy, nil
56+
case "ro-shared-page":
57+
return ModeROSharedPage, nil
58+
default:
59+
return ModeUnset, fmt.Errorf(
60+
"invalid %s value %q (want %q or %q)",
61+
AnnotationKey, value,
62+
"independent-data-copy", "ro-shared-page",
63+
)
64+
}
65+
}

libcontainer/exeseal/mode_test.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package exeseal
2+
3+
import "testing"
4+
5+
func TestParseMode(t *testing.T) {
6+
cases := []struct {
7+
in string
8+
want Mode
9+
wantErr bool
10+
}{
11+
{"independent-data-copy", ModeIndependentDataCopy, false},
12+
{"ro-shared-page", ModeROSharedPage, false},
13+
{"", ModeUnset, true}, // empty must error, not default
14+
{"auto", ModeUnset, true}, // not a valid value in this scheme
15+
{"independent_data_copy", ModeUnset, true}, // underscore typo
16+
{"INDEPENDENT-DATA-COPY", ModeUnset, true}, // case-sensitive
17+
{"ro-shared", ModeUnset, true}, // truncated
18+
{"memfd-clone", ModeUnset, true}, // an earlier name we considered
19+
{"clone-binary", ModeUnset, true}, // an even earlier name
20+
{"ro-overlayfs", ModeUnset, true}, // implementation-named alternative
21+
}
22+
for _, tc := range cases {
23+
t.Run(tc.in, func(t *testing.T) {
24+
got, err := ParseMode(tc.in)
25+
if (err != nil) != tc.wantErr {
26+
t.Fatalf("ParseMode(%q) err=%v, wantErr=%v", tc.in, err, tc.wantErr)
27+
}
28+
if !tc.wantErr && got != tc.want {
29+
t.Errorf("ParseMode(%q) = %v, want %v", tc.in, got, tc.want)
30+
}
31+
})
32+
}
33+
}
34+
35+
func TestModeStringRoundTrip(t *testing.T) {
36+
// For every Mode that has a parseable string form, String() should
37+
// produce something ParseMode accepts (and vice versa).
38+
for _, m := range []Mode{ModeIndependentDataCopy, ModeROSharedPage} {
39+
s := m.String()
40+
got, err := ParseMode(s)
41+
if err != nil {
42+
t.Errorf("ParseMode(%q) (from Mode(%d).String()) failed: %v", s, int(m), err)
43+
continue
44+
}
45+
if got != m {
46+
t.Errorf("round trip: Mode %v -> %q -> Mode %v", m, s, got)
47+
}
48+
}
49+
// ModeUnset deliberately has no parseable string form.
50+
if s := ModeUnset.String(); s == "independent-data-copy" || s == "ro-shared-page" {
51+
t.Errorf("ModeUnset.String() = %q collides with a valid annotation value", s)
52+
}
53+
}

libcontainer/specconv/spec_linux.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"github.com/opencontainers/runc/libcontainer/configs"
2828
"github.com/opencontainers/runc/libcontainer/internal/userns"
2929
"github.com/opencontainers/runc/libcontainer/seccomp"
30+
"github.com/opencontainers/runc/libcontainer/exeseal"
3031
)
3132

3233
var (
@@ -432,6 +433,13 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
432433
}
433434

434435
config.Cgroups = c
436+
437+
cloneSelfExe, err := initCloneSelfExeMode(spec)
438+
if err != nil {
439+
return nil, err
440+
}
441+
config.CloneSelfExe = cloneSelfExe
442+
435443
// set linux-specific config
436444
if spec.Linux != nil {
437445
initMaps()
@@ -775,6 +783,14 @@ func initSystemdProps(spec *specs.Spec) ([]systemdDbus.Property, error) {
775783
return sp, nil
776784
}
777785

786+
func initCloneSelfExeMode(spec *specs.Spec) (exeseal.Mode, error) {
787+
value, ok := spec.Annotations[exeseal.AnnotationKey]
788+
if !ok {
789+
return exeseal.ModeUnset, nil
790+
}
791+
return exeseal.ParseMode(value)
792+
}
793+
778794
func CreateCgroupConfig(opts *CreateOpts, defaultDevs []*devices.Device) (*cgroups.Cgroup, error) {
779795
var (
780796
myCgroupPath string

0 commit comments

Comments
 (0)