diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7f6799f18f6..5a53f1de1eb 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -28,6 +28,7 @@ jobs: rootless: ["rootless", ""] race: ["-race", ""] criu: ["", "criu-dev"] + dmz: ["", "runc_nodmz"] exclude: - criu: criu-dev rootless: rootless @@ -35,6 +36,10 @@ jobs: go-version: 1.20.x - criu: criu-dev race: -race + - dmz: runc_nodmz + criu: criu-dev + - dmz: runc_nodmz + os: ubuntu-20.04 runs-on: ${{ matrix.os }} steps: @@ -71,6 +76,8 @@ jobs: go-version: ${{ matrix.go-version }} - name: build + env: + EXTRA_BUILDTAGS: ${{ matrix.dmz }} run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all - name: install bats @@ -80,6 +87,8 @@ jobs: - name: unit test if: matrix.rootless != 'rootless' + env: + EXTRA_BUILDTAGS: ${{ matrix.dmz }} run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest - name: add rootless user @@ -113,8 +122,12 @@ jobs: # However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff. # We are not interested in providing official support for i386. cross-i386: - runs-on: ubuntu-22.04 timeout-minutes: 15 + strategy: + fail-fast: false + matrix: + dmz: ["", "runc_nodmz"] + runs-on: ubuntu-22.04 steps: @@ -136,4 +149,6 @@ jobs: go-version: 1.x # Latest stable - name: unit test + env: + EXTRA_BUILDTAGS: ${{ matrix.dmz }} run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest diff --git a/.gitignore b/.gitignore index 4df0d6abfde..f022ed275cd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,11 @@ vendor/pkg /runc /runc-* -contrib/cmd/recvtty/recvtty -contrib/cmd/sd-helper/sd-helper -contrib/cmd/seccompagent/seccompagent -contrib/cmd/fs-idmap/fs-idmap +/contrib/cmd/recvtty/recvtty +/contrib/cmd/sd-helper/sd-helper +/contrib/cmd/seccompagent/seccompagent +/contrib/cmd/fs-idmap/fs-idmap +/contrib/cmd/memfd-bind/memfd-bind man/man8 release Vagrantfile diff --git a/.golangci-extra.yml b/.golangci-extra.yml index be33f90d7f9..23b57e040b6 100644 --- a/.golangci-extra.yml +++ b/.golangci-extra.yml @@ -7,6 +7,7 @@ run: build-tags: - seccomp + - runc_nodmz linters: disable-all: true diff --git a/.golangci.yml b/.golangci.yml index 96b321019e4..c088117d2ca 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -3,6 +3,7 @@ run: build-tags: - seccomp + - runc_nodmz linters: enable: diff --git a/Dockerfile b/Dockerfile index 9fd29a59371..6fa8752b5e3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,19 +9,15 @@ ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debi RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \ wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \ && echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \ + && dpkg --add-architecture i386 \ && apt-get update \ && apt-get install -y --no-install-recommends \ build-essential \ criu \ - gcc-aarch64-linux-gnu libc-dev-arm64-cross \ - gcc-arm-linux-gnueabi libc-dev-armel-cross \ - gcc-arm-linux-gnueabihf libc-dev-armhf-cross \ - gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \ - gcc-s390x-linux-gnu libc-dev-s390x-cross \ - gcc-riscv64-linux-gnu libc-dev-riscv64-cross \ + gcc \ + gcc-multilib \ curl \ gawk \ - gcc \ gperf \ iptables \ jq \ @@ -32,6 +28,14 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \ sudo \ uidmap \ iproute2 \ + && apt-get install -y --no-install-recommends \ + libc-dev:i386 libgcc-s1:i386 \ + gcc-aarch64-linux-gnu libc-dev-arm64-cross \ + gcc-arm-linux-gnueabi libc-dev-armel-cross \ + gcc-arm-linux-gnueabihf libc-dev-armhf-cross \ + gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \ + gcc-s390x-linux-gnu libc-dev-s390x-cross \ + gcc-riscv64-linux-gnu libc-dev-riscv64-cross \ && apt-get clean \ && rm -rf /var/cache/apt /var/lib/apt/lists/* /etc/apt/sources.list.d/*.list @@ -54,7 +58,7 @@ RUN cd /tmp \ ARG LIBSECCOMP_VERSION COPY script/seccomp.sh script/lib.sh /tmp/script/ RUN mkdir -p /opt/libseccomp \ - && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le riscv64 s390x + && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp 386 amd64 arm64 armel armhf ppc64le riscv64 s390x ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION ENV LD_LIBRARY_PATH=/opt/libseccomp/lib ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig diff --git a/Makefile b/Makefile index 0d48fe8c521..d3c1c11cb86 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,11 @@ +SHELL = /bin/bash + CONTAINER_ENGINE := docker GO ?= go +# Get CC values for cross-compilation. +include cc_platform.mk + PREFIX ?= /usr/local BINDIR := $(PREFIX)/sbin MANDIR := $(PREFIX)/share/man @@ -10,6 +15,7 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g") RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN)) PROJECT := github.com/opencontainers/runc BUILDTAGS ?= seccomp urfave_cli_no_docs +BUILDTAGS += $(EXTRA_BUILDTAGS) COMMIT ?= $(shell git describe --dirty --long --always) VERSION := $(shell cat ./VERSION) @@ -57,18 +63,25 @@ endif .DEFAULT: runc -runc: +runc: runc-dmz $(GO_BUILD) -o runc . + make verify-dmz-arch -all: runc recvtty sd-helper seccompagent fs-idmap +all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind -recvtty sd-helper seccompagent fs-idmap: +recvtty sd-helper seccompagent fs-idmap memfd-bind: $(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@ -static: +static: runc-dmz $(GO_BUILD_STATIC) -o runc . + make verify-dmz-arch + +.PHONY: runc-dmz +runc-dmz: + rm -f libcontainer/dmz/runc-dmz + $(GO) generate -tags "$(BUILDTAGS)" ./libcontainer/dmz -releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x" +releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x" releaseall: release release: runcimage @@ -147,12 +160,13 @@ install-man: man install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8 clean: - rm -f runc runc-* + rm -f runc runc-* libcontainer/dmz/runc-dmz + rm -f contrib/cmd/fs-idmap/fs-idmap rm -f contrib/cmd/recvtty/recvtty rm -f contrib/cmd/sd-helper/sd-helper rm -f contrib/cmd/seccompagent/seccompagent - rm -f contrib/cmd/fs-idmap/fs-idmap - rm -rf release + rm -f contrib/cmd/memfd-bind/memfd-bind + sudo rm -rf release rm -rf man/man8 cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/') @@ -188,6 +202,18 @@ verify-dependencies: vendor @test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \ || (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \ && echo "all vendor files are up to date." +verify-dmz-arch: + @test -s libcontainer/dmz/runc-dmz || exit 0; \ + set -Eeuo pipefail; \ + export LC_ALL=C; \ + echo "readelf -h runc"; \ + readelf -h runc | grep -E "(Machine|Flags):"; \ + echo "readelf -h libcontainer/dmz/runc-dmz"; \ + readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):"; \ + diff -u \ + <(readelf -h runc | grep -E "(Machine|Flags):") \ + <(readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):") \ + && echo "runc-dmz architecture matches runc binary." validate-keyring: script/keyring_validate.sh @@ -197,4 +223,4 @@ validate-keyring: test localtest unittest localunittest integration localintegration \ rootlessintegration localrootlessintegration shell install install-bash \ install-man clean cfmt shfmt localshfmt shellcheck \ - vendor verify-changelog verify-dependencies validate-keyring + vendor verify-changelog verify-dependencies verify-dmz-arch validate-keyring diff --git a/README.md b/README.md index b209c7dcd55..827f837e06f 100644 --- a/README.md +++ b/README.md @@ -65,15 +65,18 @@ e.g. to disable seccomp: make BUILDTAGS="" ``` -| Build Tag | Feature | Enabled by default | Dependency | -|-----------|------------------------------------|--------------------|------------| -| seccomp | Syscall filtering | yes | libseccomp | +| Build Tag | Feature | Enabled by Default | Dependencies | +|---------------|---------------------------------------|--------------------|---------------------| +| `seccomp` | Syscall filtering using `libseccomp`. | yes | `libseccomp` | +| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary, [see `memfd-bind` for more details][contrib-memfd-bind]. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes || The following build tags were used earlier, but are now obsoleted: - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored) - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled) - **selinux** (since runc v1.0.0-rc93 the feature is always enabled) + [contrib-memfd-bind]: /contrib/memfd-bind/README.md + ### Running the test suite `runc` currently supports running its test suite via Docker. diff --git a/cc_platform.mk b/cc_platform.mk new file mode 100644 index 00000000000..6aa2b5ecb8b --- /dev/null +++ b/cc_platform.mk @@ -0,0 +1,61 @@ +# NOTE: Make sure you keep this file in sync with scripts/lib.sh. + +GO ?= go +GOARCH ?= $(shell $(GO) env GOARCH) + +ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),) + # openSUSE has a custom PLATFORM + PLATFORM ?= suse-linux + IS_SUSE := 1 +else + PLATFORM ?= linux-gnu +endif + +ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH)) + # use the native CC and STRIP + HOST := +else ifeq ($(GOARCH),386) + # Always use the 64-bit compiler to build the 386 binary, which works for + # the more common cross-build method for x86 (namely, the equivalent of + # dpkg --add-architecture). + ifdef IS_SUSE + # There is no x86_64-suse-linux-gcc, so use the native one. + HOST := + CPU_TYPE := i586 + else + HOST := x86_64-$(PLATFORM)- + CPU_TYPE := i686 + endif + CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS) +else ifeq ($(GOARCH),amd64) + ifdef IS_SUSE + # There is no x86_64-suse-linux-gcc, so use the native one. + HOST := + else + HOST := x86_64-$(PLATFORM)- + endif +else ifeq ($(GOARCH),arm64) + HOST := aarch64-$(PLATFORM)- +else ifeq ($(GOARCH),arm) + # HOST already configured by release_build.sh in this case. +else ifeq ($(GOARCH),armel) + HOST := arm-$(PLATFORM)eabi- +else ifeq ($(GOARCH),armhf) + HOST := arm-$(PLATFORM)eabihf- +else ifeq ($(GOARCH),ppc64le) + HOST := powerpc64le-$(PLATFORM)- +else ifeq ($(GOARCH),riscv64) + HOST := riscv64-$(PLATFORM)- +else ifeq ($(GOARCH),s390x) + HOST := s390x-$(PLATFORM)- +else +$(error Unsupported GOARCH $(GOARCH)) +endif + +ifeq ($(origin CC),$(filter $(origin CC),undefined default)) + # Override CC if it's undefined or just the default value set by Make. + CC := $(HOST)gcc + export CC +endif +STRIP ?= $(HOST)strip +export STRIP diff --git a/contrib/cmd/memfd-bind/README.md b/contrib/cmd/memfd-bind/README.md new file mode 100644 index 00000000000..f2ceae2fa78 --- /dev/null +++ b/contrib/cmd/memfd-bind/README.md @@ -0,0 +1,67 @@ +## memfd-bind ## + +`runc` normally has to make a binary copy of itself (or of a smaller helper +binary called `runc-dmz`) when constructing a container process in order to +defend against certain container runtime attacks such as CVE-2019-5736. + +This cloned binary only exists until the container process starts (this means +for `runc run` and `runc exec`, it only exists for a few hundred milliseconds +-- for `runc create` it exists until `runc start` is called). However, because +the clone is done using a memfd (or by creating files in directories that are +likely to be a `tmpfs`), this can lead to temporary increases in *host* memory +usage. Unless you are running on a cgroupv1 system with the cgroupv1 memory +controller enabled and the (deprecated) `memory.move_charge_at_immigrate` +enabled, there is no effect on the container's memory. + +However, for certain configurations this can still be undesirable. This daemon +allows you to create a sealed memfd copy of the `runc` binary, which will cause +`runc` to skip all binary copying, resulting in no additional memory usage for +each container process (instead there is a single in-memory copy of the +binary). It should be noted that (strictly speaking) this is slightly less +secure if you are concerned about Dirty Cow-like 0-day kernel vulnerabilities, +but for most users the security benefit is identical. + +The provided `memfd-bind@.service` file can be used to get systemd to manage +this daemon. You can supply the path like so: + +``` +% systemctl start memfd-bind@/usr/bin/runc +``` + +Thus, there are three ways of protecting against CVE-2019-5736, in order of how +much memory usage they can use: + +* `memfd-bind` only creates a single in-memory copy of the `runc` binary (about + 10MB), regardless of how many containers are running. + +* `runc-dmz` is (depending on which libc it was compiled with) between 10kB and + 1MB in size, and a copy is created once per process spawned inside a + container by runc (both the pid1 and every `runc exec`). There are + circumstances where using `runc-dmz` will fail in ways that runc cannot + predict ahead of time (such as restrictive LSMs applied to containers), in + which case users can disable it with the `RUNC_DMZ=legacy` setting. + `runc-dmz` also requires an additional `execve` over the other options, + though since the binary is so small the cost is probably not even noticeable. + +* The classic method of making a copy of the entire `runc` binary during + container process setup takes up about 10MB per process spawned inside the + container by runc (both pid1 and `runc exec`). + +### Caveats ### + +There are several downsides with using `memfd-bind` on the `runc` binary: + +* The `memfd-bind` process needs to continue to run indefinitely in order for + the memfd reference to stay alive. If the process is forcefully killed, the + bind-mount on top of the `runc` binary will become stale and nobody will be + able to execute it (you can use `memfd-bind --cleanup` to clean up the stale + mount). + +* Only root can execute the cloned binary due to permission restrictions on + accessing other process's files. More specifically, only users with ptrace + privileges over the memfd-bind daemon can access the file (but in practice + this is usually only root). + +* When updating `runc`, the daemon needs to be stopped before the update (so + the package manager can access the underlying file) and then restarted after + the update. diff --git a/contrib/cmd/memfd-bind/memfd-bind.go b/contrib/cmd/memfd-bind/memfd-bind.go new file mode 100644 index 00000000000..e73739f0c4d --- /dev/null +++ b/contrib/cmd/memfd-bind/memfd-bind.go @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2023 SUSE LLC + * Copyright (c) 2023 Aleksa Sarai + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "errors" + "fmt" + "io" + "os" + "os/signal" + "runtime" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/dmz" + + "github.com/sirupsen/logrus" + "github.com/urfave/cli" + "golang.org/x/sys/unix" +) + +// version will be populated by the Makefile, read from +// VERSION file of the source code. +var version = "" + +// gitCommit will be the hash that the binary was built from +// and will be populated by the Makefile. +var gitCommit = "" + +const ( + usage = `Open Container Initiative contrib/cmd/memfd-bind + +In order to protect against certain container attacks, every runc invocation +that involves creating or joining a container will cause runc to make a copy of +the runc binary in memory (usually to a memfd). While "runc init" is very +short-lived, this extra memory usage can cause problems for containers with +very small memory limits (or containers that have many "runc exec" invocations +applied to them at the same time). + +memfd-bind is a tool to create a persistent memfd-sealed-copy of the runc binary, +which will cause runc to not make its own copy. This means you can get the +benefits of using a sealed memfd as runc's binary (even in a container breakout +attack to get write access to the runc binary, neither the underlying binary +nor the memfd copy can be changed). + +To use memfd-bind, just specify which path you want to create a socket path at +which you want to receive terminals: + + $ sudo memfd-bind /usr/bin/runc + +Note that (due to kernel restrictions on bind-mounts), this program must remain +running on the host in order for the binary to be readable (it is recommended +you use a systemd unit to keep this process around). + +If this program dies, there will be a leftover mountpoint that always returns +-EINVAL when attempting to access it. You need to use memfd-bind --cleanup on the +path in order to unmount the path (regular umount(8) will not work): + + $ sudo memfd-bind --cleanup /usr/bin/runc + +Note that (due to restrictions on /proc/$pid/fd/$fd magic-link resolution), +only privileged users (specifically, those that have ptrace privileges over the +memfd-bind daemon) can access the memfd bind-mount. This means that using this +tool to harden your /usr/bin/runc binary would result in unprivileged users +being unable to execute the binary. If this is an issue, you could make all +privileged process use a different copy of runc (by making a copy in somewhere +like /usr/sbin/runc) and only using memfd-bind for the version used by +privileged users. +` +) + +func cleanup(path string) error { + file, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("cleanup: failed to open runc binary path: %w", err) + } + defer file.Close() + fdPath := fmt.Sprintf("/proc/self/fd/%d", file.Fd()) + + // Keep umounting until we hit a umount error. + for unix.Unmount(fdPath, unix.MNT_DETACH) == nil { + // loop... + logrus.Debugf("memfd-bind: path %q unmount succeeded...", path) + } + logrus.Infof("memfd-bind: path %q has been cleared of all old bind-mounts", path) + return nil +} + +// memfdClone is a memfd-only implementation of dmz.CloneBinary. +func memfdClone(path string) (*os.File, error) { + binFile, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("failed to open runc binary path: %w", err) + } + defer binFile.Close() + stat, err := binFile.Stat() + if err != nil { + return nil, fmt.Errorf("checking %s size: %w", path, err) + } + size := stat.Size() + memfd, sealFn, err := dmz.Memfd("/proc/self/exe") + if err != nil { + return nil, fmt.Errorf("creating memfd failed: %w", err) + } + copied, err := io.Copy(memfd, binFile) + if err != nil { + return nil, fmt.Errorf("copy binary: %w", err) + } else if copied != size { + return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size) + } + if err := sealFn(&memfd); err != nil { + return nil, fmt.Errorf("could not seal fd: %w", err) + } + if !dmz.IsCloned(memfd) { + return nil, fmt.Errorf("cloned memfd is not properly sealed") + } + return memfd, nil +} + +func mount(path string) error { + memfdFile, err := memfdClone(path) + if err != nil { + return fmt.Errorf("memfd clone: %w", err) + } + defer memfdFile.Close() + memfdPath := fmt.Sprintf("/proc/self/fd/%d", memfdFile.Fd()) + + // We have to open an O_NOFOLLOW|O_PATH to the memfd magic-link because we + // cannot bind-mount the memfd itself (it's in the internal kernel mount + // namespace and cross-mount-namespace bind-mounts are not allowed). This + // also requires that this program stay alive continuously for the + // magic-link to stay alive... + memfdLink, err := os.OpenFile(memfdPath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("mount: failed to /proc/self/fd magic-link for memfd: %w", err) + } + defer memfdLink.Close() + memfdLinkFdPath := fmt.Sprintf("/proc/self/fd/%d", memfdLink.Fd()) + + exeFile, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("mount: failed to open target runc binary path: %w", err) + } + defer exeFile.Close() + exeFdPath := fmt.Sprintf("/proc/self/fd/%d", exeFile.Fd()) + + err = unix.Mount(memfdLinkFdPath, exeFdPath, "", unix.MS_BIND, "") + if err != nil { + return fmt.Errorf("mount: failed to mount memfd on top of runc binary path target: %w", err) + } + + // If there is a signal we want to do cleanup. + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, os.Interrupt, unix.SIGTERM, unix.SIGINT) + go func() { + <-sigCh + logrus.Infof("memfd-bind: exit signal caught! cleaning up the bind-mount on %q...", path) + _ = cleanup(path) + os.Exit(0) + }() + + // Clean up things we don't need... + _ = exeFile.Close() + _ = memfdLink.Close() + + // We now have to stay alive to keep the magic-link alive... + logrus.Infof("memfd-bind: bind-mount of memfd over %q created -- looping forever!", path) + for { + // loop forever... + time.Sleep(time.Duration(1<<63 - 1)) + // make sure the memfd isn't gc'd + runtime.KeepAlive(memfdFile) + } +} + +func main() { + app := cli.NewApp() + app.Name = "memfd-bind" + app.Usage = usage + + // Set version to be the same as runC. + var v []string + if version != "" { + v = append(v, version) + } + if gitCommit != "" { + v = append(v, "commit: "+gitCommit) + } + app.Version = strings.Join(v, "\n") + + // Set the flags. + app.Flags = []cli.Flag{ + cli.BoolFlag{ + Name: "cleanup", + Usage: "Do not create a new memfd-sealed file, only clean up an existing one at .", + }, + cli.BoolFlag{ + Name: "debug", + Usage: "Enable debug logging.", + }, + } + + app.Action = func(ctx *cli.Context) error { + args := ctx.Args() + if len(args) != 1 { + return errors.New("need to specify a single path to the runc binary") + } + path := ctx.Args()[0] + + if ctx.Bool("debug") { + logrus.SetLevel(logrus.DebugLevel) + } + + err := cleanup(path) + // We only care about cleanup errors when doing --cleanup. + if ctx.Bool("cleanup") { + return err + } + return mount(path) + } + if err := app.Run(os.Args); err != nil { + fmt.Fprintf(os.Stderr, "memfd-bind: %v\n", err) + os.Exit(1) + } +} diff --git a/contrib/cmd/memfd-bind/memfd-bind@.service b/contrib/cmd/memfd-bind/memfd-bind@.service new file mode 100644 index 00000000000..591548ea4d9 --- /dev/null +++ b/contrib/cmd/memfd-bind/memfd-bind@.service @@ -0,0 +1,11 @@ +[Unit] +Description=Manage memfd-bind of %I +Documentation=https://github.com/opencontainers/runc + +[Service] +Type=simple +ExecStart=memfd-bind "%I" +ExecStop=memfd-bind --cleanup "%I" + +[Install] +WantedBy=multi-user.target diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index c941239b841..ae5d4fb46b4 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -24,8 +24,10 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/dmz" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/system/kernelversion" "github.com/opencontainers/runc/libcontainer/utils" ) @@ -316,6 +318,8 @@ func (c *Container) start(process *Process) (retErr error) { if err != nil { return fmt.Errorf("unable to create new parent process: %w", err) } + // We do not need the cloned binaries once the process is spawned. + defer process.closeClonedExes() logsDone := parent.forwardChildLogs() if logsDone != nil { @@ -441,6 +445,48 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error { return nil } +// No longer needed in Go 1.21. +func slicesContains[S ~[]E, E comparable](slice S, needle E) bool { + for _, val := range slice { + if val == needle { + return true + } + } + return false +} + +func isDmzBinarySafe(c *configs.Config) bool { + // Because we set the dumpable flag in nsexec, the only time when it is + // unsafe to use runc-dmz is when the container process would be able to + // race against "runc init" and bypass the ptrace_may_access() checks. + // + // This is only the case if the container processes could have + // CAP_SYS_PTRACE somehow (i.e. the capability is present in the bounding, + // inheritable, or ambient sets). Luckily, most containers do not have this + // capability. + if c.Capabilities == nil || + (!slicesContains(c.Capabilities.Bounding, "CAP_SYS_PTRACE") && + !slicesContains(c.Capabilities.Inheritable, "CAP_SYS_PTRACE") && + !slicesContains(c.Capabilities.Ambient, "CAP_SYS_PTRACE")) { + return true + } + + // Since Linux 4.10 (see bfedb589252c0) user namespaced containers cannot + // access /proc/$pid/exe of runc after it joins the namespace (until it + // does an exec), regardless of the capability set. This has been + // backported to other distribution kernels, but there's no way of checking + // this cheaply -- better to be safe than sorry here. + linux410 := kernelversion.KernelVersion{Kernel: 4, Major: 10} + if ok, err := kernelversion.GreaterEqualThan(linux410); ok && err == nil { + if c.Namespaces.Contains(configs.NEWUSER) { + return true + } + } + + // Assume it's unsafe otherwise. + return false +} + func (c *Container) newParentProcess(p *Process) (parentProcess, error) { parentInitPipe, childInitPipe, err := utils.NewSockPair("init") if err != nil { @@ -454,24 +500,59 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { } logFilePair := filePair{parentLogPipe, childLogPipe} - cmd := c.commandTemplate(p, childInitPipe, childLogPipe) - if !p.Init { - return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) - } - - // We only set up fifoFd if we're not doing a `runc exec`. The historic - // reason for this is that previously we would pass a dirfd that allowed - // for container rootfs escape (and not doing it in `runc exec` avoided - // that problem), but we no longer do that. However, there's no need to do - // this for `runc exec` so we just keep it this way to be safe. - if err := c.includeExecFifo(cmd); err != nil { - return nil, fmt.Errorf("unable to setup exec fifo: %w", err) + // Make sure we use a new safe copy of /proc/self/exe or the runc-dmz + // binary each time this is called, to make sure that if a container + // manages to overwrite the file it cannot affect other containers on the + // system. For runc, this code will only ever be called once, but + // libcontainer users might call this more than once. + p.closeClonedExes() + var ( + exePath string + // only one of dmzExe or safeExe are used at a time + dmzExe, safeExe *os.File + ) + if dmz.IsSelfExeCloned() { + // /proc/self/exe is already a cloned binary -- no need to do anything + logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!") + exePath = "/proc/self/exe" + } else { + var err error + if isDmzBinarySafe(c.config) { + dmzExe, err = dmz.Binary(c.root) + if err == nil { + // We can use our own executable without cloning if we are using + // runc-dmz. + exePath = "/proc/self/exe" + p.clonedExes = append(p.clonedExes, dmzExe) + logrus.Debug("runc-dmz: using runc-dmz") // used for tests + } else if errors.Is(err, dmz.ErrNoDmzBinary) { + logrus.Debug("runc-dmz binary not embedded in runc binary, falling back to /proc/self/exe clone") + } else if err != nil { + return nil, fmt.Errorf("failed to create runc-dmz binary clone: %w", err) + } + } else { + // If the configuration makes it unsafe to use runc-dmz, pretend we + // don't have it embedded so we do /proc/self/exe cloning. + logrus.Debug("container configuration unsafe for runc-dmz, falling back to /proc/self/exe clone") + err = dmz.ErrNoDmzBinary + } + if errors.Is(err, dmz.ErrNoDmzBinary) { + safeExe, err = dmz.CloneSelfExe(c.root) + if err != nil { + return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err) + } + exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd())) + p.clonedExes = append(p.clonedExes, safeExe) + logrus.Debug("runc-dmz: using /proc/self/exe clone") // used for tests + } + // Just to make sure we don't run without protection. + if dmzExe == nil && safeExe == nil { + // This should never happen. + return nil, fmt.Errorf("[internal error] attempted to spawn a container with no /proc/self/exe protection") + } } - return c.newInitProcess(p, cmd, messageSockPair, logFilePair) -} -func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd { - cmd := exec.Command("/proc/self/exe", "init") + cmd := exec.Command(exePath, "init") cmd.Args[0] = os.Args[0] cmd.Stdin = p.Stdin cmd.Stdout = p.Stdout @@ -494,6 +575,12 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog "_LIBCONTAINER_STATEDIR="+c.root, ) + if dmzExe != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, dmzExe) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_DMZEXEFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) + } + cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe) cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) @@ -501,13 +588,38 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel) } - // NOTE: when running a container with no PID namespace and the parent process spawning the container is - // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason - // even with the parent still running. + if safeExe != nil { + // Due to a Go stdlib bug, we need to add safeExe to the set of + // ExtraFiles otherwise it is possible for the stdlib to clobber the fd + // during forkAndExecInChild1 and replace it with some other file that + // might be malicious. This is less than ideal (because the descriptor + // will be non-O_CLOEXEC) however we have protections in "runc init" to + // stop us from leaking extra file descriptors. + // + // See . + cmd.ExtraFiles = append(cmd.ExtraFiles, safeExe) + } + + // NOTE: when running a container with no PID namespace and the parent + // process spawning the container is PID1 the pdeathsig is being + // delivered to the container's init process by the kernel for some + // reason even with the parent still running. if c.config.ParentDeathSignal > 0 { cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal) } - return cmd + + if p.Init { + // We only set up fifoFd if we're not doing a `runc exec`. The historic + // reason for this is that previously we would pass a dirfd that allowed + // for container rootfs escape (and not doing it in `runc exec` avoided + // that problem), but we no longer do that. However, there's no need to do + // this for `runc exec` so we just keep it this way to be safe. + if err := c.includeExecFifo(cmd); err != nil { + return nil, fmt.Errorf("unable to setup exec fifo: %w", err) + } + return c.newInitProcess(p, cmd, messageSockPair, logFilePair) + } + return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) } // shouldSendMountSources says whether the child process must setup bind mounts with diff --git a/libcontainer/dmz/.gitignore b/libcontainer/dmz/.gitignore new file mode 100644 index 00000000000..f163ef41c1f --- /dev/null +++ b/libcontainer/dmz/.gitignore @@ -0,0 +1 @@ +/runc-dmz diff --git a/libcontainer/dmz/Makefile b/libcontainer/dmz/Makefile new file mode 100644 index 00000000000..24e92db716b --- /dev/null +++ b/libcontainer/dmz/Makefile @@ -0,0 +1,6 @@ +# Get CC values for cross-compilation. +include ../../cc_platform.mk + +runc-dmz: _dmz.c + $(CC) $(CFLAGS) -static -o $@ $^ + $(STRIP) -gs $@ diff --git a/libcontainer/dmz/_dmz.c b/libcontainer/dmz/_dmz.c new file mode 100644 index 00000000000..6e91b0f90a9 --- /dev/null +++ b/libcontainer/dmz/_dmz.c @@ -0,0 +1,10 @@ +#include + +extern char **environ; + +int main(int argc, char **argv) +{ + if (argc < 1) + return 127; + return execve(argv[0], argv, environ); +} diff --git a/libcontainer/dmz/cloned_binary_linux.go b/libcontainer/dmz/cloned_binary_linux.go new file mode 100644 index 00000000000..db5e18a3260 --- /dev/null +++ b/libcontainer/dmz/cloned_binary_linux.go @@ -0,0 +1,241 @@ +package dmz + +import ( + "errors" + "fmt" + "io" + "os" + "strconv" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/system" +) + +type SealFunc func(**os.File) error + +var ( + _ SealFunc = sealMemfd + _ SealFunc = sealFile +) + +func isExecutable(f *os.File) bool { + if err := unix.Faccessat(int(f.Fd()), "", unix.X_OK, unix.AT_EACCESS|unix.AT_EMPTY_PATH); err == nil { + return true + } else if err == unix.EACCES { + return false + } + path := "/proc/self/fd/" + strconv.Itoa(int(f.Fd())) + if err := unix.Access(path, unix.X_OK); err == nil { + return true + } else if err == unix.EACCES { + return false + } + // Cannot check -- assume it's executable (if not, exec will fail). + logrus.Debugf("cannot do X_OK check on binary %s -- assuming it's executable", f.Name()) + return true +} + +const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE + +func sealMemfd(f **os.File) error { + if err := (*f).Chmod(0o511); err != nil { + return err + } + // Try to set the newer memfd sealing flags, but we ignore + // errors because they are not needed and we want to continue + // to work on older kernels. + fd := (*f).Fd() + // F_SEAL_FUTURE_WRITE -- Linux 5.1 + _, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE) + // F_SEAL_EXEC -- Linux 6.3 + const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name + _, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC) + // Apply all original memfd seals. + _, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals) + return os.NewSyscallError("fcntl(F_ADD_SEALS)", err) +} + +// Memfd creates a sealable executable memfd (supported since Linux 3.17). +func Memfd(comment string) (*os.File, SealFunc, error) { + file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC) + return file, sealMemfd, err +} + +func sealFile(f **os.File) error { + if err := (*f).Chmod(0o511); err != nil { + return err + } + // When sealing an O_TMPFILE-style descriptor we need to + // re-open the path as O_PATH to clear the existing write + // handle we have. + opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("reopen tmpfile: %w", err) + } + _ = (*f).Close() + *f = opath + return nil +} + +// otmpfile creates an open(O_TMPFILE) file in the given directory (supported +// since Linux 3.11). +func otmpfile(dir string) (*os.File, SealFunc, error) { + file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700) + if err != nil { + return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err) + } + // Make sure we actually got an unlinked O_TMPFILE descriptor. + var stat unix.Stat_t + if err := unix.Fstat(int(file.Fd()), &stat); err != nil { + file.Close() + return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err) + } else if stat.Nlink != 0 { + file.Close() + return nil, nil, errors.New("O_TMPFILE has non-zero nlink") + } + return file, sealFile, err +} + +// mktemp creates a classic unlinked file in the given directory. +func mktemp(dir string) (*os.File, SealFunc, error) { + file, err := os.CreateTemp(dir, "runc.") + if err != nil { + return nil, nil, err + } + // Unlink the file and verify it was unlinked. + if err := os.Remove(file.Name()); err != nil { + return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err) + } + var stat unix.Stat_t + if err := unix.Fstat(int(file.Fd()), &stat); err != nil { + return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err) + } else if stat.Nlink != 0 { + return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name()) + } + return file, sealFile, err +} + +func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) { + // First, try an executable memfd (supported since Linux 3.17). + file, sealFn, err = Memfd(comment) + if err == nil { + return + } + logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err) + + // The tmpDir here (c.root) might be mounted noexec, so we need a couple of + // fallbacks to try. It's possible that none of these are writable and + // executable, in which case there's nothing we can practically do (other + // than mounting our own executable tmpfs, which would have its own + // issues). + tmpDirs := []string{ + tmpDir, + os.TempDir(), + "/tmp", + ".", + "/bin", + "/", + } + + // Try to fallback to O_TMPFILE (supported since Linux 3.11). + for _, dir := range tmpDirs { + file, sealFn, err = otmpfile(dir) + if err != nil { + continue + } + if !isExecutable(file) { + logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir) + file.Close() + continue + } + return + } + logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err) + // Finally, try a classic unlinked temporary file. + for _, dir := range tmpDirs { + file, sealFn, err = mktemp(dir) + if err != nil { + continue + } + if !isExecutable(file) { + logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir) + file.Close() + continue + } + return + } + return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err) +} + +// CloneBinary creates a "sealed" clone of a given binary, which can be used to +// thwart attempts by the container process to gain access to host binaries +// through procfs magic-link shenanigans. For more details on why this is +// necessary, see CVE-2019-5736. +func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) { + logrus.Debugf("cloning %s binary (%d bytes)", name, size) + file, sealFn, err := getSealableFile(name, tmpDir) + if err != nil { + return nil, err + } + copied, err := system.Copy(file, src) + if err != nil { + file.Close() + return nil, fmt.Errorf("copy binary: %w", err) + } else if copied != size { + file.Close() + return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size) + } + if err := sealFn(&file); err != nil { + file.Close() + return nil, fmt.Errorf("could not seal fd: %w", err) + } + return file, nil +} + +// IsCloned returns whether the given file can be guaranteed to be a safe exe. +func IsCloned(exe *os.File) bool { + seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0) + if err != nil { + // /proc/self/exe is probably not a memfd + logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err) + return false + } + // The memfd must have all of the base seals applied. + logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals) + return seals&baseMemfdSeals == baseMemfdSeals +} + +// CloneSelfExe makes a clone of the current process's binary (through +// /proc/self/exe). This binary can then be used for "runc init" in order to +// make sure the container process can never resolve the original runc binary. +// For more details on why this is necessary, see CVE-2019-5736. +func CloneSelfExe(tmpDir string) (*os.File, error) { + selfExe, err := os.Open("/proc/self/exe") + if err != nil { + return nil, fmt.Errorf("opening current binary: %w", err) + } + defer selfExe.Close() + + stat, err := selfExe.Stat() + if err != nil { + return nil, fmt.Errorf("checking /proc/self/exe size: %w", err) + } + size := stat.Size() + + return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir) +} + +// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can +// be guaranteed to be safe. This means that it must be a sealed memfd. Other +// types of clones cannot be completely verified as safe. +func IsSelfExeCloned() bool { + selfExe, err := os.Open("/proc/self/exe") + if err != nil { + logrus.Debugf("open /proc/self/exe failed: %v", err) + return false + } + defer selfExe.Close() + return IsCloned(selfExe) +} diff --git a/libcontainer/dmz/dmz.go b/libcontainer/dmz/dmz.go new file mode 100644 index 00000000000..9b6b500807c --- /dev/null +++ b/libcontainer/dmz/dmz.go @@ -0,0 +1,9 @@ +package dmz + +import ( + "errors" +) + +// ErrNoDmzBinary is returned by Binary when there is no runc-dmz binary +// embedded in the runc program. +var ErrNoDmzBinary = errors.New("runc-dmz binary not embedded in this program") diff --git a/libcontainer/dmz/dmz_fallback_linux.go b/libcontainer/dmz/dmz_fallback_linux.go new file mode 100644 index 00000000000..4f624e048b9 --- /dev/null +++ b/libcontainer/dmz/dmz_fallback_linux.go @@ -0,0 +1 @@ +package dmz diff --git a/libcontainer/dmz/dmz_linux.go b/libcontainer/dmz/dmz_linux.go new file mode 100644 index 00000000000..12f9709a269 --- /dev/null +++ b/libcontainer/dmz/dmz_linux.go @@ -0,0 +1,48 @@ +//go:build !runc_nodmz +// +build !runc_nodmz + +package dmz + +import ( + "bytes" + "debug/elf" + _ "embed" + "os" + + "github.com/sirupsen/logrus" +) + +// Try to build the runc-dmz binary. If it fails, replace it with an empty file +// (this will trigger us to fall back to a clone of /proc/self/exe). Yeah, this +// is a bit ugly but it makes sure that weird cross-compilation setups don't +// break because of runc-dmz. +// +//go:generate sh -c "make -B runc-dmz || echo -n >runc-dmz" +//go:embed runc-dmz +var runcDmzBinary []byte + +// Binary returns a cloned copy (see CloneBinary) of a very minimal C program +// that just does an execve() of its arguments. This is used in the final +// execution step of the container execution as an intermediate process before +// the container process is execve'd. This allows for protection against +// CVE-2019-5736 without requiring a complete copy of the runc binary. Each +// call to Binary will return a new copy. +// +// If the runc-dmz binary is not embedded into the runc binary, Binary will +// return ErrNoDmzBinary as the error. +func Binary(tmpDir string) (*os.File, error) { + rdr := bytes.NewBuffer(runcDmzBinary) + // Verify that our embedded binary has a standard ELF header. + if !bytes.HasPrefix(rdr.Bytes(), []byte(elf.ELFMAG)) { + if rdr.Len() != 0 { + logrus.Infof("misconfigured build: embedded runc-dmz binary is non-empty but is missing a proper ELF header") + } + return nil, ErrNoDmzBinary + } + // Setting RUNC_DMZ=legacy disables this dmz method. + if os.Getenv("RUNC_DMZ") == "legacy" { + logrus.Debugf("RUNC_DMZ=legacy set -- switching back to classic /proc/self/exe cloning") + return nil, ErrNoDmzBinary + } + return CloneBinary(rdr, int64(rdr.Len()), "runc-dmz", tmpDir) +} diff --git a/libcontainer/dmz/dmz_unsupported.go b/libcontainer/dmz/dmz_unsupported.go new file mode 100644 index 00000000000..2ba67270495 --- /dev/null +++ b/libcontainer/dmz/dmz_unsupported.go @@ -0,0 +1,12 @@ +//go:build !linux || runc_nodmz +// +build !linux runc_nodmz + +package dmz + +import ( + "os" +) + +func Binary(_ string) (*os.File, error) { + return nil, ErrNoDmzBinary +} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 732f64dc660..a24be276878 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -182,6 +182,17 @@ func startInitialization() (retErr error) { return err } + // Get runc-dmz fds. + var dmzExe *os.File + if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" { + dmzFd, err := strconv.Atoi(dmzFdStr) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_DMZEXEFD: %w", err) + } + unix.CloseOnExec(dmzFd) + dmzExe = os.NewFile(uintptr(dmzFd), "runc-dmz") + } + // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() @@ -197,10 +208,10 @@ func startInitialization() (retErr error) { }() // If init succeeds, it will not return, hence none of the defers will be called. - return containerInit(it, pipe, consoleSocket, fifofd, logFD, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) + return containerInit(it, pipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) } -func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error { +func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return err @@ -208,6 +219,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo if err := populateProcessEnvironment(config.Env); err != nil { return err } + switch t { case initSetns: // mount and idmap fds must be nil in this case. We don't mount while doing runc exec. @@ -220,6 +232,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo consoleSocket: consoleSocket, config: config, logFd: logFd, + dmzExe: dmzExe, } return i.Init() case initStandard: @@ -230,6 +243,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo config: config, fifoFd: fifoFd, logFd: logFd, + dmzExe: dmzExe, mountFds: mountFds, } return i.Init() diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c deleted file mode 100644 index a7f992fddd7..00000000000 --- a/libcontainer/nsenter/cloned_binary.c +++ /dev/null @@ -1,567 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later -/* - * Copyright (C) 2019 Aleksa Sarai - * Copyright (C) 2019 SUSE LLC - * - * This work is dual licensed under the following licenses. You may use, - * redistribute, and/or modify the work under the conditions of either (or - * both) licenses. - * - * === Apache-2.0 === - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * === LGPL-2.1-or-later === - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library. If not, see - * . - * - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ipc.h" -#include "log.h" - -/* Use our own wrapper for memfd_create. */ -#ifndef SYS_memfd_create -# ifdef __NR_memfd_create -# define SYS_memfd_create __NR_memfd_create -# else -/* These values come from . */ -# warning "libc is outdated -- using hard-coded SYS_memfd_create" -# if defined(__x86_64__) -# define SYS_memfd_create 319 -# elif defined(__i386__) -# define SYS_memfd_create 356 -# elif defined(__ia64__) -# define SYS_memfd_create 1340 -# elif defined(__arm__) -# define SYS_memfd_create 385 -# elif defined(__aarch64__) -# define SYS_memfd_create 279 -# elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__) -# define SYS_memfd_create 360 -# elif defined(__s390__) || defined(__s390x__) -# define SYS_memfd_create 350 -# else -# warning "unknown architecture -- cannot hard-code SYS_memfd_create" -# endif -# endif -#endif - -/* memfd_create(2) flags -- copied from . */ -#ifndef MFD_CLOEXEC -# define MFD_CLOEXEC 0x0001U -# define MFD_ALLOW_SEALING 0x0002U -#endif -#ifndef MFD_EXEC -# define MFD_EXEC 0x0010U -#endif - -int memfd_create(const char *name, unsigned int flags) -{ -#ifdef SYS_memfd_create - return syscall(SYS_memfd_create, name, flags); -#else - errno = ENOSYS; - return -1; -#endif -} - -/* This comes directly from . */ -#ifndef F_LINUX_SPECIFIC_BASE -# define F_LINUX_SPECIFIC_BASE 1024 -#endif -#ifndef F_ADD_SEALS -# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) -#endif -#ifndef F_SEAL_SEAL -# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -# define F_SEAL_GROW 0x0004 /* prevent file from growing */ -# define F_SEAL_WRITE 0x0008 /* prevent writes */ -#endif -#ifndef F_SEAL_FUTURE_WRITE -# define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ -#endif -#ifndef F_SEAL_EXEC -# define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ -#endif - -#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY" -#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" -/* - * There are newer memfd seals (such as F_SEAL_FUTURE_WRITE and F_SEAL_EXEC), - * which we use opportunistically. However, this set is the original set of - * memfd seals, and we require them all to be set to trust our /proc/self/exe - * if it is a memfd. - */ -#define RUNC_MEMFD_MIN_SEALS \ - (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) - -static void *must_realloc(void *ptr, size_t size) -{ - void *old = ptr; - do { - ptr = realloc(old, size); - } while (!ptr); - return ptr; -} - -/* - * Verify whether we are currently in a self-cloned program (namely, is - * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather - * for shmem files), and we want to be sure it's actually sealed. - */ -static int is_self_cloned(void) -{ - int fd, seals = 0, is_cloned = false; - struct stat statbuf = { }; - struct statfs fsbuf = { }; - - fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); - if (fd < 0) { - write_log(ERROR, "cannot open runc binary for reading: open /proc/self/exe: %m"); - return -ENOTRECOVERABLE; - } - - /* - * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for - * this, because you cannot write to a sealed memfd no matter what. - */ - seals = fcntl(fd, F_GET_SEALS); - if (seals >= 0) { - write_log(DEBUG, "checking /proc/self/exe memfd seals: 0x%x", seals); - is_cloned = (seals & RUNC_MEMFD_MIN_SEALS) == RUNC_MEMFD_MIN_SEALS; - if (is_cloned) - goto out; - } - - /* - * All other forms require CLONED_BINARY_ENV, since they are potentially - * writeable (or we can't tell if they're fully safe) and thus we must - * check the environment as an extra layer of defence. - */ - if (!getenv(CLONED_BINARY_ENV)) { - is_cloned = false; - goto out; - } - - /* - * Is the binary on a read-only filesystem? We can't detect bind-mounts in - * particular (in-kernel they are identical to regular mounts) but we can - * at least be sure that it's read-only. In addition, to make sure that - * it's *our* bind-mount we check CLONED_BINARY_ENV. - */ - if (fstatfs(fd, &fsbuf) >= 0) - is_cloned |= (fsbuf.f_flags & MS_RDONLY); - - /* - * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 - * which appears to have a borked backport of F_GET_SEALS. Either way, - * having a file which has no hardlinks indicates that we aren't using - * a host-side "runc" binary and this is something that a container - * cannot fake (because unlinking requires being able to resolve the - * path that you want to unlink). - */ - if (fstat(fd, &statbuf) >= 0) - is_cloned |= (statbuf.st_nlink == 0); - -out: - close(fd); - return is_cloned; -} - -/* Read a given file into a new buffer, and providing the length. */ -static char *read_file(char *path, size_t *length) -{ - int fd; - char buf[4096], *copy = NULL; - - if (!length) - return NULL; - - fd = open(path, O_RDONLY | O_CLOEXEC); - if (fd < 0) - return NULL; - - *length = 0; - for (;;) { - ssize_t n; - - n = read(fd, buf, sizeof(buf)); - if (n < 0) - goto error; - if (!n) - break; - - copy = must_realloc(copy, (*length + n) * sizeof(*copy)); - memcpy(copy + *length, buf, n); - *length += n; - } - close(fd); - return copy; - -error: - close(fd); - free(copy); - return NULL; -} - -/* - * A poor-man's version of "xargs -0". Basically parses a given block of - * NUL-delimited data, within the given length and adds a pointer to each entry - * to the array of pointers. - */ -static int parse_xargs(char *data, int data_length, char ***output) -{ - int num = 0; - char *cur = data; - - if (!data || *output != NULL) - return -1; - - while (cur < data + data_length) { - num++; - *output = must_realloc(*output, (num + 1) * sizeof(**output)); - (*output)[num - 1] = cur; - cur += strlen(cur) + 1; - } - (*output)[num] = NULL; - return num; -} - -/* - * "Parse" out argv from /proc/self/cmdline. - * This is necessary because we are running in a context where we don't have a - * main() that we can just get the arguments from. - */ -static int fetchve(char ***argv) -{ - char *cmdline = NULL; - size_t cmdline_size; - - cmdline = read_file("/proc/self/cmdline", &cmdline_size); - if (!cmdline) - goto error; - - if (parse_xargs(cmdline, cmdline_size, argv) <= 0) - goto error; - - return 0; - -error: - free(cmdline); - return -EINVAL; -} - -enum { - EFD_NONE = 0, - EFD_MEMFD, - EFD_FILE, -}; - -/* - * This comes from . We can't hard-code __O_TMPFILE because it - * changes depending on the architecture. If we don't have O_TMPFILE we always - * have the mkostemp(3) fallback. - */ -#ifndef O_TMPFILE -# if defined(__O_TMPFILE) && defined(O_DIRECTORY) -# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) -# endif -#endif - -static inline bool is_memfd_unsupported_error(int err) -{ - /* - * - ENOSYS is obviously an "unsupported" error. - * - * - EINVAL could be hit if MFD_EXEC is not supported (pre-6.3 kernel), - * but it can also be hit if vm.memfd_noexec=2 (in kernels without - * [1] applied) and the flags does not contain MFD_EXEC. However, - * there was a bug in the original 6.3 implementation of - * vm.memfd_noexec=2, which meant that MFD_EXEC would work even in - * the "strict" mode. Because we try MFD_EXEC first, we won't get - * EINVAL in the vm.memfd_noexec=2 case (which means we don't need to - * figure out whether to log the message about memfd_create). - * - * - EACCES is returned in kernels that contain [1] in the - * vm.memfd_noexec=2 case. - * - * At time of writing, [1] is not in Linus's tree and it't not clear if - * it will be backported to stable, so what exact versions apply here - * is unclear. But the bug is present in 6.3-6.5 at the very least. - * - * [1]: https://lore.kernel.org/all/20230705063315.3680666-2-jeffxu@google.com/ - */ - if (err == EACCES) - write_log(INFO, - "memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE"); - return err == ENOSYS || err == EINVAL || err == EACCES; -} - -static int make_execfd(int *fdtype) -{ - int fd = -1; - char template[PATH_MAX] = { 0 }; - char *prefix = getenv("_LIBCONTAINER_STATEDIR"); - - if (!prefix || *prefix != '/') - prefix = "/tmp"; - if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) - return -1; - - /* - * Now try memfd, it's much nicer than actually creating a file in STATEDIR - * since it's easily detected thanks to sealing and also doesn't require - * assumptions about STATEDIR. - */ - *fdtype = EFD_MEMFD; - /* - * On newer kernels we should set MFD_EXEC to indicate we need +x - * permissions. Otherwise an admin with vm.memfd_noexec=1 would subtly - * break runc. vm.memfd_noexec=2 is a little bit more complicated, see the - * comment in is_memfd_unsupported_error() -- the upshot is that doing it - * this way works, but only because of two overlapping bugs in the sysctl - * implementation. - */ - fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); - if (fd < 0 && is_memfd_unsupported_error(errno)) - fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); - if (fd >= 0) - return fd; - if (!is_memfd_unsupported_error(errno)) - goto error; - -#ifdef O_TMPFILE - /* - * Try O_TMPFILE to avoid races where someone might snatch our file. Note - * that O_EXCL isn't actually a security measure here (since you can just - * fd re-open it and clear O_EXCL). - */ - *fdtype = EFD_FILE; - fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); - if (fd >= 0) { - struct stat statbuf = { }; - bool working_otmpfile = false; - - /* - * open(2) ignores unknown O_* flags -- yeah, I was surprised when I - * found this out too. As a result we can't check for EINVAL. However, - * if we get nlink != 0 (or EISDIR) then we know that this kernel - * doesn't support O_TMPFILE. - */ - if (fstat(fd, &statbuf) >= 0) - working_otmpfile = (statbuf.st_nlink == 0); - - if (working_otmpfile) - return fd; - - /* Pretend that we got EISDIR since O_TMPFILE failed. */ - close(fd); - errno = EISDIR; - } - if (errno != EISDIR) - goto error; -#endif /* defined(O_TMPFILE) */ - - /* - * Our final option is to create a temporary file the old-school way, and - * then unlink it so that nothing else sees it by accident. - */ - *fdtype = EFD_FILE; - fd = mkostemp(template, O_CLOEXEC); - if (fd >= 0) { - if (unlink(template) >= 0) - return fd; - close(fd); - } - -error: - *fdtype = EFD_NONE; - return -1; -} - -static int seal_execfd(int *fd, int fdtype) -{ - switch (fdtype) { - case EFD_MEMFD:{ - /* - * Try to seal with newer seals, but we ignore errors because older - * kernels don't support some of them. For container security only - * RUNC_MEMFD_MIN_SEALS are strictly required, but the rest are - * nice-to-haves. We apply RUNC_MEMFD_MIN_SEALS at the end because it - * contains F_SEAL_SEAL. - */ - int __attribute__((unused)) _err1 = fcntl(*fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE); // Linux 5.1 - int __attribute__((unused)) _err2 = fcntl(*fd, F_ADD_SEALS, F_SEAL_EXEC); // Linux 6.3 - return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_MIN_SEALS); - } - case EFD_FILE:{ - /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ - int newfd; - char fdpath[PATH_MAX] = { 0 }; - - if (fchmod(*fd, 0100) < 0) - return -1; - - if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) - return -1; - - newfd = open(fdpath, O_PATH | O_CLOEXEC); - if (newfd < 0) - return -1; - - close(*fd); - *fd = newfd; - return 0; - } - default: - break; - } - return -1; -} - -static ssize_t fd_to_fd(int outfd, int infd) -{ - ssize_t total = 0; - char buffer[4096]; - - for (;;) { - ssize_t nread, nwritten = 0; - - nread = read(infd, buffer, sizeof(buffer)); - if (nread < 0) - return -1; - if (!nread) - break; - - do { - ssize_t n = write(outfd, buffer + nwritten, nread - nwritten); - if (n < 0) - return -1; - nwritten += n; - } while (nwritten < nread); - - total += nwritten; - } - - return total; -} - -static int clone_binary(void) -{ - int binfd, execfd; - struct stat statbuf = { }; - size_t sent = 0; - int fdtype = EFD_NONE; - - execfd = make_execfd(&fdtype); - if (execfd < 0 || fdtype == EFD_NONE) - return -ENOTRECOVERABLE; - - binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); - if (binfd < 0) - goto error; - - if (fstat(binfd, &statbuf) < 0) - goto error_binfd; - - while (sent < statbuf.st_size) { - int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); - if (n < 0) { - /* sendfile can fail so we fallback to a dumb user-space copy. */ - n = fd_to_fd(execfd, binfd); - if (n < 0) - goto error_binfd; - } - sent += n; - } - close(binfd); - if (sent != statbuf.st_size) - goto error; - - if (seal_execfd(&execfd, fdtype) < 0) - goto error; - - return execfd; - -error_binfd: - close(binfd); -error: - close(execfd); - return -EIO; -} - -/* Get cheap access to the environment. */ -extern char **environ; - -int ensure_cloned_binary(void) -{ - int execfd; - char **argv = NULL; - - /* Check that we're not self-cloned, and if we are then bail. */ - int cloned = is_self_cloned(); - if (cloned > 0 || cloned == -ENOTRECOVERABLE) - return cloned; - - if (fetchve(&argv) < 0) - return -EINVAL; - - execfd = clone_binary(); - if (execfd < 0) - return -EIO; - - if (putenv(CLONED_BINARY_ENV "=1")) - goto error; - - fexecve(execfd, argv, environ); -error: - close(execfd); - return -ENOEXEC; -} diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 17e0468c6af..9b10b232528 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -536,9 +536,6 @@ void join_namespaces(char *nslist) free(namespaces); } -/* Defined in cloned_binary.c. */ -extern int ensure_cloned_binary(void); - static inline int sane_kill(pid_t pid, int signum) { if (pid > 0) @@ -791,14 +788,6 @@ void nsexec(void) return; } - /* - * We need to re-exec if we are not in a cloned binary. This is necessary - * to ensure that containers won't be able to access the host binary - * through /proc/self/exe. See CVE-2019-5736. - */ - if (ensure_cloned_binary() < 0) - bail("could not ensure we are a cloned binary"); - /* * Inform the parent we're past initial setup. * For the other side of this, see initWaiter. diff --git a/libcontainer/process.go b/libcontainer/process.go index 4de4a9e75c2..d2c7bfcda36 100644 --- a/libcontainer/process.go +++ b/libcontainer/process.go @@ -49,6 +49,9 @@ type Process struct { // ExtraFiles specifies additional open files to be inherited by the container ExtraFiles []*os.File + // open handles to cloned binaries -- see dmz.ClonedBinary for more details + clonedExes []*os.File + // Initial sizings for the console ConsoleWidth uint16 ConsoleHeight uint16 @@ -121,6 +124,15 @@ func (p Process) Signal(sig os.Signal) error { return p.ops.signal(sig) } +// closeClonedExes cleans up any existing cloned binaries associated with the +// Process. +func (p *Process) closeClonedExes() { + for _, exe := range p.clonedExes { + _ = exe.Close() + } + p.clonedExes = nil +} + // IO holds the process's STDIO type IO struct { Stdin io.WriteCloser diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 40a47a2e95c..7709219300b 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "os" + "os/exec" "strconv" "github.com/opencontainers/selinux/go-selinux" @@ -23,6 +24,7 @@ type linuxSetnsInit struct { consoleSocket *os.File config *initConfig logFd int + dmzExe *os.File } func (l *linuxSetnsInit) getSessionRingName() string { @@ -85,6 +87,18 @@ func (l *linuxSetnsInit) Init() error { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } + // Check for the arg early to make sure it exists. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + // exec.LookPath in Go < 1.20 might return no error for an executable + // residing on a file system mounted with noexec flag, so perform this + // extra check now while we can still return a proper error. + // TODO: remove this once go < 1.20 is not supported. + if err := eaccess(name); err != nil { + return &os.PathError{Op: "eaccess", Path: name, Err: err} + } // Set seccomp as close to execve as possible, so as few syscalls take // place afterward (reducing the amount of syscalls that users need to // enable in their seccomp profiles). @@ -98,10 +112,15 @@ func (l *linuxSetnsInit) Init() error { } } logrus.Debugf("setns_init: about to exec") + // Close the log pipe fd so the parent's ForwardLogs can exit. if err := unix.Close(l.logFd); err != nil { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } - return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) + if l.dmzExe != nil { + l.config.Args[0] = name + return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ()) + } + return system.Exec(name, l.config.Args, os.Environ()) } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index c64173ecfc3..4eb3d8db435 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -25,6 +25,7 @@ type linuxStandardInit struct { parentPid int fifoFd int logFd int + dmzExe *os.File mountFds mountFds config *initConfig } @@ -262,5 +263,9 @@ func (l *linuxStandardInit) Init() error { return err } - return system.Exec(name, l.config.Args[0:], os.Environ()) + if l.dmzExe != nil { + l.config.Args[0] = name + return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ()) + } + return system.Exec(name, l.config.Args, os.Environ()) } diff --git a/libcontainer/system/kernelversion/kernel_linux.go b/libcontainer/system/kernelversion/kernel_linux.go new file mode 100644 index 00000000000..ca5d4130d0c --- /dev/null +++ b/libcontainer/system/kernelversion/kernel_linux.go @@ -0,0 +1,94 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + File copied and customized based on + https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux.go + + File copied from + https://github.com/containerd/containerd/blob/v1.7.5/contrib/seccomp/kernelversion/kernel_linux.go +*/ + +package kernelversion + +import ( + "bytes" + "fmt" + "sync" + + "golang.org/x/sys/unix" +) + +// KernelVersion holds information about the kernel. +type KernelVersion struct { + Kernel uint64 // Version of the Kernel (i.e., the "4" in "4.1.2-generic") + Major uint64 // Major revision of the Kernel (i.e., the "1" in "4.1.2-generic") +} + +func (k *KernelVersion) String() string { + if k.Kernel > 0 || k.Major > 0 { + return fmt.Sprintf("%d.%d", k.Kernel, k.Major) + } + return "" +} + +var ( + currentKernelVersion *KernelVersion + kernelVersionError error + once sync.Once +) + +// getKernelVersion gets the current kernel version. +func getKernelVersion() (*KernelVersion, error) { + once.Do(func() { + var uts unix.Utsname + if err := unix.Uname(&uts); err != nil { + return + } + // Remove the \x00 from the release for Atoi to parse correctly + currentKernelVersion, kernelVersionError = parseRelease(string(uts.Release[:bytes.IndexByte(uts.Release[:], 0)])) + }) + return currentKernelVersion, kernelVersionError +} + +// parseRelease parses a string and creates a KernelVersion based on it. +func parseRelease(release string) (*KernelVersion, error) { + var version KernelVersion + + // We're only make sure we get the "kernel" and "major revision". Sometimes we have + // 3.12.25-gentoo, but sometimes we just have 3.12-1-amd64. + _, err := fmt.Sscanf(release, "%d.%d", &version.Kernel, &version.Major) + if err != nil { + return nil, fmt.Errorf("failed to parse kernel version %q: %w", release, err) + } + return &version, nil +} + +// GreaterEqualThan checks if the host's kernel version is greater than, or +// equal to the given kernel version v. Only "kernel version" and "major revision" +// can be specified (e.g., "3.12") and will be taken into account, which means +// that 3.12.25-gentoo and 3.12-1-amd64 are considered equal (kernel: 3, major: 12). +func GreaterEqualThan(minVersion KernelVersion) (bool, error) { + kv, err := getKernelVersion() + if err != nil { + return false, err + } + if kv.Kernel > minVersion.Kernel { + return true, nil + } + if kv.Kernel == minVersion.Kernel && kv.Major >= minVersion.Major { + return true, nil + } + return false, nil +} diff --git a/libcontainer/system/kernelversion/kernel_linux_test.go b/libcontainer/system/kernelversion/kernel_linux_test.go new file mode 100644 index 00000000000..a18f1f2226f --- /dev/null +++ b/libcontainer/system/kernelversion/kernel_linux_test.go @@ -0,0 +1,140 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + File copied and customized based on + https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux_test.go +*/ + +package kernelversion + +import ( + "fmt" + "testing" +) + +func TestGetKernelVersion(t *testing.T) { + version, err := getKernelVersion() + if err != nil { + t.Fatal(err) + } + if version == nil { + t.Fatal("version is nil") + } + if version.Kernel == 0 { + t.Fatal("no kernel version") + } +} + +func TestParseRelease(t *testing.T) { + tests := []struct { + in string + out KernelVersion + expectedErr error + }{ + {in: "3.8", out: KernelVersion{Kernel: 3, Major: 8}}, + {in: "3.8.0", out: KernelVersion{Kernel: 3, Major: 8}}, + {in: "3.8.0-19-generic", out: KernelVersion{Kernel: 3, Major: 8}}, + {in: "3.4.54.longterm-1", out: KernelVersion{Kernel: 3, Major: 4}}, + {in: "3.10.0-862.2.3.el7.x86_64", out: KernelVersion{Kernel: 3, Major: 10}}, + {in: "3.12.8tag", out: KernelVersion{Kernel: 3, Major: 12}}, + {in: "3.12-1-amd64", out: KernelVersion{Kernel: 3, Major: 12}}, + {in: "3.12foobar", out: KernelVersion{Kernel: 3, Major: 12}}, + {in: "99.999.999-19-generic", out: KernelVersion{Kernel: 99, Major: 999}}, + {in: "", expectedErr: fmt.Errorf(`failed to parse kernel version "": EOF`)}, + {in: "3", expectedErr: fmt.Errorf(`failed to parse kernel version "3": unexpected EOF`)}, + {in: "3.", expectedErr: fmt.Errorf(`failed to parse kernel version "3.": EOF`)}, + {in: "3a", expectedErr: fmt.Errorf(`failed to parse kernel version "3a": input does not match format`)}, + {in: "3.a", expectedErr: fmt.Errorf(`failed to parse kernel version "3.a": expected integer`)}, + {in: "a", expectedErr: fmt.Errorf(`failed to parse kernel version "a": expected integer`)}, + {in: "a.a", expectedErr: fmt.Errorf(`failed to parse kernel version "a.a": expected integer`)}, + {in: "a.a.a-a", expectedErr: fmt.Errorf(`failed to parse kernel version "a.a.a-a": expected integer`)}, + {in: "-3", expectedErr: fmt.Errorf(`failed to parse kernel version "-3": expected integer`)}, + {in: "-3.", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.": expected integer`)}, + {in: "-3.8", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.8": expected integer`)}, + {in: "-3.-8", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.-8": expected integer`)}, + {in: "3.-8", expectedErr: fmt.Errorf(`failed to parse kernel version "3.-8": expected integer`)}, + } + for _, tc := range tests { + tc := tc + t.Run(tc.in, func(t *testing.T) { + version, err := parseRelease(tc.in) + if tc.expectedErr != nil { + if err == nil { + t.Fatal("expected an error") + } + if err.Error() != tc.expectedErr.Error() { + t.Fatalf("expected: %s, got: %s", tc.expectedErr, err) + } + return + } + if err != nil { + t.Fatal("unexpected error:", err) + } + if version == nil { + t.Fatal("version is nil") + } + if version.Kernel != tc.out.Kernel || version.Major != tc.out.Major { + t.Fatalf("expected: %d.%d, got: %d.%d", tc.out.Kernel, tc.out.Major, version.Kernel, version.Major) + } + }) + } +} + +func TestGreaterEqualThan(t *testing.T) { + // Get the current kernel version, so that we can make test relative to that + v, err := getKernelVersion() + if err != nil { + t.Fatal(err) + } + + tests := []struct { + doc string + in KernelVersion + expected bool + }{ + { + doc: "same version", + in: KernelVersion{v.Kernel, v.Major}, + expected: true, + }, + { + doc: "kernel minus one", + in: KernelVersion{v.Kernel - 1, v.Major}, + expected: true, + }, + { + doc: "kernel plus one", + in: KernelVersion{v.Kernel + 1, v.Major}, + expected: false, + }, + { + doc: "major plus one", + in: KernelVersion{v.Kernel, v.Major + 1}, + expected: false, + }, + } + for _, tc := range tests { + tc := tc + t.Run(tc.doc+": "+tc.in.String(), func(t *testing.T) { + ok, err := GreaterEqualThan(tc.in) + if err != nil { + t.Fatal("unexpected error:", err) + } + if ok != tc.expected { + t.Fatalf("expected: %v, got: %v", tc.expected, ok) + } + }) + } +} diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go index d2ad5cea229..318b6edfe81 100644 --- a/libcontainer/system/linux.go +++ b/libcontainer/system/linux.go @@ -4,10 +4,15 @@ package system import ( + "fmt" + "io" "os" "os/exec" + "strconv" + "syscall" "unsafe" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -36,7 +41,6 @@ func Execv(cmd string, args []string, env []string) error { if err != nil { return err } - return Exec(name, args, env) } @@ -49,6 +53,49 @@ func Exec(cmd string, args []string, env []string) error { } } +func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error { + pathnamep, err := syscall.BytePtrFromString(pathname) + if err != nil { + return err + } + + argvp, err := syscall.SlicePtrFromStrings(args) + if err != nil { + return err + } + + envp, err := syscall.SlicePtrFromStrings(env) + if err != nil { + return err + } + + _, _, errno := syscall.Syscall6( + unix.SYS_EXECVEAT, + fd, + uintptr(unsafe.Pointer(pathnamep)), + uintptr(unsafe.Pointer(&argvp[0])), + uintptr(unsafe.Pointer(&envp[0])), + uintptr(flags), + 0, + ) + return errno +} + +func Fexecve(fd uintptr, args []string, env []string) error { + var err error + for { + err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH) + if err != unix.EINTR { // nolint:errorlint // unix errors are bare + break + } + } + if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare + // Fallback to classic /proc/self/fd/... exec. + return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env) + } + return os.NewSyscallError("execveat", err) +} + func SetParentDeathSignal(sig uintptr) error { if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil { return err @@ -102,3 +149,68 @@ func GetSubreaper() (int, error) { return int(i), nil } + +func ExecutableMemfd(comment string, flags int) (*os.File, error) { + // Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this + // flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an + // executable memfd. For vm.memfd_noexec=2 this is a bit more complicated. + // The original vm.memfd_noexec=2 implementation incorrectly silently + // allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer + // kernels, we will get -EACCES if we try to use MFD_EXEC with + // vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value). + // + // The upshot is we only need to retry without MFD_EXEC on -EINVAL because + // it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on + // kernels where -EINVAL is actually a security denial. + memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC) + if err == unix.EINVAL { + memfd, err = unix.MemfdCreate(comment, flags) + } + if err != nil { + if err == unix.EACCES { + logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE") + } + err := os.NewSyscallError("memfd_create", err) + return nil, fmt.Errorf("failed to create executable memfd: %w", err) + } + return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil +} + +// Copy is like io.Copy except it uses sendfile(2) if the source and sink are +// both (*os.File) as an optimisation to make copies faster. +func Copy(dst io.Writer, src io.Reader) (copied int64, err error) { + dstFile, _ := dst.(*os.File) + srcFile, _ := src.(*os.File) + + if dstFile != nil && srcFile != nil { + fi, err := srcFile.Stat() + if err != nil { + goto fallback + } + size := fi.Size() + for size > 0 { + n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size)) + if n > 0 { + size -= int64(n) + copied += int64(n) + } + if err == unix.EINTR { + continue + } + if err != nil { + if copied == 0 { + // If we haven't copied anything so far, we can safely just + // fallback to io.Copy. We could always do the fallback but + // it's safer to error out in the case of a partial copy + // followed by an error (which should never happen). + goto fallback + } + return copied, fmt.Errorf("partial sendfile copy: %w", err) + } + } + return copied, nil + } + +fallback: + return io.Copy(dst, src) +} diff --git a/script/lib.sh b/script/lib.sh index 9fee8e29f38..f79dc3c2335 100644 --- a/script/lib.sh +++ b/script/lib.sh @@ -1,33 +1,65 @@ #!/bin/bash +# NOTE: Make sure you keep this file in sync with cc_platform.mk. + # set_cross_vars sets a few environment variables used for cross-compiling, # based on the architecture specified in $1. function set_cross_vars() { GOARCH="$1" # default, may be overridden below unset GOARM + PLATFORM=linux-gnu + # openSUSE has a custom PLATFORM + if grep -iq "ID_LIKE=.*suse" /etc/os-release; then + PLATFORM=suse-linux + is_suse=1 + fi + case $1 in + 386) + # Always use the 64-bit compiler to build the 386 binary, which works + # for the more common cross-build method for x86 (namely, the + # equivalent of dpkg --add-architecture). + local cpu_type + if [ -v is_suse ]; then + # There is no x86_64-suse-linux-gcc, so use the native one. + HOST= + cpu_type=i586 + else + HOST=x86_64-${PLATFORM} + cpu_type=i686 + fi + CFLAGS="-m32 -march=$cpu_type ${CFLAGS[*]}" + ;; + amd64) + if [ -n "${is_suse:-}" ]; then + # There is no x86_64-suse-linux-gcc, so use the native one. + HOST= + else + HOST=x86_64-${PLATFORM} + fi + ;; arm64) - HOST=aarch64-linux-gnu + HOST=aarch64-${PLATFORM} ;; armel) - HOST=arm-linux-gnueabi + HOST=arm-${PLATFORM}eabi GOARCH=arm GOARM=6 ;; armhf) - HOST=arm-linux-gnueabihf + HOST=arm-${PLATFORM}eabihf GOARCH=arm GOARM=7 ;; ppc64le) - HOST=powerpc64le-linux-gnu + HOST=powerpc64le-${PLATFORM} ;; riscv64) - HOST=riscv64-linux-gnu + HOST=riscv64-${PLATFORM} ;; s390x) - HOST=s390x-linux-gnu + HOST=s390x-${PLATFORM} ;; *) echo "set_cross_vars: unsupported architecture: $1" >&2 @@ -35,8 +67,8 @@ function set_cross_vars() { ;; esac - CC=$HOST-gcc - STRIP=$HOST-strip + CC="${HOST:+$HOST-}gcc" + STRIP="${HOST:+$HOST-}strip" - export HOST GOARM GOARCH CC STRIP + export HOST CFLAGS GOARM GOARCH CC STRIP } diff --git a/script/release_build.sh b/script/release_build.sh index af238628cbd..6c7aee88b23 100755 --- a/script/release_build.sh +++ b/script/release_build.sh @@ -60,24 +60,14 @@ function build_project() { # it can reuse cached pkg-config results). local make_args=(COMMIT_NO= EXTRA_FLAGS="-a" EXTRA_LDFLAGS="${ldflags}" static) - # Build natively. - make -C "$root" \ - PKG_CONFIG_PATH="$seccompdir/lib/pkgconfig" \ - "${make_args[@]}" - strip "$root/$project" - # Sanity check: make sure libseccomp version is as expected. - local ver - ver=$("$root/$project" --version | awk '$1 == "libseccomp:" {print $2}') - if [ "$ver" != "$LIBSECCOMP_VERSION" ]; then - echo >&2 "libseccomp version mismatch: want $LIBSECCOMP_VERSION, got $ver" - exit 1 - fi + # Save the original cflags. + local original_cflags="${CFLAGS:-}" - mv "$root/$project" "$builddir/$project.$native_arch" - - # Cross-build for for other architectures. + # Build for all requested architectures. local arch for arch in "${arches[@]}"; do + # Reset CFLAGS. + CFLAGS="$original_cflags" set_cross_vars "$arch" make -C "$root" \ PKG_CONFIG_PATH="$seccompdir/$arch/lib/pkgconfig" \ @@ -86,6 +76,14 @@ function build_project() { mv "$root/$project" "$builddir/$project.$arch" done + # Sanity check: make sure libseccomp version is as expected. + local ver + ver=$("$builddir/$project.$native_arch" --version | awk '$1 == "libseccomp:" {print $2}') + if [ "$ver" != "$LIBSECCOMP_VERSION" ]; then + echo >&2 "libseccomp version mismatch: want $LIBSECCOMP_VERSION, got $ver" + exit 1 + fi + # Copy libseccomp source tarball. cp "$seccompdir"/src/* "$builddir" @@ -122,12 +120,17 @@ commit="HEAD" version="" releasedir="" hashcmd="" -declare -a add_arches +# Always build a native binary. +native_arch="$(go env GOARCH || echo "amd64")" +arches=("$native_arch") while getopts "a:c:H:hr:v:" opt; do case "$opt" in a) - add_arches+=("$OPTARG") + # Add architecture if not already present in arches. + if ! (printf "%s\0" "${arches[@]}" | grep -zqxF "$OPTARG"); then + arches+=("$OPTARG") + fi ;; c) commit="$OPTARG" @@ -158,9 +161,8 @@ done version="${version:-$(<"$root/VERSION")}" releasedir="${releasedir:-release/$version}" hashcmd="${hashcmd:-sha256sum}" -native_arch="$(go env GOARCH || echo "amd64")" # Suffixes of files to checksum/sign. -suffixes=("$native_arch" "${add_arches[@]}" tar.xz) +suffixes=("${arches[@]}" tar.xz) log "creating $project release in '$releasedir'" log " version: $version" @@ -174,7 +176,7 @@ set -x rm -rf "$releasedir" && mkdir -p "$releasedir" # Build project. -build_project "$releasedir/$project" "$native_arch" "${add_arches[@]}" +build_project "$releasedir/$project" "$native_arch" "${arches[@]}" # Generate new archive. git archive --format=tar --prefix="$project-$version/" "$commit" | xz >"$releasedir/$project.tar.xz" diff --git a/script/seccomp.sh b/script/seccomp.sh index beea612ac83..955437c2fb4 100755 --- a/script/seccomp.sh +++ b/script/seccomp.sh @@ -33,16 +33,21 @@ function build_libseccomp() { tar xf "$tar" -C "$srcdir" pushd "$srcdir/libseccomp-$ver" || return - # Build natively and install to /usr/local. + # Install native version for Dockerfile builds. ./configure \ --prefix="$dest" --libdir="$dest/lib" \ --enable-static --enable-shared make install make clean - # Build and install for additional architectures. + # Save the original cflags. + local original_cflags="${CFLAGS:-}" + + # Build and install for all requested architectures. local arch for arch in "${arches[@]}"; do + # Reset CFLAGS. + CFLAGS="$original_cflags" set_cross_vars "$arch" ./configure --host "$HOST" \ --prefix="$dest/$arch" --libdir="$dest/$arch/lib" \ diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index cd08fb2459f..7e6399a47b8 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -646,12 +646,16 @@ function teardown_bundle() { remove_parent } -function requires_kernel() { +function is_kernel_gte() { local major_required minor_required major_required=$(echo "$1" | cut -d. -f1) minor_required=$(echo "$1" | cut -d. -f2) - if [[ "$KERNEL_MAJOR" -lt $major_required || ("$KERNEL_MAJOR" -eq $major_required && "$KERNEL_MINOR" -lt $minor_required) ]]; then - skip "requires kernel $1" + [[ "$KERNEL_MAJOR" -gt $major_required || ("$KERNEL_MAJOR" -eq $major_required && "$KERNEL_MINOR" -ge $minor_required) ]] +} + +function requires_kernel() { + if ! is_kernel_gte "$@"; then + skip "requires kernel >= $1" fi } diff --git a/tests/integration/run.bats b/tests/integration/run.bats index 9f1f1d8bc74..baf91fb00cd 100644 --- a/tests/integration/run.bats +++ b/tests/integration/run.bats @@ -126,3 +126,37 @@ function teardown() { [ "$status" -eq 0 ] [ "$output" = "410" ] } + +@test "runc run [runc-dmz]" { + runc --debug run test_hello + [ "$status" -eq 0 ] + [[ "$output" = *"Hello World"* ]] + # We use runc-dmz if we can. + [[ "$output" = *"runc-dmz: using runc-dmz"* ]] +} + +@test "runc run [cap_sys_ptrace -> /proc/self/exe clone]" { + # Add CAP_SYS_PTRACE to the bounding set, the minimum needed to indicate a + # container process _could_ get CAP_SYS_PTRACE. + update_config '.process.capabilities.bounding += ["CAP_SYS_PTRACE"]' + + runc --debug run test_hello + [ "$status" -eq 0 ] + [[ "$output" = *"Hello World"* ]] + if [ "$EUID" -ne 0 ] && is_kernel_gte 4.10; then + # For Linux 4.10 and later, rootless containers will use runc-dmz + # because they are running in a user namespace. See isDmzBinarySafe(). + [[ "$output" = *"runc-dmz: using runc-dmz"* ]] + else + # If the container has CAP_SYS_PTRACE and is not rootless, we use + # /proc/self/exe cloning. + [[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]] + fi +} + +@test "RUNC_DMZ=legacy runc run [/proc/self/exe clone]" { + RUNC_DMZ=legacy runc --debug run test_hello + [ "$status" -eq 0 ] + [[ "$output" = *"Hello World"* ]] + [[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]] +} diff --git a/tests/integration/seccomp-notify-compat.bats b/tests/integration/seccomp-notify-compat.bats index 8d663edda51..6ca3449bffa 100644 --- a/tests/integration/seccomp-notify-compat.bats +++ b/tests/integration/seccomp-notify-compat.bats @@ -3,8 +3,8 @@ load helpers function setup() { - if [[ "$KERNEL_MAJOR" -gt 5 || ("$KERNEL_MAJOR" -eq 5 && "$KERNEL_MINOR" -ge 6) ]]; then - skip "requires kernel less than 5.6" + if is_kernel_gte 5.6; then + skip "requires kernel < 5.6" fi requires arch_x86_64 diff --git a/tests/integration/start_hello.bats b/tests/integration/start_hello.bats index 87005484748..6fbb893e695 100644 --- a/tests/integration/start_hello.bats +++ b/tests/integration/start_hello.bats @@ -58,6 +58,8 @@ function teardown() { # Enable CAP_DAC_OVERRIDE. update_config ' .process.capabilities.bounding += ["CAP_DAC_OVERRIDE"] | .process.capabilities.effective += ["CAP_DAC_OVERRIDE"] + | .process.capabilities.inheritable += ["CAP_DAC_OVERRIDE"] + | .process.capabilities.ambient += ["CAP_DAC_OVERRIDE"] | .process.capabilities.permitted += ["CAP_DAC_OVERRIDE"]' runc run test_busybox