diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7f6799f18f6..5a53f1de1eb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -28,6 +28,7 @@ jobs:
         rootless: ["rootless", ""]
         race: ["-race", ""]
         criu: ["", "criu-dev"]
+        dmz: ["", "runc_nodmz"]
         exclude:
           - criu: criu-dev
             rootless: rootless
@@ -35,6 +36,10 @@ jobs:
             go-version: 1.20.x
           - criu: criu-dev
             race: -race
+          - dmz: runc_nodmz
+            criu: criu-dev
+          - dmz: runc_nodmz
+            os: ubuntu-20.04
     runs-on: ${{ matrix.os }}
 
     steps:
@@ -71,6 +76,8 @@ jobs:
         go-version: ${{ matrix.go-version }}
 
     - name: build
+      env:
+        EXTRA_BUILDTAGS: ${{ matrix.dmz }}
       run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all
 
     - name: install bats
@@ -80,6 +87,8 @@ jobs:
 
     - name: unit test
       if: matrix.rootless != 'rootless'
+      env:
+        EXTRA_BUILDTAGS: ${{ matrix.dmz }}
       run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest
 
     - name: add rootless user
@@ -113,8 +122,12 @@ jobs:
   # However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
   # We are not interested in providing official support for i386.
   cross-i386:
-    runs-on: ubuntu-22.04
     timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        dmz: ["", "runc_nodmz"]
+    runs-on: ubuntu-22.04
 
     steps:
 
@@ -136,4 +149,6 @@ jobs:
         go-version: 1.x # Latest stable
 
     - name: unit test
+      env:
+        EXTRA_BUILDTAGS: ${{ matrix.dmz }}
       run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest
diff --git a/.gitignore b/.gitignore
index 4df0d6abfde..f022ed275cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,11 @@
 vendor/pkg
 /runc
 /runc-*
-contrib/cmd/recvtty/recvtty
-contrib/cmd/sd-helper/sd-helper
-contrib/cmd/seccompagent/seccompagent
-contrib/cmd/fs-idmap/fs-idmap
+/contrib/cmd/recvtty/recvtty
+/contrib/cmd/sd-helper/sd-helper
+/contrib/cmd/seccompagent/seccompagent
+/contrib/cmd/fs-idmap/fs-idmap
+/contrib/cmd/memfd-bind/memfd-bind
 man/man8
 release
 Vagrantfile
diff --git a/.golangci-extra.yml b/.golangci-extra.yml
index be33f90d7f9..23b57e040b6 100644
--- a/.golangci-extra.yml
+++ b/.golangci-extra.yml
@@ -7,6 +7,7 @@
 run:
   build-tags:
     - seccomp
+    - runc_nodmz
 
 linters:
   disable-all: true
diff --git a/.golangci.yml b/.golangci.yml
index 96b321019e4..c088117d2ca 100644
--- a/.golangci.yml
+++ b/.golangci.yml
@@ -3,6 +3,7 @@
 run:
   build-tags:
     - seccomp
+    - runc_nodmz
 
 linters:
   enable:
diff --git a/Dockerfile b/Dockerfile
index 9fd29a59371..6fa8752b5e3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,19 +9,15 @@ ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debi
 RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
     wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \
     && echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \
+    && dpkg --add-architecture i386 \
     && apt-get update \
     && apt-get install -y --no-install-recommends \
         build-essential \
         criu \
-        gcc-aarch64-linux-gnu libc-dev-arm64-cross \
-        gcc-arm-linux-gnueabi libc-dev-armel-cross \
-        gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
-        gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
-        gcc-s390x-linux-gnu libc-dev-s390x-cross \
-        gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
+        gcc \
+        gcc-multilib \
         curl \
         gawk \
-        gcc \
         gperf \
         iptables \
         jq \
@@ -32,6 +28,14 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
         sudo \
         uidmap \
         iproute2 \
+    && apt-get install -y --no-install-recommends \
+        libc-dev:i386 libgcc-s1:i386 \
+        gcc-aarch64-linux-gnu libc-dev-arm64-cross \
+        gcc-arm-linux-gnueabi libc-dev-armel-cross \
+        gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
+        gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
+        gcc-s390x-linux-gnu libc-dev-s390x-cross \
+        gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
     && apt-get clean \
     && rm -rf /var/cache/apt /var/lib/apt/lists/* /etc/apt/sources.list.d/*.list
 
@@ -54,7 +58,7 @@ RUN cd /tmp \
 ARG LIBSECCOMP_VERSION
 COPY script/seccomp.sh script/lib.sh /tmp/script/
 RUN mkdir -p /opt/libseccomp \
-    && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le riscv64 s390x
+    && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp 386 amd64 arm64 armel armhf ppc64le riscv64 s390x
 ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION
 ENV LD_LIBRARY_PATH=/opt/libseccomp/lib
 ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig
diff --git a/Makefile b/Makefile
index 0d48fe8c521..d3c1c11cb86 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,11 @@
+SHELL = /bin/bash
+
 CONTAINER_ENGINE := docker
 GO ?= go
 
+# Get CC values for cross-compilation.
+include cc_platform.mk
+
 PREFIX ?= /usr/local
 BINDIR := $(PREFIX)/sbin
 MANDIR := $(PREFIX)/share/man
@@ -10,6 +15,7 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
 RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
 PROJECT := github.com/opencontainers/runc
 BUILDTAGS ?= seccomp urfave_cli_no_docs
+BUILDTAGS += $(EXTRA_BUILDTAGS)
 
 COMMIT ?= $(shell git describe --dirty --long --always)
 VERSION := $(shell cat ./VERSION)
@@ -57,18 +63,25 @@ endif
 
 .DEFAULT: runc
 
-runc:
+runc: runc-dmz
 	$(GO_BUILD) -o runc .
+	make verify-dmz-arch
 
-all: runc recvtty sd-helper seccompagent fs-idmap
+all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind
 
-recvtty sd-helper seccompagent fs-idmap:
+recvtty sd-helper seccompagent fs-idmap memfd-bind:
 	$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@
 
-static:
+static: runc-dmz
 	$(GO_BUILD_STATIC) -o runc .
+	make verify-dmz-arch
+
+.PHONY: runc-dmz
+runc-dmz:
+	rm -f libcontainer/dmz/runc-dmz
+	$(GO) generate -tags "$(BUILDTAGS)" ./libcontainer/dmz
 
-releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
+releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
 releaseall: release
 
 release: runcimage
@@ -147,12 +160,13 @@ install-man: man
 	install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8
 
 clean:
-	rm -f runc runc-*
+	rm -f runc runc-* libcontainer/dmz/runc-dmz
+	rm -f contrib/cmd/fs-idmap/fs-idmap
 	rm -f contrib/cmd/recvtty/recvtty
 	rm -f contrib/cmd/sd-helper/sd-helper
 	rm -f contrib/cmd/seccompagent/seccompagent
-	rm -f contrib/cmd/fs-idmap/fs-idmap
-	rm -rf release
+	rm -f contrib/cmd/memfd-bind/memfd-bind
+	sudo rm -rf release
 	rm -rf man/man8
 
 cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
@@ -188,6 +202,18 @@ verify-dependencies: vendor
 	@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
 		|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
 		&& echo "all vendor files are up to date."
+verify-dmz-arch:
+	@test -s libcontainer/dmz/runc-dmz || exit 0; \
+		set -Eeuo pipefail; \
+		export LC_ALL=C; \
+		echo "readelf -h runc"; \
+		readelf -h runc | grep -E "(Machine|Flags):"; \
+		echo "readelf -h libcontainer/dmz/runc-dmz"; \
+		readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):"; \
+		diff -u \
+			<(readelf -h runc | grep -E "(Machine|Flags):") \
+			<(readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):") \
+		&& echo "runc-dmz architecture matches runc binary."
 
 validate-keyring:
 	script/keyring_validate.sh
@@ -197,4 +223,4 @@ validate-keyring:
 	test localtest unittest localunittest integration localintegration \
 	rootlessintegration localrootlessintegration shell install install-bash \
 	install-man clean cfmt shfmt localshfmt shellcheck \
-	vendor verify-changelog verify-dependencies validate-keyring
+	vendor verify-changelog verify-dependencies verify-dmz-arch validate-keyring
diff --git a/README.md b/README.md
index b209c7dcd55..827f837e06f 100644
--- a/README.md
+++ b/README.md
@@ -65,15 +65,18 @@ e.g. to disable seccomp:
 make BUILDTAGS=""
 ```
 
-| Build Tag | Feature                            | Enabled by default | Dependency |
-|-----------|------------------------------------|--------------------|------------|
-| seccomp   | Syscall filtering                  | yes                | libseccomp |
+| Build Tag     | Feature                               | Enabled by Default | Dependencies        |
+|---------------|---------------------------------------|--------------------|---------------------|
+| `seccomp`     | Syscall filtering using `libseccomp`. | yes                | `libseccomp`        |
+| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary, [see `memfd-bind` for more details][contrib-memfd-bind]. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||
 
 The following build tags were used earlier, but are now obsoleted:
  - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
  - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
  - **selinux**  (since runc v1.0.0-rc93 the feature is always enabled)
 
+ [contrib-memfd-bind]: /contrib/memfd-bind/README.md
+
 ### Running the test suite
 
 `runc` currently supports running its test suite via Docker.
diff --git a/cc_platform.mk b/cc_platform.mk
new file mode 100644
index 00000000000..6aa2b5ecb8b
--- /dev/null
+++ b/cc_platform.mk
@@ -0,0 +1,61 @@
+# NOTE: Make sure you keep this file in sync with scripts/lib.sh.
+
+GO ?= go
+GOARCH ?= $(shell $(GO) env GOARCH)
+
+ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),)
+	# openSUSE has a custom PLATFORM
+	PLATFORM ?= suse-linux
+	IS_SUSE := 1
+else
+	PLATFORM ?= linux-gnu
+endif
+
+ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH))
+	# use the native CC and STRIP
+	HOST :=
+else ifeq ($(GOARCH),386)
+	# Always use the 64-bit compiler to build the 386 binary, which works for
+	# the more common cross-build method for x86 (namely, the equivalent of
+	# dpkg --add-architecture).
+	ifdef IS_SUSE
+		# There is no x86_64-suse-linux-gcc, so use the native one.
+		HOST :=
+		CPU_TYPE := i586
+	else
+		HOST := x86_64-$(PLATFORM)-
+		CPU_TYPE := i686
+	endif
+	CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS)
+else ifeq ($(GOARCH),amd64)
+	ifdef IS_SUSE
+		# There is no x86_64-suse-linux-gcc, so use the native one.
+		HOST :=
+	else
+		HOST := x86_64-$(PLATFORM)-
+	endif
+else ifeq ($(GOARCH),arm64)
+	HOST := aarch64-$(PLATFORM)-
+else ifeq ($(GOARCH),arm)
+	# HOST already configured by release_build.sh in this case.
+else ifeq ($(GOARCH),armel)
+	HOST := arm-$(PLATFORM)eabi-
+else ifeq ($(GOARCH),armhf)
+	HOST := arm-$(PLATFORM)eabihf-
+else ifeq ($(GOARCH),ppc64le)
+	HOST := powerpc64le-$(PLATFORM)-
+else ifeq ($(GOARCH),riscv64)
+	HOST := riscv64-$(PLATFORM)-
+else ifeq ($(GOARCH),s390x)
+	HOST := s390x-$(PLATFORM)-
+else
+$(error Unsupported GOARCH $(GOARCH))
+endif
+
+ifeq ($(origin CC),$(filter $(origin CC),undefined default))
+	# Override CC if it's undefined or just the default value set by Make.
+	CC := $(HOST)gcc
+	export CC
+endif
+STRIP ?= $(HOST)strip
+export STRIP
diff --git a/contrib/cmd/memfd-bind/README.md b/contrib/cmd/memfd-bind/README.md
new file mode 100644
index 00000000000..f2ceae2fa78
--- /dev/null
+++ b/contrib/cmd/memfd-bind/README.md
@@ -0,0 +1,67 @@
+## memfd-bind ##
+
+`runc` normally has to make a binary copy of itself (or of a smaller helper
+binary called `runc-dmz`) when constructing a container process in order to
+defend against certain container runtime attacks such as CVE-2019-5736.
+
+This cloned binary only exists until the container process starts (this means
+for `runc run` and `runc exec`, it only exists for a few hundred milliseconds
+-- for `runc create` it exists until `runc start` is called). However, because
+the clone is done using a memfd (or by creating files in directories that are
+likely to be a `tmpfs`), this can lead to temporary increases in *host* memory
+usage. Unless you are running on a cgroupv1 system with the cgroupv1 memory
+controller enabled and the (deprecated) `memory.move_charge_at_immigrate`
+enabled, there is no effect on the container's memory.
+
+However, for certain configurations this can still be undesirable. This daemon
+allows you to create a sealed memfd copy of the `runc` binary, which will cause
+`runc` to skip all binary copying, resulting in no additional memory usage for
+each container process (instead there is a single in-memory copy of the
+binary). It should be noted that (strictly speaking) this is slightly less
+secure if you are concerned about Dirty Cow-like 0-day kernel vulnerabilities,
+but for most users the security benefit is identical.
+
+The provided `memfd-bind@.service` file can be used to get systemd to manage
+this daemon. You can supply the path like so:
+
+```
+% systemctl start memfd-bind@/usr/bin/runc
+```
+
+Thus, there are three ways of protecting against CVE-2019-5736, in order of how
+much memory usage they can use:
+
+* `memfd-bind` only creates a single in-memory copy of the `runc` binary (about
+  10MB), regardless of how many containers are running.
+
+* `runc-dmz` is (depending on which libc it was compiled with) between 10kB and
+  1MB in size, and a copy is created once per process spawned inside a
+  container by runc (both the pid1 and every `runc exec`). There are
+  circumstances where using `runc-dmz` will fail in ways that runc cannot
+  predict ahead of time (such as restrictive LSMs applied to containers), in
+  which case users can disable it with the `RUNC_DMZ=legacy` setting.
+  `runc-dmz` also requires an additional `execve` over the other options,
+  though since the binary is so small the cost is probably not even noticeable.
+
+* The classic method of making a copy of the entire `runc` binary during
+  container process setup takes up about 10MB per process spawned inside the
+  container by runc (both pid1 and `runc exec`).
+
+### Caveats ###
+
+There are several downsides with using `memfd-bind` on the `runc` binary:
+
+* The `memfd-bind` process needs to continue to run indefinitely in order for
+  the memfd reference to stay alive. If the process is forcefully killed, the
+  bind-mount on top of the `runc` binary will become stale and nobody will be
+  able to execute it (you can use `memfd-bind --cleanup` to clean up the stale
+  mount).
+
+* Only root can execute the cloned binary due to permission restrictions on
+  accessing other process's files. More specifically, only users with ptrace
+  privileges over the memfd-bind daemon can access the file (but in practice
+  this is usually only root).
+
+* When updating `runc`, the daemon needs to be stopped before the update (so
+  the package manager can access the underlying file) and then restarted after
+  the update.
diff --git a/contrib/cmd/memfd-bind/memfd-bind.go b/contrib/cmd/memfd-bind/memfd-bind.go
new file mode 100644
index 00000000000..e73739f0c4d
--- /dev/null
+++ b/contrib/cmd/memfd-bind/memfd-bind.go
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2023 SUSE LLC
+ * Copyright (c) 2023 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"os/signal"
+	"runtime"
+	"strings"
+	"time"
+
+	"github.com/opencontainers/runc/libcontainer/dmz"
+
+	"github.com/sirupsen/logrus"
+	"github.com/urfave/cli"
+	"golang.org/x/sys/unix"
+)
+
+// version will be populated by the Makefile, read from
+// VERSION file of the source code.
+var version = ""
+
+// gitCommit will be the hash that the binary was built from
+// and will be populated by the Makefile.
+var gitCommit = ""
+
+const (
+	usage = `Open Container Initiative contrib/cmd/memfd-bind
+
+In order to protect against certain container attacks, every runc invocation
+that involves creating or joining a container will cause runc to make a copy of
+the runc binary in memory (usually to a memfd). While "runc init" is very
+short-lived, this extra memory usage can cause problems for containers with
+very small memory limits (or containers that have many "runc exec" invocations
+applied to them at the same time).
+
+memfd-bind is a tool to create a persistent memfd-sealed-copy of the runc binary,
+which will cause runc to not make its own copy. This means you can get the
+benefits of using a sealed memfd as runc's binary (even in a container breakout
+attack to get write access to the runc binary, neither the underlying binary
+nor the memfd copy can be changed).
+
+To use memfd-bind, just specify which path you want to create a socket path at
+which you want to receive terminals:
+
+    $ sudo memfd-bind /usr/bin/runc
+
+Note that (due to kernel restrictions on bind-mounts), this program must remain
+running on the host in order for the binary to be readable (it is recommended
+you use a systemd unit to keep this process around).
+
+If this program dies, there will be a leftover mountpoint that always returns
+-EINVAL when attempting to access it. You need to use memfd-bind --cleanup on the
+path in order to unmount the path (regular umount(8) will not work):
+
+    $ sudo memfd-bind --cleanup /usr/bin/runc
+
+Note that (due to restrictions on /proc/$pid/fd/$fd magic-link resolution),
+only privileged users (specifically, those that have ptrace privileges over the
+memfd-bind daemon) can access the memfd bind-mount. This means that using this
+tool to harden your /usr/bin/runc binary would result in unprivileged users
+being unable to execute the binary. If this is an issue, you could make all
+privileged process use a different copy of runc (by making a copy in somewhere
+like /usr/sbin/runc) and only using memfd-bind for the version used by
+privileged users.
+`
+)
+
+func cleanup(path string) error {
+	file, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("cleanup: failed to open runc binary path: %w", err)
+	}
+	defer file.Close()
+	fdPath := fmt.Sprintf("/proc/self/fd/%d", file.Fd())
+
+	// Keep umounting until we hit a umount error.
+	for unix.Unmount(fdPath, unix.MNT_DETACH) == nil {
+		// loop...
+		logrus.Debugf("memfd-bind: path %q unmount succeeded...", path)
+	}
+	logrus.Infof("memfd-bind: path %q has been cleared of all old bind-mounts", path)
+	return nil
+}
+
+// memfdClone is a memfd-only implementation of dmz.CloneBinary.
+func memfdClone(path string) (*os.File, error) {
+	binFile, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open runc binary path: %w", err)
+	}
+	defer binFile.Close()
+	stat, err := binFile.Stat()
+	if err != nil {
+		return nil, fmt.Errorf("checking %s size: %w", path, err)
+	}
+	size := stat.Size()
+	memfd, sealFn, err := dmz.Memfd("/proc/self/exe")
+	if err != nil {
+		return nil, fmt.Errorf("creating memfd failed: %w", err)
+	}
+	copied, err := io.Copy(memfd, binFile)
+	if err != nil {
+		return nil, fmt.Errorf("copy binary: %w", err)
+	} else if copied != size {
+		return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
+	}
+	if err := sealFn(&memfd); err != nil {
+		return nil, fmt.Errorf("could not seal fd: %w", err)
+	}
+	if !dmz.IsCloned(memfd) {
+		return nil, fmt.Errorf("cloned memfd is not properly sealed")
+	}
+	return memfd, nil
+}
+
+func mount(path string) error {
+	memfdFile, err := memfdClone(path)
+	if err != nil {
+		return fmt.Errorf("memfd clone: %w", err)
+	}
+	defer memfdFile.Close()
+	memfdPath := fmt.Sprintf("/proc/self/fd/%d", memfdFile.Fd())
+
+	// We have to open an O_NOFOLLOW|O_PATH to the memfd magic-link because we
+	// cannot bind-mount the memfd itself (it's in the internal kernel mount
+	// namespace and cross-mount-namespace bind-mounts are not allowed). This
+	// also requires that this program stay alive continuously for the
+	// magic-link to stay alive...
+	memfdLink, err := os.OpenFile(memfdPath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("mount: failed to /proc/self/fd magic-link for memfd: %w", err)
+	}
+	defer memfdLink.Close()
+	memfdLinkFdPath := fmt.Sprintf("/proc/self/fd/%d", memfdLink.Fd())
+
+	exeFile, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("mount: failed to open target runc binary path: %w", err)
+	}
+	defer exeFile.Close()
+	exeFdPath := fmt.Sprintf("/proc/self/fd/%d", exeFile.Fd())
+
+	err = unix.Mount(memfdLinkFdPath, exeFdPath, "", unix.MS_BIND, "")
+	if err != nil {
+		return fmt.Errorf("mount: failed to mount memfd on top of runc binary path target: %w", err)
+	}
+
+	// If there is a signal we want to do cleanup.
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, os.Interrupt, unix.SIGTERM, unix.SIGINT)
+	go func() {
+		<-sigCh
+		logrus.Infof("memfd-bind: exit signal caught! cleaning up the bind-mount on %q...", path)
+		_ = cleanup(path)
+		os.Exit(0)
+	}()
+
+	// Clean up things we don't need...
+	_ = exeFile.Close()
+	_ = memfdLink.Close()
+
+	// We now have to stay alive to keep the magic-link alive...
+	logrus.Infof("memfd-bind: bind-mount of memfd over %q created -- looping forever!", path)
+	for {
+		// loop forever...
+		time.Sleep(time.Duration(1<<63 - 1))
+		// make sure the memfd isn't gc'd
+		runtime.KeepAlive(memfdFile)
+	}
+}
+
+func main() {
+	app := cli.NewApp()
+	app.Name = "memfd-bind"
+	app.Usage = usage
+
+	// Set version to be the same as runC.
+	var v []string
+	if version != "" {
+		v = append(v, version)
+	}
+	if gitCommit != "" {
+		v = append(v, "commit: "+gitCommit)
+	}
+	app.Version = strings.Join(v, "\n")
+
+	// Set the flags.
+	app.Flags = []cli.Flag{
+		cli.BoolFlag{
+			Name:  "cleanup",
+			Usage: "Do not create a new memfd-sealed file, only clean up an existing one at <path>.",
+		},
+		cli.BoolFlag{
+			Name:  "debug",
+			Usage: "Enable debug logging.",
+		},
+	}
+
+	app.Action = func(ctx *cli.Context) error {
+		args := ctx.Args()
+		if len(args) != 1 {
+			return errors.New("need to specify a single path to the runc binary")
+		}
+		path := ctx.Args()[0]
+
+		if ctx.Bool("debug") {
+			logrus.SetLevel(logrus.DebugLevel)
+		}
+
+		err := cleanup(path)
+		// We only care about cleanup errors when doing --cleanup.
+		if ctx.Bool("cleanup") {
+			return err
+		}
+		return mount(path)
+	}
+	if err := app.Run(os.Args); err != nil {
+		fmt.Fprintf(os.Stderr, "memfd-bind: %v\n", err)
+		os.Exit(1)
+	}
+}
diff --git a/contrib/cmd/memfd-bind/memfd-bind@.service b/contrib/cmd/memfd-bind/memfd-bind@.service
new file mode 100644
index 00000000000..591548ea4d9
--- /dev/null
+++ b/contrib/cmd/memfd-bind/memfd-bind@.service
@@ -0,0 +1,11 @@
+[Unit]
+Description=Manage memfd-bind of %I
+Documentation=https://github.com/opencontainers/runc
+
+[Service]
+Type=simple
+ExecStart=memfd-bind "%I"
+ExecStop=memfd-bind --cleanup "%I"
+
+[Install]
+WantedBy=multi-user.target
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index c941239b841..ae5d4fb46b4 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -24,8 +24,10 @@ import (
 
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/dmz"
 	"github.com/opencontainers/runc/libcontainer/intelrdt"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/system/kernelversion"
 	"github.com/opencontainers/runc/libcontainer/utils"
 )
 
@@ -316,6 +318,8 @@ func (c *Container) start(process *Process) (retErr error) {
 	if err != nil {
 		return fmt.Errorf("unable to create new parent process: %w", err)
 	}
+	// We do not need the cloned binaries once the process is spawned.
+	defer process.closeClonedExes()
 
 	logsDone := parent.forwardChildLogs()
 	if logsDone != nil {
@@ -441,6 +445,48 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
 	return nil
 }
 
+// No longer needed in Go 1.21.
+func slicesContains[S ~[]E, E comparable](slice S, needle E) bool {
+	for _, val := range slice {
+		if val == needle {
+			return true
+		}
+	}
+	return false
+}
+
+func isDmzBinarySafe(c *configs.Config) bool {
+	// Because we set the dumpable flag in nsexec, the only time when it is
+	// unsafe to use runc-dmz is when the container process would be able to
+	// race against "runc init" and bypass the ptrace_may_access() checks.
+	//
+	// This is only the case if the container processes could have
+	// CAP_SYS_PTRACE somehow (i.e. the capability is present in the bounding,
+	// inheritable, or ambient sets). Luckily, most containers do not have this
+	// capability.
+	if c.Capabilities == nil ||
+		(!slicesContains(c.Capabilities.Bounding, "CAP_SYS_PTRACE") &&
+			!slicesContains(c.Capabilities.Inheritable, "CAP_SYS_PTRACE") &&
+			!slicesContains(c.Capabilities.Ambient, "CAP_SYS_PTRACE")) {
+		return true
+	}
+
+	// Since Linux 4.10 (see bfedb589252c0) user namespaced containers cannot
+	// access /proc/$pid/exe of runc after it joins the namespace (until it
+	// does an exec), regardless of the capability set. This has been
+	// backported to other distribution kernels, but there's no way of checking
+	// this cheaply -- better to be safe than sorry here.
+	linux410 := kernelversion.KernelVersion{Kernel: 4, Major: 10}
+	if ok, err := kernelversion.GreaterEqualThan(linux410); ok && err == nil {
+		if c.Namespaces.Contains(configs.NEWUSER) {
+			return true
+		}
+	}
+
+	// Assume it's unsafe otherwise.
+	return false
+}
+
 func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
 	if err != nil {
@@ -454,24 +500,59 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	}
 	logFilePair := filePair{parentLogPipe, childLogPipe}
 
-	cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
-	if !p.Init {
-		return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
-	}
-
-	// We only set up fifoFd if we're not doing a `runc exec`. The historic
-	// reason for this is that previously we would pass a dirfd that allowed
-	// for container rootfs escape (and not doing it in `runc exec` avoided
-	// that problem), but we no longer do that. However, there's no need to do
-	// this for `runc exec` so we just keep it this way to be safe.
-	if err := c.includeExecFifo(cmd); err != nil {
-		return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
+	// Make sure we use a new safe copy of /proc/self/exe or the runc-dmz
+	// binary each time this is called, to make sure that if a container
+	// manages to overwrite the file it cannot affect other containers on the
+	// system. For runc, this code will only ever be called once, but
+	// libcontainer users might call this more than once.
+	p.closeClonedExes()
+	var (
+		exePath string
+		// only one of dmzExe or safeExe are used at a time
+		dmzExe, safeExe *os.File
+	)
+	if dmz.IsSelfExeCloned() {
+		// /proc/self/exe is already a cloned binary -- no need to do anything
+		logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
+		exePath = "/proc/self/exe"
+	} else {
+		var err error
+		if isDmzBinarySafe(c.config) {
+			dmzExe, err = dmz.Binary(c.root)
+			if err == nil {
+				// We can use our own executable without cloning if we are using
+				// runc-dmz.
+				exePath = "/proc/self/exe"
+				p.clonedExes = append(p.clonedExes, dmzExe)
+				logrus.Debug("runc-dmz: using runc-dmz") // used for tests
+			} else if errors.Is(err, dmz.ErrNoDmzBinary) {
+				logrus.Debug("runc-dmz binary not embedded in runc binary, falling back to /proc/self/exe clone")
+			} else if err != nil {
+				return nil, fmt.Errorf("failed to create runc-dmz binary clone: %w", err)
+			}
+		} else {
+			// If the configuration makes it unsafe to use runc-dmz, pretend we
+			// don't have it embedded so we do /proc/self/exe cloning.
+			logrus.Debug("container configuration unsafe for runc-dmz, falling back to /proc/self/exe clone")
+			err = dmz.ErrNoDmzBinary
+		}
+		if errors.Is(err, dmz.ErrNoDmzBinary) {
+			safeExe, err = dmz.CloneSelfExe(c.root)
+			if err != nil {
+				return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
+			}
+			exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
+			p.clonedExes = append(p.clonedExes, safeExe)
+			logrus.Debug("runc-dmz: using /proc/self/exe clone") // used for tests
+		}
+		// Just to make sure we don't run without protection.
+		if dmzExe == nil && safeExe == nil {
+			// This should never happen.
+			return nil, fmt.Errorf("[internal error] attempted to spawn a container with no /proc/self/exe protection")
+		}
 	}
-	return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
-}
 
-func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
-	cmd := exec.Command("/proc/self/exe", "init")
+	cmd := exec.Command(exePath, "init")
 	cmd.Args[0] = os.Args[0]
 	cmd.Stdin = p.Stdin
 	cmd.Stdout = p.Stdout
@@ -494,6 +575,12 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog
 		"_LIBCONTAINER_STATEDIR="+c.root,
 	)
 
+	if dmzExe != nil {
+		cmd.ExtraFiles = append(cmd.ExtraFiles, dmzExe)
+		cmd.Env = append(cmd.Env,
+			"_LIBCONTAINER_DMZEXEFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
+	}
+
 	cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
 	cmd.Env = append(cmd.Env,
 		"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
@@ -501,13 +588,38 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog
 		cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel)
 	}
 
-	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
-	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
-	// even with the parent still running.
+	if safeExe != nil {
+		// Due to a Go stdlib bug, we need to add safeExe to the set of
+		// ExtraFiles otherwise it is possible for the stdlib to clobber the fd
+		// during forkAndExecInChild1 and replace it with some other file that
+		// might be malicious. This is less than ideal (because the descriptor
+		// will be non-O_CLOEXEC) however we have protections in "runc init" to
+		// stop us from leaking extra file descriptors.
+		//
+		// See <https://github.com/golang/go/issues/61751>.
+		cmd.ExtraFiles = append(cmd.ExtraFiles, safeExe)
+	}
+
+	// NOTE: when running a container with no PID namespace and the parent
+	//       process spawning the container is PID1 the pdeathsig is being
+	//       delivered to the container's init process by the kernel for some
+	//       reason even with the parent still running.
 	if c.config.ParentDeathSignal > 0 {
 		cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
 	}
-	return cmd
+
+	if p.Init {
+		// We only set up fifoFd if we're not doing a `runc exec`. The historic
+		// reason for this is that previously we would pass a dirfd that allowed
+		// for container rootfs escape (and not doing it in `runc exec` avoided
+		// that problem), but we no longer do that. However, there's no need to do
+		// this for `runc exec` so we just keep it this way to be safe.
+		if err := c.includeExecFifo(cmd); err != nil {
+			return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
+		}
+		return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
+	}
+	return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
 }
 
 // shouldSendMountSources says whether the child process must setup bind mounts with
diff --git a/libcontainer/dmz/.gitignore b/libcontainer/dmz/.gitignore
new file mode 100644
index 00000000000..f163ef41c1f
--- /dev/null
+++ b/libcontainer/dmz/.gitignore
@@ -0,0 +1 @@
+/runc-dmz
diff --git a/libcontainer/dmz/Makefile b/libcontainer/dmz/Makefile
new file mode 100644
index 00000000000..24e92db716b
--- /dev/null
+++ b/libcontainer/dmz/Makefile
@@ -0,0 +1,6 @@
+# Get CC values for cross-compilation.
+include ../../cc_platform.mk
+
+runc-dmz: _dmz.c
+	$(CC) $(CFLAGS) -static -o $@ $^
+	$(STRIP) -gs $@
diff --git a/libcontainer/dmz/_dmz.c b/libcontainer/dmz/_dmz.c
new file mode 100644
index 00000000000..6e91b0f90a9
--- /dev/null
+++ b/libcontainer/dmz/_dmz.c
@@ -0,0 +1,10 @@
+#include <unistd.h>
+
+extern char **environ;
+
+int main(int argc, char **argv)
+{
+	if (argc < 1)
+		return 127;
+	return execve(argv[0], argv, environ);
+}
diff --git a/libcontainer/dmz/cloned_binary_linux.go b/libcontainer/dmz/cloned_binary_linux.go
new file mode 100644
index 00000000000..db5e18a3260
--- /dev/null
+++ b/libcontainer/dmz/cloned_binary_linux.go
@@ -0,0 +1,241 @@
+package dmz
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"strconv"
+
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/system"
+)
+
+type SealFunc func(**os.File) error
+
+var (
+	_ SealFunc = sealMemfd
+	_ SealFunc = sealFile
+)
+
+func isExecutable(f *os.File) bool {
+	if err := unix.Faccessat(int(f.Fd()), "", unix.X_OK, unix.AT_EACCESS|unix.AT_EMPTY_PATH); err == nil {
+		return true
+	} else if err == unix.EACCES {
+		return false
+	}
+	path := "/proc/self/fd/" + strconv.Itoa(int(f.Fd()))
+	if err := unix.Access(path, unix.X_OK); err == nil {
+		return true
+	} else if err == unix.EACCES {
+		return false
+	}
+	// Cannot check -- assume it's executable (if not, exec will fail).
+	logrus.Debugf("cannot do X_OK check on binary %s -- assuming it's executable", f.Name())
+	return true
+}
+
+const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE
+
+func sealMemfd(f **os.File) error {
+	if err := (*f).Chmod(0o511); err != nil {
+		return err
+	}
+	// Try to set the newer memfd sealing flags, but we ignore
+	// errors because they are not needed and we want to continue
+	// to work on older kernels.
+	fd := (*f).Fd()
+	// F_SEAL_FUTURE_WRITE -- Linux 5.1
+	_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE)
+	// F_SEAL_EXEC -- Linux 6.3
+	const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name
+	_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC)
+	// Apply all original memfd seals.
+	_, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals)
+	return os.NewSyscallError("fcntl(F_ADD_SEALS)", err)
+}
+
+// Memfd creates a sealable executable memfd (supported since Linux 3.17).
+func Memfd(comment string) (*os.File, SealFunc, error) {
+	file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC)
+	return file, sealMemfd, err
+}
+
+func sealFile(f **os.File) error {
+	if err := (*f).Chmod(0o511); err != nil {
+		return err
+	}
+	// When sealing an O_TMPFILE-style descriptor we need to
+	// re-open the path as O_PATH to clear the existing write
+	// handle we have.
+	opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("reopen tmpfile: %w", err)
+	}
+	_ = (*f).Close()
+	*f = opath
+	return nil
+}
+
+// otmpfile creates an open(O_TMPFILE) file in the given directory (supported
+// since Linux 3.11).
+func otmpfile(dir string) (*os.File, SealFunc, error) {
+	file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700)
+	if err != nil {
+		return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err)
+	}
+	// Make sure we actually got an unlinked O_TMPFILE descriptor.
+	var stat unix.Stat_t
+	if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
+		file.Close()
+		return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err)
+	} else if stat.Nlink != 0 {
+		file.Close()
+		return nil, nil, errors.New("O_TMPFILE has non-zero nlink")
+	}
+	return file, sealFile, err
+}
+
+// mktemp creates a classic unlinked file in the given directory.
+func mktemp(dir string) (*os.File, SealFunc, error) {
+	file, err := os.CreateTemp(dir, "runc.")
+	if err != nil {
+		return nil, nil, err
+	}
+	// Unlink the file and verify it was unlinked.
+	if err := os.Remove(file.Name()); err != nil {
+		return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err)
+	}
+	var stat unix.Stat_t
+	if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
+		return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err)
+	} else if stat.Nlink != 0 {
+		return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name())
+	}
+	return file, sealFile, err
+}
+
+func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) {
+	// First, try an executable memfd (supported since Linux 3.17).
+	file, sealFn, err = Memfd(comment)
+	if err == nil {
+		return
+	}
+	logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)
+
+	// The tmpDir here (c.root) might be mounted noexec, so we need a couple of
+	// fallbacks to try. It's possible that none of these are writable and
+	// executable, in which case there's nothing we can practically do (other
+	// than mounting our own executable tmpfs, which would have its own
+	// issues).
+	tmpDirs := []string{
+		tmpDir,
+		os.TempDir(),
+		"/tmp",
+		".",
+		"/bin",
+		"/",
+	}
+
+	// Try to fallback to O_TMPFILE (supported since Linux 3.11).
+	for _, dir := range tmpDirs {
+		file, sealFn, err = otmpfile(dir)
+		if err != nil {
+			continue
+		}
+		if !isExecutable(file) {
+			logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
+			file.Close()
+			continue
+		}
+		return
+	}
+	logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
+	// Finally, try a classic unlinked temporary file.
+	for _, dir := range tmpDirs {
+		file, sealFn, err = mktemp(dir)
+		if err != nil {
+			continue
+		}
+		if !isExecutable(file) {
+			logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
+			file.Close()
+			continue
+		}
+		return
+	}
+	return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err)
+}
+
+// CloneBinary creates a "sealed" clone of a given binary, which can be used to
+// thwart attempts by the container process to gain access to host binaries
+// through procfs magic-link shenanigans. For more details on why this is
+// necessary, see CVE-2019-5736.
+func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) {
+	logrus.Debugf("cloning %s binary (%d bytes)", name, size)
+	file, sealFn, err := getSealableFile(name, tmpDir)
+	if err != nil {
+		return nil, err
+	}
+	copied, err := system.Copy(file, src)
+	if err != nil {
+		file.Close()
+		return nil, fmt.Errorf("copy binary: %w", err)
+	} else if copied != size {
+		file.Close()
+		return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
+	}
+	if err := sealFn(&file); err != nil {
+		file.Close()
+		return nil, fmt.Errorf("could not seal fd: %w", err)
+	}
+	return file, nil
+}
+
+// IsCloned returns whether the given file can be guaranteed to be a safe exe.
+func IsCloned(exe *os.File) bool {
+	seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0)
+	if err != nil {
+		// /proc/self/exe is probably not a memfd
+		logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err)
+		return false
+	}
+	// The memfd must have all of the base seals applied.
+	logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals)
+	return seals&baseMemfdSeals == baseMemfdSeals
+}
+
+// CloneSelfExe makes a clone of the current process's binary (through
+// /proc/self/exe). This binary can then be used for "runc init" in order to
+// make sure the container process can never resolve the original runc binary.
+// For more details on why this is necessary, see CVE-2019-5736.
+func CloneSelfExe(tmpDir string) (*os.File, error) {
+	selfExe, err := os.Open("/proc/self/exe")
+	if err != nil {
+		return nil, fmt.Errorf("opening current binary: %w", err)
+	}
+	defer selfExe.Close()
+
+	stat, err := selfExe.Stat()
+	if err != nil {
+		return nil, fmt.Errorf("checking /proc/self/exe size: %w", err)
+	}
+	size := stat.Size()
+
+	return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir)
+}
+
+// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can
+// be guaranteed to be safe. This means that it must be a sealed memfd. Other
+// types of clones cannot be completely verified as safe.
+func IsSelfExeCloned() bool {
+	selfExe, err := os.Open("/proc/self/exe")
+	if err != nil {
+		logrus.Debugf("open /proc/self/exe failed: %v", err)
+		return false
+	}
+	defer selfExe.Close()
+	return IsCloned(selfExe)
+}
diff --git a/libcontainer/dmz/dmz.go b/libcontainer/dmz/dmz.go
new file mode 100644
index 00000000000..9b6b500807c
--- /dev/null
+++ b/libcontainer/dmz/dmz.go
@@ -0,0 +1,9 @@
+package dmz
+
+import (
+	"errors"
+)
+
+// ErrNoDmzBinary is returned by Binary when there is no runc-dmz binary
+// embedded in the runc program.
+var ErrNoDmzBinary = errors.New("runc-dmz binary not embedded in this program")
diff --git a/libcontainer/dmz/dmz_fallback_linux.go b/libcontainer/dmz/dmz_fallback_linux.go
new file mode 100644
index 00000000000..4f624e048b9
--- /dev/null
+++ b/libcontainer/dmz/dmz_fallback_linux.go
@@ -0,0 +1 @@
+package dmz
diff --git a/libcontainer/dmz/dmz_linux.go b/libcontainer/dmz/dmz_linux.go
new file mode 100644
index 00000000000..12f9709a269
--- /dev/null
+++ b/libcontainer/dmz/dmz_linux.go
@@ -0,0 +1,48 @@
+//go:build !runc_nodmz
+// +build !runc_nodmz
+
+package dmz
+
+import (
+	"bytes"
+	"debug/elf"
+	_ "embed"
+	"os"
+
+	"github.com/sirupsen/logrus"
+)
+
+// Try to build the runc-dmz binary. If it fails, replace it with an empty file
+// (this will trigger us to fall back to a clone of /proc/self/exe). Yeah, this
+// is a bit ugly but it makes sure that weird cross-compilation setups don't
+// break because of runc-dmz.
+//
+//go:generate sh -c "make -B runc-dmz || echo -n >runc-dmz"
+//go:embed runc-dmz
+var runcDmzBinary []byte
+
+// Binary returns a cloned copy (see CloneBinary) of a very minimal C program
+// that just does an execve() of its arguments. This is used in the final
+// execution step of the container execution as an intermediate process before
+// the container process is execve'd. This allows for protection against
+// CVE-2019-5736 without requiring a complete copy of the runc binary. Each
+// call to Binary will return a new copy.
+//
+// If the runc-dmz binary is not embedded into the runc binary, Binary will
+// return ErrNoDmzBinary as the error.
+func Binary(tmpDir string) (*os.File, error) {
+	rdr := bytes.NewBuffer(runcDmzBinary)
+	// Verify that our embedded binary has a standard ELF header.
+	if !bytes.HasPrefix(rdr.Bytes(), []byte(elf.ELFMAG)) {
+		if rdr.Len() != 0 {
+			logrus.Infof("misconfigured build: embedded runc-dmz binary is non-empty but is missing a proper ELF header")
+		}
+		return nil, ErrNoDmzBinary
+	}
+	// Setting RUNC_DMZ=legacy disables this dmz method.
+	if os.Getenv("RUNC_DMZ") == "legacy" {
+		logrus.Debugf("RUNC_DMZ=legacy set -- switching back to classic /proc/self/exe cloning")
+		return nil, ErrNoDmzBinary
+	}
+	return CloneBinary(rdr, int64(rdr.Len()), "runc-dmz", tmpDir)
+}
diff --git a/libcontainer/dmz/dmz_unsupported.go b/libcontainer/dmz/dmz_unsupported.go
new file mode 100644
index 00000000000..2ba67270495
--- /dev/null
+++ b/libcontainer/dmz/dmz_unsupported.go
@@ -0,0 +1,12 @@
+//go:build !linux || runc_nodmz
+// +build !linux runc_nodmz
+
+package dmz
+
+import (
+	"os"
+)
+
+func Binary(_ string) (*os.File, error) {
+	return nil, ErrNoDmzBinary
+}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index 732f64dc660..a24be276878 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -182,6 +182,17 @@ func startInitialization() (retErr error) {
 		return err
 	}
 
+	// Get runc-dmz fds.
+	var dmzExe *os.File
+	if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" {
+		dmzFd, err := strconv.Atoi(dmzFdStr)
+		if err != nil {
+			return fmt.Errorf("unable to convert _LIBCONTAINER_DMZEXEFD: %w", err)
+		}
+		unix.CloseOnExec(dmzFd)
+		dmzExe = os.NewFile(uintptr(dmzFd), "runc-dmz")
+	}
+
 	// clear the current process's environment to clean any libcontainer
 	// specific env vars.
 	os.Clearenv()
@@ -197,10 +208,10 @@ func startInitialization() (retErr error) {
 	}()
 
 	// If init succeeds, it will not return, hence none of the defers will be called.
-	return containerInit(it, pipe, consoleSocket, fifofd, logFD, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
+	return containerInit(it, pipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
 }
 
-func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error {
+func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
 	var config *initConfig
 	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
 		return err
@@ -208,6 +219,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
 	if err := populateProcessEnvironment(config.Env); err != nil {
 		return err
 	}
+
 	switch t {
 	case initSetns:
 		// mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
@@ -220,6 +232,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
 			consoleSocket: consoleSocket,
 			config:        config,
 			logFd:         logFd,
+			dmzExe:        dmzExe,
 		}
 		return i.Init()
 	case initStandard:
@@ -230,6 +243,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
 			config:        config,
 			fifoFd:        fifoFd,
 			logFd:         logFd,
+			dmzExe:        dmzExe,
 			mountFds:      mountFds,
 		}
 		return i.Init()
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
deleted file mode 100644
index a7f992fddd7..00000000000
--- a/libcontainer/nsenter/cloned_binary.c
+++ /dev/null
@@ -1,567 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
-/*
- * Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
- * Copyright (C) 2019 SUSE LLC
- *
- * This work is dual licensed under the following licenses. You may use,
- * redistribute, and/or modify the work under the conditions of either (or
- * both) licenses.
- *
- * === Apache-2.0 ===
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * === LGPL-2.1-or-later ===
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see
- * <https://www.gnu.org/licenses/>.
- *
- */
-
-#define _GNU_SOURCE
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <string.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <errno.h>
-
-#include <sched.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/statfs.h>
-#include <sys/vfs.h>
-#include <sys/mman.h>
-#include <sys/mount.h>
-#include <sys/sendfile.h>
-#include <sys/socket.h>
-#include <sys/syscall.h>
-#include <sys/wait.h>
-
-#include "ipc.h"
-#include "log.h"
-
-/* Use our own wrapper for memfd_create. */
-#ifndef SYS_memfd_create
-#  ifdef __NR_memfd_create
-#    define SYS_memfd_create __NR_memfd_create
-#  else
-/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
-#    warning "libc is outdated -- using hard-coded SYS_memfd_create"
-#    if defined(__x86_64__)
-#      define SYS_memfd_create 319
-#    elif defined(__i386__)
-#      define SYS_memfd_create 356
-#    elif defined(__ia64__)
-#      define SYS_memfd_create 1340
-#    elif defined(__arm__)
-#      define SYS_memfd_create 385
-#    elif defined(__aarch64__)
-#      define SYS_memfd_create 279
-#    elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
-#      define SYS_memfd_create 360
-#    elif defined(__s390__) || defined(__s390x__)
-#      define SYS_memfd_create 350
-#    else
-#      warning "unknown architecture -- cannot hard-code SYS_memfd_create"
-#    endif
-#  endif
-#endif
-
-/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
-#ifndef MFD_CLOEXEC
-#  define MFD_CLOEXEC       0x0001U
-#  define MFD_ALLOW_SEALING 0x0002U
-#endif
-#ifndef MFD_EXEC
-#  define MFD_EXEC          0x0010U
-#endif
-
-int memfd_create(const char *name, unsigned int flags)
-{
-#ifdef SYS_memfd_create
-	return syscall(SYS_memfd_create, name, flags);
-#else
-	errno = ENOSYS;
-	return -1;
-#endif
-}
-
-/* This comes directly from <linux/fcntl.h>. */
-#ifndef F_LINUX_SPECIFIC_BASE
-#  define F_LINUX_SPECIFIC_BASE 1024
-#endif
-#ifndef F_ADD_SEALS
-#  define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
-#  define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
-#endif
-#ifndef F_SEAL_SEAL
-#  define F_SEAL_SEAL          0x0001	/* prevent further seals from being set */
-#  define F_SEAL_SHRINK        0x0002	/* prevent file from shrinking */
-#  define F_SEAL_GROW          0x0004	/* prevent file from growing */
-#  define F_SEAL_WRITE         0x0008	/* prevent writes */
-#endif
-#ifndef F_SEAL_FUTURE_WRITE
-#  define F_SEAL_FUTURE_WRITE  0x0010	/* prevent future writes while mapped */
-#endif
-#ifndef F_SEAL_EXEC
-#  define F_SEAL_EXEC          0x0020	/* prevent chmod modifying exec bits */
-#endif
-
-#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
-#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
-/*
- * There are newer memfd seals (such as F_SEAL_FUTURE_WRITE and F_SEAL_EXEC),
- * which we use opportunistically. However, this set is the original set of
- * memfd seals, and we require them all to be set to trust our /proc/self/exe
- * if it is a memfd.
- */
-#define RUNC_MEMFD_MIN_SEALS \
-	(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
-
-static void *must_realloc(void *ptr, size_t size)
-{
-	void *old = ptr;
-	do {
-		ptr = realloc(old, size);
-	} while (!ptr);
-	return ptr;
-}
-
-/*
- * Verify whether we are currently in a self-cloned program (namely, is
- * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
- * for shmem files), and we want to be sure it's actually sealed.
- */
-static int is_self_cloned(void)
-{
-	int fd, seals = 0, is_cloned = false;
-	struct stat statbuf = { };
-	struct statfs fsbuf = { };
-
-	fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
-	if (fd < 0) {
-		write_log(ERROR, "cannot open runc binary for reading: open /proc/self/exe: %m");
-		return -ENOTRECOVERABLE;
-	}
-
-	/*
-	 * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
-	 * this, because you cannot write to a sealed memfd no matter what.
-	 */
-	seals = fcntl(fd, F_GET_SEALS);
-	if (seals >= 0) {
-		write_log(DEBUG, "checking /proc/self/exe memfd seals: 0x%x", seals);
-		is_cloned = (seals & RUNC_MEMFD_MIN_SEALS) == RUNC_MEMFD_MIN_SEALS;
-		if (is_cloned)
-			goto out;
-	}
-
-	/*
-	 * All other forms require CLONED_BINARY_ENV, since they are potentially
-	 * writeable (or we can't tell if they're fully safe) and thus we must
-	 * check the environment as an extra layer of defence.
-	 */
-	if (!getenv(CLONED_BINARY_ENV)) {
-		is_cloned = false;
-		goto out;
-	}
-
-	/*
-	 * Is the binary on a read-only filesystem? We can't detect bind-mounts in
-	 * particular (in-kernel they are identical to regular mounts) but we can
-	 * at least be sure that it's read-only. In addition, to make sure that
-	 * it's *our* bind-mount we check CLONED_BINARY_ENV.
-	 */
-	if (fstatfs(fd, &fsbuf) >= 0)
-		is_cloned |= (fsbuf.f_flags & MS_RDONLY);
-
-	/*
-	 * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
-	 * which appears to have a borked backport of F_GET_SEALS. Either way,
-	 * having a file which has no hardlinks indicates that we aren't using
-	 * a host-side "runc" binary and this is something that a container
-	 * cannot fake (because unlinking requires being able to resolve the
-	 * path that you want to unlink).
-	 */
-	if (fstat(fd, &statbuf) >= 0)
-		is_cloned |= (statbuf.st_nlink == 0);
-
-out:
-	close(fd);
-	return is_cloned;
-}
-
-/* Read a given file into a new buffer, and providing the length. */
-static char *read_file(char *path, size_t *length)
-{
-	int fd;
-	char buf[4096], *copy = NULL;
-
-	if (!length)
-		return NULL;
-
-	fd = open(path, O_RDONLY | O_CLOEXEC);
-	if (fd < 0)
-		return NULL;
-
-	*length = 0;
-	for (;;) {
-		ssize_t n;
-
-		n = read(fd, buf, sizeof(buf));
-		if (n < 0)
-			goto error;
-		if (!n)
-			break;
-
-		copy = must_realloc(copy, (*length + n) * sizeof(*copy));
-		memcpy(copy + *length, buf, n);
-		*length += n;
-	}
-	close(fd);
-	return copy;
-
-error:
-	close(fd);
-	free(copy);
-	return NULL;
-}
-
-/*
- * A poor-man's version of "xargs -0". Basically parses a given block of
- * NUL-delimited data, within the given length and adds a pointer to each entry
- * to the array of pointers.
- */
-static int parse_xargs(char *data, int data_length, char ***output)
-{
-	int num = 0;
-	char *cur = data;
-
-	if (!data || *output != NULL)
-		return -1;
-
-	while (cur < data + data_length) {
-		num++;
-		*output = must_realloc(*output, (num + 1) * sizeof(**output));
-		(*output)[num - 1] = cur;
-		cur += strlen(cur) + 1;
-	}
-	(*output)[num] = NULL;
-	return num;
-}
-
-/*
- * "Parse" out argv from /proc/self/cmdline.
- * This is necessary because we are running in a context where we don't have a
- * main() that we can just get the arguments from.
- */
-static int fetchve(char ***argv)
-{
-	char *cmdline = NULL;
-	size_t cmdline_size;
-
-	cmdline = read_file("/proc/self/cmdline", &cmdline_size);
-	if (!cmdline)
-		goto error;
-
-	if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
-		goto error;
-
-	return 0;
-
-error:
-	free(cmdline);
-	return -EINVAL;
-}
-
-enum {
-	EFD_NONE = 0,
-	EFD_MEMFD,
-	EFD_FILE,
-};
-
-/*
- * This comes from <linux/fcntl.h>. We can't hard-code __O_TMPFILE because it
- * changes depending on the architecture. If we don't have O_TMPFILE we always
- * have the mkostemp(3) fallback.
- */
-#ifndef O_TMPFILE
-#  if defined(__O_TMPFILE) && defined(O_DIRECTORY)
-#    define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
-#  endif
-#endif
-
-static inline bool is_memfd_unsupported_error(int err)
-{
-	/*
-	 * - ENOSYS is obviously an "unsupported" error.
-	 *
-	 * - EINVAL could be hit if MFD_EXEC is not supported (pre-6.3 kernel),
-	 *   but it can also be hit if vm.memfd_noexec=2 (in kernels without
-	 *   [1] applied) and the flags does not contain MFD_EXEC. However,
-	 *   there was a bug in the original 6.3 implementation of
-	 *   vm.memfd_noexec=2, which meant that MFD_EXEC would work even in
-	 *   the "strict" mode. Because we try MFD_EXEC first, we won't get
-	 *   EINVAL in the vm.memfd_noexec=2 case (which means we don't need to
-	 *   figure out whether to log the message about memfd_create).
-	 *
-	 * - EACCES is returned in kernels that contain [1] in the
-	 *   vm.memfd_noexec=2 case.
-	 *
-	 * At time of writing, [1] is not in Linus's tree and it't not clear if
-	 * it will be backported to stable, so what exact versions apply here
-	 * is unclear. But the bug is present in 6.3-6.5 at the very least.
-	 *
-	 * [1]: https://lore.kernel.org/all/20230705063315.3680666-2-jeffxu@google.com/
-	 */
-	if (err == EACCES)
-		write_log(INFO,
-			  "memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE");
-	return err == ENOSYS || err == EINVAL || err == EACCES;
-}
-
-static int make_execfd(int *fdtype)
-{
-	int fd = -1;
-	char template[PATH_MAX] = { 0 };
-	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
-
-	if (!prefix || *prefix != '/')
-		prefix = "/tmp";
-	if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
-		return -1;
-
-	/*
-	 * Now try memfd, it's much nicer than actually creating a file in STATEDIR
-	 * since it's easily detected thanks to sealing and also doesn't require
-	 * assumptions about STATEDIR.
-	 */
-	*fdtype = EFD_MEMFD;
-	/*
-	 * On newer kernels we should set MFD_EXEC to indicate we need +x
-	 * permissions. Otherwise an admin with vm.memfd_noexec=1 would subtly
-	 * break runc. vm.memfd_noexec=2 is a little bit more complicated, see the
-	 * comment in is_memfd_unsupported_error() -- the upshot is that doing it
-	 * this way works, but only because of two overlapping bugs in the sysctl
-	 * implementation.
-	 */
-	fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING);
-	if (fd < 0 && is_memfd_unsupported_error(errno))
-		fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
-	if (fd >= 0)
-		return fd;
-	if (!is_memfd_unsupported_error(errno))
-		goto error;
-
-#ifdef O_TMPFILE
-	/*
-	 * Try O_TMPFILE to avoid races where someone might snatch our file. Note
-	 * that O_EXCL isn't actually a security measure here (since you can just
-	 * fd re-open it and clear O_EXCL).
-	 */
-	*fdtype = EFD_FILE;
-	fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
-	if (fd >= 0) {
-		struct stat statbuf = { };
-		bool working_otmpfile = false;
-
-		/*
-		 * open(2) ignores unknown O_* flags -- yeah, I was surprised when I
-		 * found this out too. As a result we can't check for EINVAL. However,
-		 * if we get nlink != 0 (or EISDIR) then we know that this kernel
-		 * doesn't support O_TMPFILE.
-		 */
-		if (fstat(fd, &statbuf) >= 0)
-			working_otmpfile = (statbuf.st_nlink == 0);
-
-		if (working_otmpfile)
-			return fd;
-
-		/* Pretend that we got EISDIR since O_TMPFILE failed. */
-		close(fd);
-		errno = EISDIR;
-	}
-	if (errno != EISDIR)
-		goto error;
-#endif /* defined(O_TMPFILE) */
-
-	/*
-	 * Our final option is to create a temporary file the old-school way, and
-	 * then unlink it so that nothing else sees it by accident.
-	 */
-	*fdtype = EFD_FILE;
-	fd = mkostemp(template, O_CLOEXEC);
-	if (fd >= 0) {
-		if (unlink(template) >= 0)
-			return fd;
-		close(fd);
-	}
-
-error:
-	*fdtype = EFD_NONE;
-	return -1;
-}
-
-static int seal_execfd(int *fd, int fdtype)
-{
-	switch (fdtype) {
-	case EFD_MEMFD:{
-			/*
-			 * Try to seal with newer seals, but we ignore errors because older
-			 * kernels don't support some of them. For container security only
-			 * RUNC_MEMFD_MIN_SEALS are strictly required, but the rest are
-			 * nice-to-haves. We apply RUNC_MEMFD_MIN_SEALS at the end because it
-			 * contains F_SEAL_SEAL.
-			 */
-			int __attribute__((unused)) _err1 = fcntl(*fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE);	// Linux 5.1
-			int __attribute__((unused)) _err2 = fcntl(*fd, F_ADD_SEALS, F_SEAL_EXEC);	// Linux 6.3
-			return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_MIN_SEALS);
-		}
-	case EFD_FILE:{
-			/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
-			int newfd;
-			char fdpath[PATH_MAX] = { 0 };
-
-			if (fchmod(*fd, 0100) < 0)
-				return -1;
-
-			if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
-				return -1;
-
-			newfd = open(fdpath, O_PATH | O_CLOEXEC);
-			if (newfd < 0)
-				return -1;
-
-			close(*fd);
-			*fd = newfd;
-			return 0;
-		}
-	default:
-		break;
-	}
-	return -1;
-}
-
-static ssize_t fd_to_fd(int outfd, int infd)
-{
-	ssize_t total = 0;
-	char buffer[4096];
-
-	for (;;) {
-		ssize_t nread, nwritten = 0;
-
-		nread = read(infd, buffer, sizeof(buffer));
-		if (nread < 0)
-			return -1;
-		if (!nread)
-			break;
-
-		do {
-			ssize_t n = write(outfd, buffer + nwritten, nread - nwritten);
-			if (n < 0)
-				return -1;
-			nwritten += n;
-		} while (nwritten < nread);
-
-		total += nwritten;
-	}
-
-	return total;
-}
-
-static int clone_binary(void)
-{
-	int binfd, execfd;
-	struct stat statbuf = { };
-	size_t sent = 0;
-	int fdtype = EFD_NONE;
-
-	execfd = make_execfd(&fdtype);
-	if (execfd < 0 || fdtype == EFD_NONE)
-		return -ENOTRECOVERABLE;
-
-	binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
-	if (binfd < 0)
-		goto error;
-
-	if (fstat(binfd, &statbuf) < 0)
-		goto error_binfd;
-
-	while (sent < statbuf.st_size) {
-		int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent);
-		if (n < 0) {
-			/* sendfile can fail so we fallback to a dumb user-space copy. */
-			n = fd_to_fd(execfd, binfd);
-			if (n < 0)
-				goto error_binfd;
-		}
-		sent += n;
-	}
-	close(binfd);
-	if (sent != statbuf.st_size)
-		goto error;
-
-	if (seal_execfd(&execfd, fdtype) < 0)
-		goto error;
-
-	return execfd;
-
-error_binfd:
-	close(binfd);
-error:
-	close(execfd);
-	return -EIO;
-}
-
-/* Get cheap access to the environment. */
-extern char **environ;
-
-int ensure_cloned_binary(void)
-{
-	int execfd;
-	char **argv = NULL;
-
-	/* Check that we're not self-cloned, and if we are then bail. */
-	int cloned = is_self_cloned();
-	if (cloned > 0 || cloned == -ENOTRECOVERABLE)
-		return cloned;
-
-	if (fetchve(&argv) < 0)
-		return -EINVAL;
-
-	execfd = clone_binary();
-	if (execfd < 0)
-		return -EIO;
-
-	if (putenv(CLONED_BINARY_ENV "=1"))
-		goto error;
-
-	fexecve(execfd, argv, environ);
-error:
-	close(execfd);
-	return -ENOEXEC;
-}
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 17e0468c6af..9b10b232528 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -536,9 +536,6 @@ void join_namespaces(char *nslist)
 	free(namespaces);
 }
 
-/* Defined in cloned_binary.c. */
-extern int ensure_cloned_binary(void);
-
 static inline int sane_kill(pid_t pid, int signum)
 {
 	if (pid > 0)
@@ -791,14 +788,6 @@ void nsexec(void)
 		return;
 	}
 
-	/*
-	 * We need to re-exec if we are not in a cloned binary. This is necessary
-	 * to ensure that containers won't be able to access the host binary
-	 * through /proc/self/exe. See CVE-2019-5736.
-	 */
-	if (ensure_cloned_binary() < 0)
-		bail("could not ensure we are a cloned binary");
-
 	/*
 	 * Inform the parent we're past initial setup.
 	 * For the other side of this, see initWaiter.
diff --git a/libcontainer/process.go b/libcontainer/process.go
index 4de4a9e75c2..d2c7bfcda36 100644
--- a/libcontainer/process.go
+++ b/libcontainer/process.go
@@ -49,6 +49,9 @@ type Process struct {
 	// ExtraFiles specifies additional open files to be inherited by the container
 	ExtraFiles []*os.File
 
+	// open handles to cloned binaries -- see dmz.ClonedBinary for more details
+	clonedExes []*os.File
+
 	// Initial sizings for the console
 	ConsoleWidth  uint16
 	ConsoleHeight uint16
@@ -121,6 +124,15 @@ func (p Process) Signal(sig os.Signal) error {
 	return p.ops.signal(sig)
 }
 
+// closeClonedExes cleans up any existing cloned binaries associated with the
+// Process.
+func (p *Process) closeClonedExes() {
+	for _, exe := range p.clonedExes {
+		_ = exe.Close()
+	}
+	p.clonedExes = nil
+}
+
 // IO holds the process's STDIO
 type IO struct {
 	Stdin  io.WriteCloser
diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
index 40a47a2e95c..7709219300b 100644
--- a/libcontainer/setns_init_linux.go
+++ b/libcontainer/setns_init_linux.go
@@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"os"
+	"os/exec"
 	"strconv"
 
 	"github.com/opencontainers/selinux/go-selinux"
@@ -23,6 +24,7 @@ type linuxSetnsInit struct {
 	consoleSocket *os.File
 	config        *initConfig
 	logFd         int
+	dmzExe        *os.File
 }
 
 func (l *linuxSetnsInit) getSessionRingName() string {
@@ -85,6 +87,18 @@ func (l *linuxSetnsInit) Init() error {
 	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
 		return err
 	}
+	// Check for the arg early to make sure it exists.
+	name, err := exec.LookPath(l.config.Args[0])
+	if err != nil {
+		return err
+	}
+	// exec.LookPath in Go < 1.20 might return no error for an executable
+	// residing on a file system mounted with noexec flag, so perform this
+	// extra check now while we can still return a proper error.
+	// TODO: remove this once go < 1.20 is not supported.
+	if err := eaccess(name); err != nil {
+		return &os.PathError{Op: "eaccess", Path: name, Err: err}
+	}
 	// Set seccomp as close to execve as possible, so as few syscalls take
 	// place afterward (reducing the amount of syscalls that users need to
 	// enable in their seccomp profiles).
@@ -98,10 +112,15 @@ func (l *linuxSetnsInit) Init() error {
 		}
 	}
 	logrus.Debugf("setns_init: about to exec")
+
 	// Close the log pipe fd so the parent's ForwardLogs can exit.
 	if err := unix.Close(l.logFd); err != nil {
 		return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
 	}
 
-	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
+	if l.dmzExe != nil {
+		l.config.Args[0] = name
+		return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
+	}
+	return system.Exec(name, l.config.Args, os.Environ())
 }
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
index c64173ecfc3..4eb3d8db435 100644
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@@ -25,6 +25,7 @@ type linuxStandardInit struct {
 	parentPid     int
 	fifoFd        int
 	logFd         int
+	dmzExe        *os.File
 	mountFds      mountFds
 	config        *initConfig
 }
@@ -262,5 +263,9 @@ func (l *linuxStandardInit) Init() error {
 		return err
 	}
 
-	return system.Exec(name, l.config.Args[0:], os.Environ())
+	if l.dmzExe != nil {
+		l.config.Args[0] = name
+		return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
+	}
+	return system.Exec(name, l.config.Args, os.Environ())
 }
diff --git a/libcontainer/system/kernelversion/kernel_linux.go b/libcontainer/system/kernelversion/kernel_linux.go
new file mode 100644
index 00000000000..ca5d4130d0c
--- /dev/null
+++ b/libcontainer/system/kernelversion/kernel_linux.go
@@ -0,0 +1,94 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   File copied and customized based on
+   https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux.go
+
+   File copied from
+   https://github.com/containerd/containerd/blob/v1.7.5/contrib/seccomp/kernelversion/kernel_linux.go
+*/
+
+package kernelversion
+
+import (
+	"bytes"
+	"fmt"
+	"sync"
+
+	"golang.org/x/sys/unix"
+)
+
+// KernelVersion holds information about the kernel.
+type KernelVersion struct {
+	Kernel uint64 // Version of the Kernel (i.e., the "4" in "4.1.2-generic")
+	Major  uint64 // Major revision of the Kernel (i.e., the "1" in "4.1.2-generic")
+}
+
+func (k *KernelVersion) String() string {
+	if k.Kernel > 0 || k.Major > 0 {
+		return fmt.Sprintf("%d.%d", k.Kernel, k.Major)
+	}
+	return ""
+}
+
+var (
+	currentKernelVersion *KernelVersion
+	kernelVersionError   error
+	once                 sync.Once
+)
+
+// getKernelVersion gets the current kernel version.
+func getKernelVersion() (*KernelVersion, error) {
+	once.Do(func() {
+		var uts unix.Utsname
+		if err := unix.Uname(&uts); err != nil {
+			return
+		}
+		// Remove the \x00 from the release for Atoi to parse correctly
+		currentKernelVersion, kernelVersionError = parseRelease(string(uts.Release[:bytes.IndexByte(uts.Release[:], 0)]))
+	})
+	return currentKernelVersion, kernelVersionError
+}
+
+// parseRelease parses a string and creates a KernelVersion based on it.
+func parseRelease(release string) (*KernelVersion, error) {
+	var version KernelVersion
+
+	// We're only make sure we get the "kernel" and "major revision". Sometimes we have
+	// 3.12.25-gentoo, but sometimes we just have 3.12-1-amd64.
+	_, err := fmt.Sscanf(release, "%d.%d", &version.Kernel, &version.Major)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse kernel version %q: %w", release, err)
+	}
+	return &version, nil
+}
+
+// GreaterEqualThan checks if the host's kernel version is greater than, or
+// equal to the given kernel version v. Only "kernel version" and "major revision"
+// can be specified (e.g., "3.12") and will be taken into account, which means
+// that 3.12.25-gentoo and 3.12-1-amd64 are considered equal (kernel: 3, major: 12).
+func GreaterEqualThan(minVersion KernelVersion) (bool, error) {
+	kv, err := getKernelVersion()
+	if err != nil {
+		return false, err
+	}
+	if kv.Kernel > minVersion.Kernel {
+		return true, nil
+	}
+	if kv.Kernel == minVersion.Kernel && kv.Major >= minVersion.Major {
+		return true, nil
+	}
+	return false, nil
+}
diff --git a/libcontainer/system/kernelversion/kernel_linux_test.go b/libcontainer/system/kernelversion/kernel_linux_test.go
new file mode 100644
index 00000000000..a18f1f2226f
--- /dev/null
+++ b/libcontainer/system/kernelversion/kernel_linux_test.go
@@ -0,0 +1,140 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   File copied and customized based on
+   https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux_test.go
+*/
+
+package kernelversion
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestGetKernelVersion(t *testing.T) {
+	version, err := getKernelVersion()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if version == nil {
+		t.Fatal("version is nil")
+	}
+	if version.Kernel == 0 {
+		t.Fatal("no kernel version")
+	}
+}
+
+func TestParseRelease(t *testing.T) {
+	tests := []struct {
+		in          string
+		out         KernelVersion
+		expectedErr error
+	}{
+		{in: "3.8", out: KernelVersion{Kernel: 3, Major: 8}},
+		{in: "3.8.0", out: KernelVersion{Kernel: 3, Major: 8}},
+		{in: "3.8.0-19-generic", out: KernelVersion{Kernel: 3, Major: 8}},
+		{in: "3.4.54.longterm-1", out: KernelVersion{Kernel: 3, Major: 4}},
+		{in: "3.10.0-862.2.3.el7.x86_64", out: KernelVersion{Kernel: 3, Major: 10}},
+		{in: "3.12.8tag", out: KernelVersion{Kernel: 3, Major: 12}},
+		{in: "3.12-1-amd64", out: KernelVersion{Kernel: 3, Major: 12}},
+		{in: "3.12foobar", out: KernelVersion{Kernel: 3, Major: 12}},
+		{in: "99.999.999-19-generic", out: KernelVersion{Kernel: 99, Major: 999}},
+		{in: "", expectedErr: fmt.Errorf(`failed to parse kernel version "": EOF`)},
+		{in: "3", expectedErr: fmt.Errorf(`failed to parse kernel version "3": unexpected EOF`)},
+		{in: "3.", expectedErr: fmt.Errorf(`failed to parse kernel version "3.": EOF`)},
+		{in: "3a", expectedErr: fmt.Errorf(`failed to parse kernel version "3a": input does not match format`)},
+		{in: "3.a", expectedErr: fmt.Errorf(`failed to parse kernel version "3.a": expected integer`)},
+		{in: "a", expectedErr: fmt.Errorf(`failed to parse kernel version "a": expected integer`)},
+		{in: "a.a", expectedErr: fmt.Errorf(`failed to parse kernel version "a.a": expected integer`)},
+		{in: "a.a.a-a", expectedErr: fmt.Errorf(`failed to parse kernel version "a.a.a-a": expected integer`)},
+		{in: "-3", expectedErr: fmt.Errorf(`failed to parse kernel version "-3": expected integer`)},
+		{in: "-3.", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.": expected integer`)},
+		{in: "-3.8", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.8": expected integer`)},
+		{in: "-3.-8", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.-8": expected integer`)},
+		{in: "3.-8", expectedErr: fmt.Errorf(`failed to parse kernel version "3.-8": expected integer`)},
+	}
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.in, func(t *testing.T) {
+			version, err := parseRelease(tc.in)
+			if tc.expectedErr != nil {
+				if err == nil {
+					t.Fatal("expected an error")
+				}
+				if err.Error() != tc.expectedErr.Error() {
+					t.Fatalf("expected: %s, got: %s", tc.expectedErr, err)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatal("unexpected error:", err)
+			}
+			if version == nil {
+				t.Fatal("version is nil")
+			}
+			if version.Kernel != tc.out.Kernel || version.Major != tc.out.Major {
+				t.Fatalf("expected: %d.%d, got: %d.%d", tc.out.Kernel, tc.out.Major, version.Kernel, version.Major)
+			}
+		})
+	}
+}
+
+func TestGreaterEqualThan(t *testing.T) {
+	// Get the current kernel version, so that we can make test relative to that
+	v, err := getKernelVersion()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	tests := []struct {
+		doc      string
+		in       KernelVersion
+		expected bool
+	}{
+		{
+			doc:      "same version",
+			in:       KernelVersion{v.Kernel, v.Major},
+			expected: true,
+		},
+		{
+			doc:      "kernel minus one",
+			in:       KernelVersion{v.Kernel - 1, v.Major},
+			expected: true,
+		},
+		{
+			doc:      "kernel plus one",
+			in:       KernelVersion{v.Kernel + 1, v.Major},
+			expected: false,
+		},
+		{
+			doc:      "major plus one",
+			in:       KernelVersion{v.Kernel, v.Major + 1},
+			expected: false,
+		},
+	}
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.doc+": "+tc.in.String(), func(t *testing.T) {
+			ok, err := GreaterEqualThan(tc.in)
+			if err != nil {
+				t.Fatal("unexpected error:", err)
+			}
+			if ok != tc.expected {
+				t.Fatalf("expected: %v, got: %v", tc.expected, ok)
+			}
+		})
+	}
+}
diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go
index d2ad5cea229..318b6edfe81 100644
--- a/libcontainer/system/linux.go
+++ b/libcontainer/system/linux.go
@@ -4,10 +4,15 @@
 package system
 
 import (
+	"fmt"
+	"io"
 	"os"
 	"os/exec"
+	"strconv"
+	"syscall"
 	"unsafe"
 
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -36,7 +41,6 @@ func Execv(cmd string, args []string, env []string) error {
 	if err != nil {
 		return err
 	}
-
 	return Exec(name, args, env)
 }
 
@@ -49,6 +53,49 @@ func Exec(cmd string, args []string, env []string) error {
 	}
 }
 
+func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error {
+	pathnamep, err := syscall.BytePtrFromString(pathname)
+	if err != nil {
+		return err
+	}
+
+	argvp, err := syscall.SlicePtrFromStrings(args)
+	if err != nil {
+		return err
+	}
+
+	envp, err := syscall.SlicePtrFromStrings(env)
+	if err != nil {
+		return err
+	}
+
+	_, _, errno := syscall.Syscall6(
+		unix.SYS_EXECVEAT,
+		fd,
+		uintptr(unsafe.Pointer(pathnamep)),
+		uintptr(unsafe.Pointer(&argvp[0])),
+		uintptr(unsafe.Pointer(&envp[0])),
+		uintptr(flags),
+		0,
+	)
+	return errno
+}
+
+func Fexecve(fd uintptr, args []string, env []string) error {
+	var err error
+	for {
+		err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH)
+		if err != unix.EINTR { // nolint:errorlint // unix errors are bare
+			break
+		}
+	}
+	if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare
+		// Fallback to classic /proc/self/fd/... exec.
+		return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env)
+	}
+	return os.NewSyscallError("execveat", err)
+}
+
 func SetParentDeathSignal(sig uintptr) error {
 	if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
 		return err
@@ -102,3 +149,68 @@ func GetSubreaper() (int, error) {
 
 	return int(i), nil
 }
+
+func ExecutableMemfd(comment string, flags int) (*os.File, error) {
+	// Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this
+	// flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an
+	// executable memfd. For vm.memfd_noexec=2 this is a bit more complicated.
+	// The original vm.memfd_noexec=2 implementation incorrectly silently
+	// allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer
+	// kernels, we will get -EACCES if we try to use MFD_EXEC with
+	// vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value).
+	//
+	// The upshot is we only need to retry without MFD_EXEC on -EINVAL because
+	// it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on
+	// kernels where -EINVAL is actually a security denial.
+	memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC)
+	if err == unix.EINVAL {
+		memfd, err = unix.MemfdCreate(comment, flags)
+	}
+	if err != nil {
+		if err == unix.EACCES {
+			logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE")
+		}
+		err := os.NewSyscallError("memfd_create", err)
+		return nil, fmt.Errorf("failed to create executable memfd: %w", err)
+	}
+	return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
+}
+
+// Copy is like io.Copy except it uses sendfile(2) if the source and sink are
+// both (*os.File) as an optimisation to make copies faster.
+func Copy(dst io.Writer, src io.Reader) (copied int64, err error) {
+	dstFile, _ := dst.(*os.File)
+	srcFile, _ := src.(*os.File)
+
+	if dstFile != nil && srcFile != nil {
+		fi, err := srcFile.Stat()
+		if err != nil {
+			goto fallback
+		}
+		size := fi.Size()
+		for size > 0 {
+			n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size))
+			if n > 0 {
+				size -= int64(n)
+				copied += int64(n)
+			}
+			if err == unix.EINTR {
+				continue
+			}
+			if err != nil {
+				if copied == 0 {
+					// If we haven't copied anything so far, we can safely just
+					// fallback to io.Copy. We could always do the fallback but
+					// it's safer to error out in the case of a partial copy
+					// followed by an error (which should never happen).
+					goto fallback
+				}
+				return copied, fmt.Errorf("partial sendfile copy: %w", err)
+			}
+		}
+		return copied, nil
+	}
+
+fallback:
+	return io.Copy(dst, src)
+}
diff --git a/script/lib.sh b/script/lib.sh
index 9fee8e29f38..f79dc3c2335 100644
--- a/script/lib.sh
+++ b/script/lib.sh
@@ -1,33 +1,65 @@
 #!/bin/bash
 
+# NOTE: Make sure you keep this file in sync with cc_platform.mk.
+
 # set_cross_vars sets a few environment variables used for cross-compiling,
 # based on the architecture specified in $1.
 function set_cross_vars() {
 	GOARCH="$1" # default, may be overridden below
 	unset GOARM
 
+	PLATFORM=linux-gnu
+	# openSUSE has a custom PLATFORM
+	if grep -iq "ID_LIKE=.*suse" /etc/os-release; then
+		PLATFORM=suse-linux
+		is_suse=1
+	fi
+
 	case $1 in
+	386)
+		# Always use the 64-bit compiler to build the 386 binary, which works
+		# for the more common cross-build method for x86 (namely, the
+		# equivalent of dpkg --add-architecture).
+		local cpu_type
+		if [ -v is_suse ]; then
+			# There is no x86_64-suse-linux-gcc, so use the native one.
+			HOST=
+			cpu_type=i586
+		else
+			HOST=x86_64-${PLATFORM}
+			cpu_type=i686
+		fi
+		CFLAGS="-m32 -march=$cpu_type ${CFLAGS[*]}"
+		;;
+	amd64)
+		if [ -n "${is_suse:-}" ]; then
+			# There is no x86_64-suse-linux-gcc, so use the native one.
+			HOST=
+		else
+			HOST=x86_64-${PLATFORM}
+		fi
+		;;
 	arm64)
-		HOST=aarch64-linux-gnu
+		HOST=aarch64-${PLATFORM}
 		;;
 	armel)
-		HOST=arm-linux-gnueabi
+		HOST=arm-${PLATFORM}eabi
 		GOARCH=arm
 		GOARM=6
 		;;
 	armhf)
-		HOST=arm-linux-gnueabihf
+		HOST=arm-${PLATFORM}eabihf
 		GOARCH=arm
 		GOARM=7
 		;;
 	ppc64le)
-		HOST=powerpc64le-linux-gnu
+		HOST=powerpc64le-${PLATFORM}
 		;;
 	riscv64)
-		HOST=riscv64-linux-gnu
+		HOST=riscv64-${PLATFORM}
 		;;
 	s390x)
-		HOST=s390x-linux-gnu
+		HOST=s390x-${PLATFORM}
 		;;
 	*)
 		echo "set_cross_vars: unsupported architecture: $1" >&2
@@ -35,8 +67,8 @@ function set_cross_vars() {
 		;;
 	esac
 
-	CC=$HOST-gcc
-	STRIP=$HOST-strip
+	CC="${HOST:+$HOST-}gcc"
+	STRIP="${HOST:+$HOST-}strip"
 
-	export HOST GOARM GOARCH CC STRIP
+	export HOST CFLAGS GOARM GOARCH CC STRIP
 }
diff --git a/script/release_build.sh b/script/release_build.sh
index af238628cbd..6c7aee88b23 100755
--- a/script/release_build.sh
+++ b/script/release_build.sh
@@ -60,24 +60,14 @@ function build_project() {
 	# it can reuse cached pkg-config results).
 	local make_args=(COMMIT_NO= EXTRA_FLAGS="-a" EXTRA_LDFLAGS="${ldflags}" static)
 
-	# Build natively.
-	make -C "$root" \
-		PKG_CONFIG_PATH="$seccompdir/lib/pkgconfig" \
-		"${make_args[@]}"
-	strip "$root/$project"
-	# Sanity check: make sure libseccomp version is as expected.
-	local ver
-	ver=$("$root/$project" --version | awk '$1 == "libseccomp:" {print $2}')
-	if [ "$ver" != "$LIBSECCOMP_VERSION" ]; then
-		echo >&2 "libseccomp version mismatch: want $LIBSECCOMP_VERSION, got $ver"
-		exit 1
-	fi
+	# Save the original cflags.
+	local original_cflags="${CFLAGS:-}"
 
-	mv "$root/$project" "$builddir/$project.$native_arch"
-
-	# Cross-build for for other architectures.
+	# Build for all requested architectures.
 	local arch
 	for arch in "${arches[@]}"; do
+		# Reset CFLAGS.
+		CFLAGS="$original_cflags"
 		set_cross_vars "$arch"
 		make -C "$root" \
 			PKG_CONFIG_PATH="$seccompdir/$arch/lib/pkgconfig" \
@@ -86,6 +76,14 @@ function build_project() {
 		mv "$root/$project" "$builddir/$project.$arch"
 	done
 
+	# Sanity check: make sure libseccomp version is as expected.
+	local ver
+	ver=$("$builddir/$project.$native_arch" --version | awk '$1 == "libseccomp:" {print $2}')
+	if [ "$ver" != "$LIBSECCOMP_VERSION" ]; then
+		echo >&2 "libseccomp version mismatch: want $LIBSECCOMP_VERSION, got $ver"
+		exit 1
+	fi
+
 	# Copy libseccomp source tarball.
 	cp "$seccompdir"/src/* "$builddir"
 
@@ -122,12 +120,17 @@ commit="HEAD"
 version=""
 releasedir=""
 hashcmd=""
-declare -a add_arches
+# Always build a native binary.
+native_arch="$(go env GOARCH || echo "amd64")"
+arches=("$native_arch")
 
 while getopts "a:c:H:hr:v:" opt; do
 	case "$opt" in
 	a)
-		add_arches+=("$OPTARG")
+		# Add architecture if not already present in arches.
+		if ! (printf "%s\0" "${arches[@]}" | grep -zqxF "$OPTARG"); then
+			arches+=("$OPTARG")
+		fi
 		;;
 	c)
 		commit="$OPTARG"
@@ -158,9 +161,8 @@ done
 version="${version:-$(<"$root/VERSION")}"
 releasedir="${releasedir:-release/$version}"
 hashcmd="${hashcmd:-sha256sum}"
-native_arch="$(go env GOARCH || echo "amd64")"
 # Suffixes of files to checksum/sign.
-suffixes=("$native_arch" "${add_arches[@]}" tar.xz)
+suffixes=("${arches[@]}" tar.xz)
 
 log "creating $project release in '$releasedir'"
 log "  version: $version"
@@ -174,7 +176,7 @@ set -x
 rm -rf "$releasedir" && mkdir -p "$releasedir"
 
 # Build project.
-build_project "$releasedir/$project" "$native_arch" "${add_arches[@]}"
+build_project "$releasedir/$project" "$native_arch" "${arches[@]}"
 
 # Generate new archive.
 git archive --format=tar --prefix="$project-$version/" "$commit" | xz >"$releasedir/$project.tar.xz"
diff --git a/script/seccomp.sh b/script/seccomp.sh
index beea612ac83..955437c2fb4 100755
--- a/script/seccomp.sh
+++ b/script/seccomp.sh
@@ -33,16 +33,21 @@ function build_libseccomp() {
 	tar xf "$tar" -C "$srcdir"
 	pushd "$srcdir/libseccomp-$ver" || return
 
-	# Build natively and install to /usr/local.
+	# Install native version for Dockerfile builds.
 	./configure \
 		--prefix="$dest" --libdir="$dest/lib" \
 		--enable-static --enable-shared
 	make install
 	make clean
 
-	# Build and install for additional architectures.
+	# Save the original cflags.
+	local original_cflags="${CFLAGS:-}"
+
+	# Build and install for all requested architectures.
 	local arch
 	for arch in "${arches[@]}"; do
+		# Reset CFLAGS.
+		CFLAGS="$original_cflags"
 		set_cross_vars "$arch"
 		./configure --host "$HOST" \
 			--prefix="$dest/$arch" --libdir="$dest/$arch/lib" \
diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash
index cd08fb2459f..7e6399a47b8 100755
--- a/tests/integration/helpers.bash
+++ b/tests/integration/helpers.bash
@@ -646,12 +646,16 @@ function teardown_bundle() {
 	remove_parent
 }
 
-function requires_kernel() {
+function is_kernel_gte() {
 	local major_required minor_required
 	major_required=$(echo "$1" | cut -d. -f1)
 	minor_required=$(echo "$1" | cut -d. -f2)
-	if [[ "$KERNEL_MAJOR" -lt $major_required || ("$KERNEL_MAJOR" -eq $major_required && "$KERNEL_MINOR" -lt $minor_required) ]]; then
-		skip "requires kernel $1"
+	[[ "$KERNEL_MAJOR" -gt $major_required || ("$KERNEL_MAJOR" -eq $major_required && "$KERNEL_MINOR" -ge $minor_required) ]]
+}
+
+function requires_kernel() {
+	if ! is_kernel_gte "$@"; then
+		skip "requires kernel >= $1"
 	fi
 }
 
diff --git a/tests/integration/run.bats b/tests/integration/run.bats
index 9f1f1d8bc74..baf91fb00cd 100644
--- a/tests/integration/run.bats
+++ b/tests/integration/run.bats
@@ -126,3 +126,37 @@ function teardown() {
 	[ "$status" -eq 0 ]
 	[ "$output" = "410" ]
 }
+
+@test "runc run [runc-dmz]" {
+	runc --debug run test_hello
+	[ "$status" -eq 0 ]
+	[[ "$output" = *"Hello World"* ]]
+	# We use runc-dmz if we can.
+	[[ "$output" = *"runc-dmz: using runc-dmz"* ]]
+}
+
+@test "runc run [cap_sys_ptrace -> /proc/self/exe clone]" {
+	# Add CAP_SYS_PTRACE to the bounding set, the minimum needed to indicate a
+	# container process _could_ get CAP_SYS_PTRACE.
+	update_config '.process.capabilities.bounding += ["CAP_SYS_PTRACE"]'
+
+	runc --debug run test_hello
+	[ "$status" -eq 0 ]
+	[[ "$output" = *"Hello World"* ]]
+	if [ "$EUID" -ne 0 ] && is_kernel_gte 4.10; then
+		# For Linux 4.10 and later, rootless containers will use runc-dmz
+		# because they are running in a user namespace. See isDmzBinarySafe().
+		[[ "$output" = *"runc-dmz: using runc-dmz"* ]]
+	else
+		# If the container has CAP_SYS_PTRACE and is not rootless, we use
+		# /proc/self/exe cloning.
+		[[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]]
+	fi
+}
+
+@test "RUNC_DMZ=legacy runc run [/proc/self/exe clone]" {
+	RUNC_DMZ=legacy runc --debug run test_hello
+	[ "$status" -eq 0 ]
+	[[ "$output" = *"Hello World"* ]]
+	[[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]]
+}
diff --git a/tests/integration/seccomp-notify-compat.bats b/tests/integration/seccomp-notify-compat.bats
index 8d663edda51..6ca3449bffa 100644
--- a/tests/integration/seccomp-notify-compat.bats
+++ b/tests/integration/seccomp-notify-compat.bats
@@ -3,8 +3,8 @@
 load helpers
 
 function setup() {
-	if [[ "$KERNEL_MAJOR" -gt 5 || ("$KERNEL_MAJOR" -eq 5 && "$KERNEL_MINOR" -ge 6) ]]; then
-		skip "requires kernel less than 5.6"
+	if is_kernel_gte 5.6; then
+		skip "requires kernel < 5.6"
 	fi
 
 	requires arch_x86_64
diff --git a/tests/integration/start_hello.bats b/tests/integration/start_hello.bats
index 87005484748..6fbb893e695 100644
--- a/tests/integration/start_hello.bats
+++ b/tests/integration/start_hello.bats
@@ -58,6 +58,8 @@ function teardown() {
 	# Enable CAP_DAC_OVERRIDE.
 	update_config '	  .process.capabilities.bounding += ["CAP_DAC_OVERRIDE"]
 			| .process.capabilities.effective += ["CAP_DAC_OVERRIDE"]
+			| .process.capabilities.inheritable += ["CAP_DAC_OVERRIDE"]
+			| .process.capabilities.ambient += ["CAP_DAC_OVERRIDE"]
 			| .process.capabilities.permitted += ["CAP_DAC_OVERRIDE"]'
 
 	runc run test_busybox