Skip to content

Commit 16612d7

Browse files
committed
nsenter: cloned_binary: try to ro-bind /proc/self/exe before copying
The usage of memfd_create(2) and other copying techniques is quite wasteful, despite attempts to minimise it with _LIBCONTAINER_STATEDIR. memfd_create(2) added ~10M of memory usage to the cgroup associated with the container, which can result in some setups getting OOM'd (or just hogging the hosts' memory when you have lots of created-but-not-started containers sticking around). The easiest way of solving this is by creating a read-only bind-mount of the binary, opening that read-only bindmount, and then umounting it to ensure that the host won't accidentally be re-mounted read-write. This avoids all copying and cleans up naturally like the other techniques used. Unfortunately, like the O_TMPFILE fallback, this requires being able to create a file inside _LIBCONTAINER_STATEDIR (since bind-mounting over the most obvious path -- /proc/self/exe -- is a *very bad idea*). Unfortunately detecting this isn't fool-proof -- on a system with a read-only root filesystem (that might become read-write during "runc init" execution), we cannot tell whether we have already done an ro remount. As a partial mitigation, we store a _LIBCONTAINER_CLONED_BINARY environment variable which is checked *alongside* the protection being present. Signed-off-by: Aleksa Sarai <[email protected]>
1 parent af9da0a commit 16612d7

File tree

1 file changed

+131
-26
lines changed

1 file changed

+131
-26
lines changed

libcontainer/nsenter/cloned_binary.c

Lines changed: 131 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@
2727

2828
#include <sys/types.h>
2929
#include <sys/stat.h>
30+
#include <sys/statfs.h>
3031
#include <sys/vfs.h>
3132
#include <sys/mman.h>
33+
#include <sys/mount.h>
3234
#include <sys/sendfile.h>
3335
#include <sys/syscall.h>
3436

@@ -67,6 +69,7 @@ int memfd_create(const char *name, unsigned int flags)
6769
# define F_SEAL_WRITE 0x0008 /* prevent writes */
6870
#endif
6971

72+
#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
7073
#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
7174
#define RUNC_MEMFD_SEALS \
7275
(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
@@ -88,29 +91,56 @@ static void *must_realloc(void *ptr, size_t size)
8891
static int is_self_cloned(void)
8992
{
9093
int fd, ret, is_cloned = 0;
94+
struct stat statbuf = {};
95+
struct statfs fsbuf = {};
9196

9297
fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
9398
if (fd < 0)
9499
return -ENOTRECOVERABLE;
95100

96-
/* First check memfd. */
101+
/*
102+
* Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
103+
* this, because you cannot write to a sealed memfd no matter what (so
104+
* sharing it isn't a bad thing -- and an admin could bind-mount a sealed
105+
* memfd to /usr/bin/runc to allow re-use).
106+
*/
97107
ret = fcntl(fd, F_GET_SEALS);
98108
if (ret >= 0) {
99109
is_cloned = (ret == RUNC_MEMFD_SEALS);
100-
} else {
101-
/*
102-
* Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
103-
* which appears to have a borked backport of F_GET_SEALS. Either way,
104-
* having a file which has no hardlinks indicates that we aren't using
105-
* a host-side "runc" binary and this is something that a container
106-
* cannot fake (because unlinking requires being able to resolve the
107-
* path that you want to unlink).
108-
*/
109-
struct stat statbuf = {};
110-
if (fstat(fd, &statbuf) >= 0)
111-
is_cloned = (statbuf.st_nlink == 0);
110+
goto out;
112111
}
113112

113+
/*
114+
* All other forms require CLONED_BINARY_ENV, since they are potentially
115+
* writeable (or we can't tell if they're fully safe) and thus we must
116+
* check the environment as an extra layer of defence.
117+
*/
118+
if (!getenv(CLONED_BINARY_ENV)) {
119+
is_cloned = false;
120+
goto out;
121+
}
122+
123+
/*
124+
* Is the binary on a read-only filesystem? We can't detect bind-mounts in
125+
* particular (in-kernel they are identical to regular mounts) but we can
126+
* at least be sure that it's read-only. In addition, to make sure that
127+
* it's *our* bind-mount we check CLONED_BINARY_ENV.
128+
*/
129+
if (fstatfs(fd, &fsbuf) >= 0)
130+
is_cloned |= (fsbuf.f_flags & MS_RDONLY);
131+
132+
/*
133+
* Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
134+
* which appears to have a borked backport of F_GET_SEALS. Either way,
135+
* having a file which has no hardlinks indicates that we aren't using
136+
* a host-side "runc" binary and this is something that a container
137+
* cannot fake (because unlinking requires being able to resolve the
138+
* path that you want to unlink).
139+
*/
140+
if (fstat(fd, &statbuf) >= 0)
141+
is_cloned |= (statbuf.st_nlink == 0);
142+
143+
out:
114144
close(fd);
115145
return is_cloned;
116146
}
@@ -227,15 +257,16 @@ static int make_execfd(int *fdtype)
227257
return -1;
228258

229259
/*
230-
* Try memfd first, it's much nicer since it's easily detected thanks to
231-
* sealing and also doesn't require assumptions like /tmp.
260+
* Now try memfd, it's much nicer than actually creating a file in STATEDIR
261+
* since it's easily detected thanks to sealing and also doesn't require
262+
* assumptions about STATEDIR.
232263
*/
233264
*fdtype = EFD_MEMFD;
234265
fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
235266
if (fd >= 0)
236267
return fd;
237-
if (errno != ENOSYS)
238-
goto err;
268+
if (errno != ENOSYS && errno != EINVAL)
269+
goto error;
239270

240271
#ifdef O_TMPFILE
241272
/*
@@ -266,7 +297,7 @@ static int make_execfd(int *fdtype)
266297
errno = EISDIR;
267298
}
268299
if (errno != EISDIR)
269-
goto err;
300+
goto error;
270301
#endif /* defined(O_TMPFILE) */
271302

272303
/*
@@ -281,7 +312,7 @@ static int make_execfd(int *fdtype)
281312
close(fd);
282313
}
283314

284-
err:
315+
error:
285316
*fdtype = EFD_NONE;
286317
return -1;
287318
}
@@ -316,15 +347,83 @@ static int seal_execfd(int *fd, int fdtype)
316347
return -1;
317348
}
318349

350+
static int try_bindfd(void)
351+
{
352+
int fd, ret = -1;
353+
char template[PATH_MAX] = {0};
354+
char *prefix = secure_getenv("_LIBCONTAINER_STATEDIR");
355+
356+
if (!prefix || *prefix != '/')
357+
prefix = "/tmp";
358+
if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
359+
return ret;
360+
361+
/*
362+
* We need somewhere to mount it, mounting anything over /proc/self is a
363+
* BAD idea on the host -- even if we do it temporarily.
364+
*/
365+
fd = mkstemp(template);
366+
if (fd < 0)
367+
return ret;
368+
close(fd);
369+
370+
/*
371+
* For obvious reasons this won't work in rootless mode because we haven't
372+
* created a userns+mntns -- but getting that to work will be a bit
373+
* complicated and it's only worth doing if someone actually needs it.
374+
*/
375+
ret = -EPERM;
376+
if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
377+
goto out;
378+
if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
379+
goto out_umount;
380+
381+
382+
/* Get read-only handle that we're sure can't be made read-write. */
383+
ret = open(template, O_PATH | O_CLOEXEC);
384+
385+
out_umount:
386+
/*
387+
* Make sure the MNT_DETACH works, otherwise we could get remounted
388+
* read-write and that would be quite bad (the fd would be made read-write
389+
* too, invalidating the protection).
390+
*/
391+
if (umount2(template, MNT_DETACH) < 0) {
392+
if (ret >= 0)
393+
close(ret);
394+
ret = -ENOTRECOVERABLE;
395+
}
396+
397+
out:
398+
/*
399+
* We don't care about unlink errors, the worst that happens is that
400+
* there's an empty file left around in STATEDIR.
401+
*/
402+
unlink(template);
403+
return ret;
404+
}
405+
319406
static int clone_binary(void)
320407
{
321-
int binfd, memfd;
408+
int binfd, execfd;
322409
struct stat statbuf = {};
323410
size_t sent = 0;
324411
int fdtype = EFD_NONE;
325412

326-
memfd = make_execfd(&fdtype);
327-
if (memfd < 0 || fdtype == EFD_NONE)
413+
/*
414+
* Before we resort to copying, let's try creating an ro-binfd in one shot
415+
* by getting a handle for a read-only bind-mount of the execfd.
416+
*/
417+
execfd = try_bindfd();
418+
if (execfd >= 0)
419+
return execfd;
420+
421+
/*
422+
* Dammit, that didn't work -- time to copy the binary to a safe place we
423+
* can seal the contents.
424+
*/
425+
execfd = make_execfd(&fdtype);
426+
if (execfd < 0 || fdtype == EFD_NONE)
328427
return -ENOTRECOVERABLE;
329428

330429
binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
@@ -335,7 +434,7 @@ static int clone_binary(void)
335434
goto error_binfd;
336435

337436
while (sent < statbuf.st_size) {
338-
int n = sendfile(memfd, binfd, NULL, statbuf.st_size - sent);
437+
int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent);
339438
if (n < 0)
340439
goto error_binfd;
341440
sent += n;
@@ -344,14 +443,15 @@ static int clone_binary(void)
344443
if (sent != statbuf.st_size)
345444
goto error;
346445

347-
if (seal_execfd(&memfd, fdtype) < 0)
446+
if (seal_execfd(&execfd, fdtype) < 0)
348447
goto error;
349-
return memfd;
448+
449+
return execfd;
350450

351451
error_binfd:
352452
close(binfd);
353453
error:
354-
close(memfd);
454+
close(execfd);
355455
return -EIO;
356456
}
357457

@@ -375,6 +475,11 @@ int ensure_cloned_binary(void)
375475
if (execfd < 0)
376476
return -EIO;
377477

478+
if (putenv(CLONED_BINARY_ENV "=1"))
479+
goto error;
480+
378481
fexecve(execfd, argv, environ);
482+
error:
483+
close(execfd);
379484
return -ENOEXEC;
380485
}

0 commit comments

Comments
 (0)