Skip to content

Commit acab6f6

Browse files
authored
Merge pull request #3931 from cyphar/remove-bindfd
nsexec: cloned_binary: remove bindfd logic entirely
2 parents 7b1fc98 + 0d890ad commit acab6f6

File tree

1 file changed

+83
-222
lines changed

1 file changed

+83
-222
lines changed

libcontainer/nsenter/cloned_binary.c

Lines changed: 83 additions & 222 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@
9696
# define MFD_CLOEXEC 0x0001U
9797
# define MFD_ALLOW_SEALING 0x0002U
9898
#endif
99+
#ifndef MFD_EXEC
100+
# define MFD_EXEC 0x0010U
101+
#endif
99102

100103
int memfd_create(const char *name, unsigned int flags)
101104
{
@@ -116,15 +119,27 @@ int memfd_create(const char *name, unsigned int flags)
116119
# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
117120
#endif
118121
#ifndef F_SEAL_SEAL
119-
# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
120-
# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
121-
# define F_SEAL_GROW 0x0004 /* prevent file from growing */
122-
# define F_SEAL_WRITE 0x0008 /* prevent writes */
122+
# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
123+
# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
124+
# define F_SEAL_GROW 0x0004 /* prevent file from growing */
125+
# define F_SEAL_WRITE 0x0008 /* prevent writes */
126+
#endif
127+
#ifndef F_SEAL_FUTURE_WRITE
128+
# define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
129+
#endif
130+
#ifndef F_SEAL_EXEC
131+
# define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */
123132
#endif
124133

125134
#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
126135
#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
127-
#define RUNC_MEMFD_SEALS \
136+
/*
137+
* There are newer memfd seals (such as F_SEAL_FUTURE_WRITE and F_SEAL_EXEC),
138+
* which we use opportunistically. However, this set is the original set of
139+
* memfd seals, and we require them all to be set to trust our /proc/self/exe
140+
* if it is a memfd.
141+
*/
142+
#define RUNC_MEMFD_MIN_SEALS \
128143
(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
129144

130145
static void *must_realloc(void *ptr, size_t size)
@@ -143,25 +158,27 @@ static void *must_realloc(void *ptr, size_t size)
143158
*/
144159
static int is_self_cloned(void)
145160
{
146-
int fd, is_cloned = 0;
161+
int fd, seals = 0, is_cloned = false;
147162
struct stat statbuf = { };
148163
struct statfs fsbuf = { };
149164

150165
fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
151166
if (fd < 0) {
152-
fprintf(stderr, "you have no read access to runc binary file\n");
167+
write_log(ERROR, "cannot open runc binary for reading: open /proc/self/exe: %m");
153168
return -ENOTRECOVERABLE;
154169
}
155170

156171
/*
157172
* Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
158-
* this, because you cannot write to a sealed memfd no matter what (so
159-
* sharing it isn't a bad thing -- and an admin could bind-mount a sealed
160-
* memfd to /usr/bin/runc to allow re-use).
173+
* this, because you cannot write to a sealed memfd no matter what.
161174
*/
162-
is_cloned = (fcntl(fd, F_GET_SEALS) == RUNC_MEMFD_SEALS);
163-
if (is_cloned)
164-
goto out;
175+
seals = fcntl(fd, F_GET_SEALS);
176+
if (seals >= 0) {
177+
write_log(DEBUG, "checking /proc/self/exe memfd seals: 0x%x", seals);
178+
is_cloned = (seals & RUNC_MEMFD_MIN_SEALS) == RUNC_MEMFD_MIN_SEALS;
179+
if (is_cloned)
180+
goto out;
181+
}
165182

166183
/*
167184
* All other forms require CLONED_BINARY_ENV, since they are potentially
@@ -298,6 +315,35 @@ enum {
298315
# endif
299316
#endif
300317

318+
static inline bool is_memfd_unsupported_error(int err)
319+
{
320+
/*
321+
* - ENOSYS is obviously an "unsupported" error.
322+
*
323+
* - EINVAL could be hit if MFD_EXEC is not supported (pre-6.3 kernel),
324+
* but it can also be hit if vm.memfd_noexec=2 (in kernels without
325+
* [1] applied) and the flags does not contain MFD_EXEC. However,
326+
* there was a bug in the original 6.3 implementation of
327+
* vm.memfd_noexec=2, which meant that MFD_EXEC would work even in
328+
* the "strict" mode. Because we try MFD_EXEC first, we won't get
329+
* EINVAL in the vm.memfd_noexec=2 case (which means we don't need to
330+
* figure out whether to log the message about memfd_create).
331+
*
332+
* - EACCES is returned in kernels that contain [1] in the
333+
* vm.memfd_noexec=2 case.
334+
*
335+
* At time of writing, [1] is not in Linus's tree and it't not clear if
336+
* it will be backported to stable, so what exact versions apply here
337+
* is unclear. But the bug is present in 6.3-6.5 at the very least.
338+
*
339+
* [1]: https://lore.kernel.org/all/[email protected]/
340+
*/
341+
if (err == EACCES)
342+
write_log(INFO,
343+
"memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE");
344+
return err == ENOSYS || err == EINVAL || err == EACCES;
345+
}
346+
301347
static int make_execfd(int *fdtype)
302348
{
303349
int fd = -1;
@@ -315,10 +361,20 @@ static int make_execfd(int *fdtype)
315361
* assumptions about STATEDIR.
316362
*/
317363
*fdtype = EFD_MEMFD;
318-
fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
364+
/*
365+
* On newer kernels we should set MFD_EXEC to indicate we need +x
366+
* permissions. Otherwise an admin with vm.memfd_noexec=1 would subtly
367+
* break runc. vm.memfd_noexec=2 is a little bit more complicated, see the
368+
* comment in is_memfd_unsupported_error() -- the upshot is that doing it
369+
* this way works, but only because of two overlapping bugs in the sysctl
370+
* implementation.
371+
*/
372+
fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING);
373+
if (fd < 0 && is_memfd_unsupported_error(errno))
374+
fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
319375
if (fd >= 0)
320376
return fd;
321-
if (errno != ENOSYS && errno != EINVAL)
377+
if (!is_memfd_unsupported_error(errno))
322378
goto error;
323379

324380
#ifdef O_TMPFILE
@@ -373,8 +429,18 @@ static int make_execfd(int *fdtype)
373429
static int seal_execfd(int *fd, int fdtype)
374430
{
375431
switch (fdtype) {
376-
case EFD_MEMFD:
377-
return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
432+
case EFD_MEMFD:{
433+
/*
434+
* Try to seal with newer seals, but we ignore errors because older
435+
* kernels don't support some of them. For container security only
436+
* RUNC_MEMFD_MIN_SEALS are strictly required, but the rest are
437+
* nice-to-haves. We apply RUNC_MEMFD_MIN_SEALS at the end because it
438+
* contains F_SEAL_SEAL.
439+
*/
440+
int __attribute__((unused)) _err1 = fcntl(*fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE); // Linux 5.1
441+
int __attribute__((unused)) _err2 = fcntl(*fd, F_ADD_SEALS, F_SEAL_EXEC); // Linux 6.3
442+
return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_MIN_SEALS);
443+
}
378444
case EFD_FILE:{
379445
/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
380446
int newfd;
@@ -400,199 +466,6 @@ static int seal_execfd(int *fd, int fdtype)
400466
return -1;
401467
}
402468

403-
struct bindfd_child_args {
404-
int sockfd;
405-
const char *mount_target;
406-
};
407-
408-
static int bindfd_in_subprocess(void *arg)
409-
{
410-
/*
411-
* In the interests of efficiency (read: minimizing the syscall count)
412-
* and conciseness, no attempt is made to release resources which would
413-
* be cleaned up automatically on process exit, i.e. when this function
414-
* returns. This includes filesystem mounts, as this function is
415-
* executed in a dedicated mount namespace.
416-
*/
417-
418-
/*
419-
* For obvious reasons this won't work in rootless mode because we
420-
* haven't created a userns -- but getting that to work will be a bit
421-
* complicated and it's only worth doing if someone actually needs it.
422-
*/
423-
if (mount("none", "/", NULL, MS_SLAVE | MS_REC, NULL) < 0)
424-
return errno;
425-
/*
426-
* The kernel resolves the magic symlink /proc/self/exe to the real file
427-
* _in the original mount namespace_. Cross-namespace bind mounts are
428-
* not allowed, so we must locate the file inside the current mount
429-
* namespace to be able to bind-mount it. (The mount(8) command resolves
430-
* symlinks, which is why it appears to work at first glance.)
431-
*/
432-
char linkbuf[PATH_MAX + 1] = { 0 };
433-
ssize_t linkpathlen = readlink("/proc/self/exe", linkbuf, sizeof(linkbuf));
434-
if (linkpathlen < 0)
435-
return errno;
436-
if (linkpathlen == sizeof(linkbuf)) {
437-
/*
438-
* The link path is longer than PATH_MAX, and the contents of
439-
* linkbuf might have been truncated. A truncated path could
440-
* happen to be a valid path to a different file, which could
441-
* allow for local privilege escalation if we were to exec it.
442-
* The mount syscall doesn't accept paths longer than PATH_MAX,
443-
* anyway.
444-
*/
445-
return ENAMETOOLONG;
446-
}
447-
448-
int srcfd = open(linkbuf, O_PATH | O_CLOEXEC);
449-
if (srcfd < 0)
450-
return errno;
451-
/*
452-
* linkbuf holds the path to the binary which the parent process was
453-
* launched from. Someone could have moved a different file to that path
454-
* in the interim, in which case srcfd is not the file we want to
455-
* bind-mount. Guard against this situation by verifying srcfd is the
456-
* same file as /proc/self/exe.
457-
*/
458-
struct stat realexe = { 0 };
459-
if (stat("/proc/self/exe", &realexe) < 0)
460-
return errno;
461-
struct stat resolved = { 0 };
462-
if (fstat(srcfd, &resolved) < 0)
463-
return errno;
464-
if (resolved.st_dev != realexe.st_dev || resolved.st_ino != realexe.st_ino)
465-
return ENOENT;
466-
if (snprintf(linkbuf, sizeof(linkbuf), "/proc/self/fd/%d", srcfd) == sizeof(linkbuf))
467-
return ENAMETOOLONG;
468-
469-
const struct bindfd_child_args *args = arg;
470-
if (mount(linkbuf, args->mount_target, "", MS_BIND, "") < 0)
471-
return errno;
472-
if (mount("", args->mount_target, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
473-
return errno;
474-
475-
int fd = open(args->mount_target, O_PATH | O_CLOEXEC);
476-
if (fd < 0)
477-
return errno;
478-
479-
/*
480-
* Make sure the MNT_DETACH works, otherwise we could get remounted
481-
* read-write and that would be quite bad.
482-
*/
483-
if (umount2(args->mount_target, MNT_DETACH) < 0)
484-
return errno;
485-
486-
if (send_fd(args->sockfd, fd) < 0)
487-
return errno;
488-
return 0;
489-
}
490-
491-
static int spawn_bindfd_child(const struct bindfd_child_args *args) __attribute__((noinline));
492-
static int spawn_bindfd_child(const struct bindfd_child_args *args)
493-
{
494-
/*
495-
* Carve out a chunk of our call stack for the child process to use as
496-
* we can be sure it is correctly mapped for use as stack. (Technically
497-
* only the libc clone() wrapper writes to this buffer. The child
498-
* process operates on a copy of the parent's virtual memory space and
499-
* so can safely overflow into the rest of the stack memory region
500-
* without consequence.)
501-
*/
502-
char stack[4 * 1024] __attribute__((aligned(16)));
503-
int tid = clone(bindfd_in_subprocess,
504-
/*
505-
* Assume stack grows down, as HP-PA, the only Linux
506-
* platform where stack grows up, is obsolete.
507-
*/
508-
stack + sizeof(stack),
509-
/*
510-
* Suspend the parent process until the child has exited to
511-
* save an unnecessary context switch as we'd just be
512-
* waiting for the child process to exit anyway.
513-
*/
514-
CLONE_NEWNS | CLONE_VFORK, (void *)args);
515-
if (tid < 0)
516-
return -errno;
517-
return tid;
518-
}
519-
520-
static int try_bindfd(void)
521-
{
522-
int fd, ret = -1;
523-
char template[PATH_MAX] = { 0 };
524-
char *prefix = getenv("_LIBCONTAINER_STATEDIR");
525-
526-
if (!prefix || *prefix != '/')
527-
prefix = "/tmp";
528-
if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
529-
return ret;
530-
531-
/*
532-
* We need somewhere to mount it, mounting anything over /proc/self is a
533-
* BAD idea on the host -- even if we do it temporarily.
534-
*/
535-
fd = mkstemp(template);
536-
if (fd < 0)
537-
return ret;
538-
close(fd);
539-
540-
/*
541-
* Daemons such as systemd and udisks2 watch /proc/self/mountinfo and
542-
* re-parse it on every change, which gets expensive when the mount table
543-
* is large and/or changes frequently. Perform the mount operations in a
544-
* new, private mount namespace so as not to wake up those processes
545-
* every time we nsexec into a container. We clone a child process into
546-
* a new mount namespace to do the dirty work so the side effects of
547-
* unsharing the mount namespace do not leak into the current process.
548-
*/
549-
int sock[2];
550-
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sock) < 0) {
551-
ret = -errno;
552-
goto cleanup_unlink;
553-
}
554-
555-
struct bindfd_child_args args = {
556-
.sockfd = sock[0],
557-
.mount_target = template,
558-
};
559-
int cpid = spawn_bindfd_child(&args);
560-
close(sock[0]);
561-
if (cpid < 0) {
562-
ret = cpid;
563-
goto cleanup_socketpair;
564-
}
565-
566-
int wstatus = 0;
567-
if (waitpid(cpid, &wstatus, __WCLONE) < 0)
568-
bail("error waiting for bindfd child process to exit");
569-
if (WIFEXITED(wstatus)) {
570-
if (WEXITSTATUS(wstatus)) {
571-
ret = -WEXITSTATUS(wstatus);
572-
goto cleanup_socketpair;
573-
}
574-
} else if (WIFSIGNALED(wstatus)) {
575-
int sig = WTERMSIG(wstatus);
576-
bail("bindfd child process terminated by signal %d (%s)", sig, strsignal(sig));
577-
} else {
578-
/* Should never happen... */
579-
bail("unexpected waitpid() status for bindfd child process: 0x%x", wstatus);
580-
}
581-
582-
ret = receive_fd(sock[1]);
583-
584-
cleanup_socketpair:
585-
close(sock[1]);
586-
587-
cleanup_unlink:
588-
/*
589-
* We don't care about unlink errors, the worst that happens is that
590-
* there's an empty file left around in STATEDIR.
591-
*/
592-
unlink(template);
593-
return ret;
594-
}
595-
596469
static ssize_t fd_to_fd(int outfd, int infd)
597470
{
598471
ssize_t total = 0;
@@ -627,18 +500,6 @@ static int clone_binary(void)
627500
size_t sent = 0;
628501
int fdtype = EFD_NONE;
629502

630-
/*
631-
* Before we resort to copying, let's try creating an ro-binfd in one shot
632-
* by getting a handle for a read-only bind-mount of the execfd.
633-
*/
634-
execfd = try_bindfd();
635-
if (execfd >= 0)
636-
return execfd;
637-
638-
/*
639-
* Dammit, that didn't work -- time to copy the binary to a safe place we
640-
* can seal the contents.
641-
*/
642503
execfd = make_execfd(&fdtype);
643504
if (execfd < 0 || fdtype == EFD_NONE)
644505
return -ENOTRECOVERABLE;

0 commit comments

Comments
 (0)