96
96
# define MFD_CLOEXEC 0x0001U
97
97
# define MFD_ALLOW_SEALING 0x0002U
98
98
#endif
99
+ #ifndef MFD_EXEC
100
+ # define MFD_EXEC 0x0010U
101
+ #endif
99
102
100
103
int memfd_create (const char * name , unsigned int flags )
101
104
{
@@ -116,15 +119,27 @@ int memfd_create(const char *name, unsigned int flags)
116
119
# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
117
120
#endif
118
121
#ifndef F_SEAL_SEAL
119
- # define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
120
- # define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
121
- # define F_SEAL_GROW 0x0004 /* prevent file from growing */
122
- # define F_SEAL_WRITE 0x0008 /* prevent writes */
122
+ # define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
123
+ # define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
124
+ # define F_SEAL_GROW 0x0004 /* prevent file from growing */
125
+ # define F_SEAL_WRITE 0x0008 /* prevent writes */
126
+ #endif
127
+ #ifndef F_SEAL_FUTURE_WRITE
128
+ # define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
129
+ #endif
130
+ #ifndef F_SEAL_EXEC
131
+ # define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */
123
132
#endif
124
133
125
134
#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
126
135
#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
127
- #define RUNC_MEMFD_SEALS \
136
+ /*
137
+ * There are newer memfd seals (such as F_SEAL_FUTURE_WRITE and F_SEAL_EXEC),
138
+ * which we use opportunistically. However, this set is the original set of
139
+ * memfd seals, and we require them all to be set to trust our /proc/self/exe
140
+ * if it is a memfd.
141
+ */
142
+ #define RUNC_MEMFD_MIN_SEALS \
128
143
(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
129
144
130
145
static void * must_realloc (void * ptr , size_t size )
@@ -143,25 +158,27 @@ static void *must_realloc(void *ptr, size_t size)
143
158
*/
144
159
static int is_self_cloned (void )
145
160
{
146
- int fd , is_cloned = 0 ;
161
+ int fd , seals = 0 , is_cloned = false ;
147
162
struct stat statbuf = { };
148
163
struct statfs fsbuf = { };
149
164
150
165
fd = open ("/proc/self/exe" , O_RDONLY | O_CLOEXEC );
151
166
if (fd < 0 ) {
152
- fprintf ( stderr , "you have no read access to runc binary file\n " );
167
+ write_log ( ERROR , "cannot open runc binary for reading: open /proc/self/exe: %m " );
153
168
return - ENOTRECOVERABLE ;
154
169
}
155
170
156
171
/*
157
172
* Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
158
- * this, because you cannot write to a sealed memfd no matter what (so
159
- * sharing it isn't a bad thing -- and an admin could bind-mount a sealed
160
- * memfd to /usr/bin/runc to allow re-use).
173
+ * this, because you cannot write to a sealed memfd no matter what.
161
174
*/
162
- is_cloned = (fcntl (fd , F_GET_SEALS ) == RUNC_MEMFD_SEALS );
163
- if (is_cloned )
164
- goto out ;
175
+ seals = fcntl (fd , F_GET_SEALS );
176
+ if (seals >= 0 ) {
177
+ write_log (DEBUG , "checking /proc/self/exe memfd seals: 0x%x" , seals );
178
+ is_cloned = (seals & RUNC_MEMFD_MIN_SEALS ) == RUNC_MEMFD_MIN_SEALS ;
179
+ if (is_cloned )
180
+ goto out ;
181
+ }
165
182
166
183
/*
167
184
* All other forms require CLONED_BINARY_ENV, since they are potentially
@@ -298,6 +315,35 @@ enum {
298
315
# endif
299
316
#endif
300
317
318
+ static inline bool is_memfd_unsupported_error (int err )
319
+ {
320
+ /*
321
+ * - ENOSYS is obviously an "unsupported" error.
322
+ *
323
+ * - EINVAL could be hit if MFD_EXEC is not supported (pre-6.3 kernel),
324
+ * but it can also be hit if vm.memfd_noexec=2 (in kernels without
325
+ * [1] applied) and the flags does not contain MFD_EXEC. However,
326
+ * there was a bug in the original 6.3 implementation of
327
+ * vm.memfd_noexec=2, which meant that MFD_EXEC would work even in
328
+ * the "strict" mode. Because we try MFD_EXEC first, we won't get
329
+ * EINVAL in the vm.memfd_noexec=2 case (which means we don't need to
330
+ * figure out whether to log the message about memfd_create).
331
+ *
332
+ * - EACCES is returned in kernels that contain [1] in the
333
+ * vm.memfd_noexec=2 case.
334
+ *
335
+ * At time of writing, [1] is not in Linus's tree and it't not clear if
336
+ * it will be backported to stable, so what exact versions apply here
337
+ * is unclear. But the bug is present in 6.3-6.5 at the very least.
338
+ *
339
+ * [1]: https://lore.kernel.org/all/[email protected] /
340
+ */
341
+ if (err == EACCES )
342
+ write_log (INFO ,
343
+ "memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE" );
344
+ return err == ENOSYS || err == EINVAL || err == EACCES ;
345
+ }
346
+
301
347
static int make_execfd (int * fdtype )
302
348
{
303
349
int fd = -1 ;
@@ -315,10 +361,20 @@ static int make_execfd(int *fdtype)
315
361
* assumptions about STATEDIR.
316
362
*/
317
363
* fdtype = EFD_MEMFD ;
318
- fd = memfd_create (RUNC_MEMFD_COMMENT , MFD_CLOEXEC | MFD_ALLOW_SEALING );
364
+ /*
365
+ * On newer kernels we should set MFD_EXEC to indicate we need +x
366
+ * permissions. Otherwise an admin with vm.memfd_noexec=1 would subtly
367
+ * break runc. vm.memfd_noexec=2 is a little bit more complicated, see the
368
+ * comment in is_memfd_unsupported_error() -- the upshot is that doing it
369
+ * this way works, but only because of two overlapping bugs in the sysctl
370
+ * implementation.
371
+ */
372
+ fd = memfd_create (RUNC_MEMFD_COMMENT , MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING );
373
+ if (fd < 0 && is_memfd_unsupported_error (errno ))
374
+ fd = memfd_create (RUNC_MEMFD_COMMENT , MFD_CLOEXEC | MFD_ALLOW_SEALING );
319
375
if (fd >= 0 )
320
376
return fd ;
321
- if (errno != ENOSYS && errno != EINVAL )
377
+ if (! is_memfd_unsupported_error ( errno ) )
322
378
goto error ;
323
379
324
380
#ifdef O_TMPFILE
@@ -373,8 +429,18 @@ static int make_execfd(int *fdtype)
373
429
static int seal_execfd (int * fd , int fdtype )
374
430
{
375
431
switch (fdtype ) {
376
- case EFD_MEMFD :
377
- return fcntl (* fd , F_ADD_SEALS , RUNC_MEMFD_SEALS );
432
+ case EFD_MEMFD :{
433
+ /*
434
+ * Try to seal with newer seals, but we ignore errors because older
435
+ * kernels don't support some of them. For container security only
436
+ * RUNC_MEMFD_MIN_SEALS are strictly required, but the rest are
437
+ * nice-to-haves. We apply RUNC_MEMFD_MIN_SEALS at the end because it
438
+ * contains F_SEAL_SEAL.
439
+ */
440
+ int __attribute__((unused )) _err1 = fcntl (* fd , F_ADD_SEALS , F_SEAL_FUTURE_WRITE ); // Linux 5.1
441
+ int __attribute__((unused )) _err2 = fcntl (* fd , F_ADD_SEALS , F_SEAL_EXEC ); // Linux 6.3
442
+ return fcntl (* fd , F_ADD_SEALS , RUNC_MEMFD_MIN_SEALS );
443
+ }
378
444
case EFD_FILE :{
379
445
/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
380
446
int newfd ;
@@ -400,199 +466,6 @@ static int seal_execfd(int *fd, int fdtype)
400
466
return -1 ;
401
467
}
402
468
403
- struct bindfd_child_args {
404
- int sockfd ;
405
- const char * mount_target ;
406
- };
407
-
408
- static int bindfd_in_subprocess (void * arg )
409
- {
410
- /*
411
- * In the interests of efficiency (read: minimizing the syscall count)
412
- * and conciseness, no attempt is made to release resources which would
413
- * be cleaned up automatically on process exit, i.e. when this function
414
- * returns. This includes filesystem mounts, as this function is
415
- * executed in a dedicated mount namespace.
416
- */
417
-
418
- /*
419
- * For obvious reasons this won't work in rootless mode because we
420
- * haven't created a userns -- but getting that to work will be a bit
421
- * complicated and it's only worth doing if someone actually needs it.
422
- */
423
- if (mount ("none" , "/" , NULL , MS_SLAVE | MS_REC , NULL ) < 0 )
424
- return errno ;
425
- /*
426
- * The kernel resolves the magic symlink /proc/self/exe to the real file
427
- * _in the original mount namespace_. Cross-namespace bind mounts are
428
- * not allowed, so we must locate the file inside the current mount
429
- * namespace to be able to bind-mount it. (The mount(8) command resolves
430
- * symlinks, which is why it appears to work at first glance.)
431
- */
432
- char linkbuf [PATH_MAX + 1 ] = { 0 };
433
- ssize_t linkpathlen = readlink ("/proc/self/exe" , linkbuf , sizeof (linkbuf ));
434
- if (linkpathlen < 0 )
435
- return errno ;
436
- if (linkpathlen == sizeof (linkbuf )) {
437
- /*
438
- * The link path is longer than PATH_MAX, and the contents of
439
- * linkbuf might have been truncated. A truncated path could
440
- * happen to be a valid path to a different file, which could
441
- * allow for local privilege escalation if we were to exec it.
442
- * The mount syscall doesn't accept paths longer than PATH_MAX,
443
- * anyway.
444
- */
445
- return ENAMETOOLONG ;
446
- }
447
-
448
- int srcfd = open (linkbuf , O_PATH | O_CLOEXEC );
449
- if (srcfd < 0 )
450
- return errno ;
451
- /*
452
- * linkbuf holds the path to the binary which the parent process was
453
- * launched from. Someone could have moved a different file to that path
454
- * in the interim, in which case srcfd is not the file we want to
455
- * bind-mount. Guard against this situation by verifying srcfd is the
456
- * same file as /proc/self/exe.
457
- */
458
- struct stat realexe = { 0 };
459
- if (stat ("/proc/self/exe" , & realexe ) < 0 )
460
- return errno ;
461
- struct stat resolved = { 0 };
462
- if (fstat (srcfd , & resolved ) < 0 )
463
- return errno ;
464
- if (resolved .st_dev != realexe .st_dev || resolved .st_ino != realexe .st_ino )
465
- return ENOENT ;
466
- if (snprintf (linkbuf , sizeof (linkbuf ), "/proc/self/fd/%d" , srcfd ) == sizeof (linkbuf ))
467
- return ENAMETOOLONG ;
468
-
469
- const struct bindfd_child_args * args = arg ;
470
- if (mount (linkbuf , args -> mount_target , "" , MS_BIND , "" ) < 0 )
471
- return errno ;
472
- if (mount ("" , args -> mount_target , "" , MS_REMOUNT | MS_BIND | MS_RDONLY , "" ) < 0 )
473
- return errno ;
474
-
475
- int fd = open (args -> mount_target , O_PATH | O_CLOEXEC );
476
- if (fd < 0 )
477
- return errno ;
478
-
479
- /*
480
- * Make sure the MNT_DETACH works, otherwise we could get remounted
481
- * read-write and that would be quite bad.
482
- */
483
- if (umount2 (args -> mount_target , MNT_DETACH ) < 0 )
484
- return errno ;
485
-
486
- if (send_fd (args -> sockfd , fd ) < 0 )
487
- return errno ;
488
- return 0 ;
489
- }
490
-
491
- static int spawn_bindfd_child (const struct bindfd_child_args * args ) __attribute__((noinline ));
492
- static int spawn_bindfd_child (const struct bindfd_child_args * args )
493
- {
494
- /*
495
- * Carve out a chunk of our call stack for the child process to use as
496
- * we can be sure it is correctly mapped for use as stack. (Technically
497
- * only the libc clone() wrapper writes to this buffer. The child
498
- * process operates on a copy of the parent's virtual memory space and
499
- * so can safely overflow into the rest of the stack memory region
500
- * without consequence.)
501
- */
502
- char stack [4 * 1024 ] __attribute__((aligned (16 )));
503
- int tid = clone (bindfd_in_subprocess ,
504
- /*
505
- * Assume stack grows down, as HP-PA, the only Linux
506
- * platform where stack grows up, is obsolete.
507
- */
508
- stack + sizeof (stack ),
509
- /*
510
- * Suspend the parent process until the child has exited to
511
- * save an unnecessary context switch as we'd just be
512
- * waiting for the child process to exit anyway.
513
- */
514
- CLONE_NEWNS | CLONE_VFORK , (void * )args );
515
- if (tid < 0 )
516
- return - errno ;
517
- return tid ;
518
- }
519
-
520
- static int try_bindfd (void )
521
- {
522
- int fd , ret = -1 ;
523
- char template [PATH_MAX ] = { 0 };
524
- char * prefix = getenv ("_LIBCONTAINER_STATEDIR" );
525
-
526
- if (!prefix || * prefix != '/' )
527
- prefix = "/tmp" ;
528
- if (snprintf (template , sizeof (template ), "%s/runc.XXXXXX" , prefix ) < 0 )
529
- return ret ;
530
-
531
- /*
532
- * We need somewhere to mount it, mounting anything over /proc/self is a
533
- * BAD idea on the host -- even if we do it temporarily.
534
- */
535
- fd = mkstemp (template );
536
- if (fd < 0 )
537
- return ret ;
538
- close (fd );
539
-
540
- /*
541
- * Daemons such as systemd and udisks2 watch /proc/self/mountinfo and
542
- * re-parse it on every change, which gets expensive when the mount table
543
- * is large and/or changes frequently. Perform the mount operations in a
544
- * new, private mount namespace so as not to wake up those processes
545
- * every time we nsexec into a container. We clone a child process into
546
- * a new mount namespace to do the dirty work so the side effects of
547
- * unsharing the mount namespace do not leak into the current process.
548
- */
549
- int sock [2 ];
550
- if (socketpair (AF_LOCAL , SOCK_STREAM , 0 , sock ) < 0 ) {
551
- ret = - errno ;
552
- goto cleanup_unlink ;
553
- }
554
-
555
- struct bindfd_child_args args = {
556
- .sockfd = sock [0 ],
557
- .mount_target = template ,
558
- };
559
- int cpid = spawn_bindfd_child (& args );
560
- close (sock [0 ]);
561
- if (cpid < 0 ) {
562
- ret = cpid ;
563
- goto cleanup_socketpair ;
564
- }
565
-
566
- int wstatus = 0 ;
567
- if (waitpid (cpid , & wstatus , __WCLONE ) < 0 )
568
- bail ("error waiting for bindfd child process to exit" );
569
- if (WIFEXITED (wstatus )) {
570
- if (WEXITSTATUS (wstatus )) {
571
- ret = - WEXITSTATUS (wstatus );
572
- goto cleanup_socketpair ;
573
- }
574
- } else if (WIFSIGNALED (wstatus )) {
575
- int sig = WTERMSIG (wstatus );
576
- bail ("bindfd child process terminated by signal %d (%s)" , sig , strsignal (sig ));
577
- } else {
578
- /* Should never happen... */
579
- bail ("unexpected waitpid() status for bindfd child process: 0x%x" , wstatus );
580
- }
581
-
582
- ret = receive_fd (sock [1 ]);
583
-
584
- cleanup_socketpair :
585
- close (sock [1 ]);
586
-
587
- cleanup_unlink :
588
- /*
589
- * We don't care about unlink errors, the worst that happens is that
590
- * there's an empty file left around in STATEDIR.
591
- */
592
- unlink (template );
593
- return ret ;
594
- }
595
-
596
469
static ssize_t fd_to_fd (int outfd , int infd )
597
470
{
598
471
ssize_t total = 0 ;
@@ -627,18 +500,6 @@ static int clone_binary(void)
627
500
size_t sent = 0 ;
628
501
int fdtype = EFD_NONE ;
629
502
630
- /*
631
- * Before we resort to copying, let's try creating an ro-binfd in one shot
632
- * by getting a handle for a read-only bind-mount of the execfd.
633
- */
634
- execfd = try_bindfd ();
635
- if (execfd >= 0 )
636
- return execfd ;
637
-
638
- /*
639
- * Dammit, that didn't work -- time to copy the binary to a safe place we
640
- * can seal the contents.
641
- */
642
503
execfd = make_execfd (& fdtype );
643
504
if (execfd < 0 || fdtype == EFD_NONE )
644
505
return - ENOTRECOVERABLE ;
0 commit comments