27
27
28
28
#include <sys/types.h>
29
29
#include <sys/stat.h>
30
+ #include <sys/statfs.h>
30
31
#include <sys/vfs.h>
31
32
#include <sys/mman.h>
33
+ #include <sys/mount.h>
32
34
#include <sys/sendfile.h>
33
35
#include <sys/syscall.h>
34
36
@@ -67,6 +69,7 @@ int memfd_create(const char *name, unsigned int flags)
67
69
# define F_SEAL_WRITE 0x0008 /* prevent writes */
68
70
#endif
69
71
72
+ #define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
70
73
#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
71
74
#define RUNC_MEMFD_SEALS \
72
75
(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
@@ -88,29 +91,56 @@ static void *must_realloc(void *ptr, size_t size)
88
91
static int is_self_cloned (void )
89
92
{
90
93
int fd , ret , is_cloned = 0 ;
94
+ struct stat statbuf = {};
95
+ struct statfs fsbuf = {};
91
96
92
97
fd = open ("/proc/self/exe" , O_RDONLY |O_CLOEXEC );
93
98
if (fd < 0 )
94
99
return - ENOTRECOVERABLE ;
95
100
96
- /* First check memfd. */
101
+ /*
102
+ * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
103
+ * this, because you cannot write to a sealed memfd no matter what (so
104
+ * sharing it isn't a bad thing -- and an admin could bind-mount a sealed
105
+ * memfd to /usr/bin/runc to allow re-use).
106
+ */
97
107
ret = fcntl (fd , F_GET_SEALS );
98
108
if (ret >= 0 ) {
99
109
is_cloned = (ret == RUNC_MEMFD_SEALS );
100
- } else {
101
- /*
102
- * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
103
- * which appears to have a borked backport of F_GET_SEALS. Either way,
104
- * having a file which has no hardlinks indicates that we aren't using
105
- * a host-side "runc" binary and this is something that a container
106
- * cannot fake (because unlinking requires being able to resolve the
107
- * path that you want to unlink).
108
- */
109
- struct stat statbuf = {};
110
- if (fstat (fd , & statbuf ) >= 0 )
111
- is_cloned = (statbuf .st_nlink == 0 );
110
+ goto out ;
112
111
}
113
112
113
+ /*
114
+ * All other forms require CLONED_BINARY_ENV, since they are potentially
115
+ * writeable (or we can't tell if they're fully safe) and thus we must
116
+ * check the environment as an extra layer of defence.
117
+ */
118
+ if (!getenv (CLONED_BINARY_ENV )) {
119
+ is_cloned = false;
120
+ goto out ;
121
+ }
122
+
123
+ /*
124
+ * Is the binary on a read-only filesystem? We can't detect bind-mounts in
125
+ * particular (in-kernel they are identical to regular mounts) but we can
126
+ * at least be sure that it's read-only. In addition, to make sure that
127
+ * it's *our* bind-mount we check CLONED_BINARY_ENV.
128
+ */
129
+ if (fstatfs (fd , & fsbuf ) >= 0 )
130
+ is_cloned |= (fsbuf .f_flags & MS_RDONLY );
131
+
132
+ /*
133
+ * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
134
+ * which appears to have a borked backport of F_GET_SEALS. Either way,
135
+ * having a file which has no hardlinks indicates that we aren't using
136
+ * a host-side "runc" binary and this is something that a container
137
+ * cannot fake (because unlinking requires being able to resolve the
138
+ * path that you want to unlink).
139
+ */
140
+ if (fstat (fd , & statbuf ) >= 0 )
141
+ is_cloned |= (statbuf .st_nlink == 0 );
142
+
143
+ out :
114
144
close (fd );
115
145
return is_cloned ;
116
146
}
@@ -227,15 +257,16 @@ static int make_execfd(int *fdtype)
227
257
return -1 ;
228
258
229
259
/*
230
- * Try memfd first, it's much nicer since it's easily detected thanks to
231
- * sealing and also doesn't require assumptions like /tmp.
260
+ * Now try memfd, it's much nicer than actually creating a file in STATEDIR
261
+ * since it's easily detected thanks to sealing and also doesn't require
262
+ * assumptions about STATEDIR.
232
263
*/
233
264
* fdtype = EFD_MEMFD ;
234
265
fd = memfd_create (RUNC_MEMFD_COMMENT , MFD_CLOEXEC | MFD_ALLOW_SEALING );
235
266
if (fd >= 0 )
236
267
return fd ;
237
- if (errno != ENOSYS )
238
- goto err ;
268
+ if (errno != ENOSYS && errno != EINVAL )
269
+ goto error ;
239
270
240
271
#ifdef O_TMPFILE
241
272
/*
@@ -266,7 +297,7 @@ static int make_execfd(int *fdtype)
266
297
errno = EISDIR ;
267
298
}
268
299
if (errno != EISDIR )
269
- goto err ;
300
+ goto error ;
270
301
#endif /* defined(O_TMPFILE) */
271
302
272
303
/*
@@ -281,7 +312,7 @@ static int make_execfd(int *fdtype)
281
312
close (fd );
282
313
}
283
314
284
- err :
315
+ error :
285
316
* fdtype = EFD_NONE ;
286
317
return -1 ;
287
318
}
@@ -316,15 +347,83 @@ static int seal_execfd(int *fd, int fdtype)
316
347
return -1 ;
317
348
}
318
349
350
+ static int try_bindfd (void )
351
+ {
352
+ int fd , ret = -1 ;
353
+ char template [PATH_MAX ] = {0 };
354
+ char * prefix = secure_getenv ("_LIBCONTAINER_STATEDIR" );
355
+
356
+ if (!prefix || * prefix != '/' )
357
+ prefix = "/tmp" ;
358
+ if (snprintf (template , sizeof (template ), "%s/runc.XXXXXX" , prefix ) < 0 )
359
+ return ret ;
360
+
361
+ /*
362
+ * We need somewhere to mount it, mounting anything over /proc/self is a
363
+ * BAD idea on the host -- even if we do it temporarily.
364
+ */
365
+ fd = mkstemp (template );
366
+ if (fd < 0 )
367
+ return ret ;
368
+ close (fd );
369
+
370
+ /*
371
+ * For obvious reasons this won't work in rootless mode because we haven't
372
+ * created a userns+mntns -- but getting that to work will be a bit
373
+ * complicated and it's only worth doing if someone actually needs it.
374
+ */
375
+ ret = - EPERM ;
376
+ if (mount ("/proc/self/exe" , template , "" , MS_BIND , "" ) < 0 )
377
+ goto out ;
378
+ if (mount ("" , template , "" , MS_REMOUNT | MS_BIND | MS_RDONLY , "" ) < 0 )
379
+ goto out_umount ;
380
+
381
+
382
+ /* Get read-only handle that we're sure can't be made read-write. */
383
+ ret = open (template , O_PATH | O_CLOEXEC );
384
+
385
+ out_umount :
386
+ /*
387
+ * Make sure the MNT_DETACH works, otherwise we could get remounted
388
+ * read-write and that would be quite bad (the fd would be made read-write
389
+ * too, invalidating the protection).
390
+ */
391
+ if (umount2 (template , MNT_DETACH ) < 0 ) {
392
+ if (ret >= 0 )
393
+ close (ret );
394
+ ret = - ENOTRECOVERABLE ;
395
+ }
396
+
397
+ out :
398
+ /*
399
+ * We don't care about unlink errors, the worst that happens is that
400
+ * there's an empty file left around in STATEDIR.
401
+ */
402
+ unlink (template );
403
+ return ret ;
404
+ }
405
+
319
406
static int clone_binary (void )
320
407
{
321
- int binfd , memfd ;
408
+ int binfd , execfd ;
322
409
struct stat statbuf = {};
323
410
size_t sent = 0 ;
324
411
int fdtype = EFD_NONE ;
325
412
326
- memfd = make_execfd (& fdtype );
327
- if (memfd < 0 || fdtype == EFD_NONE )
413
+ /*
414
+ * Before we resort to copying, let's try creating an ro-binfd in one shot
415
+ * by getting a handle for a read-only bind-mount of the execfd.
416
+ */
417
+ execfd = try_bindfd ();
418
+ if (execfd >= 0 )
419
+ return execfd ;
420
+
421
+ /*
422
+ * Dammit, that didn't work -- time to copy the binary to a safe place we
423
+ * can seal the contents.
424
+ */
425
+ execfd = make_execfd (& fdtype );
426
+ if (execfd < 0 || fdtype == EFD_NONE )
328
427
return - ENOTRECOVERABLE ;
329
428
330
429
binfd = open ("/proc/self/exe" , O_RDONLY | O_CLOEXEC );
@@ -335,7 +434,7 @@ static int clone_binary(void)
335
434
goto error_binfd ;
336
435
337
436
while (sent < statbuf .st_size ) {
338
- int n = sendfile (memfd , binfd , NULL , statbuf .st_size - sent );
437
+ int n = sendfile (execfd , binfd , NULL , statbuf .st_size - sent );
339
438
if (n < 0 )
340
439
goto error_binfd ;
341
440
sent += n ;
@@ -344,14 +443,15 @@ static int clone_binary(void)
344
443
if (sent != statbuf .st_size )
345
444
goto error ;
346
445
347
- if (seal_execfd (& memfd , fdtype ) < 0 )
446
+ if (seal_execfd (& execfd , fdtype ) < 0 )
348
447
goto error ;
349
- return memfd ;
448
+
449
+ return execfd ;
350
450
351
451
error_binfd :
352
452
close (binfd );
353
453
error :
354
- close (memfd );
454
+ close (execfd );
355
455
return - EIO ;
356
456
}
357
457
@@ -375,6 +475,11 @@ int ensure_cloned_binary(void)
375
475
if (execfd < 0 )
376
476
return - EIO ;
377
477
478
+ if (putenv (CLONED_BINARY_ENV "=1" ))
479
+ goto error ;
480
+
378
481
fexecve (execfd , argv , environ );
482
+ error :
483
+ close (execfd );
379
484
return - ENOEXEC ;
380
485
}
0 commit comments