Skip to content

Commit 7cf97b1

Browse files
sargunkees
authored andcommitted
seccomp: Introduce addfd ioctl to seccomp user notifier
The current SECCOMP_RET_USER_NOTIF API allows for syscall supervision over an fd. It is often used in settings where a supervising task emulates syscalls on behalf of a supervised task in userspace, either to further restrict the supervisee's syscall abilities or to circumvent kernel enforced restrictions the supervisor deems safe to lift (e.g. actually performing a mount(2) for an unprivileged container). While SECCOMP_RET_USER_NOTIF allows for the interception of any syscall, only a certain subset of syscalls could be correctly emulated. Over the last few development cycles, the set of syscalls which can't be emulated has been reduced due to the addition of pidfd_getfd(2). With this we are now able to, for example, intercept syscalls that require the supervisor to operate on file descriptors of the supervisee such as connect(2). However, syscalls that cause new file descriptors to be installed can not currently be correctly emulated since there is no way for the supervisor to inject file descriptors into the supervisee. This patch adds a new addfd ioctl to remove this restriction by allowing the supervisor to install file descriptors into the intercepted task. By implementing this feature via seccomp the supervisor effectively instructs the supervisee to install a set of file descriptors into its own file descriptor table during the intercepted syscall. This way it is possible to intercept syscalls such as open() or accept(), and install (or replace, like dup2(2)) the supervisor's resulting fd into the supervisee. One replacement use-case would be to redirect the stdout and stderr of a supervisee into log file descriptors opened by the supervisor. The ioctl handling is based on the discussions[1] of how Extensible Arguments should interact with ioctls. Instead of building size into the addfd structure, make it a function of the ioctl command (which is how sizes are normally passed to ioctls). To support forward and backward compatibility, just mask out the direction and size, and match everything. The size (and any future direction) checks are done along with copy_struct_from_user() logic. As a note, the seccomp_notif_addfd structure is laid out based on 8-byte alignment without requiring packing as there have been packing issues with uapi highlighted before[2][3]. Although we could overload the newfd field and use -1 to indicate that it is not to be used, doing so requires changing the size of the fd field, and introduces struct packing complexity. [1]: https://lore.kernel.org/lkml/[email protected]/ [2]: https://lore.kernel.org/lkml/[email protected]/ [3]: https://lore.kernel.org/lkml/[email protected] Cc: Christoph Hellwig <[email protected]> Cc: Christian Brauner <[email protected]> Cc: Tycho Andersen <[email protected]> Cc: Jann Horn <[email protected]> Cc: Robert Sesek <[email protected]> Cc: Chris Palmer <[email protected]> Cc: Al Viro <[email protected]> Cc: [email protected] Cc: [email protected] Cc: [email protected] Suggested-by: Matt Denton <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Sargun Dhillon <[email protected]> Reviewed-by: Will Drewry <[email protected]> Co-developed-by: Kees Cook <[email protected]> Signed-off-by: Kees Cook <[email protected]>
1 parent 1738171 commit 7cf97b1

File tree

3 files changed

+199
-2
lines changed

3 files changed

+199
-2
lines changed

include/linux/seccomp.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
SECCOMP_FILTER_FLAG_NEW_LISTENER | \
1111
SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
1212

13+
/* sizeof() the first published struct seccomp_notif_addfd */
14+
#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
15+
#define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0
16+
1317
#ifdef CONFIG_SECCOMP
1418

1519
#include <linux/thread_info.h>

include/uapi/linux/seccomp.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,25 @@ struct seccomp_notif_resp {
113113
__u32 flags;
114114
};
115115

116+
/* valid flags for seccomp_notif_addfd */
117+
#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */
118+
119+
/**
120+
* struct seccomp_notif_addfd
121+
* @id: The ID of the seccomp notification
122+
* @flags: SECCOMP_ADDFD_FLAG_*
123+
* @srcfd: The local fd number
124+
* @newfd: Optional remote FD number if SETFD option is set, otherwise 0.
125+
* @newfd_flags: The O_* flags the remote FD should have applied
126+
*/
127+
struct seccomp_notif_addfd {
128+
__u64 id;
129+
__u32 flags;
130+
__u32 srcfd;
131+
__u32 newfd;
132+
__u32 newfd_flags;
133+
};
134+
116135
#define SECCOMP_IOC_MAGIC '!'
117136
#define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr)
118137
#define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type)
@@ -124,5 +143,8 @@ struct seccomp_notif_resp {
124143
#define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \
125144
struct seccomp_notif_resp)
126145
#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64)
146+
/* On success, the return value is the remote process's added fd number */
147+
#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \
148+
struct seccomp_notif_addfd)
127149

128150
#endif /* _UAPI_LINUX_SECCOMP_H */

kernel/seccomp.c

Lines changed: 173 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,42 @@ struct seccomp_knotif {
8787
long val;
8888
u32 flags;
8989

90-
/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
90+
/*
91+
* Signals when this has changed states, such as the listener
92+
* dying, a new seccomp addfd message, or changing to REPLIED
93+
*/
9194
struct completion ready;
9295

9396
struct list_head list;
97+
98+
/* outstanding addfd requests */
99+
struct list_head addfd;
100+
};
101+
102+
/**
103+
* struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
104+
*
105+
* @file: A reference to the file to install in the other task
106+
* @fd: The fd number to install it at. If the fd number is -1, it means the
107+
* installing process should allocate the fd as normal.
108+
* @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
109+
* is allowed.
110+
* @ret: The return value of the installing process. It is set to the fd num
111+
* upon success (>= 0).
112+
* @completion: Indicates that the installing process has completed fd
113+
* installation, or gone away (either due to successful
114+
* reply, or signal)
115+
*
116+
*/
117+
struct seccomp_kaddfd {
118+
struct file *file;
119+
int fd;
120+
unsigned int flags;
121+
122+
/* To only be set on reply */
123+
int ret;
124+
struct completion completion;
125+
struct list_head list;
94126
};
95127

96128
/**
@@ -793,6 +825,17 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
793825
return filter->notif->next_id++;
794826
}
795827

828+
static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
829+
{
830+
/*
831+
* Remove the notification, and reset the list pointers, indicating
832+
* that it has been handled.
833+
*/
834+
list_del_init(&addfd->list);
835+
addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
836+
complete(&addfd->completion);
837+
}
838+
796839
static int seccomp_do_user_notification(int this_syscall,
797840
struct seccomp_filter *match,
798841
const struct seccomp_data *sd)
@@ -801,6 +844,7 @@ static int seccomp_do_user_notification(int this_syscall,
801844
u32 flags = 0;
802845
long ret = 0;
803846
struct seccomp_knotif n = {};
847+
struct seccomp_kaddfd *addfd, *tmp;
804848

805849
mutex_lock(&match->notify_lock);
806850
err = -ENOSYS;
@@ -813,6 +857,7 @@ static int seccomp_do_user_notification(int this_syscall,
813857
n.id = seccomp_next_notify_id(match);
814858
init_completion(&n.ready);
815859
list_add(&n.list, &match->notif->notifications);
860+
INIT_LIST_HEAD(&n.addfd);
816861

817862
up(&match->notif->request);
818863
wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
@@ -821,17 +866,34 @@ static int seccomp_do_user_notification(int this_syscall,
821866
/*
822867
* This is where we wait for a reply from userspace.
823868
*/
869+
wait:
824870
err = wait_for_completion_interruptible(&n.ready);
825871
mutex_lock(&match->notify_lock);
826872
if (err == 0) {
873+
/* Check if we were woken up by a addfd message */
874+
addfd = list_first_entry_or_null(&n.addfd,
875+
struct seccomp_kaddfd, list);
876+
if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
877+
seccomp_handle_addfd(addfd);
878+
mutex_unlock(&match->notify_lock);
879+
goto wait;
880+
}
827881
ret = n.val;
828882
err = n.error;
829883
flags = n.flags;
830884
}
831885

886+
/* If there were any pending addfd calls, clear them out */
887+
list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
888+
/* The process went away before we got a chance to handle it */
889+
addfd->ret = -ESRCH;
890+
list_del_init(&addfd->list);
891+
complete(&addfd->completion);
892+
}
893+
832894
/*
833895
* Note that it's possible the listener died in between the time when
834-
* we were notified of a respons (or a signal) and when we were able to
896+
* we were notified of a response (or a signal) and when we were able to
835897
* re-acquire the lock, so only delete from the list if the
836898
* notification actually exists.
837899
*
@@ -1069,6 +1131,11 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
10691131
knotif->error = -ENOSYS;
10701132
knotif->val = 0;
10711133

1134+
/*
1135+
* We do not need to wake up any pending addfd messages, as
1136+
* the notifier will do that for us, as this just looks
1137+
* like a standard reply.
1138+
*/
10721139
complete(&knotif->ready);
10731140
}
10741141

@@ -1233,12 +1300,109 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
12331300
return ret;
12341301
}
12351302

1303+
static long seccomp_notify_addfd(struct seccomp_filter *filter,
1304+
struct seccomp_notif_addfd __user *uaddfd,
1305+
unsigned int size)
1306+
{
1307+
struct seccomp_notif_addfd addfd;
1308+
struct seccomp_knotif *knotif;
1309+
struct seccomp_kaddfd kaddfd;
1310+
int ret;
1311+
1312+
BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
1313+
BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
1314+
1315+
if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
1316+
return -EINVAL;
1317+
1318+
ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
1319+
if (ret)
1320+
return ret;
1321+
1322+
if (addfd.newfd_flags & ~O_CLOEXEC)
1323+
return -EINVAL;
1324+
1325+
if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
1326+
return -EINVAL;
1327+
1328+
if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
1329+
return -EINVAL;
1330+
1331+
kaddfd.file = fget(addfd.srcfd);
1332+
if (!kaddfd.file)
1333+
return -EBADF;
1334+
1335+
kaddfd.flags = addfd.newfd_flags;
1336+
kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
1337+
addfd.newfd : -1;
1338+
init_completion(&kaddfd.completion);
1339+
1340+
ret = mutex_lock_interruptible(&filter->notify_lock);
1341+
if (ret < 0)
1342+
goto out;
1343+
1344+
knotif = find_notification(filter, addfd.id);
1345+
if (!knotif) {
1346+
ret = -ENOENT;
1347+
goto out_unlock;
1348+
}
1349+
1350+
/*
1351+
* We do not want to allow for FD injection to occur before the
1352+
* notification has been picked up by a userspace handler, or after
1353+
* the notification has been replied to.
1354+
*/
1355+
if (knotif->state != SECCOMP_NOTIFY_SENT) {
1356+
ret = -EINPROGRESS;
1357+
goto out_unlock;
1358+
}
1359+
1360+
list_add(&kaddfd.list, &knotif->addfd);
1361+
complete(&knotif->ready);
1362+
mutex_unlock(&filter->notify_lock);
1363+
1364+
/* Now we wait for it to be processed or be interrupted */
1365+
ret = wait_for_completion_interruptible(&kaddfd.completion);
1366+
if (ret == 0) {
1367+
/*
1368+
* We had a successful completion. The other side has already
1369+
* removed us from the addfd queue, and
1370+
* wait_for_completion_interruptible has a memory barrier upon
1371+
* success that lets us read this value directly without
1372+
* locking.
1373+
*/
1374+
ret = kaddfd.ret;
1375+
goto out;
1376+
}
1377+
1378+
mutex_lock(&filter->notify_lock);
1379+
/*
1380+
* Even though we were woken up by a signal and not a successful
1381+
* completion, a completion may have happened in the mean time.
1382+
*
1383+
* We need to check again if the addfd request has been handled,
1384+
* and if not, we will remove it from the queue.
1385+
*/
1386+
if (list_empty(&kaddfd.list))
1387+
ret = kaddfd.ret;
1388+
else
1389+
list_del(&kaddfd.list);
1390+
1391+
out_unlock:
1392+
mutex_unlock(&filter->notify_lock);
1393+
out:
1394+
fput(kaddfd.file);
1395+
1396+
return ret;
1397+
}
1398+
12361399
static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
12371400
unsigned long arg)
12381401
{
12391402
struct seccomp_filter *filter = file->private_data;
12401403
void __user *buf = (void __user *)arg;
12411404

1405+
/* Fixed-size ioctls */
12421406
switch (cmd) {
12431407
case SECCOMP_IOCTL_NOTIF_RECV:
12441408
return seccomp_notify_recv(filter, buf);
@@ -1247,6 +1411,13 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
12471411
case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
12481412
case SECCOMP_IOCTL_NOTIF_ID_VALID:
12491413
return seccomp_notify_id_valid(filter, buf);
1414+
}
1415+
1416+
/* Extensible Argument ioctls */
1417+
#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
1418+
switch (EA_IOCTL(cmd)) {
1419+
case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
1420+
return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
12501421
default:
12511422
return -EINVAL;
12521423
}

0 commit comments

Comments
 (0)