Skip to content

Commit ef2c41c

Browse files
Christian Braunerhtejun
Christian Brauner
authored andcommitted
clone3: allow spawning processes into cgroups
This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Oleg Nesterov <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Li Zefan <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: [email protected] Signed-off-by: Christian Brauner <[email protected]> Signed-off-by: Tejun Heo <[email protected]>
1 parent f355322 commit ef2c41c

File tree

7 files changed

+214
-39
lines changed

7 files changed

+214
-39
lines changed

include/linux/cgroup-defs.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -628,8 +628,9 @@ struct cgroup_subsys {
628628
void (*cancel_attach)(struct cgroup_taskset *tset);
629629
void (*attach)(struct cgroup_taskset *tset);
630630
void (*post_attach)(void);
631-
int (*can_fork)(struct task_struct *task);
632-
void (*cancel_fork)(struct task_struct *task);
631+
int (*can_fork)(struct task_struct *task,
632+
struct css_set *cset);
633+
void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
633634
void (*fork)(struct task_struct *task);
634635
void (*exit)(struct task_struct *task);
635636
void (*release)(struct task_struct *task);

include/linux/cgroup.h

+14-6
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727

2828
#include <linux/cgroup-defs.h>
2929

30+
struct kernel_clone_args;
31+
3032
#ifdef CONFIG_CGROUPS
3133

3234
/*
@@ -119,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
119121
struct pid *pid, struct task_struct *tsk);
120122

121123
void cgroup_fork(struct task_struct *p);
122-
extern int cgroup_can_fork(struct task_struct *p);
123-
extern void cgroup_cancel_fork(struct task_struct *p);
124-
extern void cgroup_post_fork(struct task_struct *p);
124+
extern int cgroup_can_fork(struct task_struct *p,
125+
struct kernel_clone_args *kargs);
126+
extern void cgroup_cancel_fork(struct task_struct *p,
127+
struct kernel_clone_args *kargs);
128+
extern void cgroup_post_fork(struct task_struct *p,
129+
struct kernel_clone_args *kargs);
125130
void cgroup_exit(struct task_struct *p);
126131
void cgroup_release(struct task_struct *p);
127132
void cgroup_free(struct task_struct *p);
@@ -705,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
705710
struct dentry *dentry) { return -EINVAL; }
706711

707712
static inline void cgroup_fork(struct task_struct *p) {}
708-
static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
709-
static inline void cgroup_cancel_fork(struct task_struct *p) {}
710-
static inline void cgroup_post_fork(struct task_struct *p) {}
713+
static inline int cgroup_can_fork(struct task_struct *p,
714+
struct kernel_clone_args *kargs) { return 0; }
715+
static inline void cgroup_cancel_fork(struct task_struct *p,
716+
struct kernel_clone_args *kargs) {}
717+
static inline void cgroup_post_fork(struct task_struct *p,
718+
struct kernel_clone_args *kargs) {}
711719
static inline void cgroup_exit(struct task_struct *p) {}
712720
static inline void cgroup_release(struct task_struct *p) {}
713721
static inline void cgroup_free(struct task_struct *p) {}

include/linux/sched/task.h

+4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
struct task_struct;
1414
struct rusage;
1515
union thread_union;
16+
struct css_set;
1617

1718
/* All the bits taken by the old clone syscall. */
1819
#define CLONE_LEGACY_FLAGS 0xffffffffULL
@@ -29,6 +30,9 @@ struct kernel_clone_args {
2930
pid_t *set_tid;
3031
/* Number of elements in *set_tid */
3132
size_t set_tid_size;
33+
int cgroup;
34+
struct cgroup *cgrp;
35+
struct css_set *cset;
3236
};
3337

3438
/*

include/uapi/linux/sched.h

+5
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535

3636
/* Flags for the clone3() syscall. */
3737
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
38+
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
3839

3940
/*
4041
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
@@ -81,6 +82,8 @@
8182
* @set_tid_size: This defines the size of the array referenced
8283
* in @set_tid. This cannot be larger than the
8384
* kernel's limit of nested PID namespaces.
85+
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
86+
* a file descriptor for the cgroup.
8487
*
8588
* The structure is versioned by size and thus extensible.
8689
* New struct members must go at the end of the struct and
@@ -97,11 +100,13 @@ struct clone_args {
97100
__aligned_u64 tls;
98101
__aligned_u64 set_tid;
99102
__aligned_u64 set_tid_size;
103+
__aligned_u64 cgroup;
100104
};
101105
#endif
102106

103107
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
104108
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
109+
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
105110

106111
/*
107112
* Scheduling policies

kernel/cgroup/cgroup.c

+168-23
Original file line numberDiff line numberDiff line change
@@ -5881,8 +5881,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
58815881
* @child: pointer to task_struct of forking parent process.
58825882
*
58835883
* A task is associated with the init_css_set until cgroup_post_fork()
5884-
* attaches it to the parent's css_set. Empty cg_list indicates that
5885-
* @child isn't holding reference to its css_set.
5884+
* attaches it to the target css_set.
58865885
*/
58875886
void cgroup_fork(struct task_struct *child)
58885887
{
@@ -5908,24 +5907,154 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
59085907
return cgrp;
59095908
}
59105909

5910+
/**
5911+
* cgroup_css_set_fork - find or create a css_set for a child process
5912+
* @kargs: the arguments passed to create the child process
5913+
*
5914+
* This functions finds or creates a new css_set which the child
5915+
* process will be attached to in cgroup_post_fork(). By default,
5916+
* the child process will be given the same css_set as its parent.
5917+
*
5918+
* If CLONE_INTO_CGROUP is specified this function will try to find an
5919+
* existing css_set which includes the requested cgroup and if not create
5920+
* a new css_set that the child will be attached to later. If this function
5921+
* succeeds it will hold cgroup_threadgroup_rwsem on return. If
5922+
* CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
5923+
* before grabbing cgroup_threadgroup_rwsem and will hold a reference
5924+
* to the target cgroup.
5925+
*/
5926+
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
5927+
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
5928+
{
5929+
int ret;
5930+
struct cgroup *dst_cgrp = NULL;
5931+
struct css_set *cset;
5932+
struct super_block *sb;
5933+
struct file *f;
5934+
5935+
if (kargs->flags & CLONE_INTO_CGROUP)
5936+
mutex_lock(&cgroup_mutex);
5937+
5938+
cgroup_threadgroup_change_begin(current);
5939+
5940+
spin_lock_irq(&css_set_lock);
5941+
cset = task_css_set(current);
5942+
get_css_set(cset);
5943+
spin_unlock_irq(&css_set_lock);
5944+
5945+
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
5946+
kargs->cset = cset;
5947+
return 0;
5948+
}
5949+
5950+
f = fget_raw(kargs->cgroup);
5951+
if (!f) {
5952+
ret = -EBADF;
5953+
goto err;
5954+
}
5955+
sb = f->f_path.dentry->d_sb;
5956+
5957+
dst_cgrp = cgroup_get_from_file(f);
5958+
if (IS_ERR(dst_cgrp)) {
5959+
ret = PTR_ERR(dst_cgrp);
5960+
dst_cgrp = NULL;
5961+
goto err;
5962+
}
5963+
5964+
if (cgroup_is_dead(dst_cgrp)) {
5965+
ret = -ENODEV;
5966+
goto err;
5967+
}
5968+
5969+
/*
5970+
* Verify that we the target cgroup is writable for us. This is
5971+
* usually done by the vfs layer but since we're not going through
5972+
* the vfs layer here we need to do it "manually".
5973+
*/
5974+
ret = cgroup_may_write(dst_cgrp, sb);
5975+
if (ret)
5976+
goto err;
5977+
5978+
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
5979+
!(kargs->flags & CLONE_THREAD));
5980+
if (ret)
5981+
goto err;
5982+
5983+
kargs->cset = find_css_set(cset, dst_cgrp);
5984+
if (!kargs->cset) {
5985+
ret = -ENOMEM;
5986+
goto err;
5987+
}
5988+
5989+
put_css_set(cset);
5990+
fput(f);
5991+
kargs->cgrp = dst_cgrp;
5992+
return ret;
5993+
5994+
err:
5995+
cgroup_threadgroup_change_end(current);
5996+
mutex_unlock(&cgroup_mutex);
5997+
if (f)
5998+
fput(f);
5999+
if (dst_cgrp)
6000+
cgroup_put(dst_cgrp);
6001+
put_css_set(cset);
6002+
if (kargs->cset)
6003+
put_css_set(kargs->cset);
6004+
return ret;
6005+
}
6006+
6007+
/**
6008+
* cgroup_css_set_put_fork - drop references we took during fork
6009+
* @kargs: the arguments passed to create the child process
6010+
*
6011+
* Drop references to the prepared css_set and target cgroup if
6012+
* CLONE_INTO_CGROUP was requested.
6013+
*/
6014+
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
6015+
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6016+
{
6017+
cgroup_threadgroup_change_end(current);
6018+
6019+
if (kargs->flags & CLONE_INTO_CGROUP) {
6020+
struct cgroup *cgrp = kargs->cgrp;
6021+
struct css_set *cset = kargs->cset;
6022+
6023+
mutex_unlock(&cgroup_mutex);
6024+
6025+
if (cset) {
6026+
put_css_set(cset);
6027+
kargs->cset = NULL;
6028+
}
6029+
6030+
if (cgrp) {
6031+
cgroup_put(cgrp);
6032+
kargs->cgrp = NULL;
6033+
}
6034+
}
6035+
}
6036+
59116037
/**
59126038
* cgroup_can_fork - called on a new task before the process is exposed
59136039
* @child: the child process
59146040
*
6041+
* This prepares a new css_set for the child process which the child will
6042+
* be attached to in cgroup_post_fork().
59156043
* This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
59166044
* callback returns an error, the fork aborts with that error code. This
59176045
* allows for a cgroup subsystem to conditionally allow or deny new forks.
59186046
*/
5919-
int cgroup_can_fork(struct task_struct *child)
5920-
__acquires(&cgroup_threadgroup_rwsem) __releases(&cgroup_threadgroup_rwsem)
6047+
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
59216048
{
59226049
struct cgroup_subsys *ss;
59236050
int i, j, ret;
59246051

5925-
cgroup_threadgroup_change_begin(current);
6052+
ret = cgroup_css_set_fork(kargs);
6053+
if (ret)
6054+
return ret;
59266055

59276056
do_each_subsys_mask(ss, i, have_canfork_callback) {
5928-
ret = ss->can_fork(child);
6057+
ret = ss->can_fork(child, kargs->cset);
59296058
if (ret)
59306059
goto out_revert;
59316060
} while_each_subsys_mask();
@@ -5937,32 +6066,34 @@ int cgroup_can_fork(struct task_struct *child)
59376066
if (j >= i)
59386067
break;
59396068
if (ss->cancel_fork)
5940-
ss->cancel_fork(child);
6069+
ss->cancel_fork(child, kargs->cset);
59416070
}
59426071

5943-
cgroup_threadgroup_change_end(current);
6072+
cgroup_css_set_put_fork(kargs);
59446073

59456074
return ret;
59466075
}
59476076

59486077
/**
5949-
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
5950-
* @child: the child process
5951-
*
5952-
* This calls the cancel_fork() callbacks if a fork failed *after*
5953-
* cgroup_can_fork() succeded.
5954-
*/
5955-
void cgroup_cancel_fork(struct task_struct *child)
5956-
__releases(&cgroup_threadgroup_rwsem)
6078+
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
6079+
* @child: the child process
6080+
* @kargs: the arguments passed to create the child process
6081+
*
6082+
* This calls the cancel_fork() callbacks if a fork failed *after*
6083+
* cgroup_can_fork() succeded and cleans up references we took to
6084+
* prepare a new css_set for the child process in cgroup_can_fork().
6085+
*/
6086+
void cgroup_cancel_fork(struct task_struct *child,
6087+
struct kernel_clone_args *kargs)
59576088
{
59586089
struct cgroup_subsys *ss;
59596090
int i;
59606091

59616092
for_each_subsys(ss, i)
59626093
if (ss->cancel_fork)
5963-
ss->cancel_fork(child);
6094+
ss->cancel_fork(child, kargs->cset);
59646095

5965-
cgroup_threadgroup_change_end(current);
6096+
cgroup_css_set_put_fork(kargs);
59666097
}
59676098

59686099
/**
@@ -5972,22 +6103,27 @@ void cgroup_cancel_fork(struct task_struct *child)
59726103
* Attach the child process to its css_set calling the subsystem fork()
59736104
* callbacks.
59746105
*/
5975-
void cgroup_post_fork(struct task_struct *child)
5976-
__releases(&cgroup_threadgroup_rwsem)
6106+
void cgroup_post_fork(struct task_struct *child,
6107+
struct kernel_clone_args *kargs)
6108+
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
59776109
{
59786110
struct cgroup_subsys *ss;
59796111
struct css_set *cset;
59806112
int i;
59816113

6114+
cset = kargs->cset;
6115+
kargs->cset = NULL;
6116+
59826117
spin_lock_irq(&css_set_lock);
59836118

59846119
/* init tasks are special, only link regular threads */
59856120
if (likely(child->pid)) {
59866121
WARN_ON_ONCE(!list_empty(&child->cg_list));
5987-
cset = task_css_set(current); /* current is @child's parent */
5988-
get_css_set(cset);
59896122
cset->nr_tasks++;
59906123
css_set_move_task(child, NULL, cset, false);
6124+
} else {
6125+
put_css_set(cset);
6126+
cset = NULL;
59916127
}
59926128

59936129
/*
@@ -6020,7 +6156,16 @@ void cgroup_post_fork(struct task_struct *child)
60206156
ss->fork(child);
60216157
} while_each_subsys_mask();
60226158

6023-
cgroup_threadgroup_change_end(current);
6159+
/* Make the new cset the root_cset of the new cgroup namespace. */
6160+
if (kargs->flags & CLONE_NEWCGROUP) {
6161+
struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6162+
6163+
get_css_set(cset);
6164+
child->nsproxy->cgroup_ns->root_cset = cset;
6165+
put_css_set(rcset);
6166+
}
6167+
6168+
cgroup_css_set_put_fork(kargs);
60246169
}
60256170

60266171
/**

0 commit comments

Comments
 (0)