Skip to content

Commit 1f7dd3e

Browse files
committed
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy. P0 (+memory) --- P1 (-memory) --- A \- B P0 has memory enabled in its subtree_control while P1 doesn't. If both A and B contain processes, they would belong to the memory css of P1. Now if memory is enabled on P1's subtree_control, memory csses should be created on both A and B and A's processes should be moved to the former and B's processes the latter. IOW, enabling controllers can cause atomic migrations into different csses. The core cgroup migration logic has been updated accordingly but the controller migration methods haven't and still assume that all tasks migrate to a single target css; furthermore, the methods were fed the css in which subtree_control was updated which is the parent of the target csses. pids controller depends on the migration methods to move charges and this made the controller attribute charges to the wrong csses often triggering the following warning by driving a counter negative. WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40() Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29 ... ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000 ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00 ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8 Call Trace: [<ffffffff81551ffc>] dump_stack+0x4e/0x82 [<ffffffff810de202>] warn_slowpath_common+0x82/0xc0 [<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20 [<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40 [<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0 [<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330 [<ffffffff81188e05>] cgroup_migrate+0xf5/0x190 [<ffffffff81189016>] cgroup_attach_task+0x176/0x200 [<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460 [<ffffffff81189684>] cgroup_procs_write+0x14/0x20 [<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0 [<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190 [<ffffffff81265f88>] __vfs_write+0x28/0xe0 [<ffffffff812666fc>] vfs_write+0xac/0x1a0 [<ffffffff81267019>] SyS_write+0x49/0xb0 [<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76 This patch fixes the bug by removing @css parameter from the three migration methods, ->can_attach, ->cancel_attach() and ->attach() and updating cgroup_taskset iteration helpers also return the destination css in addition to the task being migrated. All controllers are updated accordingly. * Controllers which don't care whether there are one or multiple target csses can be converted trivially. cpu, io, freezer, perf, netclassid and netprio fall in this category. * cpuset's current implementation assumes that there's single source and destination and thus doesn't support v2 hierarchy already. The only change made by this patchset is how that single destination css is obtained. * memory migration path already doesn't do anything on v2. How the single destination css is obtained is updated and the prep stage of mem_cgroup_can_attach() is reordered to accomodate the change. * pids is the only controller which was affected by this bug. It now correctly handles multi-destination migrations and no longer causes counter underflow from incorrect accounting. Signed-off-by: Tejun Heo <[email protected]> Reported-and-tested-by: Daniel Wagner <[email protected]> Cc: Aleksa Sarai <[email protected]>
1 parent 599c963 commit 1f7dd3e

File tree

12 files changed

+137
-92
lines changed

12 files changed

+137
-92
lines changed

block/blk-cgroup.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1127,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q)
11271127
* of the main cic data structures. For now we allow a task to change
11281128
* its cgroup only if it's the only owner of its ioc.
11291129
*/
1130-
static int blkcg_can_attach(struct cgroup_subsys_state *css,
1131-
struct cgroup_taskset *tset)
1130+
static int blkcg_can_attach(struct cgroup_taskset *tset)
11321131
{
11331132
struct task_struct *task;
1133+
struct cgroup_subsys_state *dst_css;
11341134
struct io_context *ioc;
11351135
int ret = 0;
11361136

11371137
/* task_lock() is needed to avoid races with exit_io_context() */
1138-
cgroup_taskset_for_each(task, tset) {
1138+
cgroup_taskset_for_each(task, dst_css, tset) {
11391139
task_lock(task);
11401140
ioc = task->io_context;
11411141
if (ioc && atomic_read(&ioc->nr_tasks) > 1)

include/linux/cgroup-defs.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -422,12 +422,9 @@ struct cgroup_subsys {
422422
void (*css_reset)(struct cgroup_subsys_state *css);
423423
void (*css_e_css_changed)(struct cgroup_subsys_state *css);
424424

425-
int (*can_attach)(struct cgroup_subsys_state *css,
426-
struct cgroup_taskset *tset);
427-
void (*cancel_attach)(struct cgroup_subsys_state *css,
428-
struct cgroup_taskset *tset);
429-
void (*attach)(struct cgroup_subsys_state *css,
430-
struct cgroup_taskset *tset);
425+
int (*can_attach)(struct cgroup_taskset *tset);
426+
void (*cancel_attach)(struct cgroup_taskset *tset);
427+
void (*attach)(struct cgroup_taskset *tset);
431428
int (*can_fork)(struct task_struct *task, void **priv_p);
432429
void (*cancel_fork)(struct task_struct *task, void *priv);
433430
void (*fork)(struct task_struct *task, void *priv);

include/linux/cgroup.h

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,10 @@ struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state
120120
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
121121
struct cgroup_subsys_state *css);
122122

123-
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
124-
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
123+
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
124+
struct cgroup_subsys_state **dst_cssp);
125+
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
126+
struct cgroup_subsys_state **dst_cssp);
125127

126128
void css_task_iter_start(struct cgroup_subsys_state *css,
127129
struct css_task_iter *it);
@@ -236,30 +238,39 @@ void css_task_iter_end(struct css_task_iter *it);
236238
/**
237239
* cgroup_taskset_for_each - iterate cgroup_taskset
238240
* @task: the loop cursor
241+
* @dst_css: the destination css
239242
* @tset: taskset to iterate
240243
*
241244
* @tset may contain multiple tasks and they may belong to multiple
242-
* processes. When there are multiple tasks in @tset, if a task of a
243-
* process is in @tset, all tasks of the process are in @tset. Also, all
244-
* are guaranteed to share the same source and destination csses.
245+
* processes.
246+
*
247+
* On the v2 hierarchy, there may be tasks from multiple processes and they
248+
* may not share the source or destination csses.
249+
*
250+
* On traditional hierarchies, when there are multiple tasks in @tset, if a
251+
* task of a process is in @tset, all tasks of the process are in @tset.
252+
* Also, all are guaranteed to share the same source and destination csses.
245253
*
246254
* Iteration is not in any specific order.
247255
*/
248-
#define cgroup_taskset_for_each(task, tset) \
249-
for ((task) = cgroup_taskset_first((tset)); (task); \
250-
(task) = cgroup_taskset_next((tset)))
256+
#define cgroup_taskset_for_each(task, dst_css, tset) \
257+
for ((task) = cgroup_taskset_first((tset), &(dst_css)); \
258+
(task); \
259+
(task) = cgroup_taskset_next((tset), &(dst_css)))
251260

252261
/**
253262
* cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
254263
* @leader: the loop cursor
264+
* @dst_css: the destination css
255265
* @tset: takset to iterate
256266
*
257267
* Iterate threadgroup leaders of @tset. For single-task migrations, @tset
258268
* may not contain any.
259269
*/
260-
#define cgroup_taskset_for_each_leader(leader, tset) \
261-
for ((leader) = cgroup_taskset_first((tset)); (leader); \
262-
(leader) = cgroup_taskset_next((tset))) \
270+
#define cgroup_taskset_for_each_leader(leader, dst_css, tset) \
271+
for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \
272+
(leader); \
273+
(leader) = cgroup_taskset_next((tset), &(dst_css))) \
263274
if ((leader) != (leader)->group_leader) \
264275
; \
265276
else

kernel/cgroup.c

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2237,6 +2237,9 @@ struct cgroup_taskset {
22372237
struct list_head src_csets;
22382238
struct list_head dst_csets;
22392239

2240+
/* the subsys currently being processed */
2241+
int ssid;
2242+
22402243
/*
22412244
* Fields for cgroup_taskset_*() iteration.
22422245
*
@@ -2299,25 +2302,29 @@ static void cgroup_taskset_add(struct task_struct *task,
22992302
/**
23002303
* cgroup_taskset_first - reset taskset and return the first task
23012304
* @tset: taskset of interest
2305+
* @dst_cssp: output variable for the destination css
23022306
*
23032307
* @tset iteration is initialized and the first task is returned.
23042308
*/
2305-
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
2309+
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2310+
struct cgroup_subsys_state **dst_cssp)
23062311
{
23072312
tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
23082313
tset->cur_task = NULL;
23092314

2310-
return cgroup_taskset_next(tset);
2315+
return cgroup_taskset_next(tset, dst_cssp);
23112316
}
23122317

23132318
/**
23142319
* cgroup_taskset_next - iterate to the next task in taskset
23152320
* @tset: taskset of interest
2321+
* @dst_cssp: output variable for the destination css
23162322
*
23172323
* Return the next task in @tset. Iteration must have been initialized
23182324
* with cgroup_taskset_first().
23192325
*/
2320-
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
2326+
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2327+
struct cgroup_subsys_state **dst_cssp)
23212328
{
23222329
struct css_set *cset = tset->cur_cset;
23232330
struct task_struct *task = tset->cur_task;
@@ -2332,6 +2339,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
23322339
if (&task->cg_list != &cset->mg_tasks) {
23332340
tset->cur_cset = cset;
23342341
tset->cur_task = task;
2342+
2343+
/*
2344+
* This function may be called both before and
2345+
* after cgroup_taskset_migrate(). The two cases
2346+
* can be distinguished by looking at whether @cset
2347+
* has its ->mg_dst_cset set.
2348+
*/
2349+
if (cset->mg_dst_cset)
2350+
*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2351+
else
2352+
*dst_cssp = cset->subsys[tset->ssid];
2353+
23352354
return task;
23362355
}
23372356

@@ -2367,7 +2386,8 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
23672386
/* check that we can legitimately attach to the cgroup */
23682387
for_each_e_css(css, i, dst_cgrp) {
23692388
if (css->ss->can_attach) {
2370-
ret = css->ss->can_attach(css, tset);
2389+
tset->ssid = i;
2390+
ret = css->ss->can_attach(tset);
23712391
if (ret) {
23722392
failed_css = css;
23732393
goto out_cancel_attach;
@@ -2400,9 +2420,12 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
24002420
*/
24012421
tset->csets = &tset->dst_csets;
24022422

2403-
for_each_e_css(css, i, dst_cgrp)
2404-
if (css->ss->attach)
2405-
css->ss->attach(css, tset);
2423+
for_each_e_css(css, i, dst_cgrp) {
2424+
if (css->ss->attach) {
2425+
tset->ssid = i;
2426+
css->ss->attach(tset);
2427+
}
2428+
}
24062429

24072430
ret = 0;
24082431
goto out_release_tset;
@@ -2411,8 +2434,10 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
24112434
for_each_e_css(css, i, dst_cgrp) {
24122435
if (css == failed_css)
24132436
break;
2414-
if (css->ss->cancel_attach)
2415-
css->ss->cancel_attach(css, tset);
2437+
if (css->ss->cancel_attach) {
2438+
tset->ssid = i;
2439+
css->ss->cancel_attach(tset);
2440+
}
24162441
}
24172442
out_release_tset:
24182443
spin_lock_bh(&css_set_lock);

kernel/cgroup_freezer.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css)
155155
* @freezer->lock. freezer_attach() makes the new tasks conform to the
156156
* current state and all following state changes can see the new tasks.
157157
*/
158-
static void freezer_attach(struct cgroup_subsys_state *new_css,
159-
struct cgroup_taskset *tset)
158+
static void freezer_attach(struct cgroup_taskset *tset)
160159
{
161160
struct task_struct *task;
161+
struct cgroup_subsys_state *new_css;
162162

163163
mutex_lock(&freezer_mutex);
164164

@@ -172,7 +172,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
172172
* current state before executing the following - !frozen tasks may
173173
* be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
174174
*/
175-
cgroup_taskset_for_each(task, tset) {
175+
cgroup_taskset_for_each(task, new_css, tset) {
176176
struct freezer *freezer = css_freezer(new_css);
177177

178178
if (!(freezer->state & CGROUP_FREEZING)) {

kernel/cgroup_pids.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -162,13 +162,13 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
162162
return -EAGAIN;
163163
}
164164

165-
static int pids_can_attach(struct cgroup_subsys_state *css,
166-
struct cgroup_taskset *tset)
165+
static int pids_can_attach(struct cgroup_taskset *tset)
167166
{
168-
struct pids_cgroup *pids = css_pids(css);
169167
struct task_struct *task;
168+
struct cgroup_subsys_state *dst_css;
170169

171-
cgroup_taskset_for_each(task, tset) {
170+
cgroup_taskset_for_each(task, dst_css, tset) {
171+
struct pids_cgroup *pids = css_pids(dst_css);
172172
struct cgroup_subsys_state *old_css;
173173
struct pids_cgroup *old_pids;
174174

@@ -187,13 +187,13 @@ static int pids_can_attach(struct cgroup_subsys_state *css,
187187
return 0;
188188
}
189189

190-
static void pids_cancel_attach(struct cgroup_subsys_state *css,
191-
struct cgroup_taskset *tset)
190+
static void pids_cancel_attach(struct cgroup_taskset *tset)
192191
{
193-
struct pids_cgroup *pids = css_pids(css);
194192
struct task_struct *task;
193+
struct cgroup_subsys_state *dst_css;
195194

196-
cgroup_taskset_for_each(task, tset) {
195+
cgroup_taskset_for_each(task, dst_css, tset) {
196+
struct pids_cgroup *pids = css_pids(dst_css);
197197
struct cgroup_subsys_state *old_css;
198198
struct pids_cgroup *old_pids;
199199

kernel/cpuset.c

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1429,15 +1429,16 @@ static int fmeter_getrate(struct fmeter *fmp)
14291429
static struct cpuset *cpuset_attach_old_cs;
14301430

14311431
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1432-
static int cpuset_can_attach(struct cgroup_subsys_state *css,
1433-
struct cgroup_taskset *tset)
1432+
static int cpuset_can_attach(struct cgroup_taskset *tset)
14341433
{
1435-
struct cpuset *cs = css_cs(css);
1434+
struct cgroup_subsys_state *css;
1435+
struct cpuset *cs;
14361436
struct task_struct *task;
14371437
int ret;
14381438

14391439
/* used later by cpuset_attach() */
1440-
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
1440+
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
1441+
cs = css_cs(css);
14411442

14421443
mutex_lock(&cpuset_mutex);
14431444

@@ -1447,7 +1448,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
14471448
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
14481449
goto out_unlock;
14491450

1450-
cgroup_taskset_for_each(task, tset) {
1451+
cgroup_taskset_for_each(task, css, tset) {
14511452
ret = task_can_attach(task, cs->cpus_allowed);
14521453
if (ret)
14531454
goto out_unlock;
@@ -1467,9 +1468,14 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
14671468
return ret;
14681469
}
14691470

1470-
static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
1471-
struct cgroup_taskset *tset)
1471+
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
14721472
{
1473+
struct cgroup_subsys_state *css;
1474+
struct cpuset *cs;
1475+
1476+
cgroup_taskset_first(tset, &css);
1477+
cs = css_cs(css);
1478+
14731479
mutex_lock(&cpuset_mutex);
14741480
css_cs(css)->attach_in_progress--;
14751481
mutex_unlock(&cpuset_mutex);
@@ -1482,16 +1488,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
14821488
*/
14831489
static cpumask_var_t cpus_attach;
14841490

1485-
static void cpuset_attach(struct cgroup_subsys_state *css,
1486-
struct cgroup_taskset *tset)
1491+
static void cpuset_attach(struct cgroup_taskset *tset)
14871492
{
14881493
/* static buf protected by cpuset_mutex */
14891494
static nodemask_t cpuset_attach_nodemask_to;
14901495
struct task_struct *task;
14911496
struct task_struct *leader;
1492-
struct cpuset *cs = css_cs(css);
1497+
struct cgroup_subsys_state *css;
1498+
struct cpuset *cs;
14931499
struct cpuset *oldcs = cpuset_attach_old_cs;
14941500

1501+
cgroup_taskset_first(tset, &css);
1502+
cs = css_cs(css);
1503+
14951504
mutex_lock(&cpuset_mutex);
14961505

14971506
/* prepare for attach */
@@ -1502,7 +1511,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
15021511

15031512
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
15041513

1505-
cgroup_taskset_for_each(task, tset) {
1514+
cgroup_taskset_for_each(task, css, tset) {
15061515
/*
15071516
* can_attach beforehand should guarantee that this doesn't
15081517
* fail. TODO: have a better way to handle failure here
@@ -1518,7 +1527,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
15181527
* sleep and should be moved outside migration path proper.
15191528
*/
15201529
cpuset_attach_nodemask_to = cs->effective_mems;
1521-
cgroup_taskset_for_each_leader(leader, tset) {
1530+
cgroup_taskset_for_each_leader(leader, css, tset) {
15221531
struct mm_struct *mm = get_task_mm(leader);
15231532

15241533
if (mm) {

kernel/events/core.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9456,12 +9456,12 @@ static int __perf_cgroup_move(void *info)
94569456
return 0;
94579457
}
94589458

9459-
static void perf_cgroup_attach(struct cgroup_subsys_state *css,
9460-
struct cgroup_taskset *tset)
9459+
static void perf_cgroup_attach(struct cgroup_taskset *tset)
94619460
{
94629461
struct task_struct *task;
9462+
struct cgroup_subsys_state *css;
94639463

9464-
cgroup_taskset_for_each(task, tset)
9464+
cgroup_taskset_for_each(task, css, tset)
94659465
task_function_call(task, __perf_cgroup_move, task);
94669466
}
94679467

kernel/sched/core.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8217,12 +8217,12 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private)
82178217
sched_move_task(task);
82188218
}
82198219

8220-
static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
8221-
struct cgroup_taskset *tset)
8220+
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
82228221
{
82238222
struct task_struct *task;
8223+
struct cgroup_subsys_state *css;
82248224

8225-
cgroup_taskset_for_each(task, tset) {
8225+
cgroup_taskset_for_each(task, css, tset) {
82268226
#ifdef CONFIG_RT_GROUP_SCHED
82278227
if (!sched_rt_can_attach(css_tg(css), task))
82288228
return -EINVAL;
@@ -8235,12 +8235,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
82358235
return 0;
82368236
}
82378237

8238-
static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
8239-
struct cgroup_taskset *tset)
8238+
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
82408239
{
82418240
struct task_struct *task;
8241+
struct cgroup_subsys_state *css;
82428242

8243-
cgroup_taskset_for_each(task, tset)
8243+
cgroup_taskset_for_each(task, css, tset)
82448244
sched_move_task(task);
82458245
}
82468246

0 commit comments

Comments
 (0)