Skip to content

Commit 05f7658

Browse files
htejungregkh
authored andcommitted
cgroup: Use separate src/dst nodes when preloading css_sets for migration
commit 07fd5b6cdf3cc30bfde8fe0f644771688be04447 upstream. Each cset (css_set) is pinned by its tasks. When we're moving tasks around across csets for a migration, we need to hold the source and destination csets to ensure that they don't go away while we're moving tasks about. This is done by linking cset->mg_preload_node on either the mgctx->preloaded_src_csets or mgctx->preloaded_dst_csets list. Using the same cset->mg_preload_node for both the src and dst lists was deemed okay as a cset can't be both the source and destination at the same time. Unfortunately, this overloading becomes problematic when multiple tasks are involved in a migration and some of them are identity noop migrations while others are actually moving across cgroups. For example, this can happen with the following sequence on cgroup1: PeterCxy#1> mkdir -p /sys/fs/cgroup/misc/a/b OnePlusOSS#2> echo $$ > /sys/fs/cgroup/misc/a/cgroup.procs OnePlusOSS#3> RUN_A_COMMAND_WHICH_CREATES_MULTIPLE_THREADS & OnePlusOSS#4> PID=$! OnePlusOSS#5> echo $PID > /sys/fs/cgroup/misc/a/b/tasks OnePlusOSS#6> echo $PID > /sys/fs/cgroup/misc/a/cgroup.procs the process including the group leader back into a. In this final migration, non-leader threads would be doing identity migration while the group leader is doing an actual one. After OnePlusOSS#3, let's say the whole process was in cset A, and that after OnePlusOSS#4, the leader moves to cset B. Then, during OnePlusOSS#6, the following happens: 1. cgroup_migrate_add_src() is called on B for the leader. 2. cgroup_migrate_add_src() is called on A for the other threads. 3. cgroup_migrate_prepare_dst() is called. It scans the src list. 4. It notices that B wants to migrate to A, so it tries to A to the dst list but realizes that its ->mg_preload_node is already busy. 5. and then it notices A wants to migrate to A as it's an identity migration, it culls it by list_del_init()'ing its ->mg_preload_node and putting references accordingly. 6. The rest of migration takes place with B on the src list but nothing on the dst list. This means that A isn't held while migration is in progress. If all tasks leave A before the migration finishes and the incoming task pins it, the cset will be destroyed leading to use-after-free. This is caused by overloading cset->mg_preload_node for both src and dst preload lists. We wanted to exclude the cset from the src list but ended up inadvertently excluding it from the dst list too. This patch fixes the issue by separating out cset->mg_preload_node into ->mg_src_preload_node and ->mg_dst_preload_node, so that the src and dst preloadings don't interfere with each other. Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Mukesh Ojha <quic_mojha@quicinc.com> Reported-by: shisiyuan <shisiyuan19870131@gmail.com> Link: http://lkml.kernel.org/r/1654187688-27411-1-git-send-email-shisiyuan@xiaomi.com Link: https://www.spinics.net/lists/cgroups/msg33313.html Fixes: f817de9 ("cgroup: prepare migration path for unified hierarchy") Cc: stable@vger.kernel.org # v3.16+ Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 9b2e184 commit 05f7658

2 files changed

Lines changed: 25 additions & 15 deletions

File tree

include/linux/cgroup-defs.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,8 @@ struct css_set {
235235
* List of csets participating in the on-going migration either as
236236
* source or destination. Protected by cgroup_mutex.
237237
*/
238-
struct list_head mg_preload_node;
238+
struct list_head mg_src_preload_node;
239+
struct list_head mg_dst_preload_node;
239240
struct list_head mg_node;
240241

241242
/*

kernel/cgroup/cgroup.c

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -647,7 +647,8 @@ struct css_set init_css_set = {
647647
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
648648
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
649649
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
650-
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
650+
.mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
651+
.mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
651652
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
652653
};
653654

@@ -1113,7 +1114,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
11131114
INIT_LIST_HEAD(&cset->threaded_csets);
11141115
INIT_HLIST_NODE(&cset->hlist);
11151116
INIT_LIST_HEAD(&cset->cgrp_links);
1116-
INIT_LIST_HEAD(&cset->mg_preload_node);
1117+
INIT_LIST_HEAD(&cset->mg_src_preload_node);
1118+
INIT_LIST_HEAD(&cset->mg_dst_preload_node);
11171119
INIT_LIST_HEAD(&cset->mg_node);
11181120

11191121
/* Copy the set of subsystem state objects generated in
@@ -2399,21 +2401,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
23992401
*/
24002402
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
24012403
{
2402-
LIST_HEAD(preloaded);
24032404
struct css_set *cset, *tmp_cset;
24042405

24052406
lockdep_assert_held(&cgroup_mutex);
24062407

24072408
spin_lock_irq(&css_set_lock);
24082409

2409-
list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2410-
list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2410+
list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
2411+
mg_src_preload_node) {
2412+
cset->mg_src_cgrp = NULL;
2413+
cset->mg_dst_cgrp = NULL;
2414+
cset->mg_dst_cset = NULL;
2415+
list_del_init(&cset->mg_src_preload_node);
2416+
put_css_set_locked(cset);
2417+
}
24112418

2412-
list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2419+
list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
2420+
mg_dst_preload_node) {
24132421
cset->mg_src_cgrp = NULL;
24142422
cset->mg_dst_cgrp = NULL;
24152423
cset->mg_dst_cset = NULL;
2416-
list_del_init(&cset->mg_preload_node);
2424+
list_del_init(&cset->mg_dst_preload_node);
24172425
put_css_set_locked(cset);
24182426
}
24192427

@@ -2455,7 +2463,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
24552463

24562464
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
24572465

2458-
if (!list_empty(&src_cset->mg_preload_node))
2466+
if (!list_empty(&src_cset->mg_src_preload_node))
24592467
return;
24602468

24612469
WARN_ON(src_cset->mg_src_cgrp);
@@ -2466,7 +2474,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
24662474
src_cset->mg_src_cgrp = src_cgrp;
24672475
src_cset->mg_dst_cgrp = dst_cgrp;
24682476
get_css_set(src_cset);
2469-
list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2477+
list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
24702478
}
24712479

24722480
/**
@@ -2491,7 +2499,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
24912499

24922500
/* look up the dst cset for each src cset and link it to src */
24932501
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2494-
mg_preload_node) {
2502+
mg_src_preload_node) {
24952503
struct css_set *dst_cset;
24962504
struct cgroup_subsys *ss;
24972505
int ssid;
@@ -2510,16 +2518,16 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
25102518
if (src_cset == dst_cset) {
25112519
src_cset->mg_src_cgrp = NULL;
25122520
src_cset->mg_dst_cgrp = NULL;
2513-
list_del_init(&src_cset->mg_preload_node);
2521+
list_del_init(&src_cset->mg_src_preload_node);
25142522
put_css_set(src_cset);
25152523
put_css_set(dst_cset);
25162524
continue;
25172525
}
25182526

25192527
src_cset->mg_dst_cset = dst_cset;
25202528

2521-
if (list_empty(&dst_cset->mg_preload_node))
2522-
list_add_tail(&dst_cset->mg_preload_node,
2529+
if (list_empty(&dst_cset->mg_dst_preload_node))
2530+
list_add_tail(&dst_cset->mg_dst_preload_node,
25232531
&mgctx->preloaded_dst_csets);
25242532
else
25252533
put_css_set(dst_cset);
@@ -2753,7 +2761,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
27532761
goto out_finish;
27542762

27552763
spin_lock_irq(&css_set_lock);
2756-
list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2764+
list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
2765+
mg_src_preload_node) {
27572766
struct task_struct *task, *ntask;
27582767

27592768
/* all tasks in src_csets need to be migrated */

0 commit comments

Comments
 (0)