From: Juri Lelli juri.lelli@redhat.com
mainline inclusion from mainline-v5.4-rc1 commit d74b27d63a8bebe2fe634944e4ebdc7b10db7a39 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6L46J CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
cpuset_rwsem is going to be acquired from sched_setscheduler() with a following patch. There are however paths (e.g., spawn_ksoftirqd) in which sched_scheduler() is eventually called while holding hotplug lock; this creates a dependecy between hotplug lock (to be always acquired first) and cpuset_rwsem (to be always acquired after hotplug lock).
Fix paths which currently take the two locks in the wrong order (after a following patch is applied).
Tested-by: Dietmar Eggemann dietmar.eggemann@arm.com Signed-off-by: Juri Lelli juri.lelli@redhat.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-7-juri.lelli@redhat.com Signed-off-by: Ingo Molnar mingo@kernel.org
conflicts: kernel/cgroup/cpuset.c
Signed-off-by: Cai Xinchen caixinchen1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- include/linux/cpuset.h | 8 ++++---- kernel/cgroup/cpuset.c | 24 +++++++++++++++++------- 2 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 934633a05d20..7f1478c26a33 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)
static inline void cpuset_inc(void) { - static_branch_inc(&cpusets_pre_enable_key); - static_branch_inc(&cpusets_enabled_key); + static_branch_inc_cpuslocked(&cpusets_pre_enable_key); + static_branch_inc_cpuslocked(&cpusets_enabled_key); }
static inline void cpuset_dec(void) { - static_branch_dec(&cpusets_enabled_key); - static_branch_dec(&cpusets_pre_enable_key); + static_branch_dec_cpuslocked(&cpusets_enabled_key); + static_branch_dec_cpuslocked(&cpusets_pre_enable_key); }
extern int cpuset_init(void); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 55bfbc4cdb16..def36c3fc524 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -914,8 +914,8 @@ static void rebuild_sched_domains_locked(void) cpumask_var_t *doms; int ndoms;
+ lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); - get_online_cpus();
/* * We have raced with CPU hotplug. Don't do anything to avoid @@ -923,15 +923,13 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return;
/* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) @@ -941,9 +939,11 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void) { + get_online_cpus(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); + put_online_cpus(); }
/** @@ -1612,13 +1612,13 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css);
- mutex_lock(&cpuset_mutex); - /* * It should hold cpus lock because a cpu offline event can * cause set_cpus_allowed_ptr() failed. */ get_online_cpus(); + mutex_lock(&cpuset_mutex); + /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1644,7 +1644,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); } - put_online_cpus();
/* * Change mm for all threadgroup leaders. This is expensive and may @@ -1680,6 +1679,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq);
mutex_unlock(&cpuset_mutex); + put_online_cpus(); }
/* The various types of files and directories in a cpuset file system */ @@ -1711,6 +1711,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0;
+ get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -1748,6 +1749,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; }
@@ -1758,6 +1760,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV;
+ get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1772,6 +1775,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; }
@@ -1810,6 +1814,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work);
+ get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1840,6 +1845,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2108,6 +2114,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0;
+ get_online_cpus(); mutex_lock(&cpuset_mutex);
set_bit(CS_ONLINE, &cs->flags); @@ -2161,6 +2168,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return 0; }
@@ -2174,6 +2182,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css);
+ get_online_cpus(); mutex_lock(&cpuset_mutex);
if (is_sched_load_balance(cs)) @@ -2183,6 +2192,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags);
mutex_unlock(&cpuset_mutex); + put_online_cpus(); }
static void cpuset_css_free(struct cgroup_subsys_state *css)
From: Tejun Heo tj@kernel.org
mainline inclusion from mainline-v6.0-rc3 commit 4f7e7236435ca0abe005c674ebd6892c6e83aeb3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6L46J CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Bringing up a CPU may involve creating and destroying tasks which requires read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside cpus_read_lock(). However, cpuset's ->attach(), which may be called with thredagroup_rwsem write-locked, also wants to disable CPU hotplug and acquires cpus_read_lock(), leading to a deadlock.
Fix it by guaranteeing that ->attach() is always called with CPU hotplug disabled and removing cpus_read_lock() call from cpuset_attach().
Signed-off-by: Tejun Heo tj@kernel.org Reviewed-and-tested-by: Imran Khan imran.f.khan@oracle.com Reported-and-tested-by: Xuewen Yan xuewen.yan@unisoc.com Fixes: 05c7b7a92cc8 ("cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug") Cc: stable@vger.kernel.org # v5.17+
conflict: kernel/cgroup/cgroup.c kernel/cgroup/cpuset.c
Signed-off-by: Cai Xinchen caixinchen1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- kernel/cgroup/cgroup.c | 50 +++++++++++++++++++++++++++++++++++++----- kernel/cgroup/cpuset.c | 7 +----- 2 files changed, 46 insertions(+), 11 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b01490b71f32..6487df9a6be0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -56,6 +56,7 @@ #include <linux/file.h> #include <linux/sched/cputime.h> #include <net/sock.h> +#include <linux/cpu.h>
#define CREATE_TRACE_POINTS #include <trace/events/cgroup.h> @@ -2212,6 +2213,45 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path);
+/** + * cgroup_attach_lock - Lock for ->attach() + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * + * cgroup migration sometimes needs to stabilize threadgroups against forks and + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() + * implementations (e.g. cpuset), also need to disable CPU hotplug. + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can + * lead to deadlocks. + * + * Bringing up a CPU may involve creating and destroying tasks which requires + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while + * write-locking threadgroup_rwsem, the locking order is reversed and we end up + * waiting for an on-going CPU hotplug operation which in turn is waiting for + * the threadgroup_rwsem to be released to create new tasks. For more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + * + * Resolve the situation by always acquiring cpus_read_lock() before optionally + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that + * CPU hotplug is disabled on entry. + */ +static void cgroup_attach_lock(void) +{ + cpus_read_lock(); + percpu_down_write(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_attach_unlock - Undo cgroup_attach_lock() + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + */ +static void cgroup_attach_unlock(void) +{ + percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); +} + /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task @@ -2691,7 +2731,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL);
- percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock();
rcu_read_lock(); if (pid) { @@ -2722,7 +2762,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) goto out_unlock_rcu;
out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); out_unlock_rcu: rcu_read_unlock(); return tsk; @@ -2737,7 +2777,7 @@ void cgroup_procs_write_finish(struct task_struct *task) /* release reference from cgroup_procs_write_start() */ put_task_struct(task);
- percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -2818,7 +2858,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock();
/* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); @@ -2848,7 +2888,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); return ret; }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index def36c3fc524..1d13d64108a0 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1612,11 +1612,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css);
- /* - * It should hold cpus lock because a cpu offline event can - * cause set_cpus_allowed_ptr() failed. - */ - get_online_cpus(); + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ mutex_lock(&cpuset_mutex);
/* prepare for attach */ @@ -1679,7 +1675,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq);
mutex_unlock(&cpuset_mutex); - put_online_cpus(); }
/* The various types of files and directories in a cpuset file system */
From: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp
mainline inclusion from mainline-v6.0-rc3 commit 43626dade36fa74d3329046f4ae2d7fdefe401c6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6L46J CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
syzbot is hitting percpu_rwsem_assert_held(&cpu_hotplug_lock) warning at cpuset_attach() [1], for commit 4f7e7236435ca0ab ("cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock") missed that cpuset_attach() is also called from cgroup_attach_task_all(). Add cpus_read_lock() like what cgroup_procs_write_start() does.
Link: https://syzkaller.appspot.com/bug?extid=29d3a3b4d86c8136ad9e [1] Reported-by: syzbot syzbot+29d3a3b4d86c8136ad9e@syzkaller.appspotmail.com Signed-off-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Fixes: 4f7e7236435ca0ab ("cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock") Signed-off-by: Tejun Heo tj@kernel.org
conflicts: kernel/cgroup/cgroup-internal.h kernel/cgroup/cgroup-v1.c kernel/cgroup/cgroup.c
Signed-off-by: Cai Xinchen caixinchen1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- kernel/cgroup/cgroup-internal.h | 2 ++ kernel/cgroup/cgroup-v1.c | 4 ++-- kernel/cgroup/cgroup.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 2e65e4c4d6e7..edb45e2f7f54 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -226,6 +226,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); +void cgroup_attach_lock(void); +void cgroup_attach_unlock(void); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) __acquires(&cgroup_threadgroup_rwsem); void cgroup_procs_write_finish(struct task_struct *task) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c4cc6c1ddacd..8bd36f2143eb 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -55,7 +55,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0;
mutex_lock(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(); for_each_root(root) { struct cgroup *from_cgrp;
@@ -70,7 +70,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); mutex_unlock(&cgroup_mutex);
return retval; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6487df9a6be0..4a4d8a3f06ab 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2236,7 +2236,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path); * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ -static void cgroup_attach_lock(void) +void cgroup_attach_lock(void) { cpus_read_lock(); percpu_down_write(&cgroup_threadgroup_rwsem); @@ -2246,7 +2246,7 @@ static void cgroup_attach_lock(void) * cgroup_attach_unlock - Undo cgroup_attach_lock() * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem */ -static void cgroup_attach_unlock(void) +void cgroup_attach_unlock(void) { percpu_up_write(&cgroup_threadgroup_rwsem); cpus_read_unlock();