From: Cai Xinchen caixinchen1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6TI3Y CVE: NA
--------------------------------
This reverts commit c2d8355618485dd9108ee9077799a227771af307.
Signed-off-by: Cai Xinchen caixinchen1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- kernel/cgroup/cgroup-internal.h | 2 -- kernel/cgroup/cgroup-v1.c | 4 ++-- kernel/cgroup/cgroup.c | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index edb45e2f7f54..2e65e4c4d6e7 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -226,8 +226,6 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(void); -void cgroup_attach_unlock(void); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) __acquires(&cgroup_threadgroup_rwsem); void cgroup_procs_write_finish(struct task_struct *task) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 8bd36f2143eb..c4cc6c1ddacd 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -55,7 +55,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0;
mutex_lock(&cgroup_mutex); - cgroup_attach_lock(); + percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp;
@@ -70,7 +70,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(); + percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex);
return retval; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4a4d8a3f06ab..6487df9a6be0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2236,7 +2236,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path); * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ -void cgroup_attach_lock(void) +static void cgroup_attach_lock(void) { cpus_read_lock(); percpu_down_write(&cgroup_threadgroup_rwsem); @@ -2246,7 +2246,7 @@ void cgroup_attach_lock(void) * cgroup_attach_unlock - Undo cgroup_attach_lock() * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem */ -void cgroup_attach_unlock(void) +static void cgroup_attach_unlock(void) { percpu_up_write(&cgroup_threadgroup_rwsem); cpus_read_unlock();
From: Cai Xinchen caixinchen1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6TI3Y CVE: NA
--------------------------------
This reverts commit 4924308a1ca9cc2f791398836a8744c22078ffbd.
Signed-off-by: Cai Xinchen caixinchen1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- kernel/cgroup/cgroup.c | 50 +++++------------------------------------- kernel/cgroup/cpuset.c | 7 +++++- 2 files changed, 11 insertions(+), 46 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6487df9a6be0..b01490b71f32 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -56,7 +56,6 @@ #include <linux/file.h> #include <linux/sched/cputime.h> #include <net/sock.h> -#include <linux/cpu.h>
#define CREATE_TRACE_POINTS #include <trace/events/cgroup.h> @@ -2213,45 +2212,6 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path);
-/** - * cgroup_attach_lock - Lock for ->attach() - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem - * - * cgroup migration sometimes needs to stabilize threadgroups against forks and - * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() - * implementations (e.g. cpuset), also need to disable CPU hotplug. - * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can - * lead to deadlocks. - * - * Bringing up a CPU may involve creating and destroying tasks which requires - * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside - * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while - * write-locking threadgroup_rwsem, the locking order is reversed and we end up - * waiting for an on-going CPU hotplug operation which in turn is waiting for - * the threadgroup_rwsem to be released to create new tasks. For more details: - * - * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu - * - * Resolve the situation by always acquiring cpus_read_lock() before optionally - * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that - * CPU hotplug is disabled on entry. - */ -static void cgroup_attach_lock(void) -{ - cpus_read_lock(); - percpu_down_write(&cgroup_threadgroup_rwsem); -} - -/** - * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem - */ -static void cgroup_attach_unlock(void) -{ - percpu_up_write(&cgroup_threadgroup_rwsem); - cpus_read_unlock(); -} - /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task @@ -2731,7 +2691,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL);
- cgroup_attach_lock(); + percpu_down_write(&cgroup_threadgroup_rwsem);
rcu_read_lock(); if (pid) { @@ -2762,7 +2722,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) goto out_unlock_rcu;
out_unlock_threadgroup: - cgroup_attach_unlock(); + percpu_up_write(&cgroup_threadgroup_rwsem); out_unlock_rcu: rcu_read_unlock(); return tsk; @@ -2777,7 +2737,7 @@ void cgroup_procs_write_finish(struct task_struct *task) /* release reference from cgroup_procs_write_start() */ put_task_struct(task);
- cgroup_attach_unlock(); + percpu_up_write(&cgroup_threadgroup_rwsem); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -2858,7 +2818,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
- cgroup_attach_lock(); + percpu_down_write(&cgroup_threadgroup_rwsem);
/* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); @@ -2888,7 +2848,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(); + percpu_up_write(&cgroup_threadgroup_rwsem); return ret; }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1d13d64108a0..def36c3fc524 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1612,7 +1612,11 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css);
- lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ + /* + * It should hold cpus lock because a cpu offline event can + * cause set_cpus_allowed_ptr() failed. + */ + get_online_cpus(); mutex_lock(&cpuset_mutex);
/* prepare for attach */ @@ -1675,6 +1679,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq);
mutex_unlock(&cpuset_mutex); + put_online_cpus(); }
/* The various types of files and directories in a cpuset file system */
From: Cai Xinchen caixinchen1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6TI3Y CVE: NA
--------------------------------
This reverts commit c831178aad59f60fa7a53c709c2856b24efa6651.
Signed-off-by: Cai Xinchen caixinchen1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- include/linux/cpuset.h | 8 ++++---- kernel/cgroup/cpuset.c | 24 +++++++----------------- 2 files changed, 11 insertions(+), 21 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 7f1478c26a33..934633a05d20 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)
static inline void cpuset_inc(void) { - static_branch_inc_cpuslocked(&cpusets_pre_enable_key); - static_branch_inc_cpuslocked(&cpusets_enabled_key); + static_branch_inc(&cpusets_pre_enable_key); + static_branch_inc(&cpusets_enabled_key); }
static inline void cpuset_dec(void) { - static_branch_dec_cpuslocked(&cpusets_enabled_key); - static_branch_dec_cpuslocked(&cpusets_pre_enable_key); + static_branch_dec(&cpusets_enabled_key); + static_branch_dec(&cpusets_pre_enable_key); }
extern int cpuset_init(void); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index def36c3fc524..55bfbc4cdb16 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -914,8 +914,8 @@ static void rebuild_sched_domains_locked(void) cpumask_var_t *doms; int ndoms;
- lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); + get_online_cpus();
/* * We have raced with CPU hotplug. Don't do anything to avoid @@ -923,13 +923,15 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - return; + goto out;
/* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); +out: + put_online_cpus(); } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) @@ -939,11 +941,9 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void) { - get_online_cpus(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); - put_online_cpus(); }
/** @@ -1612,13 +1612,13 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css);
+ mutex_lock(&cpuset_mutex); + /* * It should hold cpus lock because a cpu offline event can * cause set_cpus_allowed_ptr() failed. */ get_online_cpus(); - mutex_lock(&cpuset_mutex); - /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1644,6 +1644,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); } + put_online_cpus();
/* * Change mm for all threadgroup leaders. This is expensive and may @@ -1679,7 +1680,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq);
mutex_unlock(&cpuset_mutex); - put_online_cpus(); }
/* The various types of files and directories in a cpuset file system */ @@ -1711,7 +1711,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0;
- get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -1749,7 +1748,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); - put_online_cpus(); return retval; }
@@ -1760,7 +1758,6 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV;
- get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1775,7 +1772,6 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); - put_online_cpus(); return retval; }
@@ -1814,7 +1810,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work);
- get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1845,7 +1840,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); - put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2114,7 +2108,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0;
- get_online_cpus(); mutex_lock(&cpuset_mutex);
set_bit(CS_ONLINE, &cs->flags); @@ -2168,7 +2161,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); - put_online_cpus(); return 0; }
@@ -2182,7 +2174,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css);
- get_online_cpus(); mutex_lock(&cpuset_mutex);
if (is_sched_load_balance(cs)) @@ -2192,7 +2183,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags);
mutex_unlock(&cpuset_mutex); - put_online_cpus(); }
static void cpuset_css_free(struct cgroup_subsys_state *css)
From: Juri Lelli juri.lelli@redhat.com
stable inclusion from stable-v4.19.280 commit 224262583fabf3b6bf2a29d033cf9a8f28fde843 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6TI3Y CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit d74b27d63a8bebe2fe634944e4ebdc7b10db7a39 upstream.
commit 1243dc518c9da ("cgroup/cpuset: Convert cpuset_mutex to percpu_rwsem") is performance patch which is not backport. So convert percpu_rwsem to cpuset_mutex.
commit aa44002e7db25 ("cpuset: Fix unsafe lock order between cpuset lock and cpuslock") makes lock order keep cpuset_mutex ->cpu_hotplug_lock. We should change lock order in cpuset_attach.
original commit message:
cpuset_rwsem is going to be acquired from sched_setscheduler() with a following patch. There are however paths (e.g., spawn_ksoftirqd) in which sched_scheduler() is eventually called while holding hotplug lock; this creates a dependecy between hotplug lock (to be always acquired first) and cpuset_rwsem (to be always acquired after hotplug lock).
Fix paths which currently take the two locks in the wrong order (after a following patch is applied).
Tested-by: Dietmar Eggemann dietmar.eggemann@arm.com Signed-off-by: Juri Lelli juri.lelli@redhat.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-7-juri.lelli@redhat.com Signed-off-by: Ingo Molnar mingo@kernel.org Signed-off-by: Cai Xinchen caixinchen1@huawei.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Cai Xinchen caixinchen1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- include/linux/cpuset.h | 8 ++++---- kernel/cgroup/cpuset.c | 24 +++++++++++++++++------- 2 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 934633a05d20..7f1478c26a33 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)
static inline void cpuset_inc(void) { - static_branch_inc(&cpusets_pre_enable_key); - static_branch_inc(&cpusets_enabled_key); + static_branch_inc_cpuslocked(&cpusets_pre_enable_key); + static_branch_inc_cpuslocked(&cpusets_enabled_key); }
static inline void cpuset_dec(void) { - static_branch_dec(&cpusets_enabled_key); - static_branch_dec(&cpusets_pre_enable_key); + static_branch_dec_cpuslocked(&cpusets_enabled_key); + static_branch_dec_cpuslocked(&cpusets_pre_enable_key); }
extern int cpuset_init(void); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 55bfbc4cdb16..def36c3fc524 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -914,8 +914,8 @@ static void rebuild_sched_domains_locked(void) cpumask_var_t *doms; int ndoms;
+ lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); - get_online_cpus();
/* * We have raced with CPU hotplug. Don't do anything to avoid @@ -923,15 +923,13 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return;
/* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) @@ -941,9 +939,11 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void) { + get_online_cpus(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); + put_online_cpus(); }
/** @@ -1612,13 +1612,13 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css);
- mutex_lock(&cpuset_mutex); - /* * It should hold cpus lock because a cpu offline event can * cause set_cpus_allowed_ptr() failed. */ get_online_cpus(); + mutex_lock(&cpuset_mutex); + /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1644,7 +1644,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); } - put_online_cpus();
/* * Change mm for all threadgroup leaders. This is expensive and may @@ -1680,6 +1679,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq);
mutex_unlock(&cpuset_mutex); + put_online_cpus(); }
/* The various types of files and directories in a cpuset file system */ @@ -1711,6 +1711,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0;
+ get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -1748,6 +1749,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; }
@@ -1758,6 +1760,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV;
+ get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1772,6 +1775,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; }
@@ -1810,6 +1814,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work);
+ get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1840,6 +1845,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2108,6 +2114,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0;
+ get_online_cpus(); mutex_lock(&cpuset_mutex);
set_bit(CS_ONLINE, &cs->flags); @@ -2161,6 +2168,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return 0; }
@@ -2174,6 +2182,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css);
+ get_online_cpus(); mutex_lock(&cpuset_mutex);
if (is_sched_load_balance(cs)) @@ -2183,6 +2192,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags);
mutex_unlock(&cpuset_mutex); + put_online_cpus(); }
static void cpuset_css_free(struct cgroup_subsys_state *css)
From: Tejun Heo tj@kernel.org
stable inclusion from stable-v4.19.280 commit e446300968c6bd25d9cd6c33b9600780a39b3975 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6TI3Y CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 4f7e7236435ca0abe005c674ebd6892c6e83aeb3 upstream.
Add #include <linux/cpu.h> to avoid compile error on some architectures.
commit 9a3284fad42f6 ("cgroup: Optimize single thread migration") and commit 671c11f0619e5 ("cgroup: Elide write-locking threadgroup_rwsem when updating csses on an empty subtree") are not backport. So ignore the input parameter of cgroup_attach_lock/cgroup_attach_unlock.
original commit message:
Bringing up a CPU may involve creating and destroying tasks which requires read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside cpus_read_lock(). However, cpuset's ->attach(), which may be called with thredagroup_rwsem write-locked, also wants to disable CPU hotplug and acquires cpus_read_lock(), leading to a deadlock.
Fix it by guaranteeing that ->attach() is always called with CPU hotplug disabled and removing cpus_read_lock() call from cpuset_attach().
Signed-off-by: Tejun Heo tj@kernel.org Reviewed-and-tested-by: Imran Khan imran.f.khan@oracle.com Reported-and-tested-by: Xuewen Yan xuewen.yan@unisoc.com Fixes: 05c7b7a92cc8 ("cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug") Cc: stable@vger.kernel.org # v5.17+ Signed-off-by: Cai Xinchen caixinchen1@huawei.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Cai Xinchen caixinchen1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- kernel/cgroup/cgroup.c | 50 +++++++++++++++++++++++++++++++++++++----- kernel/cgroup/cpuset.c | 7 +----- 2 files changed, 46 insertions(+), 11 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b01490b71f32..170cc5bc7da2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -55,6 +55,7 @@ #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/sched/cputime.h> +#include <linux/cpu.h> #include <net/sock.h>
#define CREATE_TRACE_POINTS @@ -2212,6 +2213,45 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path);
+/** + * cgroup_attach_lock - Lock for ->attach() + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * + * cgroup migration sometimes needs to stabilize threadgroups against forks and + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() + * implementations (e.g. cpuset), also need to disable CPU hotplug. + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can + * lead to deadlocks. + * + * Bringing up a CPU may involve creating and destroying tasks which requires + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while + * write-locking threadgroup_rwsem, the locking order is reversed and we end up + * waiting for an on-going CPU hotplug operation which in turn is waiting for + * the threadgroup_rwsem to be released to create new tasks. For more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + * + * Resolve the situation by always acquiring cpus_read_lock() before optionally + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that + * CPU hotplug is disabled on entry. + */ +static void cgroup_attach_lock(void) +{ + get_online_cpus(); + percpu_down_write(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_attach_unlock - Undo cgroup_attach_lock() + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + */ +static void cgroup_attach_unlock(void) +{ + percpu_up_write(&cgroup_threadgroup_rwsem); + put_online_cpus(); +} + /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task @@ -2691,7 +2731,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL);
- percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock();
rcu_read_lock(); if (pid) { @@ -2722,7 +2762,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) goto out_unlock_rcu;
out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); out_unlock_rcu: rcu_read_unlock(); return tsk; @@ -2737,7 +2777,7 @@ void cgroup_procs_write_finish(struct task_struct *task) /* release reference from cgroup_procs_write_start() */ put_task_struct(task);
- percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -2818,7 +2858,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock();
/* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); @@ -2848,7 +2888,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); return ret; }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index def36c3fc524..1d13d64108a0 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1612,11 +1612,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css);
- /* - * It should hold cpus lock because a cpu offline event can - * cause set_cpus_allowed_ptr() failed. - */ - get_online_cpus(); + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ mutex_lock(&cpuset_mutex);
/* prepare for attach */ @@ -1679,7 +1675,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq);
mutex_unlock(&cpuset_mutex); - put_online_cpus(); }
/* The various types of files and directories in a cpuset file system */
From: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp
stable inclusion from stable-v4.19.280 commit 321488cfac7d0eb6d97de467015ff754f85813ff category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6TI3Y CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 43626dade36fa74d3329046f4ae2d7fdefe401c6 upstream.
syzbot is hitting percpu_rwsem_assert_held(&cpu_hotplug_lock) warning at cpuset_attach() [1], for commit 4f7e7236435ca0ab ("cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock") missed that cpuset_attach() is also called from cgroup_attach_task_all(). Add cpus_read_lock() like what cgroup_procs_write_start() does.
Link: https://syzkaller.appspot.com/bug?extid=29d3a3b4d86c8136ad9e [1] Reported-by: syzbot syzbot+29d3a3b4d86c8136ad9e@syzkaller.appspotmail.com Signed-off-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Fixes: 4f7e7236435ca0ab ("cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock") Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Cai Xinchen caixinchen1@huawei.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
conflicts: kernel/cgroup/cgroup-internal.h kernel/cgroup/cgroup-v1.c kernel/cgroup/cgroup.c
Signed-off-by: Cai Xinchen caixinchen1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- kernel/cgroup/cgroup-internal.h | 2 ++ kernel/cgroup/cgroup-v1.c | 4 ++-- kernel/cgroup/cgroup.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 2e65e4c4d6e7..edb45e2f7f54 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -226,6 +226,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); +void cgroup_attach_lock(void); +void cgroup_attach_unlock(void); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) __acquires(&cgroup_threadgroup_rwsem); void cgroup_procs_write_finish(struct task_struct *task) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c4cc6c1ddacd..8bd36f2143eb 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -55,7 +55,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0;
mutex_lock(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(); for_each_root(root) { struct cgroup *from_cgrp;
@@ -70,7 +70,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); mutex_unlock(&cgroup_mutex);
return retval; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 170cc5bc7da2..4d01867f7b85 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2236,7 +2236,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path); * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ -static void cgroup_attach_lock(void) +void cgroup_attach_lock(void) { get_online_cpus(); percpu_down_write(&cgroup_threadgroup_rwsem); @@ -2246,7 +2246,7 @@ static void cgroup_attach_lock(void) * cgroup_attach_unlock - Undo cgroup_attach_lock() * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem */ -static void cgroup_attach_unlock(void) +void cgroup_attach_unlock(void) { percpu_up_write(&cgroup_threadgroup_rwsem); put_online_cpus();
From: Zheng Yejian zhengyejian1@huawei.com
mainline inclusion from mainline-v6.3-rc6 commit 6455b6163d8c680366663cdb8c679514d55fc30c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6TJ97 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When user reads file 'trace_pipe', kernel keeps printing following logs that warn at "cpu_buffer->reader_page->read > rb_page_size(reader)" in rb_get_reader_page(). It just looks like there's an infinite loop in tracing_read_pipe(). This problem occurs several times on arm64 platform when testing v5.10 and below.
Call trace: rb_get_reader_page+0x248/0x1300 rb_buffer_peek+0x34/0x160 ring_buffer_peek+0xbc/0x224 peek_next_entry+0x98/0xbc __find_next_entry+0xc4/0x1c0 trace_find_next_entry_inc+0x30/0x94 tracing_read_pipe+0x198/0x304 vfs_read+0xb4/0x1e0 ksys_read+0x74/0x100 __arm64_sys_read+0x24/0x30 el0_svc_common.constprop.0+0x7c/0x1bc do_el0_svc+0x2c/0x94 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb4 el0_sync+0x160/0x180
Then I dump the vmcore and look into the problematic per_cpu ring_buffer, I found that tail_page/commit_page/reader_page are on the same page while reader_page->read is obviously abnormal: tail_page == commit_page == reader_page == { .write = 0x100d20, .read = 0x8f9f4805, // Far greater than 0xd20, obviously abnormal!!! .entries = 0x10004c, .real_end = 0x0, .page = { .time_stamp = 0x857257416af0, .commit = 0xd20, // This page hasn't been full filled. // .data[0...0xd20] seems normal. } }
The root cause is most likely the race that reader and writer are on the same page while reader saw an event that not fully committed by writer.
To fix this, add memory barriers to make sure the reader can see the content of what is committed. Since commit a0fcaaed0c46 ("ring-buffer: Fix race between reset page and reading page") has added the read barrier in rb_get_reader_page(), here we just need to add the write barrier.
Link: https://lore.kernel.org/linux-trace-kernel/20230325021247.2923907-1-zhengyej...
Cc: stable@vger.kernel.org Fixes: 77ae365eca89 ("ring-buffer: make lockless") Suggested-by: Steven Rostedt (Google) rostedt@goodmis.org Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Signed-off-by: Steven Rostedt (Google) rostedt@goodmis.org Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- kernel/trace/ring_buffer.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a9b32f620fb0..4610265ad6ca 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2462,6 +2462,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, rb_is_reader_page(cpu_buffer->tail_page))) return; + /* + * No need for a memory barrier here, as the update + * of the tail_page did it for this page. + */ local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); @@ -2475,6 +2479,8 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) {
+ /* Make sure the readers see the content of what is committed. */ + smp_wmb(); local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); RB_WARN_ON(cpu_buffer, @@ -3840,7 +3846,12 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
/* * Make sure we see any padding after the write update - * (see rb_reset_tail()) + * (see rb_reset_tail()). + * + * In addition, a writer may be writing on the reader page + * if the page has not been fully filled, so the read barrier + * is also needed to make sure we see the content of what is + * committed by the writer (see rb_set_commit_to_write()). */ smp_rmb();