From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.12-rc1 commit 2d24dd5798d0474d9bf705bfca8725e7d20f9d54 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
I've always been bothered by the endless (fragile) boilerplate for rbtree, and I recently wrote some rbtree helpers for objtool and figured I should lift them into the kernel and use them more widely.
Provide:
partial-order; less() based: - rb_add(): add a new entry to the rbtree - rb_add_cached(): like rb_add(), but for a rb_root_cached
total-order; cmp() based: - rb_find(): find an entry in an rbtree - rb_find_add(): find an entry, and add if not found
- rb_find_first(): find the first (leftmost) matching entry - rb_next_match(): continue from rb_find_first() - rb_for_each(): iterate a sub-tree using the previous two
Inlining and constant propagation should see the compiler inline the whole thing, including the various compare functions.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Reviewed-by: Michel Lespinasse walken@google.com Acked-by: Davidlohr Bueso dbueso@suse.de
Conflicts: tools/objtool/elf.c [Feature 3690914ed35e("objtool: Extract elf_symbol_add()")] Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/rbtree.h | 190 ++++++++++++++++++++++++++++++++++ tools/include/linux/rbtree.h | 192 ++++++++++++++++++++++++++++++++++- tools/objtool/elf.c | 73 ++----------- 3 files changed, 392 insertions(+), 63 deletions(-)
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index d7db17996322..e0b300de8f3f 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -158,4 +158,194 @@ static inline void rb_replace_node_cached(struct rb_node *victim, rb_replace_node(victim, new, &root->rb_root); }
+/* + * The below helper functions use 2 operators with 3 different + * calling conventions. The operators are related like: + * + * comp(a->key,b) < 0 := less(a,b) + * comp(a->key,b) > 0 := less(b,a) + * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) + * + * If these operators define a partial order on the elements we make no + * guarantee on which of the elements matching the key is found. See + * rb_find(). + * + * The reason for this is to allow the find() interface without requiring an + * on-stack dummy object, which might not be feasible due to object size. + */ + +/** + * rb_add_cached() - insert @node into the leftmost cached tree @tree + * @node: node to insert + * @tree: leftmost cached tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_root.rb_node; + struct rb_node *parent = NULL; + bool leftmost = true; + + while (*link) { + parent = *link; + if (less(node, parent)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = false; + } + } + + rb_link_node(node, parent, link); + rb_insert_color_cached(node, tree, leftmost); +} + +/** + * rb_add() - insert @node into @tree + * @node: node to insert + * @tree: tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_node; + struct rb_node *parent = NULL; + + while (*link) { + parent = *link; + if (less(node, parent)) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + + rb_link_node(node, parent, link); + rb_insert_color(node, tree); +} + +/** + * rb_find_add() - find equivalent @node in @tree, or add @node + * @node: node to look-for / insert + * @tree: tree to search / modify + * @cmp: operator defining the node order + * + * Returns the rb_node matching @node, or NULL when no match is found and @node + * is inserted. + */ +static __always_inline struct rb_node * +rb_find_add(struct rb_node *node, struct rb_root *tree, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_node; + struct rb_node *parent = NULL; + int c; + + while (*link) { + parent = *link; + c = cmp(node, parent); + + if (c < 0) + link = &parent->rb_left; + else if (c > 0) + link = &parent->rb_right; + else + return parent; + } + + rb_link_node(node, parent, link); + rb_insert_color(node, tree); + return NULL; +} + +/** + * rb_find() - find @key in tree @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining the node order + * + * Returns the rb_node matching @key or NULL. + */ +static __always_inline struct rb_node * +rb_find(const void *key, const struct rb_root *tree, + int (*cmp)(const void *key, const struct rb_node *)) +{ + struct rb_node *node = tree->rb_node; + + while (node) { + int c = cmp(key, node); + + if (c < 0) + node = node->rb_left; + else if (c > 0) + node = node->rb_right; + else + return node; + } + + return NULL; +} + +/** + * rb_find_first() - find the first @key in @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + * + * Returns the leftmost node matching @key, or NULL. + */ +static __always_inline struct rb_node * +rb_find_first(const void *key, const struct rb_root *tree, + int (*cmp)(const void *key, const struct rb_node *)) +{ + struct rb_node *node = tree->rb_node; + struct rb_node *match = NULL; + + while (node) { + int c = cmp(key, node); + + if (c <= 0) { + if (!c) + match = node; + node = node->rb_left; + } else if (c > 0) { + node = node->rb_right; + } + } + + return match; +} + +/** + * rb_next_match() - find the next @key in @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + * + * Returns the next node matching @key, or NULL. + */ +static __always_inline struct rb_node * +rb_next_match(const void *key, struct rb_node *node, + int (*cmp)(const void *key, const struct rb_node *)) +{ + node = rb_next(node); + if (node && cmp(key, node)) + node = NULL; + return node; +} + +/** + * rb_for_each() - iterates a subtree matching @key + * @node: iterator + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + */ +#define rb_for_each(node, key, tree, cmp) \ + for ((node) = rb_find_first((key), (tree), (cmp)); \ + (node); (node) = rb_next_match((key), (node), (cmp))) + #endif /* _LINUX_RBTREE_H */ diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h index 30dd21f976c3..2680f2edb837 100644 --- a/tools/include/linux/rbtree.h +++ b/tools/include/linux/rbtree.h @@ -152,4 +152,194 @@ static inline void rb_replace_node_cached(struct rb_node *victim, rb_replace_node(victim, new, &root->rb_root); }
-#endif /* __TOOLS_LINUX_PERF_RBTREE_H */ +/* + * The below helper functions use 2 operators with 3 different + * calling conventions. The operators are related like: + * + * comp(a->key,b) < 0 := less(a,b) + * comp(a->key,b) > 0 := less(b,a) + * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) + * + * If these operators define a partial order on the elements we make no + * guarantee on which of the elements matching the key is found. See + * rb_find(). + * + * The reason for this is to allow the find() interface without requiring an + * on-stack dummy object, which might not be feasible due to object size. + */ + +/** + * rb_add_cached() - insert @node into the leftmost cached tree @tree + * @node: node to insert + * @tree: leftmost cached tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_root.rb_node; + struct rb_node *parent = NULL; + bool leftmost = true; + + while (*link) { + parent = *link; + if (less(node, parent)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = false; + } + } + + rb_link_node(node, parent, link); + rb_insert_color_cached(node, tree, leftmost); +} + +/** + * rb_add() - insert @node into @tree + * @node: node to insert + * @tree: tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_node; + struct rb_node *parent = NULL; + + while (*link) { + parent = *link; + if (less(node, parent)) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + + rb_link_node(node, parent, link); + rb_insert_color(node, tree); +} + +/** + * rb_find_add() - find equivalent @node in @tree, or add @node + * @node: node to look-for / insert + * @tree: tree to search / modify + * @cmp: operator defining the node order + * + * Returns the rb_node matching @node, or NULL when no match is found and @node + * is inserted. + */ +static __always_inline struct rb_node * +rb_find_add(struct rb_node *node, struct rb_root *tree, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_node; + struct rb_node *parent = NULL; + int c; + + while (*link) { + parent = *link; + c = cmp(node, parent); + + if (c < 0) + link = &parent->rb_left; + else if (c > 0) + link = &parent->rb_right; + else + return parent; + } + + rb_link_node(node, parent, link); + rb_insert_color(node, tree); + return NULL; +} + +/** + * rb_find() - find @key in tree @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining the node order + * + * Returns the rb_node matching @key or NULL. + */ +static __always_inline struct rb_node * +rb_find(const void *key, const struct rb_root *tree, + int (*cmp)(const void *key, const struct rb_node *)) +{ + struct rb_node *node = tree->rb_node; + + while (node) { + int c = cmp(key, node); + + if (c < 0) + node = node->rb_left; + else if (c > 0) + node = node->rb_right; + else + return node; + } + + return NULL; +} + +/** + * rb_find_first() - find the first @key in @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + * + * Returns the leftmost node matching @key, or NULL. + */ +static __always_inline struct rb_node * +rb_find_first(const void *key, const struct rb_root *tree, + int (*cmp)(const void *key, const struct rb_node *)) +{ + struct rb_node *node = tree->rb_node; + struct rb_node *match = NULL; + + while (node) { + int c = cmp(key, node); + + if (c <= 0) { + if (!c) + match = node; + node = node->rb_left; + } else if (c > 0) { + node = node->rb_right; + } + } + + return match; +} + +/** + * rb_next_match() - find the next @key in @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + * + * Returns the next node matching @key, or NULL. + */ +static __always_inline struct rb_node * +rb_next_match(const void *key, struct rb_node *node, + int (*cmp)(const void *key, const struct rb_node *)) +{ + node = rb_next(node); + if (node && cmp(key, node)) + node = NULL; + return node; +} + +/** + * rb_for_each() - iterates a subtree matching @key + * @node: iterator + * @key: key to match + * @tree: tree to search + * @cmp: operator defining node order + */ +#define rb_for_each(node, key, tree, cmp) \ + for ((node) = rb_find_first((key), (tree), (cmp)); \ + (node); (node) = rb_next_match((key), (node), (cmp))) + +#endif /* __TOOLS_LINUX_PERF_RBTREE_H */ diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index de6e42040d75..a0468ead1ea1 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -43,75 +43,24 @@ static void elf_hash_init(struct hlist_head *table) #define elf_hash_for_each_possible(name, obj, member, key) \ hlist_for_each_entry(obj, &name[hash_min(key, elf_hash_bits())], member)
-static void rb_add(struct rb_root *tree, struct rb_node *node, - int (*cmp)(struct rb_node *, const struct rb_node *)) -{ - struct rb_node **link = &tree->rb_node; - struct rb_node *parent = NULL; - - while (*link) { - parent = *link; - if (cmp(node, parent) < 0) - link = &parent->rb_left; - else - link = &parent->rb_right; - } - - rb_link_node(node, parent, link); - rb_insert_color(node, tree); -} - -static struct rb_node *rb_find_first(const struct rb_root *tree, const void *key, - int (*cmp)(const void *key, const struct rb_node *)) -{ - struct rb_node *node = tree->rb_node; - struct rb_node *match = NULL; - - while (node) { - int c = cmp(key, node); - if (c <= 0) { - if (!c) - match = node; - node = node->rb_left; - } else if (c > 0) { - node = node->rb_right; - } - } - - return match; -} - -static struct rb_node *rb_next_match(struct rb_node *node, const void *key, - int (*cmp)(const void *key, const struct rb_node *)) -{ - node = rb_next(node); - if (node && cmp(key, node)) - node = NULL; - return node; -} - -#define rb_for_each(tree, node, key, cmp) \ - for ((node) = rb_find_first((tree), (key), (cmp)); \ - (node); (node) = rb_next_match((node), (key), (cmp))) - -static int symbol_to_offset(struct rb_node *a, const struct rb_node *b) +static bool symbol_to_offset(struct rb_node *a, const struct rb_node *b) { struct symbol *sa = rb_entry(a, struct symbol, node); struct symbol *sb = rb_entry(b, struct symbol, node);
if (sa->offset < sb->offset) - return -1; + return true; if (sa->offset > sb->offset) - return 1; + return false;
if (sa->len < sb->len) - return -1; + return true; if (sa->len > sb->len) - return 1; + return false;
sa->alias = sb;
- return 0; + return false; }
static int symbol_by_offset(const void *key, const struct rb_node *node) @@ -165,7 +114,7 @@ struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset) { struct rb_node *node;
- rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { struct symbol *s = rb_entry(node, struct symbol, node);
if (s->offset == offset && s->type != STT_SECTION) @@ -179,7 +128,7 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) { struct rb_node *node;
- rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { struct symbol *s = rb_entry(node, struct symbol, node);
if (s->offset == offset && s->type == STT_FUNC) @@ -193,7 +142,7 @@ struct symbol *find_symbol_containing(const struct section *sec, unsigned long o { struct rb_node *node;
- rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { struct symbol *s = rb_entry(node, struct symbol, node);
if (s->type != STT_SECTION) @@ -207,7 +156,7 @@ struct symbol *find_func_containing(struct section *sec, unsigned long offset) { struct rb_node *node;
- rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) { struct symbol *s = rb_entry(node, struct symbol, node);
if (s->type == STT_FUNC) @@ -352,7 +301,7 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) sym->offset = sym->sym.st_value; sym->len = sym->sym.st_size;
- rb_add(&sym->sec->symbol_tree, &sym->node, symbol_to_offset); + rb_add(&sym->node, &sym->sec->symbol_tree, symbol_to_offset); pnode = rb_prev(&sym->node); if (pnode) entry = &rb_entry(pnode, struct symbol, node)->list;
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 9099a14708ce1dfecb6002605594a0daa319b555 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.015639083@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42a9c9557800..49aeab435237 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6412,6 +6412,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) task_util = uclamp_task_util(p); }
+ /* + * per-cpu select_idle_mask usage + */ + lockdep_assert_irqs_disabled(); + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && asym_fits_capacity(task_util, target)) { SET_STAT(found_idle_cpu_easy); @@ -6883,8 +6888,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. * * Returns the target CPU number. - * - * preempt must be disabled. */ static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) @@ -6898,6 +6901,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
time = schedstat_start_time();
+ /* + * required for stable ->cpus_allowed + */ + lockdep_assert_held(&p->pi_lock); if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p);
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 39d371b7c0c299d489041884d005aacc4bba8c15 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
In prepration for playing games with rq->lock, add some rq_lock wrappers.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.075967879@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 15 +++++++++++++ kernel/sched/sched.h | 50 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e00b39d4e2e2..32da42297ded 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -175,6 +175,21 @@ int sysctl_sched_rt_runtime = 950000; * */
+void raw_spin_rq_lock_nested(struct rq *rq, int subclass) +{ + raw_spin_lock_nested(rq_lockp(rq), subclass); +} + +bool raw_spin_rq_trylock(struct rq *rq) +{ + return raw_spin_trylock(rq_lockp(rq)); +} + +void raw_spin_rq_unlock(struct rq *rq) +{ + raw_spin_unlock(rq_lockp(rq)); +} + /* * __task_rq_lock - lock the rq @p resides on. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f8127a43b6a0..b71627aacc30 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1151,6 +1151,56 @@ enum task_qos_level { void init_qos_hrtimer(int cpu); #endif
+static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + return &rq->lock; +} + +static inline void lockdep_assert_rq_held(struct rq *rq) +{ + lockdep_assert_held(rq_lockp(rq)); +} + +extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); +extern bool raw_spin_rq_trylock(struct rq *rq); +extern void raw_spin_rq_unlock(struct rq *rq); + +static inline void raw_spin_rq_lock(struct rq *rq) +{ + raw_spin_rq_lock_nested(rq, 0); +} + +static inline void raw_spin_rq_lock_irq(struct rq *rq) +{ + local_irq_disable(); + raw_spin_rq_lock(rq); +} + +static inline void raw_spin_rq_unlock_irq(struct rq *rq) +{ + raw_spin_rq_unlock(rq); + local_irq_enable(); +} + +static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) +{ + unsigned long flags; + local_irq_save(flags); + raw_spin_rq_lock(rq); + return flags; +} + +static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags) +{ + raw_spin_rq_unlock(rq); + local_irq_restore(flags); +} + +#define raw_spin_rq_lock_irqsave(rq, flags) \ +do { \ + flags = _raw_spin_rq_lock_irqsave(rq); \ +} while (0) + #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq);
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 5cb9eaa3d274f75539077a28cf01e3563195fa53 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
In preparation of playing games with rq->lock, abstract the thing using an accessor.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.136465446@infradead.org Conflicts: kernel/sched/core.c [Bugfix a7c81556ec4d3("sched: Fix migrate_disable() vs rt/dl balancing") is not applied. Bugfix 565790d28b1e3("sched: Fix balance_callback()") is not applied. Bugfix ae7927023243d("sched: Optimize finish_lock_switch()") is not applied. Bugfix 36c6e17bf1692("sched/core: Print out straggler tasks in sched_cpu_dying()") is not applied. Feature 2558aacff8586("sched/hotplug: Ensure only per-cpu kthreads run during hotplug") is not applied. Feature f2469a1fb43f8("sched/core: Wait for tasks being pushed away on hotplug") is not applied.]
kernel/sched/deadline.c [Bugfix a7c81556ec4d3("sched: Fix migrate_disable() vs rt/dl balancing") is not applied.]
kernel/sched/fair.c [Feature acf66d7048e08("sched/fair: Provide can_migrate_task_llc") Feature 0826530de3cbd("sched/fair: Remove update of blocked load from newidle_balance") s not applied. Feature 6864cf0161bad("sched/fair: Steal work from an overloaded CPU when CPU goes idle")]
kernel/sched/rt.c [Bugfix a7c81556ec4d3("sched: Fix migrate_disable() vs rt/dl balancing") is not applied.]
kernel/sched/sched.h [[Bugfix a7c81556ec4d3("sched: Fix migrate_disable() vs rt/dl balancing") is not applied.] Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 46 +++++++++--------- kernel/sched/cpuacct.c | 12 ++--- kernel/sched/deadline.c | 18 +++---- kernel/sched/debug.c | 4 +- kernel/sched/fair.c | 49 ++++++++++--------- kernel/sched/idle.c | 4 +- kernel/sched/pelt.h | 2 +- kernel/sched/rt.c | 8 ++-- kernel/sched/sched.h | 103 ++++++++++++++++++++-------------------- kernel/sched/topology.c | 4 +- 10 files changed, 124 insertions(+), 126 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 32da42297ded..4d69360a3f06 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -202,12 +202,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
for (;;) { rq = task_rq(p); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { rq_pin_lock(rq, rf); return rq; } - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq);
while (unlikely(task_on_rq_migrating(p))) cpu_relax(); @@ -226,7 +226,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) for (;;) { raw_spin_lock_irqsave(&p->pi_lock, rf->flags); rq = task_rq(p); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); /* * move_queued_task() task_rq_lock() * @@ -248,7 +248,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) rq_pin_lock(rq, rf); return rq; } - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
while (unlikely(task_on_rq_migrating(p))) @@ -318,7 +318,7 @@ void update_rq_clock(struct rq *rq) { s64 delta;
- lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
if (rq->clock_update_flags & RQCF_ACT_SKIP) return; @@ -617,7 +617,7 @@ void resched_curr(struct rq *rq) struct task_struct *curr = rq->curr; int cpu;
- lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
if (test_tsk_need_resched(curr)) return; @@ -641,10 +641,10 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); if (cpu_online(cpu) || cpu == smp_processor_id()) resched_curr(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); }
#ifdef CONFIG_SMP @@ -1146,7 +1146,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_bucket *bucket;
- lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
/* Update task effective clamp */ p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); @@ -1186,7 +1186,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, unsigned int bkt_clamp; unsigned int rq_clamp;
- lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
/* * If sched_uclamp_used was enabled after task @p was enqueued, @@ -1755,7 +1755,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int new_cpu) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); @@ -1867,7 +1867,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * Because __kthread_bind() calls this on blocked tasks without * holding rq->lock. */ - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); } if (running) @@ -2004,7 +2004,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * task_rq_lock(). */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock))); + lockdep_is_held(rq_lockp(task_rq(p))))); #endif /* * Clearly, migrating tasks to offline CPUs is a fairly daft thing. @@ -2515,7 +2515,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, { int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
- lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
if (p->sched_contributes_to_load) rq->nr_uninterruptible--; @@ -3565,10 +3565,10 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf * do an early lockdep release here: */ rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, _THIS_IP_); + spin_release(&rq_lockp(rq)->dep_map, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = next; + rq_lockp(rq)->owner = next; #endif }
@@ -3579,8 +3579,8 @@ static inline void finish_lock_switch(struct rq *rq) * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - raw_spin_unlock_irq(&rq->lock); + spin_acquire(&rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); + raw_spin_rq_unlock_irq(rq); }
/* @@ -3730,7 +3730,7 @@ static void __balance_callback(struct rq *rq) void (*func)(struct rq *rq); unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); head = rq->balance_callback; rq->balance_callback = NULL; while (head) { @@ -3741,7 +3741,7 @@ static void __balance_callback(struct rq *rq)
func(rq); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); }
static inline void balance_callback(struct rq *rq) @@ -6786,7 +6786,7 @@ void __init init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle);
raw_spin_lock_irqsave(&idle->pi_lock, flags); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq);
idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); @@ -6821,7 +6821,7 @@ void __init init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP idle->on_cpu = 1; #endif - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */ @@ -7393,7 +7393,7 @@ void __init sched_init(void) struct rq *rq;
rq = cpu_rq(i); - raw_spin_lock_init(&rq->lock); + raw_spin_lock_init(&rq->__lock); rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index e51c1f524b8c..5c00d3dcee8d 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -112,7 +112,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif
if (index == CPUACCT_STAT_NSTATS) { @@ -126,7 +126,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, }
#ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_unlock_irq(cpu_rq(cpu)); #endif
return data; @@ -141,14 +141,14 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif
for (i = 0; i < CPUACCT_STAT_NSTATS; i++) cpuusage->usages[i] = val;
#ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_unlock_irq(cpu_rq(cpu)); #endif }
@@ -253,13 +253,13 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) * Take rq->lock to make 64-bit read safe on 32-bit * platforms. */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif
seq_printf(m, " %llu", cpuusage->usages[index]);
#ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_unlock_irq(cpu_rq(cpu)); #endif } seq_puts(m, "\n"); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8255267ce323..990bffb1443d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -141,7 +141,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); @@ -154,7 +154,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) @@ -168,7 +168,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ } @@ -178,7 +178,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) @@ -985,7 +985,7 @@ static int start_dl_timer(struct task_struct *p) ktime_t now, act; s64 delta;
- lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
/* * We want the timer to fire at the deadline, but considering @@ -1095,9 +1095,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * If the runqueue is no longer available, migrate the * task elsewhere. This necessarily changes rq. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + lockdep_unpin_lock(rq_lockp(rq), rf.cookie); rq = dl_task_offline_migration(rq, p); - rf.cookie = lockdep_pin_lock(&rq->lock); + rf.cookie = lockdep_pin_lock(rq_lockp(rq)); update_rq_clock(rq);
/* @@ -1733,7 +1733,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * from try_to_wake_up(). Hence, p->pi_lock is locked, but * rq->lock is not... So, lock it */ - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); if (p->dl.dl_non_contending) { update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); @@ -1749,7 +1749,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused put_task_struct(p); } sub_rq_bw(&p->dl, &rq->dl); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); }
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a260ff7800db..dda6e77accc2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -570,7 +570,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock));
- raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); if (rb_first_cached(&cfs_rq->tasks_timeline)) MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); @@ -578,7 +578,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) max_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 49aeab435237..5aa269fbb068 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1131,7 +1131,7 @@ struct numa_group { static struct numa_group *deref_task_numa_group(struct task_struct *p) { return rcu_dereference_check(p->numa_group, p == current || - (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); + (lockdep_is_held(rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu))); }
static struct numa_group *deref_curr_numa_group(struct task_struct *p) @@ -5454,7 +5454,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) { struct task_group *tg;
- lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -5473,7 +5473,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct task_group *tg;
- lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
#ifdef CONFIG_QOS_SCHED unthrottle_qos_cfs_rqs(cpu_of(rq)); @@ -6996,7 +6996,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' * rq->lock and can modify state directly. */ - lockdep_assert_held(&task_rq(p)->lock); + lockdep_assert_rq_held(task_rq(p)); detach_entity_cfs_rq(&p->se);
} else { @@ -8093,7 +8093,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta;
- lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq);
if (p->sched_class != &fair_sched_class) return 0; @@ -8191,7 +8191,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot;
- lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq);
/* * We do not migrate tasks that are: @@ -8279,7 +8279,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) { int dst_cpu = dst_rq->cpu;
- lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu)) return false; @@ -8303,7 +8303,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) */ static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu) { - lockdep_assert_held(&src_rq->lock); + lockdep_assert_rq_held(src_rq);
deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, dst_cpu); @@ -8319,7 +8319,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p;
- lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq);
list_for_each_entry_reverse(p, &env->src_rq->cfs_tasks, se.group_node) { @@ -8355,7 +8355,7 @@ static int detach_tasks(struct lb_env *env) struct task_struct *p; int detached = 0;
- lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq);
if (env->imbalance <= 0) return 0; @@ -8476,7 +8476,7 @@ static int detach_tasks(struct lb_env *env) */ static void attach_task(struct rq *rq, struct task_struct *p) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
BUG_ON(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); @@ -10427,7 +10427,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, if (need_active_balance(&env)) { unsigned long flags;
- raw_spin_lock_irqsave(&busiest->lock, flags); + raw_spin_rq_lock_irqsave(busiest, flags);
/* * Don't kick the active_load_balance_cpu_stop, @@ -10435,8 +10435,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, * moved to this_cpu: */ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { - raw_spin_unlock_irqrestore(&busiest->lock, - flags); + raw_spin_rq_unlock_irqrestore(busiest, flags); env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } @@ -10451,7 +10450,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, busiest->push_cpu = this_cpu; active_balance = 1; } - raw_spin_unlock_irqrestore(&busiest->lock, flags); + raw_spin_rq_unlock_irqrestore(busiest, flags);
if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), @@ -11203,7 +11202,7 @@ static void nohz_newidle_balance(struct rq *this_rq) time_before(jiffies, READ_ONCE(nohz.next_blocked))) return;
- raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); /* * This CPU is going to be idle and blocked load of idle CPUs * need to be updated. Run the ilb locally as it is a good @@ -11212,7 +11211,7 @@ static void nohz_newidle_balance(struct rq *this_rq) */ if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) kick_ilb(NOHZ_STATS_KICK); - raw_spin_lock(&this_rq->lock); + raw_spin_rq_lock(this_rq); }
#else /* !CONFIG_NO_HZ_COMMON */ @@ -11273,7 +11272,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) goto out; }
- raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq);
update_blocked_averages(this_cpu); rcu_read_lock(); @@ -11311,7 +11310,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) } rcu_read_unlock();
- raw_spin_lock(&this_rq->lock); + raw_spin_rq_lock(this_rq);
if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; @@ -11392,7 +11391,7 @@ detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq) struct task_struct *p; struct rq *rq = rq_of(cfs_rq);
- lockdep_assert_held(&rq_of(cfs_rq)->lock); + lockdep_assert_rq_held(rq_of(cfs_rq));
list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) { if (can_migrate_task_llc(p, rq, dst_rq)) { @@ -11424,7 +11423,7 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
if (*locked) { rq_unpin_lock(dst_rq, dst_rf); - raw_spin_unlock(&dst_rq->lock); + raw_spin_rq_unlock(dst_rq); *locked = false; } rq_lock_irqsave(src_rq, &rf); @@ -11438,7 +11437,7 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, rq_unlock(src_rq, &rf);
if (p) { - raw_spin_lock(&dst_rq->lock); + raw_spin_rq_lock(dst_rq); rq_repin_lock(dst_rq, dst_rf); *locked = true; update_rq_clock(dst_rq); @@ -11518,7 +11517,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) out: rcu_read_unlock(); if (!locked) { - raw_spin_lock(&dst_rq->lock); + raw_spin_rq_lock(dst_rq); rq_repin_lock(dst_rq, dst_rf); } stolen |= (dst_rq->cfs.h_nr_running > 0); @@ -11945,9 +11944,9 @@ void unregister_fair_sched_group(struct task_group *tg)
rq = cpu_rq(cpu);
- raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); } }
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2593a733c084..c781dc136a5b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -448,10 +448,10 @@ struct task_struct *pick_next_task_idle(struct rq *rq) static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) { - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); - raw_spin_lock_irq(&rq->lock); + raw_spin_rq_lock_irq(rq); }
/* diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 89150ced09cf..dc3a730e0d04 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -134,7 +134,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
static inline u64 rq_clock_pelt(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); assert_clock_updated(rq);
return rq->clock_pelt - rq->lost_idle_time; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 59c3e20943ac..4e971b5cea19 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -890,7 +890,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (skip) continue;
- raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); update_rq_clock(rq);
if (rt_rq->rt_time) { @@ -928,7 +928,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue) sched_rt_rq_enqueue(rt_rq); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); }
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) @@ -2097,9 +2097,9 @@ void rto_push_irq_work_func(struct irq_work *work) * When it gets updated, a check is made if a push is possible. */ if (has_pushable_tasks(rq)) { - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); push_rt_tasks(rq); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); }
raw_spin_lock(&rd->rto_lock); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b71627aacc30..4aed3a7923a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -937,7 +937,7 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); */ struct rq { /* runqueue lock: */ - raw_spinlock_t lock; + raw_spinlock_t __lock;
/* * nr_running and cpu_load should be in the same cacheline because @@ -1153,7 +1153,7 @@ void init_qos_hrtimer(int cpu);
static inline raw_spinlock_t *rq_lockp(struct rq *rq) { - return &rq->lock; + return &rq->__lock; }
static inline void lockdep_assert_rq_held(struct rq *rq) @@ -1267,7 +1267,7 @@ static inline void assert_clock_updated(struct rq *rq)
static inline u64 rq_clock(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); assert_clock_updated(rq);
return rq->clock; @@ -1275,7 +1275,7 @@ static inline u64 rq_clock(struct rq *rq)
static inline u64 rq_clock_task(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); assert_clock_updated(rq);
return rq->clock_task; @@ -1301,7 +1301,7 @@ static inline u64 rq_clock_thermal(struct rq *rq)
static inline void rq_clock_skip_update(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rq->clock_update_flags |= RQCF_REQ_SKIP; }
@@ -1311,7 +1311,7 @@ static inline void rq_clock_skip_update(struct rq *rq) */ static inline void rq_clock_cancel_skipupdate(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rq->clock_update_flags &= ~RQCF_REQ_SKIP; }
@@ -1340,7 +1340,7 @@ struct rq_flags { */ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rf->cookie = lockdep_pin_lock(rq_lockp(rq));
#ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); @@ -1355,12 +1355,12 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) rf->clock_update_flags = RQCF_UPDATED; #endif
- lockdep_unpin_lock(&rq->lock, rf->cookie); + lockdep_unpin_lock(rq_lockp(rq), rf->cookie); }
static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) { - lockdep_repin_lock(&rq->lock, rf->cookie); + lockdep_repin_lock(rq_lockp(rq), rf->cookie);
#ifdef CONFIG_SCHED_DEBUG /* @@ -1381,7 +1381,7 @@ static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); }
static inline void @@ -1390,7 +1390,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(p->pi_lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); }
@@ -1398,7 +1398,7 @@ static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock_irqsave(&rq->lock, rf->flags); + raw_spin_rq_lock_irqsave(rq, rf->flags); rq_pin_lock(rq, rf); }
@@ -1406,7 +1406,7 @@ static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock_irq(&rq->lock); + raw_spin_rq_lock_irq(rq); rq_pin_lock(rq, rf); }
@@ -1414,7 +1414,7 @@ static inline void rq_lock(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); rq_pin_lock(rq, rf); }
@@ -1422,7 +1422,7 @@ static inline void rq_relock(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); rq_repin_lock(rq, rf); }
@@ -1431,7 +1431,7 @@ rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock_irqrestore(&rq->lock, rf->flags); + raw_spin_rq_unlock_irqrestore(rq, rf->flags); }
static inline void @@ -1439,7 +1439,7 @@ rq_unlock_irq(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); }
static inline void @@ -1447,7 +1447,7 @@ rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); }
static inline struct rq * @@ -1516,7 +1516,7 @@ queue_balance_callback(struct rq *rq, struct callback_head *head, void (*func)(struct rq *rq)) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq);
if (unlikely(head->next)) return; @@ -2221,7 +2221,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); double_rq_lock(this_rq, busiest);
return 1; @@ -2240,20 +2240,22 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - int ret = 0; - - if (unlikely(!raw_spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - raw_spin_unlock(&this_rq->lock); - raw_spin_lock(&busiest->lock); - raw_spin_lock_nested(&this_rq->lock, - SINGLE_DEPTH_NESTING); - ret = 1; - } else - raw_spin_lock_nested(&busiest->lock, - SINGLE_DEPTH_NESTING); + if (rq_lockp(this_rq) == rq_lockp(busiest)) + return 0; + + if (likely(raw_spin_rq_trylock(busiest))) + return 0; + + if (rq_lockp(busiest) >= rq_lockp(this_rq)) { + raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + return 0; } - return ret; + + raw_spin_rq_unlock(this_rq); + raw_spin_rq_lock(busiest); + raw_spin_rq_lock_nested(this_rq, SINGLE_DEPTH_NESTING); + + return 1; }
#endif /* CONFIG_PREEMPTION */ @@ -2263,11 +2265,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) */ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) { - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work well under rq->lock */ - raw_spin_unlock(&this_rq->lock); - BUG_ON(1); - } + lockdep_assert_irqs_disabled();
return _double_lock_balance(this_rq, busiest); } @@ -2275,8 +2273,9 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - raw_spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); + if (rq_lockp(this_rq) != rq_lockp(busiest)) + raw_spin_rq_unlock(busiest); + lock_set_subclass(&rq_lockp(this_rq)->dep_map, 0, _RET_IP_); }
static inline void double_lock(spinlock_t *l1, spinlock_t *l2) @@ -2317,16 +2316,16 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq2->lock) { BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - raw_spin_lock(&rq1->lock); + if (rq_lockp(rq1) == rq_lockp(rq2)) { + raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ } else { - if (rq1 < rq2) { - raw_spin_lock(&rq1->lock); - raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + if (rq_lockp(rq1) < rq_lockp(rq2)) { + raw_spin_rq_lock(rq1); + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); } else { - raw_spin_lock(&rq2->lock); - raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + raw_spin_rq_lock(rq2); + raw_spin_rq_lock_nested(rq1, SINGLE_DEPTH_NESTING); } } } @@ -2341,9 +2340,9 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - raw_spin_unlock(&rq1->lock); - if (rq1 != rq2) - raw_spin_unlock(&rq2->lock); + raw_spin_rq_unlock(rq1); + if (rq_lockp(rq1) != rq_lockp(rq2)) + raw_spin_rq_unlock(rq2); else __release(rq2->lock); } @@ -2366,7 +2365,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) { BUG_ON(!irqs_disabled()); BUG_ON(rq1 != rq2); - raw_spin_lock(&rq1->lock); + raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ }
@@ -2381,7 +2380,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq2->lock) { BUG_ON(rq1 != rq2); - raw_spin_unlock(&rq1->lock); + raw_spin_rq_unlock(rq1); __release(rq2->lock); }
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9b4e3b25ddff..2678e7590cfc 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -469,7 +469,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) struct root_domain *old_rd = NULL; unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags);
if (rq->rd) { old_rd = rq->rd; @@ -495,7 +495,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags);
if (old_rd) call_rcu(&old_rd->rcu, free_rootdomain);
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit d66f1b06b5b438cd20ba3664b8eef1f9c79e84bf category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
When switching on core-sched, CPUs need to agree which lock to use for their RQ.
The new rule will be that rq->core_enabled will be toggled while holding all rq->__locks that belong to a core. This means we need to double check the rq->core_enabled value after each lock acquire and retry if it changed.
This also has implications for those sites that take multiple RQ locks, they need to be careful that the second lock doesn't end up being the first lock.
Verify the lock pointer after acquiring the first lock, because if they're on the same core, holding any of the rq->__lock instances will pin the core state.
While there, change the rq->__lock order to CPU number, instead of rq address, this greatly simplifies the next patch.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/YJUNY0dmrJMD/BIm@hirez.programming.kicks-ass.net Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 48 ++++++++++++++++---------------------------- 2 files changed, 63 insertions(+), 33 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d69360a3f06..0a2138a3c8f1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -177,12 +177,37 @@ int sysctl_sched_rt_runtime = 950000;
void raw_spin_rq_lock_nested(struct rq *rq, int subclass) { - raw_spin_lock_nested(rq_lockp(rq), subclass); + raw_spinlock_t *lock; + + if (sched_core_disabled()) { + raw_spin_lock_nested(&rq->__lock, subclass); + return; + } + + for (;;) { + lock = rq_lockp(rq); + raw_spin_lock_nested(lock, subclass); + if (likely(lock == rq_lockp(rq))) + return; + raw_spin_unlock(lock); + } }
bool raw_spin_rq_trylock(struct rq *rq) { - return raw_spin_trylock(rq_lockp(rq)); + raw_spinlock_t *lock; + bool ret; + + if (sched_core_disabled()) + return raw_spin_trylock(&rq->__lock); + + for (;;) { + lock = rq_lockp(rq); + ret = raw_spin_trylock(lock); + if (!ret || (likely(lock == rq_lockp(rq)))) + return ret; + raw_spin_unlock(lock); + } }
void raw_spin_rq_unlock(struct rq *rq) @@ -190,6 +215,25 @@ void raw_spin_rq_unlock(struct rq *rq) raw_spin_unlock(rq_lockp(rq)); }
+#ifdef CONFIG_SMP +/* + * double_rq_lock - safely lock two runqueues + */ +void double_rq_lock(struct rq *rq1, struct rq *rq2) +{ + lockdep_assert_irqs_disabled(); + + if (rq_order_less(rq2, rq1)) + swap(rq1, rq2); + + raw_spin_rq_lock(rq1); + if (rq_lockp(rq1) == rq_lockp(rq2)) + return; + + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); +} +#endif + /* * __task_rq_lock - lock the rq @p resides on. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4aed3a7923a2..ec6be3e11d60 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1151,6 +1151,11 @@ enum task_qos_level { void init_qos_hrtimer(int cpu); #endif
+static inline bool sched_core_disabled(void) +{ + return true; +} + static inline raw_spinlock_t *rq_lockp(struct rq *rq) { return &rq->__lock; @@ -2203,10 +2208,17 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif
+ #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPTION
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); +static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) +{ + return rq1->cpu < rq2->cpu; +} + +extern void double_rq_lock(struct rq *rq1, struct rq *rq2); + +#ifdef CONFIG_PREEMPTION
/* * fair double_lock_balance: Safely acquires both rq->locks in a fair @@ -2246,14 +2258,13 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) if (likely(raw_spin_rq_trylock(busiest))) return 0;
- if (rq_lockp(busiest) >= rq_lockp(this_rq)) { + if (rq_order_less(this_rq, busiest)) { raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); return 0; }
raw_spin_rq_unlock(this_rq); - raw_spin_rq_lock(busiest); - raw_spin_rq_lock_nested(this_rq, SINGLE_DEPTH_NESTING); + double_rq_lock(this_rq, busiest);
return 1; } @@ -2305,31 +2316,6 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); }
-/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - if (rq_lockp(rq1) == rq_lockp(rq2)) { - raw_spin_rq_lock(rq1); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq_lockp(rq1) < rq_lockp(rq2)) { - raw_spin_rq_lock(rq1); - raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); - } else { - raw_spin_rq_lock(rq2); - raw_spin_rq_lock_nested(rq1, SINGLE_DEPTH_NESTING); - } - } -} - /* * double_rq_unlock - safely unlock two runqueues * @@ -2340,11 +2326,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - raw_spin_rq_unlock(rq1); if (rq_lockp(rq1) != rq_lockp(rq2)) raw_spin_rq_unlock(rq2); else __release(rq2->lock); + raw_spin_rq_unlock(rq1); }
extern void set_rq_online (struct rq *rq);
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 9edeaea1bc452372718837ed2ba775811baf1ba1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Introduce the basic infrastructure to have a core wide rq->lock.
This relies on the rq->__lock order being in increasing CPU number (inside a core). It is also constrained to SMT8 per lockdep (and SMT256 per preempt_count).
Luckily SMT8 is the max supported SMT count for Linux (Mips, Sparc and Power are known to have this).
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/YJUNfzSgptjX7tG6@hirez.programming.kicks-ass.net Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/Kconfig.preempt | 6 ++ kernel/sched/core.c | 164 ++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 58 +++++++++++++++ 3 files changed, 224 insertions(+), 4 deletions(-)
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 416017301660..ea1e3331c0ba 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -99,3 +99,9 @@ config PREEMPT_DYNAMIC
Interesting if you want the same pre-built kernel should be used for both Server and Desktop workloads. + +config SCHED_CORE + bool "Core Scheduling for SMT" + default y + depends on SCHED_SMT + diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0a2138a3c8f1..c549e584aea5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -75,6 +75,108 @@ unsigned int sysctl_sched_rt_period = 1000000;
__read_mostly int scheduler_running;
+#ifdef CONFIG_SCHED_CORE + +DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); + +/* + * Magic required such that: + * + * raw_spin_rq_lock(rq); + * ... + * raw_spin_rq_unlock(rq); + * + * ends up locking and unlocking the _same_ lock, and all CPUs + * always agree on what rq has what lock. + * + * XXX entirely possible to selectively enable cores, don't bother for now. + */ + +static DEFINE_MUTEX(sched_core_mutex); +static int sched_core_count; +static struct cpumask sched_core_mask; + +static void __sched_core_flip(bool enabled) +{ + int cpu, t, i; + + cpus_read_lock(); + + /* + * Toggle the online cores, one by one. + */ + cpumask_copy(&sched_core_mask, cpu_online_mask); + for_each_cpu(cpu, &sched_core_mask) { + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + + i = 0; + local_irq_disable(); + for_each_cpu(t, smt_mask) { + /* supports up to SMT8 */ + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); + } + + for_each_cpu(t, smt_mask) + cpu_rq(t)->core_enabled = enabled; + + for_each_cpu(t, smt_mask) + raw_spin_unlock(&cpu_rq(t)->__lock); + local_irq_enable(); + + cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); + } + + /* + * Toggle the offline CPUs. + */ + cpumask_copy(&sched_core_mask, cpu_possible_mask); + cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask); + + for_each_cpu(cpu, &sched_core_mask) + cpu_rq(cpu)->core_enabled = enabled; + + cpus_read_unlock(); +} + +static void __sched_core_enable(void) +{ + // XXX verify there are no cookie tasks (yet) + + static_branch_enable(&__sched_core_enabled); + /* + * Ensure all previous instances of raw_spin_rq_*lock() have finished + * and future ones will observe !sched_core_disabled(). + */ + synchronize_rcu(); + __sched_core_flip(true); +} + +static void __sched_core_disable(void) +{ + // XXX verify there are no cookie tasks (left) + + __sched_core_flip(false); + static_branch_disable(&__sched_core_enabled); +} + +void sched_core_get(void) +{ + mutex_lock(&sched_core_mutex); + if (!sched_core_count++) + __sched_core_enable(); + mutex_unlock(&sched_core_mutex); +} + +void sched_core_put(void) +{ + mutex_lock(&sched_core_mutex); + if (!--sched_core_count) + __sched_core_disable(); + mutex_unlock(&sched_core_mutex); +} + +#endif /* CONFIG_SCHED_CORE */ + /* * part of the period that we allow rt tasks to run in us. * default: 0.95s @@ -179,16 +281,23 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass) { raw_spinlock_t *lock;
+ /* Matches synchronize_rcu() in __sched_core_enable() */ + preempt_disable(); if (sched_core_disabled()) { raw_spin_lock_nested(&rq->__lock, subclass); + /* preempt_count *MUST* be > 1 */ + preempt_enable_no_resched(); return; }
for (;;) { lock = rq_lockp(rq); raw_spin_lock_nested(lock, subclass); - if (likely(lock == rq_lockp(rq))) + if (likely(lock == rq_lockp(rq))) { + /* preempt_count *MUST* be > 1 */ + preempt_enable_no_resched(); return; + } raw_spin_unlock(lock); } } @@ -198,14 +307,21 @@ bool raw_spin_rq_trylock(struct rq *rq) raw_spinlock_t *lock; bool ret;
- if (sched_core_disabled()) - return raw_spin_trylock(&rq->__lock); + /* Matches synchronize_rcu() in __sched_core_enable() */ + preempt_disable(); + if (sched_core_disabled()) { + ret = raw_spin_trylock(&rq->__lock); + preempt_enable(); + return ret; + }
for (;;) { lock = rq_lockp(rq); ret = raw_spin_trylock(lock); - if (!ret || (likely(lock == rq_lockp(rq)))) + if (!ret || (likely(lock == rq_lockp(rq)))) { + preempt_enable(); return ret; + } raw_spin_unlock(lock); } } @@ -4476,6 +4592,40 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) BUG(); }
+#ifdef CONFIG_SCHED_CORE + +static inline void sched_core_cpu_starting(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq, *core_rq = NULL; + int i; + + core_rq = cpu_rq(cpu)->core; + + if (!core_rq) { + for_each_cpu(i, smt_mask) { + rq = cpu_rq(i); + if (rq->core && rq->core == rq) + core_rq = rq; + } + + if (!core_rq) + core_rq = cpu_rq(cpu); + + for_each_cpu(i, smt_mask) { + rq = cpu_rq(i); + + WARN_ON_ONCE(rq->core && rq->core != core_rq); + rq->core = core_rq; + } + } +} +#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_cpu_starting(unsigned int cpu) {} + +#endif /* CONFIG_SCHED_CORE */ + /* * __schedule() is the main scheduler function. * @@ -7269,6 +7419,7 @@ static void sched_rq_cpu_starting(unsigned int cpu)
int sched_cpu_starting(unsigned int cpu) { + sched_core_cpu_starting(cpu); sched_rq_cpu_starting(cpu); sched_tick_start(cpu); return 0; @@ -7502,6 +7653,11 @@ void __init sched_init(void) #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); + +#ifdef CONFIG_SCHED_CORE + rq->core = NULL; + rq->core_enabled = 0; +#endif }
set_load_weight(&init_task, false); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec6be3e11d60..6b56c90f5d10 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1107,6 +1107,12 @@ struct rq { struct cpuidle_state *idle_state; #endif
+#ifdef CONFIG_SCHED_CORE + /* per rq */ + struct rq *core; + unsigned int core_enabled; +#endif + KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) @@ -1151,6 +1157,35 @@ enum task_qos_level { void init_qos_hrtimer(int cpu); #endif
+#ifdef CONFIG_SCHED_CORE + +DECLARE_STATIC_KEY_FALSE(__sched_core_enabled); + +static inline bool sched_core_enabled(struct rq *rq) +{ + return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled; +} + +static inline bool sched_core_disabled(void) +{ + return !static_branch_unlikely(&__sched_core_enabled); +} + +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + if (sched_core_enabled(rq)) + return &rq->core->__lock; + + return &rq->__lock; +} + +#else /* !CONFIG_SCHED_CORE */ + +static inline bool sched_core_enabled(struct rq *rq) +{ + return false; +} + static inline bool sched_core_disabled(void) { return true; @@ -1161,6 +1196,8 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq) return &rq->__lock; }
+#endif /* CONFIG_SCHED_CORE */ + static inline void lockdep_assert_rq_held(struct rq *rq) { lockdep_assert_held(rq_lockp(rq)); @@ -2213,6 +2250,27 @@ unsigned long arch_scale_freq_capacity(int cpu)
static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) { +#ifdef CONFIG_SCHED_CORE + /* + * In order to not have {0,2},{1,3} turn into into an AB-BA, + * order by core-id first and cpu-id second. + * + * Notably: + * + * double_rq_lock(0,3); will take core-0, core-1 lock + * double_rq_lock(1,2); will take core-1, core-0 lock + * + * when only cpu-id is considered. + */ + if (rq1->core->cpu < rq2->core->cpu) + return true; + if (rq1->core->cpu > rq2->core->cpu) + return false; + + /* + * __sched_core_flip() relies on SMT having cpu-id lock order. + */ +#endif return rq1->cpu < rq2->cpu; }
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 9ef7e7e33bcdb57be1afb28884053c28b5f05240 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
rq_lockp() includes a static_branch(), which is asm-goto, which is asm volatile which defeats regular CSE. This means that:
if (!static_branch(&foo)) return simple;
if (static_branch(&foo) && cond) return complex;
Doesn't fold and we get horrible code. Introduce __rq_lockp() without the static_branch() on.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.316696988@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 16 ++++++++-------- kernel/sched/deadline.c | 4 ++-- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 33 +++++++++++++++++++++++++-------- 4 files changed, 36 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c549e584aea5..89f34366fd74 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -291,9 +291,9 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass) }
for (;;) { - lock = rq_lockp(rq); + lock = __rq_lockp(rq); raw_spin_lock_nested(lock, subclass); - if (likely(lock == rq_lockp(rq))) { + if (likely(lock == __rq_lockp(rq))) { /* preempt_count *MUST* be > 1 */ preempt_enable_no_resched(); return; @@ -316,9 +316,9 @@ bool raw_spin_rq_trylock(struct rq *rq) }
for (;;) { - lock = rq_lockp(rq); + lock = __rq_lockp(rq); ret = raw_spin_trylock(lock); - if (!ret || (likely(lock == rq_lockp(rq)))) { + if (!ret || (likely(lock == __rq_lockp(rq)))) { preempt_enable(); return ret; } @@ -343,7 +343,7 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) swap(rq1, rq2);
raw_spin_rq_lock(rq1); - if (rq_lockp(rq1) == rq_lockp(rq2)) + if (__rq_lockp(rq1) == __rq_lockp(rq2)) return;
raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); @@ -2164,7 +2164,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * task_rq_lock(). */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(rq_lockp(task_rq(p))))); + lockdep_is_held(__rq_lockp(task_rq(p))))); #endif /* * Clearly, migrating tasks to offline CPUs is a fairly daft thing. @@ -3725,7 +3725,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf * do an early lockdep release here: */ rq_unpin_lock(rq, rf); - spin_release(&rq_lockp(rq)->dep_map, _THIS_IP_); + spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq_lockp(rq)->owner = next; @@ -3739,7 +3739,7 @@ static inline void finish_lock_switch(struct rq *rq) * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ - spin_acquire(&rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); + spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); raw_spin_rq_unlock_irq(rq); }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 990bffb1443d..6453bf1ddf7e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1095,9 +1095,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * If the runqueue is no longer available, migrate the * task elsewhere. This necessarily changes rq. */ - lockdep_unpin_lock(rq_lockp(rq), rf.cookie); + lockdep_unpin_lock(__rq_lockp(rq), rf.cookie); rq = dl_task_offline_migration(rq, p); - rf.cookie = lockdep_pin_lock(rq_lockp(rq)); + rf.cookie = lockdep_pin_lock(__rq_lockp(rq)); update_rq_clock(rq);
/* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5aa269fbb068..a596d87a39de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1131,7 +1131,7 @@ struct numa_group { static struct numa_group *deref_task_numa_group(struct task_struct *p) { return rcu_dereference_check(p->numa_group, p == current || - (lockdep_is_held(rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu))); + (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu))); }
static struct numa_group *deref_curr_numa_group(struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6b56c90f5d10..3a9296fbc458 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1171,6 +1171,10 @@ static inline bool sched_core_disabled(void) return !static_branch_unlikely(&__sched_core_enabled); }
+/* + * Be careful with this function; not for general use. The return value isn't + * stable unless you actually hold a relevant rq->__lock. + */ static inline raw_spinlock_t *rq_lockp(struct rq *rq) { if (sched_core_enabled(rq)) @@ -1179,6 +1183,14 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq) return &rq->__lock; }
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + if (rq->core_enabled) + return &rq->core->__lock; + + return &rq->__lock; +} + #else /* !CONFIG_SCHED_CORE */
static inline bool sched_core_enabled(struct rq *rq) @@ -1196,11 +1208,16 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq) return &rq->__lock; }
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + #endif /* CONFIG_SCHED_CORE */
static inline void lockdep_assert_rq_held(struct rq *rq) { - lockdep_assert_held(rq_lockp(rq)); + lockdep_assert_held(__rq_lockp(rq)); }
extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); @@ -1382,7 +1399,7 @@ struct rq_flags { */ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { - rf->cookie = lockdep_pin_lock(rq_lockp(rq)); + rf->cookie = lockdep_pin_lock(__rq_lockp(rq));
#ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); @@ -1397,12 +1414,12 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) rf->clock_update_flags = RQCF_UPDATED; #endif
- lockdep_unpin_lock(rq_lockp(rq), rf->cookie); + lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); }
static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) { - lockdep_repin_lock(rq_lockp(rq), rf->cookie); + lockdep_repin_lock(__rq_lockp(rq), rf->cookie);
#ifdef CONFIG_SCHED_DEBUG /* @@ -2310,7 +2327,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - if (rq_lockp(this_rq) == rq_lockp(busiest)) + if (__rq_lockp(this_rq) == __rq_lockp(busiest)) return 0;
if (likely(raw_spin_rq_trylock(busiest))) @@ -2342,9 +2359,9 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - if (rq_lockp(this_rq) != rq_lockp(busiest)) + if (__rq_lockp(this_rq) != __rq_lockp(busiest)) raw_spin_rq_unlock(busiest); - lock_set_subclass(&rq_lockp(this_rq)->dep_map, 0, _RET_IP_); + lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_); }
static inline void double_lock(spinlock_t *l1, spinlock_t *l2) @@ -2384,7 +2401,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - if (rq_lockp(rq1) != rq_lockp(rq2)) + if (__rq_lockp(rq1) != __rq_lockp(rq2)) raw_spin_rq_unlock(rq2); else __release(rq2->lock);
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 875feb41fd20f6bd6054c9e79a5bcd9da6d8d2b2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Stuff the meat of sched_core_put() into a work such that we can use sched_core_put() from atomic context.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.377455632@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 89f34366fd74..86a256b490fa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -93,7 +93,7 @@ DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); */
static DEFINE_MUTEX(sched_core_mutex); -static int sched_core_count; +static atomic_t sched_core_count; static struct cpumask sched_core_mask;
static void __sched_core_flip(bool enabled) @@ -161,18 +161,39 @@ static void __sched_core_disable(void)
void sched_core_get(void) { + if (atomic_inc_not_zero(&sched_core_count)) + return; + mutex_lock(&sched_core_mutex); - if (!sched_core_count++) + if (!atomic_read(&sched_core_count)) __sched_core_enable(); + + smp_mb__before_atomic(); + atomic_inc(&sched_core_count); mutex_unlock(&sched_core_mutex); }
-void sched_core_put(void) +static void __sched_core_put(struct work_struct *work) { - mutex_lock(&sched_core_mutex); - if (!--sched_core_count) + if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) { __sched_core_disable(); - mutex_unlock(&sched_core_mutex); + mutex_unlock(&sched_core_mutex); + } +} + +void sched_core_put(void) +{ + static DECLARE_WORK(_work, __sched_core_put); + + /* + * "There can be only one" + * + * Either this is the last one, or we don't actually need to do any + * 'work'. If it is the last *again*, we rely on + * WORK_STRUCT_PENDING_BIT. + */ + if (!atomic_add_unless(&sched_core_count, -1, 1)) + schedule_work(&_work); }
#endif /* CONFIG_SCHED_CORE */
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 21f56ffe4482e501b9e83737612493eeaac21f5a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Because sched_class::pick_next_task() also implies sched_class::set_next_task() (and possibly put_prev_task() and newidle_balance) it is not state invariant. This makes it unsuitable for remote task selection.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org [Vineeth: folded fixes] Signed-off-by: Vineeth Remanan Pillai viremana@linux.microsoft.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.437092775@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/deadline.c | 16 ++++++++++++++-- kernel/sched/fair.c | 40 +++++++++++++++++++++++++++++++++++++--- kernel/sched/idle.c | 8 ++++++++ kernel/sched/rt.c | 15 +++++++++++++-- kernel/sched/sched.h | 3 +++ kernel/sched/stop_task.c | 14 ++++++++++++-- 6 files changed, 87 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6453bf1ddf7e..0a4d217097ae 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1855,7 +1855,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); }
-static struct task_struct *pick_next_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -1867,7 +1867,18 @@ static struct task_struct *pick_next_task_dl(struct rq *rq) dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); p = dl_task_of(dl_se); - set_next_task_dl(rq, p, true); + + return p; +} + +static struct task_struct *pick_next_task_dl(struct rq *rq) +{ + struct task_struct *p; + + p = pick_task_dl(rq); + if (p) + set_next_task_dl(rq, p, true); + return p; }
@@ -2531,6 +2542,7 @@ const struct sched_class dl_sched_class
#ifdef CONFIG_SMP .balance = balance_dl, + .pick_task = pick_task_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a596d87a39de..55043987fc76 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4529,6 +4529,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { + clear_buddies(cfs_rq, se); + /* 'current' is not kept within the tree. */ if (se->on_rq) { /* @@ -4588,7 +4590,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ - if (cfs_rq->skip == se) { + if (cfs_rq->skip && cfs_rq->skip == se) { struct sched_entity *second;
if (se == curr) { @@ -4615,8 +4617,6 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) se = cfs_rq->last; }
- clear_buddies(cfs_rq, se); - return se; }
@@ -7576,6 +7576,39 @@ void qos_smt_check_need_resched(void) } #endif
+#ifdef CONFIG_SMP +static struct task_struct *pick_task_fair(struct rq *rq) +{ + struct sched_entity *se; + struct cfs_rq *cfs_rq; + +again: + cfs_rq = &rq->cfs; + if (!cfs_rq->nr_running) + return NULL; + + do { + struct sched_entity *curr = cfs_rq->curr; + + /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ + if (curr) { + if (curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; + + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto again; + } + + se = pick_next_entity(cfs_rq, curr); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + return task_of(se); +} +#endif + struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -12068,6 +12101,7 @@ const struct sched_class fair_sched_class
#ifdef CONFIG_SMP .balance = balance_fair, + .pick_task = pick_task_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c781dc136a5b..3c6396d61a04 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -432,6 +432,13 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir schedstat_inc(rq->sched_goidle); }
+#ifdef CONFIG_SMP +static struct task_struct *pick_task_idle(struct rq *rq) +{ + return rq->idle; +} +#endif + struct task_struct *pick_next_task_idle(struct rq *rq) { struct task_struct *next = rq->idle; @@ -499,6 +506,7 @@ const struct sched_class idle_sched_class
#ifdef CONFIG_SMP .balance = balance_idle, + .pick_task = pick_task_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4e971b5cea19..11df3f66af90 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1631,7 +1631,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); }
-static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq) { struct task_struct *p;
@@ -1639,7 +1639,17 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) return NULL;
p = _pick_next_task_rt(rq); - set_next_task_rt(rq, p, true); + + return p; +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ + struct task_struct *p = pick_task_rt(rq); + + if (p) + set_next_task_rt(rq, p, true); + return p; }
@@ -2453,6 +2463,7 @@ const struct sched_class rt_sched_class
#ifdef CONFIG_SMP .balance = balance_rt, + .pick_task = pick_task_rt, .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3a9296fbc458..9331ba4ef976 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1983,6 +1983,9 @@ struct sched_class { #ifdef CONFIG_SMP int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + + struct task_struct * (*pick_task)(struct rq *rq); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index ceb5b6b12561..eb34768cd4e1 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -34,15 +34,24 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir stop->se.exec_start = rq_clock_task(rq); }
-static struct task_struct *pick_next_task_stop(struct rq *rq) +static struct task_struct *pick_task_stop(struct rq *rq) { if (!sched_stop_runnable(rq)) return NULL;
- set_next_task_stop(rq, rq->stop, true); return rq->stop; }
+static struct task_struct *pick_next_task_stop(struct rq *rq) +{ + struct task_struct *p = pick_task_stop(rq); + + if (p) + set_next_task_stop(rq, p, true); + + return p; +} + static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { @@ -124,6 +133,7 @@ const struct sched_class stop_sched_class
#ifdef CONFIG_SMP .balance = balance_stop, + .pick_task = pick_task_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 8a311c740b53324ec584e0e3bb7077d56b123c28 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Introduce task_struct::core_cookie as an opaque identifier for core scheduling. When enabled; core scheduling will only allow matching task to be on the core; where idle matches everything.
When task_struct::core_cookie is set (and core scheduling is enabled) these tasks are indexed in a second RB-tree, first on cookie value then on scheduling function, such that matching task selection always finds the most elegible match.
NOTE: *shudder* at the overhead...
NOTE: *sigh*, a 3rd copy of the scheduling function; the alternative is per class tracking of cookies and that just duplicates a lot of stuff for no raisin (the 2nd copy lives in the rt-mutex PI code).
[Joel: folded fixes] Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.496975854@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched.h | 8 ++- kernel/sched/core.c | 152 ++++++++++++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 46 ------------- kernel/sched/sched.h | 55 +++++++++++++++ 4 files changed, 210 insertions(+), 51 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 47f462040f4d..d2a399e0cd9b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -729,10 +729,16 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; + struct sched_dl_entity dl; + +#ifdef CONFIG_SCHED_CORE + struct rb_node core_node; + unsigned long core_cookie; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif - struct sched_dl_entity dl;
#ifdef CONFIG_UCLAMP_TASK /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 86a256b490fa..29935c1e2905 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -79,6 +79,133 @@ __read_mostly int scheduler_running;
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+/* kernel prio, less is more */ +static inline int __task_prio(struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) /* trumps deadline */ + return -2; + + if (rt_prio(p->prio)) /* includes deadline */ + return p->prio; /* [-1, 99] */ + + if (p->sched_class == &idle_sched_class) + return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ + + return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ +} + +/* + * l(a,b) + * le(a,b) := !l(b,a) + * g(a,b) := l(b,a) + * ge(a,b) := !l(a,b) + */ + +/* real prio, less is less */ +static inline bool prio_less(struct task_struct *a, struct task_struct *b) +{ + + int pa = __task_prio(a), pb = __task_prio(b); + + if (-pa < -pb) + return true; + + if (-pb < -pa) + return false; + + if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ + return !dl_time_before(a->dl.deadline, b->dl.deadline); + + if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */ + u64 vruntime = b->se.vruntime; + + /* + * Normalize the vruntime if tasks are in different cpus. + */ + if (task_cpu(a) != task_cpu(b)) { + vruntime -= task_cfs_rq(b)->min_vruntime; + vruntime += task_cfs_rq(a)->min_vruntime; + } + + return !((s64)(a->se.vruntime - vruntime) <= 0); + } + + return false; +} + +static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b) +{ + if (a->core_cookie < b->core_cookie) + return true; + + if (a->core_cookie > b->core_cookie) + return false; + + /* flip prio, so high prio is leftmost */ + if (prio_less(b, a)) + return true; + + return false; +} + +#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node) + +static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b) +{ + return __sched_core_less(__node_2_sc(a), __node_2_sc(b)); +} + +static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) +{ + const struct task_struct *p = __node_2_sc(node); + unsigned long cookie = (unsigned long)key; + + if (cookie < p->core_cookie) + return -1; + + if (cookie > p->core_cookie) + return 1; + + return 0; +} + +static void sched_core_enqueue(struct rq *rq, struct task_struct *p) +{ + rq->core->core_task_seq++; + + if (!p->core_cookie) + return; + + rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); +} + +static void sched_core_dequeue(struct rq *rq, struct task_struct *p) +{ + rq->core->core_task_seq++; + + if (!p->core_cookie) + return; + + rb_erase(&p->core_node, &rq->core_tree); +} + +/* + * Find left-most (aka, highest priority) task matching @cookie. + */ +static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) +{ + struct rb_node *node; + + node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); + /* + * The idle task always matches any cookie! + */ + if (!node) + return idle_sched_class.pick_task(rq); + + return __node_2_sc(node); +} + /* * Magic required such that: * @@ -138,10 +265,16 @@ static void __sched_core_flip(bool enabled) cpus_read_unlock(); }
-static void __sched_core_enable(void) +static void sched_core_assert_empty(void) { - // XXX verify there are no cookie tasks (yet) + int cpu;
+ for_each_possible_cpu(cpu) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree)); +} + +static void __sched_core_enable(void) +{ static_branch_enable(&__sched_core_enabled); /* * Ensure all previous instances of raw_spin_rq_*lock() have finished @@ -149,12 +282,12 @@ static void __sched_core_enable(void) */ synchronize_rcu(); __sched_core_flip(true); + sched_core_assert_empty(); }
static void __sched_core_disable(void) { - // XXX verify there are no cookie tasks (left) - + sched_core_assert_empty(); __sched_core_flip(false); static_branch_disable(&__sched_core_enabled); } @@ -196,6 +329,11 @@ void sched_core_put(void) schedule_work(&_work); }
+#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } +static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } + #endif /* CONFIG_SCHED_CORE */
/* @@ -1775,10 +1913,16 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + + if (sched_core_enabled(rq)) + sched_core_enqueue(rq, p); }
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { + if (sched_core_enabled(rq)) + sched_core_dequeue(rq, p); + if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 55043987fc76..21ef241894e2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -278,33 +278,11 @@ const struct sched_class fair_sched_class; */
#ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -}
/* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (!path) @@ -465,33 +443,9 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#else /* !CONFIG_FAIR_GROUP_SCHED */
-static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #define for_each_sched_entity(se) \ for (; se; se = NULL)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (path) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9331ba4ef976..cb5a62b051cc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1111,6 +1111,10 @@ struct rq { /* per rq */ struct rq *core; unsigned int core_enabled; + struct rb_root core_tree; + + /* shared state */ + unsigned int core_task_seq; #endif
KABI_RESERVE(1) @@ -1281,6 +1285,57 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues)
+#ifdef CONFIG_FAIR_GROUP_SCHED +static inline struct task_struct *task_of(struct sched_entity *se) +{ + SCHED_WARN_ON(!entity_is_task(se)); + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +#else + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} +#endif + extern void update_rq_clock(struct rq *rq);
static inline u64 __rq_clock_broken(struct rq *rq)
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 539f65125d20aacab54d02d77f10a839f45b09dc category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Instead of only selecting a local task, select a task for all SMT siblings for every reschedule on the core (irrespective which logical CPU does the reschedule).
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.557559654@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 301 ++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 6 +- 2 files changed, 305 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 29935c1e2905..a010117eb540 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4717,7 +4717,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; @@ -4758,6 +4758,294 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) }
#ifdef CONFIG_SCHED_CORE +static inline bool is_task_rq_idle(struct task_struct *t) +{ + return (task_rq(t)->idle == t); +} + +static inline bool cookie_equals(struct task_struct *a, unsigned long cookie) +{ + return is_task_rq_idle(a) || (a->core_cookie == cookie); +} + +static inline bool cookie_match(struct task_struct *a, struct task_struct *b) +{ + if (is_task_rq_idle(a) || is_task_rq_idle(b)) + return true; + + return a->core_cookie == b->core_cookie; +} + +// XXX fairness/fwd progress conditions +/* + * Returns + * - NULL if there is no runnable task for this class. + * - the highest priority task for this runqueue if it matches + * rq->core->core_cookie or its priority is greater than max. + * - Else returns idle_task. + */ +static struct task_struct * +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max) +{ + struct task_struct *class_pick, *cookie_pick; + unsigned long cookie = rq->core->core_cookie; + + class_pick = class->pick_task(rq); + if (!class_pick) + return NULL; + + if (!cookie) { + /* + * If class_pick is tagged, return it only if it has + * higher priority than max. + */ + if (max && class_pick->core_cookie && + prio_less(class_pick, max)) + return idle_sched_class.pick_task(rq); + + return class_pick; + } + + /* + * If class_pick is idle or matches cookie, return early. + */ + if (cookie_equals(class_pick, cookie)) + return class_pick; + + cookie_pick = sched_core_find(rq, cookie); + + /* + * If class > max && class > cookie, it is the highest priority task on + * the core (so far) and it must be selected, otherwise we must go with + * the cookie pick in order to satisfy the constraint. + */ + if (prio_less(cookie_pick, class_pick) && + (!max || prio_less(max, class_pick))) + return class_pick; + + return cookie_pick; +} + +static struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + struct task_struct *next, *max = NULL; + const struct sched_class *class; + const struct cpumask *smt_mask; + bool need_sync; + int i, j, cpu; + + if (!sched_core_enabled(rq)) + return __pick_next_task(rq, prev, rf); + + cpu = cpu_of(rq); + + /* Stopper task is switching into idle, no need core-wide selection. */ + if (cpu_is_offline(cpu)) { + /* + * Reset core_pick so that we don't enter the fastpath when + * coming online. core_pick would already be migrated to + * another cpu during offline. + */ + rq->core_pick = NULL; + return __pick_next_task(rq, prev, rf); + } + + /* + * If there were no {en,de}queues since we picked (IOW, the task + * pointers are all still valid), and we haven't scheduled the last + * pick yet, do so now. + * + * rq->core_pick can be NULL if no selection was made for a CPU because + * it was either offline or went offline during a sibling's core-wide + * selection. In this case, do a core-wide selection. + */ + if (rq->core->core_pick_seq == rq->core->core_task_seq && + rq->core->core_pick_seq != rq->core_sched_seq && + rq->core_pick) { + WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); + + next = rq->core_pick; + if (next != prev) { + put_prev_task(rq, prev); + set_next_task(rq, next); + } + + rq->core_pick = NULL; + return next; + } + + put_prev_task_balance(rq, prev, rf); + + smt_mask = cpu_smt_mask(cpu); + + /* + * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq + * + * @task_seq guards the task state ({en,de}queues) + * @pick_seq is the @task_seq we did a selection on + * @sched_seq is the @pick_seq we scheduled + * + * However, preemptions can cause multiple picks on the same task set. + * 'Fix' this by also increasing @task_seq for every pick. + */ + rq->core->core_task_seq++; + need_sync = !!rq->core->core_cookie; + + /* reset state */ + rq->core->core_cookie = 0UL; + for_each_cpu(i, smt_mask) { + struct rq *rq_i = cpu_rq(i); + + rq_i->core_pick = NULL; + + if (rq_i->core_forceidle) { + need_sync = true; + rq_i->core_forceidle = false; + } + + if (i != cpu) + update_rq_clock(rq_i); + } + + /* + * Try and select tasks for each sibling in decending sched_class + * order. + */ + for_each_class(class) { +again: + for_each_cpu_wrap(i, smt_mask, cpu) { + struct rq *rq_i = cpu_rq(i); + struct task_struct *p; + + if (rq_i->core_pick) + continue; + + /* + * If this sibling doesn't yet have a suitable task to + * run; ask for the most elegible task, given the + * highest priority task already selected for this + * core. + */ + p = pick_task(rq_i, class, max); + if (!p) { + /* + * If there weren't no cookies; we don't need to + * bother with the other siblings. + * If the rest of the core is not running a tagged + * task, i.e. need_sync == 0, and the current CPU + * which called into the schedule() loop does not + * have any tasks for this class, skip selecting for + * other siblings since there's no point. We don't skip + * for RT/DL because that could make CFS force-idle RT. + */ + if (i == cpu && !need_sync && class == &fair_sched_class) + goto next_class; + + continue; + } + + /* + * Optimize the 'normal' case where there aren't any + * cookies and we don't need to sync up. + */ + if (i == cpu && !need_sync && !p->core_cookie) { + next = p; + goto done; + } + + rq_i->core_pick = p; + + /* + * If this new candidate is of higher priority than the + * previous; and they're incompatible; we need to wipe + * the slate and start over. pick_task makes sure that + * p's priority is more than max if it doesn't match + * max's cookie. + * + * NOTE: this is a linear max-filter and is thus bounded + * in execution time. + */ + if (!max || !cookie_match(max, p)) { + struct task_struct *old_max = max; + + rq->core->core_cookie = p->core_cookie; + max = p; + + if (old_max) { + for_each_cpu(j, smt_mask) { + if (j == i) + continue; + + cpu_rq(j)->core_pick = NULL; + } + goto again; + } else { + /* + * Once we select a task for a cpu, we + * should not be doing an unconstrained + * pick because it might starve a task + * on a forced idle cpu. + */ + need_sync = true; + } + + } + } +next_class:; + } + + rq->core->core_pick_seq = rq->core->core_task_seq; + next = rq->core_pick; + rq->core_sched_seq = rq->core->core_pick_seq; + + /* Something should have been selected for current CPU */ + WARN_ON_ONCE(!next); + + /* + * Reschedule siblings + * + * NOTE: L1TF -- at this point we're no longer running the old task and + * sending an IPI (below) ensures the sibling will no longer be running + * their task. This ensures there is no inter-sibling overlap between + * non-matching user state. + */ + for_each_cpu(i, smt_mask) { + struct rq *rq_i = cpu_rq(i); + + /* + * An online sibling might have gone offline before a task + * could be picked for it, or it might be offline but later + * happen to come online, but its too late and nothing was + * picked for it. That's Ok - it will pick tasks for itself, + * so ignore it. + */ + if (!rq_i->core_pick) + continue; + + if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running) + rq_i->core_forceidle = true; + + if (i == cpu) { + rq_i->core_pick = NULL; + continue; + } + + /* Did we break L1TF mitigation requirements? */ + WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick)); + + if (rq_i->curr == rq_i->core_pick) { + rq_i->core_pick = NULL; + continue; + } + + resched_curr(rq_i); + } + +done: + set_next_task(rq, next); + return next; +}
static inline void sched_core_cpu_starting(unsigned int cpu) { @@ -4789,6 +5077,12 @@ static inline void sched_core_cpu_starting(unsigned int cpu)
static inline void sched_core_cpu_starting(unsigned int cpu) {}
+static struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return __pick_next_task(rq, prev, rf); +} + #endif /* CONFIG_SCHED_CORE */
/* @@ -7821,7 +8115,12 @@ void __init sched_init(void)
#ifdef CONFIG_SCHED_CORE rq->core = NULL; + rq->core_pick = NULL; rq->core_enabled = 0; + rq->core_tree = RB_ROOT; + rq->core_forceidle = false; + + rq->core_cookie = 0UL; #endif }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cb5a62b051cc..1bbe70210e5d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1110,11 +1110,16 @@ struct rq { #ifdef CONFIG_SCHED_CORE /* per rq */ struct rq *core; + struct task_struct *core_pick; unsigned int core_enabled; + unsigned int core_sched_seq; struct rb_root core_tree; + unsigned char core_forceidle;
/* shared state */ unsigned int core_task_seq; + unsigned int core_pick_seq; + unsigned long core_cookie; #endif
KABI_RESERVE(1) @@ -2090,7 +2095,6 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
static inline void set_next_task(struct rq *rq, struct task_struct *next) { - WARN_ON_ONCE(rq->curr != next); next->sched_class->set_next_task(rq, next, false); }
From: Vineeth Pillai viremana@linux.microsoft.com
mainline inclusion from mainline-v5.14-rc1 commit 8039e96fcc1de30d5bcaf05da9ca2de46a800826 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
If there is only one long running local task and the sibling is forced idle, it might not get a chance to run until a schedule event happens on any cpu in the core.
So we check for this condition during a tick to see if a sibling is starved and then give it a chance to schedule.
Signed-off-by: Vineeth Pillai viremana@linux.microsoft.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.617407840@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 15 ++++++++------- kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 2 +- 3 files changed, 49 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a010117eb540..1f9e156fe6ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4894,16 +4894,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* reset state */ rq->core->core_cookie = 0UL; + if (rq->core->core_forceidle) { + need_sync = true; + rq->core->core_forceidle = false; + } for_each_cpu(i, smt_mask) { struct rq *rq_i = cpu_rq(i);
rq_i->core_pick = NULL;
- if (rq_i->core_forceidle) { - need_sync = true; - rq_i->core_forceidle = false; - } - if (i != cpu) update_rq_clock(rq_i); } @@ -5023,8 +5022,10 @@ next_class:; if (!rq_i->core_pick) continue;
- if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running) - rq_i->core_forceidle = true; + if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running && + !rq_i->core->core_forceidle) { + rq_i->core->core_forceidle = true; + }
if (i == cpu) { rq_i->core_pick = NULL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 21ef241894e2..a2651e159c23 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11533,6 +11533,44 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_CORE +static inline bool +__entity_slice_used(struct sched_entity *se, int min_nr_tasks) +{ + u64 slice = sched_slice(cfs_rq_of(se), se); + u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; + + return (rtime * min_nr_tasks > slice); +} + +#define MIN_NR_TASKS_DURING_FORCEIDLE 2 +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) +{ + if (!sched_core_enabled(rq)) + return; + + /* + * If runqueue has only one task which used up its slice and + * if the sibling is forced idle, then trigger schedule to + * give forced idle task a chance. + * + * sched_slice() considers only this active rq and it gets the + * whole slice. But during force idle, we have siblings acting + * like a single runqueue and hence we need to consider runnable + * tasks on this cpu and the forced idle cpu. Ideally, we should + * go through the forced idle rq, but that would be a perf hit. + * We can assume that the forced idle cpu has atleast + * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check + * if we need to give up the cpu. + */ + if (rq->core->core_forceidle && rq->cfs.nr_running == 1 && + __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) + resched_curr(rq); +} +#else +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} +#endif + /* * scheduler tick hitting a task of our scheduling class. * @@ -11556,6 +11594,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
update_misfit_status(curr, rq); update_overutilized_status(task_rq(curr)); + + task_tick_core(rq, curr); }
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1bbe70210e5d..044aded8314c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1114,12 +1114,12 @@ struct rq { unsigned int core_enabled; unsigned int core_sched_seq; struct rb_root core_tree; - unsigned char core_forceidle;
/* shared state */ unsigned int core_task_seq; unsigned int core_pick_seq; unsigned long core_cookie; + unsigned char core_forceidle; #endif
KABI_RESERVE(1)
From: "Joel Fernandes (Google)" joel@joelfernandes.org
mainline inclusion from mainline-v5.14-rc1 commit 7afbba119f0da09824d723f8081608ea1f74ff57 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
The rationale is as follows. In the core-wide pick logic, even if need_sync == false, we need to go look at other CPUs (non-local CPUs) to see if they could be running RT.
Say the RQs in a particular core look like this:
Let CFS1 and CFS2 be 2 tagged CFS tags. Let RT1 be an untagged RT task.
rq0 rq1 CFS1 (tagged) RT1 (no tag) CFS2 (tagged)
Say schedule() runs on rq0. Now, it will enter the above loop and pick_task(RT) will return NULL for 'p'. It will enter the above if() block and see that need_sync == false and will skip RT entirely.
The end result of the selection will be (say prio(CFS1) > prio(CFS2)):
rq0 rq1 CFS1 IDLE
When it should have selected:
rq0 rq1 IDLE RT
Suggested-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.678425748@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 65 ++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 39 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1f9e156fe6ab..1240480adcab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4878,6 +4878,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) put_prev_task_balance(rq, prev, rf);
smt_mask = cpu_smt_mask(cpu); + need_sync = !!rq->core->core_cookie; + + /* reset state */ + rq->core->core_cookie = 0UL; + if (rq->core->core_forceidle) { + need_sync = true; + fi_before = true; + rq->core->core_forceidle = false; + }
/* * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq @@ -4890,14 +4899,25 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * 'Fix' this by also increasing @task_seq for every pick. */ rq->core->core_task_seq++; - need_sync = !!rq->core->core_cookie;
- /* reset state */ - rq->core->core_cookie = 0UL; - if (rq->core->core_forceidle) { + /* + * Optimize for common case where this CPU has no cookies + * and there are no cookied tasks running on siblings. + */ + if (!need_sync) { + for_each_class(class) { + next = class->pick_task(rq); + if (next) + break; + } + + if (!next->core_cookie) { + rq->core_pick = NULL; + goto done; + } need_sync = true; - rq->core->core_forceidle = false; } + for_each_cpu(i, smt_mask) { struct rq *rq_i = cpu_rq(i);
@@ -4927,31 +4947,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * core. */ p = pick_task(rq_i, class, max); - if (!p) { - /* - * If there weren't no cookies; we don't need to - * bother with the other siblings. - * If the rest of the core is not running a tagged - * task, i.e. need_sync == 0, and the current CPU - * which called into the schedule() loop does not - * have any tasks for this class, skip selecting for - * other siblings since there's no point. We don't skip - * for RT/DL because that could make CFS force-idle RT. - */ - if (i == cpu && !need_sync && class == &fair_sched_class) - goto next_class; - + if (!p) continue; - } - - /* - * Optimize the 'normal' case where there aren't any - * cookies and we don't need to sync up. - */ - if (i == cpu && !need_sync && !p->core_cookie) { - next = p; - goto done; - }
rq_i->core_pick = p;
@@ -4979,19 +4976,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) cpu_rq(j)->core_pick = NULL; } goto again; - } else { - /* - * Once we select a task for a cpu, we - * should not be doing an unconstrained - * pick because it might starve a task - * on a forced idle cpu. - */ - need_sync = true; } - } } -next_class:; }
rq->core->core_pick_seq = rq->core->core_task_seq;
From: "Joel Fernandes (Google)" joel@joelfernandes.org
mainline inclusion from mainline-v5.14-rc1 commit c6047c2e3af68dae23ad884249e0d42ff28d2d1b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
During force-idle, we end up doing cross-cpu comparison of vruntimes during pick_next_task. If we simply compare (vruntime-min_vruntime) across CPUs, and if the CPUs only have 1 task each, we will always end up comparing 0 with 0 and pick just one of the tasks all the time. This starves the task that was not picked. To fix this, take a snapshot of the min_vruntime when entering force idle and use it for comparison. This min_vruntime snapshot will only be used for cross-CPU vruntime comparison, and nothing else.
A note about the min_vruntime snapshot and force idling:
During selection:
When we're not fi, we need to update snapshot. when we're fi and we were not fi, we must update snapshot. When we're fi and we were already fi, we must not update snapshot.
Which gives:
fib fi update 0 0 1 0 1 1 1 0 1 1 1 0
Where:
fi: force-idled now fib: force-idled before
So the min_vruntime snapshot needs to be updated when: !(fib && fi).
Also, the cfs_prio_less() function needs to be aware of whether the core is in force idle or not, since it will be use this information to know whether to advance a cfs_rq's min_vruntime_fi in the hierarchy. So pass this information along via pick_task() -> prio_less().
Suggested-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.738542617@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 59 +++++++++++++++++++--------------- kernel/sched/fair.c | 75 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 8 +++++ 3 files changed, 117 insertions(+), 25 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1240480adcab..78401da6e597 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -102,7 +102,7 @@ static inline int __task_prio(struct task_struct *p) */
/* real prio, less is less */ -static inline bool prio_less(struct task_struct *a, struct task_struct *b) +static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) {
int pa = __task_prio(a), pb = __task_prio(b); @@ -116,19 +116,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b) if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ return !dl_time_before(a->dl.deadline, b->dl.deadline);
- if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */ - u64 vruntime = b->se.vruntime; - - /* - * Normalize the vruntime if tasks are in different cpus. - */ - if (task_cpu(a) != task_cpu(b)) { - vruntime -= task_cfs_rq(b)->min_vruntime; - vruntime += task_cfs_rq(a)->min_vruntime; - } - - return !((s64)(a->se.vruntime - vruntime) <= 0); - } + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ + return cfs_prio_less(a, b, in_fi);
return false; } @@ -142,7 +131,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct * return false;
/* flip prio, so high prio is leftmost */ - if (prio_less(b, a)) + if (prio_less(b, a, task_rq(a)->core->core_forceidle)) return true;
return false; @@ -4785,7 +4774,7 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b) * - Else returns idle_task. */ static struct task_struct * -pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max) +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi) { struct task_struct *class_pick, *cookie_pick; unsigned long cookie = rq->core->core_cookie; @@ -4800,7 +4789,7 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma * higher priority than max. */ if (max && class_pick->core_cookie && - prio_less(class_pick, max)) + prio_less(class_pick, max, in_fi)) return idle_sched_class.pick_task(rq);
return class_pick; @@ -4819,19 +4808,22 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma * the core (so far) and it must be selected, otherwise we must go with * the cookie pick in order to satisfy the constraint. */ - if (prio_less(cookie_pick, class_pick) && - (!max || prio_less(max, class_pick))) + if (prio_less(cookie_pick, class_pick, in_fi) && + (!max || prio_less(max, class_pick, in_fi))) return class_pick;
return cookie_pick; }
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); + static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *next, *max = NULL; const struct sched_class *class; const struct cpumask *smt_mask; + bool fi_before = false; bool need_sync; int i, j, cpu;
@@ -4913,9 +4905,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (!next->core_cookie) { rq->core_pick = NULL; + /* + * For robustness, update the min_vruntime_fi for + * unconstrained picks as well. + */ + WARN_ON_ONCE(fi_before); + task_vruntime_update(rq, next, false); goto done; } - need_sync = true; }
for_each_cpu(i, smt_mask) { @@ -4946,11 +4943,16 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * highest priority task already selected for this * core. */ - p = pick_task(rq_i, class, max); + p = pick_task(rq_i, class, max, fi_before); if (!p) continue;
rq_i->core_pick = p; + if (rq_i->idle == p && rq_i->nr_running) { + rq->core->core_forceidle = true; + if (!fi_before) + rq->core->core_forceidle_seq++; + }
/* * If this new candidate is of higher priority than the @@ -4969,6 +4971,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) max = p;
if (old_max) { + rq->core->core_forceidle = false; for_each_cpu(j, smt_mask) { if (j == i) continue; @@ -5009,10 +5012,16 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (!rq_i->core_pick) continue;
- if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running && - !rq_i->core->core_forceidle) { - rq_i->core->core_forceidle = true; - } + /* + * Update for new !FI->FI transitions, or if continuing to be in !FI: + * fi_before fi update? + * 0 0 1 + * 0 1 1 + * 1 0 1 + * 1 1 0 + */ + if (!(fi_before && rq->core->core_forceidle)) + task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
if (i == cpu) { rq_i->core_pick = NULL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a2651e159c23..3969b7644f97 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11567,6 +11567,81 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) resched_curr(rq); } + +/* + * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + */ +static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (forceidle) { + if (cfs_rq->forceidle_seq == fi_seq) + break; + cfs_rq->forceidle_seq = fi_seq; + } + + cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; + } +} + +void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) +{ + struct sched_entity *se = &p->se; + + if (p->sched_class != &fair_sched_class) + return; + + se_fi_update(se, rq->core->core_forceidle_seq, in_fi); +} + +bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +{ + struct rq *rq = task_rq(a); + struct sched_entity *sea = &a->se; + struct sched_entity *seb = &b->se; + struct cfs_rq *cfs_rqa; + struct cfs_rq *cfs_rqb; + s64 delta; + + SCHED_WARN_ON(task_rq(b)->core != rq->core); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Find an se in the hierarchy for tasks a and b, such that the se's + * are immediate siblings. + */ + while (sea->cfs_rq->tg != seb->cfs_rq->tg) { + int sea_depth = sea->depth; + int seb_depth = seb->depth; + + if (sea_depth >= seb_depth) + sea = parent_entity(sea); + if (sea_depth <= seb_depth) + seb = parent_entity(seb); + } + + se_fi_update(sea, rq->core->core_forceidle_seq, in_fi); + se_fi_update(seb, rq->core->core_forceidle_seq, in_fi); + + cfs_rqa = sea->cfs_rq; + cfs_rqb = seb->cfs_rq; +#else + cfs_rqa = &task_rq(a)->cfs; + cfs_rqb = &task_rq(b)->cfs; +#endif + + /* + * Find delta after normalizing se's vruntime with its cfs_rq's + * min_vruntime_fi, which would have been updated in prior calls + * to se_fi_update(). + */ + delta = (s64)(sea->vruntime - seb->vruntime) + + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + + return delta > 0; +} #else static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 044aded8314c..ff709a805815 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -553,6 +553,11 @@ struct cfs_rq {
u64 exec_clock; u64 min_vruntime; +#ifdef CONFIG_SCHED_CORE + unsigned int forceidle_seq; + u64 min_vruntime_fi; +#endif + #ifndef CONFIG_64BIT u64 min_vruntime_copy; #endif @@ -1120,6 +1125,7 @@ struct rq { unsigned int core_pick_seq; unsigned long core_cookie; unsigned char core_forceidle; + unsigned int core_forceidle_seq; #endif
KABI_RESERVE(1) @@ -1200,6 +1206,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; }
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); + #else /* !CONFIG_SCHED_CORE */
static inline bool sched_core_enabled(struct rq *rq)
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit d2dfa17bc7de67e99685c4d6557837bf801a102c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
When a sibling is forced-idle to match the core-cookie; search for matching tasks to fill the core.
rcu_read_unlock() can incur an infrequent deadlock in sched_core_balance(). Fix this by using the RCU-sched flavor instead.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.800048269@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched.h | 1 + kernel/sched/core.c | 130 +++++++++++++++++++++++++++++++++++++++++- kernel/sched/idle.c | 1 + kernel/sched/sched.h | 6 ++ 4 files changed, 137 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index d2a399e0cd9b..dd1feff73c7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -734,6 +734,7 @@ struct task_struct { #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; + unsigned int core_occupation; #endif
#ifdef CONFIG_CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78401da6e597..d4598582240b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -195,6 +195,21 @@ static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) return __node_2_sc(node); }
+static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) +{ + struct rb_node *node = &p->core_node; + + node = rb_next(node); + if (!node) + return NULL; + + p = container_of(node, struct task_struct, core_node); + if (p->core_cookie != cookie) + return NULL; + + return p; +} + /* * Magic required such that: * @@ -4824,8 +4839,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) const struct sched_class *class; const struct cpumask *smt_mask; bool fi_before = false; + int i, j, cpu, occ = 0; bool need_sync; - int i, j, cpu;
if (!sched_core_enabled(rq)) return __pick_next_task(rq, prev, rf); @@ -4947,6 +4962,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (!p) continue;
+ if (!is_task_rq_idle(p)) + occ++; + rq_i->core_pick = p; if (rq_i->idle == p && rq_i->nr_running) { rq->core->core_forceidle = true; @@ -4978,6 +4996,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
cpu_rq(j)->core_pick = NULL; } + occ = 1; goto again; } } @@ -5023,6 +5042,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (!(fi_before && rq->core->core_forceidle)) task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+ rq_i->core_pick->core_occupation = occ; + if (i == cpu) { rq_i->core_pick = NULL; continue; @@ -5044,6 +5065,113 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return next; }
+static bool try_steal_cookie(int this, int that) +{ + struct rq *dst = cpu_rq(this), *src = cpu_rq(that); + struct task_struct *p; + unsigned long cookie; + bool success = false; + + local_irq_disable(); + double_rq_lock(dst, src); + + cookie = dst->core->core_cookie; + if (!cookie) + goto unlock; + + if (dst->curr != dst->idle) + goto unlock; + + p = sched_core_find(src, cookie); + if (p == src->idle) + goto unlock; + + do { + if (p == src->core_pick || p == src->curr) + goto next; + + if (!cpumask_test_cpu(this, &p->cpus_mask)) + goto next; + + if (p->core_occupation > dst->idle->core_occupation) + goto next; + + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(src, p, 0); + set_task_cpu(p, this); + activate_task(dst, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + + resched_curr(dst); + + success = true; + break; + +next: + p = sched_core_next(p, cookie); + } while (p); + +unlock: + double_rq_unlock(dst, src); + local_irq_enable(); + + return success; +} + +static bool steal_cookie_task(int cpu, struct sched_domain *sd) +{ + int i; + + for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { + if (i == cpu) + continue; + + if (need_resched()) + break; + + if (try_steal_cookie(cpu, i)) + return true; + } + + return false; +} + +static void sched_core_balance(struct rq *rq) +{ + struct sched_domain *sd; + int cpu = cpu_of(rq); + + preempt_disable(); + rcu_read_lock(); + raw_spin_rq_unlock_irq(rq); + for_each_domain(cpu, sd) { + if (need_resched()) + break; + + if (steal_cookie_task(cpu, sd)) + break; + } + raw_spin_rq_lock_irq(rq); + rcu_read_unlock(); + preempt_enable(); +} + +static DEFINE_PER_CPU(struct callback_head, core_balance_head); + +void queue_core_balance(struct rq *rq) +{ + if (!sched_core_enabled(rq)) + return; + + if (!rq->core->core_cookie) + return; + + if (!rq->nr_running) /* not forced idle */ + return; + + queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); +} + static inline void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 3c6396d61a04..58bcb9517dfe 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -430,6 +430,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir { update_idle_core(rq); schedstat_inc(rq->sched_goidle); + queue_core_balance(rq); }
#ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ff709a805815..4fe48700e926 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1208,6 +1208,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);
+extern void queue_core_balance(struct rq *rq); + #else /* !CONFIG_SCHED_CORE */
static inline bool sched_core_enabled(struct rq *rq) @@ -1230,6 +1232,10 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; }
+static inline void queue_core_balance(struct rq *rq) +{ +} + #endif /* CONFIG_SCHED_CORE */
static inline void lockdep_assert_rq_held(struct rq *rq)
From: Aubrey Li aubrey.li@linux.intel.com
mainline inclusion from mainline-v5.14-rc1 commit 97886d9dcd86820bdbc1fa73b455982809cbc8c2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
- Don't migrate if there is a cookie mismatch Load balance tries to move task from busiest CPU to the destination CPU. When core scheduling is enabled, if the task's cookie does not match with the destination CPU's core cookie, this task may be skipped by this CPU. This mitigates the forced idle time on the destination CPU.
- Select cookie matched idle CPU In the fast path of task wakeup, select the first cookie matched idle CPU instead of the first idle CPU.
- Find cookie matched idlest CPU In the slow path of task wakeup, find the idlest CPU whose core cookie matches with task's cookie
Signed-off-by: Aubrey Li aubrey.li@linux.intel.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.860083871@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 29 ++++++++++++++---- kernel/sched/sched.h | 73 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3969b7644f97..6324e991f8f4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6022,11 +6022,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + struct rq *rq = cpu_rq(i); + + if (!sched_core_cookie_match(rq, p)) + continue; + if (sched_idle_cpu(i)) return i;
if (available_idle_cpu(i)) { - struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { /* @@ -6112,9 +6116,10 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; }
-static inline int __select_idle_cpu(int cpu) +static inline int __select_idle_cpu(int cpu, struct task_struct *p) { - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && + sched_cpu_cookie_match(cpu_rq(cpu), p)) return cpu;
return -1; @@ -6184,7 +6189,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu int cpu;
if (!static_branch_likely(&sched_smt_present)) - return __select_idle_cpu(core); + return __select_idle_cpu(core, p);
for_each_cpu(cpu, cpu_smt_mask(core)) { if (!available_idle_cpu(cpu)) { @@ -6222,7 +6227,7 @@ static inline bool test_idle_cores(int cpu, bool def)
static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - return __select_idle_cpu(core); + return __select_idle_cpu(core, p); }
#endif /* CONFIG_SCHED_SMT */ @@ -6275,7 +6280,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } else { if (!--nr) return -1; - idle_cpu = __select_idle_cpu(cpu); + idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) break; } @@ -8102,6 +8107,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (sysctl_sched_migration_cost == -1) return 1; + + /* + * Don't migrate task if the task's cookie does not match + * with the destination CPU's core cookie. + */ + if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p)) + return 1; + if (sysctl_sched_migration_cost == 0) return 0;
@@ -9486,6 +9499,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) p->cpus_ptr)) continue;
+ /* Skip over this group if no cookie matched */ + if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group)) + continue; + local_group = cpumask_test_cpu(this_cpu, sched_group_span(group));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4fe48700e926..05bcd51b25da 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1172,7 +1172,9 @@ enum task_qos_level { void init_qos_hrtimer(int cpu); #endif
+struct sched_group; #ifdef CONFIG_SCHED_CORE +static inline struct cpumask *sched_group_span(struct sched_group *sg);
DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
@@ -1208,6 +1210,61 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);
+/* + * Helpers to check if the CPU's core cookie matches with the task's cookie + * when core scheduling is enabled. + * A special case is that the task's cookie always matches with CPU's core + * cookie if the CPU is in an idle core. + */ +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + return rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + bool idle_core = true; + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { + if (!available_idle_cpu(cpu)) { + idle_core = false; + break; + } + } + + /* + * A CPU in an idle core is always the best choice for tasks with + * cookies. + */ + return idle_core || rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) { + if (sched_core_cookie_match(rq, p)) + return true; + } + return false; +} + extern void queue_core_balance(struct rq *rq);
#else /* !CONFIG_SCHED_CORE */ @@ -1236,6 +1293,22 @@ static inline void queue_core_balance(struct rq *rq) { }
+static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + return true; +} #endif /* CONFIG_SCHED_CORE */
static inline void lockdep_assert_rq_held(struct rq *rq)
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 6e33cad0af49336952e5541464bd02f5b5fd433e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
In order to not have to use pid_struct, create a new, smaller, structure to manage task cookies for core scheduling.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.919768100@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched.h | 6 +++ kernel/fork.c | 1 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 7 +-- kernel/sched/core_sched.c | 109 ++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 16 ++++++ 6 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 kernel/sched/core_sched.c
diff --git a/include/linux/sched.h b/include/linux/sched.h index dd1feff73c7d..6f8e0cb8b55b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2173,6 +2173,12 @@ int sched_trace_rq_nr_running(struct rq *rq);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
+#ifdef CONFIG_SCHED_CORE +extern void sched_core_free(struct task_struct *tsk); +#else +static inline void sched_core_free(struct task_struct *tsk) { } +#endif + #ifdef CONFIG_QOS_SCHED void sched_move_offline_task(struct task_struct *p); void sched_qos_offline_wait(void); diff --git a/kernel/fork.c b/kernel/fork.c index 0fb86b65ae60..628c2b3fafed 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -739,6 +739,7 @@ void __put_task_struct(struct task_struct *tsk) exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); + sched_core_free(tsk);
if (!profile_handoff_task(tsk)) free_task(tsk); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5fc9c9b70862..978fcfca5871 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o +obj-$(CONFIG_SCHED_CORE) += core_sched.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d4598582240b..a485c8729f7c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -158,7 +158,7 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) return 0; }
-static void sched_core_enqueue(struct rq *rq, struct task_struct *p) +void sched_core_enqueue(struct rq *rq, struct task_struct *p) { rq->core->core_task_seq++;
@@ -168,14 +168,15 @@ static void sched_core_enqueue(struct rq *rq, struct task_struct *p) rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); }
-static void sched_core_dequeue(struct rq *rq, struct task_struct *p) +void sched_core_dequeue(struct rq *rq, struct task_struct *p) { rq->core->core_task_seq++;
- if (!p->core_cookie) + if (!sched_core_enqueued(p)) return;
rb_erase(&p->core_node, &rq->core_tree); + RB_CLEAR_NODE(&p->core_node); }
/* diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c new file mode 100644 index 000000000000..8d0869a9eb8c --- /dev/null +++ b/kernel/sched/core_sched.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "sched.h" + +/* + * A simple wrapper around refcount. An allocated sched_core_cookie's + * address is used to compute the cookie of the task. + */ +struct sched_core_cookie { + refcount_t refcnt; +}; + +unsigned long sched_core_alloc_cookie(void) +{ + struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL); + if (!ck) + return 0; + + refcount_set(&ck->refcnt, 1); + sched_core_get(); + + return (unsigned long)ck; +} + +void sched_core_put_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr && refcount_dec_and_test(&ptr->refcnt)) { + kfree(ptr); + sched_core_put(); + } +} + +unsigned long sched_core_get_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr) + refcount_inc(&ptr->refcnt); + + return cookie; +} + +/* + * sched_core_update_cookie - replace the cookie on a task + * @p: the task to update + * @cookie: the new cookie + * + * Effectively exchange the task cookie; caller is responsible for lifetimes on + * both ends. + * + * Returns: the old cookie + */ +unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie) +{ + unsigned long old_cookie; + struct rq_flags rf; + struct rq *rq; + bool enqueued; + + rq = task_rq_lock(p, &rf); + + /* + * Since creating a cookie implies sched_core_get(), and we cannot set + * a cookie until after we've created it, similarly, we cannot destroy + * a cookie until after we've removed it, we must have core scheduling + * enabled here. + */ + SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); + + enqueued = sched_core_enqueued(p); + if (enqueued) + sched_core_dequeue(rq, p); + + old_cookie = p->core_cookie; + p->core_cookie = cookie; + + if (enqueued) + sched_core_enqueue(rq, p); + + /* + * If task is currently running, it may not be compatible anymore after + * the cookie change, so enter the scheduler on its CPU to schedule it + * away. + */ + if (task_running(rq, p)) + resched_curr(rq); + + task_rq_unlock(rq, p, &rf); + + return old_cookie; +} + +static unsigned long sched_core_clone_cookie(struct task_struct *p) +{ + unsigned long cookie, flags; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + cookie = sched_core_get_cookie(p->core_cookie); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return cookie; +} + +void sched_core_free(struct task_struct *p) +{ + sched_core_put_cookie(p->core_cookie); +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 05bcd51b25da..91f8795f0693 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1267,6 +1267,22 @@ static inline bool sched_group_cookie_match(struct rq *rq,
extern void queue_core_balance(struct rq *rq);
+static inline bool sched_core_enqueued(struct task_struct *p) +{ + return !RB_EMPTY_NODE(&p->core_node); +} + +extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); + +extern void sched_core_get(void); +extern void sched_core_put(void); + +extern unsigned long sched_core_alloc_cookie(void); +extern void sched_core_put_cookie(unsigned long cookie); +extern unsigned long sched_core_get_cookie(unsigned long cookie); +extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie); + #else /* !CONFIG_SCHED_CORE */
static inline bool sched_core_enabled(struct rq *rq)
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 85dd3f61203c5cfa72b308ff327b5fbf3fc1ce5e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Note that sched_core_fork() is called from under tasklist_lock, and not from sched_fork() earlier. This avoids a few races later.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123308.980003687@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched.h | 2 ++ kernel/fork.c | 3 +++ kernel/sched/core_sched.c | 6 ++++++ 3 files changed, 11 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f8e0cb8b55b..a133a67a1b86 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2175,8 +2175,10 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
#ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); +extern void sched_core_fork(struct task_struct *p); #else static inline void sched_core_free(struct task_struct *tsk) { } +static inline void sched_core_fork(struct task_struct *p) { } #endif
#ifdef CONFIG_QOS_SCHED diff --git a/kernel/fork.c b/kernel/fork.c index 628c2b3fafed..8a2e827815b6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2278,6 +2278,8 @@ static __latent_entropy struct task_struct *copy_process(
klp_copy_process(p);
+ sched_core_fork(p); + spin_lock(¤t->sighand->siglock);
/* @@ -2364,6 +2366,7 @@ static __latent_entropy struct task_struct *copy_process( return p;
bad_fork_cancel_cgroup: + sched_core_free(p); spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p, args); diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 8d0869a9eb8c..dcbbeaefaaa3 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -103,6 +103,12 @@ static unsigned long sched_core_clone_cookie(struct task_struct *p) return cookie; }
+void sched_core_fork(struct task_struct *p) +{ + RB_CLEAR_NODE(&p->core_node); + p->core_cookie = sched_core_clone_cookie(current); +} + void sched_core_free(struct task_struct *p) { sched_core_put_cookie(p->core_cookie);
From: Chris Hyser chris.hyser@oracle.com
mainline inclusion from mainline-v5.14-rc1 commit 7ac592aa35a684ff1858fb9ec282886b9e3575ac category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
This patch provides support for setting and copying core scheduling 'task cookies' between threads (PID), processes (TGID), and process groups (PGID).
The value of core scheduling isn't that tasks don't share a core, 'nosmt' can do that. The value lies in exploiting all the sharing opportunities that exist to recover possible lost performance and that requires a degree of flexibility in the API.
From a security perspective (and there are others), the thread, process and process group distinction is an existent hierarchal categorization of tasks that reflects many of the security concerns about 'data sharing'. For example, protecting against cache-snooping by a thread that can just read the memory directly isn't all that useful.
With this in mind, subcommands to CREATE/SHARE (TO/FROM) provide a mechanism to create and share cookies. CREATE/SHARE_TO specify a target pid with enum pidtype used to specify the scope of the targeted tasks. For example, PIDTYPE_TGID will share the cookie with the process and all of it's threads as typically desired in a security scenario.
API:
prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, tgtpid, pidtype, &cookie) prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, tgtpid, pidtype, NULL) prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, tgtpid, pidtype, NULL) prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, srcpid, pidtype, NULL)
where 'tgtpid/srcpid == 0' implies the current process and pidtype is kernel enum pid_type {PIDTYPE_PID, PIDTYPE_TGID, PIDTYPE_PGID, ...}.
For return values, EINVAL, ENOMEM are what they say. ESRCH means the tgtpid/srcpid was not found. EPERM indicates lack of PTRACE permission access to tgtpid/srcpid. ENODEV indicates your machines lacks SMT.
[peterz: complete rewrite] Signed-off-by: Chris Hyser chris.hyser@oracle.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123309.039845339@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched.h | 2 + include/uapi/linux/prctl.h | 8 +++ kernel/sched/core_sched.c | 114 +++++++++++++++++++++++++++++++ kernel/sys.c | 5 ++ tools/include/uapi/linux/prctl.h | 8 +++ 5 files changed, 137 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index a133a67a1b86..ea82fb089cd1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2176,6 +2176,8 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); +extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 7f0827705c9a..892da2795b59 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -247,4 +247,12 @@ struct prctl_mm_map { #define PR_SET_IO_FLUSHER 57 #define PR_GET_IO_FLUSHER 58
+/* Request the scheduler to share a core */ +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index dcbbeaefaaa3..9a80e9a474c0 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only
+#include <linux/prctl.h> #include "sched.h"
/* @@ -113,3 +114,116 @@ void sched_core_free(struct task_struct *p) { sched_core_put_cookie(p->core_cookie); } + +static void __sched_core_set(struct task_struct *p, unsigned long cookie) +{ + cookie = sched_core_get_cookie(cookie); + cookie = sched_core_update_cookie(p, cookie); + sched_core_put_cookie(cookie); +} + +/* Called from prctl interface: PR_SCHED_CORE */ +int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr) +{ + unsigned long cookie = 0, id = 0; + struct task_struct *task, *p; + struct pid *grp; + int err = 0; + + if (!static_branch_likely(&sched_smt_present)) + return -ENODEV; + + if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 || + (cmd != PR_SCHED_CORE_GET && uaddr)) + return -EINVAL; + + rcu_read_lock(); + if (pid == 0) { + task = current; + } else { + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } + } + get_task_struct(task); + rcu_read_unlock(); + + /* + * Check if this process has the right to modify the specified + * process. Use the regular "ptrace_may_access()" checks. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out; + } + + switch (cmd) { + case PR_SCHED_CORE_GET: + if (type != PIDTYPE_PID || uaddr & 7) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + if (cookie) { + /* XXX improve ? */ + ptr_to_hashval((void *)cookie, &id); + } + err = put_user(id, (u64 __user *)uaddr); + goto out; + + case PR_SCHED_CORE_CREATE: + cookie = sched_core_alloc_cookie(); + if (!cookie) { + err = -ENOMEM; + goto out; + } + break; + + case PR_SCHED_CORE_SHARE_TO: + cookie = sched_core_clone_cookie(current); + break; + + case PR_SCHED_CORE_SHARE_FROM: + if (type != PIDTYPE_PID) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + __sched_core_set(current, cookie); + goto out; + + default: + err = -EINVAL; + goto out; + }; + + if (type == PIDTYPE_PID) { + __sched_core_set(task, cookie); + goto out; + } + + read_lock(&tasklist_lock); + grp = task_pid_type(task, type); + + do_each_pid_thread(grp, type, p) { + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out_tasklist; + } + } while_each_pid_thread(grp, type, p); + + do_each_pid_thread(grp, type, p) { + __sched_core_set(p, cookie); + } while_each_pid_thread(grp, type, p); +out_tasklist: + read_unlock(&tasklist_lock); + +out: + sched_core_put_cookie(cookie); + put_task_struct(task); + return err; +} + diff --git a/kernel/sys.c b/kernel/sys.c index 24a3a28ae228..c8a31e1037be 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2523,6 +2523,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; +#ifdef CONFIG_SCHED_CORE + case PR_SCHED_CORE: + error = sched_core_share_pid(arg2, arg3, arg4, arg5); + break; +#endif default: error = -EINVAL; break; diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 7f0827705c9a..892da2795b59 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -247,4 +247,12 @@ struct prctl_mm_map { #define PR_SET_IO_FLUSHER 57 #define PR_GET_IO_FLUSHER 58
+/* Request the scheduler to share a core */ +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 + #endif /* _LINUX_PRCTL_H */
From: Chris Hyser chris.hyser@oracle.com
mainline inclusion from mainline-v5.14-rc1 commit 9f26990074931bbf797373e53104216059b300b1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Provides a selftest and examples of using the interface.
[peterz: updated to not use sched_debug] Signed-off-by: Chris Hyser chris.hyser@oracle.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Don Hiatt dhiatt@digitalocean.com Tested-by: Hongyu Ning hongyu.ning@linux.intel.com Tested-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20210422123309.100860030@infradead.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/testing/selftests/sched/.gitignore | 1 + tools/testing/selftests/sched/Makefile | 14 + tools/testing/selftests/sched/config | 1 + tools/testing/selftests/sched/cs_prctl_test.c | 338 ++++++++++++++++++ 4 files changed, 354 insertions(+) create mode 100644 tools/testing/selftests/sched/.gitignore create mode 100644 tools/testing/selftests/sched/Makefile create mode 100644 tools/testing/selftests/sched/config create mode 100644 tools/testing/selftests/sched/cs_prctl_test.c
diff --git a/tools/testing/selftests/sched/.gitignore b/tools/testing/selftests/sched/.gitignore new file mode 100644 index 000000000000..6996d4654d92 --- /dev/null +++ b/tools/testing/selftests/sched/.gitignore @@ -0,0 +1 @@ +cs_prctl_test diff --git a/tools/testing/selftests/sched/Makefile b/tools/testing/selftests/sched/Makefile new file mode 100644 index 000000000000..10c72f14fea9 --- /dev/null +++ b/tools/testing/selftests/sched/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0+ + +ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),) +CLANG_FLAGS += -no-integrated-as +endif + +CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -Wl,-rpath=./ \ + $(CLANG_FLAGS) +LDLIBS += -lpthread + +TEST_GEN_FILES := cs_prctl_test +TEST_PROGS := cs_prctl_test + +include ../lib.mk diff --git a/tools/testing/selftests/sched/config b/tools/testing/selftests/sched/config new file mode 100644 index 000000000000..e8b09aa7c0c4 --- /dev/null +++ b/tools/testing/selftests/sched/config @@ -0,0 +1 @@ +CONFIG_SCHED_DEBUG=y diff --git a/tools/testing/selftests/sched/cs_prctl_test.c b/tools/testing/selftests/sched/cs_prctl_test.c new file mode 100644 index 000000000000..63fe6521c56d --- /dev/null +++ b/tools/testing/selftests/sched/cs_prctl_test.c @@ -0,0 +1,338 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Use the core scheduling prctl() to test core scheduling cookies control. + * + * Copyright (c) 2021 Oracle and/or its affiliates. + * Author: Chris Hyser chris.hyser@oracle.com + * + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License as + * published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, see http://www.gnu.org/licenses. + */ + +#define _GNU_SOURCE +#include <sys/eventfd.h> +#include <sys/wait.h> +#include <sys/types.h> +#include <sched.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <time.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#if __GLIBC_PREREQ(2, 30) == 0 +#include <sys/syscall.h> +static pid_t gettid(void) +{ + return syscall(SYS_gettid); +} +#endif + +#ifndef PR_SCHED_CORE +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 +#endif + +#define MAX_PROCESSES 128 +#define MAX_THREADS 128 + +static const char USAGE[] = "cs_prctl_test [options]\n" +" options:\n" +" -P : number of processes to create.\n" +" -T : number of threads per process to create.\n" +" -d : delay time to keep tasks alive.\n" +" -k : keep tasks alive until keypress.\n"; + +enum pid_type {PIDTYPE_PID = 0, PIDTYPE_TGID, PIDTYPE_PGID}; + +const int THREAD_CLONE_FLAGS = CLONE_THREAD | CLONE_SIGHAND | CLONE_FS | CLONE_VM | CLONE_FILES; + +static int _prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5) +{ + int res; + + res = prctl(option, arg2, arg3, arg4, arg5); + printf("%d = prctl(%d, %ld, %ld, %ld, %lx)\n", res, option, (long)arg2, (long)arg3, + (long)arg4, arg5); + return res; +} + +#define STACK_SIZE (1024 * 1024) + +#define handle_error(msg) __handle_error(__FILE__, __LINE__, msg) +static void __handle_error(char *fn, int ln, char *msg) +{ + printf("(%s:%d) - ", fn, ln); + perror(msg); + exit(EXIT_FAILURE); +} + +static void handle_usage(int rc, char *msg) +{ + puts(USAGE); + puts(msg); + putchar('\n'); + exit(rc); +} + +static unsigned long get_cs_cookie(int pid) +{ + unsigned long long cookie; + int ret; + + ret = prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, pid, PIDTYPE_PID, + (unsigned long)&cookie); + if (ret) { + printf("Not a core sched system\n"); + return -1UL; + } + + return cookie; +} + +struct child_args { + int num_threads; + int pfd[2]; + int cpid; + int thr_tids[MAX_THREADS]; +}; + +static int child_func_thread(void __attribute__((unused))*arg) +{ + while (1) + usleep(20000); + return 0; +} + +static void create_threads(int num_threads, int thr_tids[]) +{ + void *child_stack; + pid_t tid; + int i; + + for (i = 0; i < num_threads; ++i) { + child_stack = malloc(STACK_SIZE); + if (!child_stack) + handle_error("child stack allocate"); + + tid = clone(child_func_thread, child_stack + STACK_SIZE, THREAD_CLONE_FLAGS, NULL); + if (tid == -1) + handle_error("clone thread"); + thr_tids[i] = tid; + } +} + +static int child_func_process(void *arg) +{ + struct child_args *ca = (struct child_args *)arg; + + close(ca->pfd[0]); + + create_threads(ca->num_threads, ca->thr_tids); + + write(ca->pfd[1], &ca->thr_tids, sizeof(int) * ca->num_threads); + close(ca->pfd[1]); + + while (1) + usleep(20000); + return 0; +} + +static unsigned char child_func_process_stack[STACK_SIZE]; + +void create_processes(int num_processes, int num_threads, struct child_args proc[]) +{ + pid_t cpid; + int i; + + for (i = 0; i < num_processes; ++i) { + proc[i].num_threads = num_threads; + + if (pipe(proc[i].pfd) == -1) + handle_error("pipe() failed"); + + cpid = clone(child_func_process, child_func_process_stack + STACK_SIZE, + SIGCHLD, &proc[i]); + proc[i].cpid = cpid; + close(proc[i].pfd[1]); + } + + for (i = 0; i < num_processes; ++i) { + read(proc[i].pfd[0], &proc[i].thr_tids, sizeof(int) * proc[i].num_threads); + close(proc[i].pfd[0]); + } +} + +void disp_processes(int num_processes, struct child_args proc[]) +{ + int i, j; + + printf("tid=%d, / tgid=%d / pgid=%d: %lx\n", gettid(), getpid(), getpgid(0), + get_cs_cookie(getpid())); + + for (i = 0; i < num_processes; ++i) { + printf(" tid=%d, / tgid=%d / pgid=%d: %lx\n", proc[i].cpid, proc[i].cpid, + getpgid(proc[i].cpid), get_cs_cookie(proc[i].cpid)); + for (j = 0; j < proc[i].num_threads; ++j) { + printf(" tid=%d, / tgid=%d / pgid=%d: %lx\n", proc[i].thr_tids[j], + proc[i].cpid, getpgid(0), get_cs_cookie(proc[i].thr_tids[j])); + } + } + puts("\n"); +} + +static int errors; + +#define validate(v) _validate(__LINE__, v, #v) +void _validate(int line, int val, char *msg) +{ + if (!val) { + ++errors; + printf("(%d) FAILED: %s\n", line, msg); + } else { + printf("(%d) PASSED: %s\n", line, msg); + } +} + +int main(int argc, char *argv[]) +{ + struct child_args procs[MAX_PROCESSES]; + + int keypress = 0; + int num_processes = 2; + int num_threads = 3; + int delay = 0; + int res = 0; + int pidx; + int pid; + int opt; + + while ((opt = getopt(argc, argv, ":hkT:P:d:")) != -1) { + switch (opt) { + case 'P': + num_processes = (int)strtol(optarg, NULL, 10); + break; + case 'T': + num_threads = (int)strtoul(optarg, NULL, 10); + break; + case 'd': + delay = (int)strtol(optarg, NULL, 10); + break; + case 'k': + keypress = 1; + break; + case 'h': + printf(USAGE); + exit(EXIT_SUCCESS); + default: + handle_usage(20, "unknown option"); + } + } + + if (num_processes < 1 || num_processes > MAX_PROCESSES) + handle_usage(1, "Bad processes value"); + + if (num_threads < 1 || num_threads > MAX_THREADS) + handle_usage(2, "Bad thread value"); + + if (keypress) + delay = -1; + + srand(time(NULL)); + + /* put into separate process group */ + if (setpgid(0, 0) != 0) + handle_error("process group"); + + printf("\n## Create a thread/process/process group hiearchy\n"); + create_processes(num_processes, num_threads, procs); + disp_processes(num_processes, procs); + validate(get_cs_cookie(0) == 0); + + printf("\n## Set a cookie on entire process group\n"); + if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, 0, PIDTYPE_PGID, 0) < 0) + handle_error("core_sched create failed -- PGID"); + disp_processes(num_processes, procs); + + validate(get_cs_cookie(0) != 0); + + /* get a random process pid */ + pidx = rand() % num_processes; + pid = procs[pidx].cpid; + + validate(get_cs_cookie(0) == get_cs_cookie(pid)); + validate(get_cs_cookie(0) == get_cs_cookie(procs[pidx].thr_tids[0])); + + printf("\n## Set a new cookie on entire process/TGID [%d]\n", pid); + if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, pid, PIDTYPE_TGID, 0) < 0) + handle_error("core_sched create failed -- TGID"); + disp_processes(num_processes, procs); + + validate(get_cs_cookie(0) != get_cs_cookie(pid)); + validate(get_cs_cookie(pid) != 0); + validate(get_cs_cookie(pid) == get_cs_cookie(procs[pidx].thr_tids[0])); + + printf("\n## Copy the cookie of current/PGID[%d], to pid [%d] as PIDTYPE_PID\n", + getpid(), pid); + if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, pid, PIDTYPE_PID, 0) < 0) + handle_error("core_sched share to itself failed -- PID"); + disp_processes(num_processes, procs); + + validate(get_cs_cookie(0) == get_cs_cookie(pid)); + validate(get_cs_cookie(pid) != 0); + validate(get_cs_cookie(pid) != get_cs_cookie(procs[pidx].thr_tids[0])); + + printf("\n## Copy cookie from a thread [%d] to current/PGID [%d] as PIDTYPE_PID\n", + procs[pidx].thr_tids[0], getpid()); + if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, procs[pidx].thr_tids[0], + PIDTYPE_PID, 0) < 0) + handle_error("core_sched share from thread failed -- PID"); + disp_processes(num_processes, procs); + + validate(get_cs_cookie(0) == get_cs_cookie(procs[pidx].thr_tids[0])); + validate(get_cs_cookie(pid) != get_cs_cookie(procs[pidx].thr_tids[0])); + + printf("\n## Copy cookie from current [%d] to current as pidtype PGID\n", getpid()); + if (_prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, 0, PIDTYPE_PGID, 0) < 0) + handle_error("core_sched share to self failed -- PGID"); + disp_processes(num_processes, procs); + + validate(get_cs_cookie(0) == get_cs_cookie(pid)); + validate(get_cs_cookie(pid) != 0); + validate(get_cs_cookie(pid) == get_cs_cookie(procs[pidx].thr_tids[0])); + + if (errors) { + printf("TESTS FAILED. errors: %d\n", errors); + res = 10; + } else { + printf("SUCCESS !!!\n"); + } + + if (keypress) + getchar(); + else + sleep(delay); + + for (pidx = 0; pidx < num_processes; ++pidx) + kill(procs[pidx].cpid, 15); + + return res; +}
From: Arnaldo Carvalho de Melo acme@redhat.com
mainline inclusion from mainline-v5.16-rc1 commit 49024204322cbfff892a28a67ad813cd41b6be81 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
To pick the changes in:
61bc346ce64a3864 ("uapi/linux/prctl: provide macro definitions for the PR_SCHED_CORE type argument")
That don't result in any changes in tooling:
$ tools/perf/trace/beauty/prctl_option.sh > before $ cp include/uapi/linux/prctl.h tools/include/uapi/linux/prctl.h $ tools/perf/trace/beauty/prctl_option.sh > after $ diff -u before after $
Just silences this perf tools build warning:
Warning: Kernel ABI header at 'tools/include/uapi/linux/prctl.h' differs from latest version at 'include/uapi/linux/prctl.h' diff -u tools/include/uapi/linux/prctl.h include/uapi/linux/prctl.h
Cc: Christian Brauner christian.brauner@ubuntu.com Cc: Eugene Syromiatnikov esyr@redhat.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/include/uapi/linux/prctl.h | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 892da2795b59..334b256900c7 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -254,5 +254,8 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ # define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ # define PR_SCHED_CORE_MAX 4 +# define PR_SCHED_CORE_SCOPE_THREAD 0 +# define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 +# define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2
#endif /* _LINUX_PRCTL_H */
From: Ingo Molnar mingo@kernel.org
mainline inclusion from mainline-v5.14-rc1 commit cc00c1988801dc71f63bb7bad019e85046865095 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
A few more snuck in. Also capitalize 'CPU' while at it.
Signed-off-by: Ingo Molnar mingo@kernel.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched_clock.h | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index 528718e4ed52..835ee87ed792 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -14,7 +14,7 @@ * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit * clocks. * @read_sched_clock: Current clock source (or dummy source when suspended). - * @mult: Multipler for scaled math conversion. + * @mult: Multiplier for scaled math conversion. * @shift: Shift value for scaled math conversion. * * Care must be taken when updating this structure; it is read by diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a485c8729f7c..fb1195f295e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4941,7 +4941,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) }
/* - * Try and select tasks for each sibling in decending sched_class + * Try and select tasks for each sibling in descending sched_class * order. */ for_each_class(class) { @@ -4955,7 +4955,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* * If this sibling doesn't yet have a suitable task to - * run; ask for the most elegible task, given the + * run; ask for the most eligible task, given the * highest priority task already selected for this * core. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6324e991f8f4..20b482688e3a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11574,11 +11574,11 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) * sched_slice() considers only this active rq and it gets the * whole slice. But during force idle, we have siblings acting * like a single runqueue and hence we need to consider runnable - * tasks on this cpu and the forced idle cpu. Ideally, we should + * tasks on this CPU and the forced idle CPU. Ideally, we should * go through the forced idle rq, but that would be a perf hit. - * We can assume that the forced idle cpu has atleast + * We can assume that the forced idle CPU has at least * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check - * if we need to give up the cpu. + * if we need to give up the CPU. */ if (rq->core->core_forceidle && rq->cfs.nr_running == 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 7b419f47facd286c6723daca6ad69ec355473f78 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Hugh noted that the SCHED_CORE Kconfig option could do with a help text.
Requested-by: Hugh Dickins hughd@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Randy Dunlap rdunlap@infradead.org Acked-by: Hugh Dickins hughd@google.com Link: https://lkml.kernel.org/r/YKyhtwhEgvtUDOyl@hirez.programming.kicks-ass.net Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/Kconfig.preempt | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index ea1e3331c0ba..bd7c4147b9a8 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -104,4 +104,18 @@ config SCHED_CORE bool "Core Scheduling for SMT" default y depends on SCHED_SMT + help + This option permits Core Scheduling, a means of coordinated task + selection across SMT siblings. When enabled -- see + prctl(PR_SCHED_CORE) -- task selection ensures that all SMT siblings + will execute a task from the same 'core group', forcing idle when no + matching task is found. + + Use of this feature includes: + - mitigation of some (not all) SMT side channels; + - limiting SMT interference to improve determinism and/or performance. + + SCHED_CORE is default enabled when SCHED_SMT is enabled -- when + unused there should be no impact on performance. +
From: "Joel Fernandes (Google)" joel@joelfernandes.org
mainline inclusion from mainline-v5.14-rc1 commit 0159bb020ca9a43b17aa9149f1199643c1d49426 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Now that core scheduling is merged, update the documentation.
Co-developed-by: Chris Hyser chris.hyser@oracle.com Signed-off-by: Chris Hyser chris.hyser@oracle.com Co-developed-by: Josh Don joshdon@google.com Signed-off-by: Josh Don joshdon@google.com Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20210603013136.370918-1-joel@joelfernandes.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../admin-guide/hw-vuln/core-scheduling.rst | 223 ++++++++++++++++++ Documentation/admin-guide/hw-vuln/index.rst | 1 + 2 files changed, 224 insertions(+) create mode 100644 Documentation/admin-guide/hw-vuln/core-scheduling.rst
diff --git a/Documentation/admin-guide/hw-vuln/core-scheduling.rst b/Documentation/admin-guide/hw-vuln/core-scheduling.rst new file mode 100644 index 000000000000..7b410aef9c5c --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/core-scheduling.rst @@ -0,0 +1,223 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Core Scheduling +=============== +Core scheduling support allows userspace to define groups of tasks that can +share a core. These groups can be specified either for security usecases (one +group of tasks don't trust another), or for performance usecases (some +workloads may benefit from running on the same core as they don't need the same +hardware resources of the shared core, or may prefer different cores if they +do share hardware resource needs). This document only describes the security +usecase. + +Security usecase +---------------- +A cross-HT attack involves the attacker and victim running on different Hyper +Threads of the same core. MDS and L1TF are examples of such attacks. The only +full mitigation of cross-HT attacks is to disable Hyper Threading (HT). Core +scheduling is a scheduler feature that can mitigate some (not all) cross-HT +attacks. It allows HT to be turned on safely by ensuring that only tasks in a +user-designated trusted group can share a core. This increase in core sharing +can also improve performance, however it is not guaranteed that performance +will always improve, though that is seen to be the case with a number of real +world workloads. In theory, core scheduling aims to perform at least as good as +when Hyper Threading is disabled. In practice, this is mostly the case though +not always: as synchronizing scheduling decisions across 2 or more CPUs in a +core involves additional overhead - especially when the system is lightly +loaded. When ``total_threads <= N_CPUS/2``, the extra overhead may cause core +scheduling to perform more poorly compared to SMT-disabled, where N_CPUS is the +total number of CPUs. Please measure the performance of your workloads always. + +Usage +----- +Core scheduling support is enabled via the ``CONFIG_SCHED_CORE`` config option. +Using this feature, userspace defines groups of tasks that can be co-scheduled +on the same core. The core scheduler uses this information to make sure that +tasks that are not in the same group never run simultaneously on a core, while +doing its best to satisfy the system's scheduling requirements. + +Core scheduling can be enabled via the ``PR_SCHED_CORE`` prctl interface. +This interface provides support for the creation of core scheduling groups, as +well as admission and removal of tasks from created groups:: + + #include <sys/prctl.h> + + int prctl(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5); + +option: + ``PR_SCHED_CORE`` + +arg2: + Command for operation, must be one off: + + - ``PR_SCHED_CORE_GET`` -- get core_sched cookie of ``pid``. + - ``PR_SCHED_CORE_CREATE`` -- create a new unique cookie for ``pid``. + - ``PR_SCHED_CORE_SHARE_TO`` -- push core_sched cookie to ``pid``. + - ``PR_SCHED_CORE_SHARE_FROM`` -- pull core_sched cookie from ``pid``. + +arg3: + ``pid`` of the task for which the operation applies. + +arg4: + ``pid_type`` for which the operation applies. It is of type ``enum pid_type``. + For example, if arg4 is ``PIDTYPE_TGID``, then the operation of this command + will be performed for all tasks in the task group of ``pid``. + +arg5: + userspace pointer to an unsigned long for storing the cookie returned by + ``PR_SCHED_CORE_GET`` command. Should be 0 for all other commands. + +In order for a process to push a cookie to, or pull a cookie from a process, it +is required to have the ptrace access mode: `PTRACE_MODE_READ_REALCREDS` to the +process. + +Building hierarchies of tasks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The simplest way to build hierarchies of threads/processes which share a +cookie and thus a core is to rely on the fact that the core-sched cookie is +inherited across forks/clones and execs, thus setting a cookie for the +'initial' script/executable/daemon will place every spawned child in the +same core-sched group. + +Cookie Transferral +~~~~~~~~~~~~~~~~~~ +Transferring a cookie between the current and other tasks is possible using +PR_SCHED_CORE_SHARE_FROM and PR_SCHED_CORE_SHARE_TO to inherit a cookie from a +specified task or a share a cookie with a task. In combination this allows a +simple helper program to pull a cookie from a task in an existing core +scheduling group and share it with already running tasks. + +Design/Implementation +--------------------- +Each task that is tagged is assigned a cookie internally in the kernel. As +mentioned in `Usage`_, tasks with the same cookie value are assumed to trust +each other and share a core. + +The basic idea is that, every schedule event tries to select tasks for all the +siblings of a core such that all the selected tasks running on a core are +trusted (same cookie) at any point in time. Kernel threads are assumed trusted. +The idle task is considered special, as it trusts everything and everything +trusts it. + +During a schedule() event on any sibling of a core, the highest priority task on +the sibling's core is picked and assigned to the sibling calling schedule(), if +the sibling has the task enqueued. For rest of the siblings in the core, +highest priority task with the same cookie is selected if there is one runnable +in their individual run queues. If a task with same cookie is not available, +the idle task is selected. Idle task is globally trusted. + +Once a task has been selected for all the siblings in the core, an IPI is sent to +siblings for whom a new task was selected. Siblings on receiving the IPI will +switch to the new task immediately. If an idle task is selected for a sibling, +then the sibling is considered to be in a `forced idle` state. I.e., it may +have tasks on its on runqueue to run, however it will still have to run idle. +More on this in the next section. + +Forced-idling of hyperthreads +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The scheduler tries its best to find tasks that trust each other such that all +tasks selected to be scheduled are of the highest priority in a core. However, +it is possible that some runqueues had tasks that were incompatible with the +highest priority ones in the core. Favoring security over fairness, one or more +siblings could be forced to select a lower priority task if the highest +priority task is not trusted with respect to the core wide highest priority +task. If a sibling does not have a trusted task to run, it will be forced idle +by the scheduler (idle thread is scheduled to run). + +When the highest priority task is selected to run, a reschedule-IPI is sent to +the sibling to force it into idle. This results in 4 cases which need to be +considered depending on whether a VM or a regular usermode process was running +on either HT:: + + HT1 (attack) HT2 (victim) + A idle -> user space user space -> idle + B idle -> user space guest -> idle + C idle -> guest user space -> idle + D idle -> guest guest -> idle + +Note that for better performance, we do not wait for the destination CPU +(victim) to enter idle mode. This is because the sending of the IPI would bring +the destination CPU immediately into kernel mode from user space, or VMEXIT +in the case of guests. At best, this would only leak some scheduler metadata +which may not be worth protecting. It is also possible that the IPI is received +too late on some architectures, but this has not been observed in the case of +x86. + +Trust model +~~~~~~~~~~~ +Core scheduling maintains trust relationships amongst groups of tasks by +assigning them a tag that is the same cookie value. +When a system with core scheduling boots, all tasks are considered to trust +each other. This is because the core scheduler does not have information about +trust relationships until userspace uses the above mentioned interfaces, to +communicate them. In other words, all tasks have a default cookie value of 0. +and are considered system-wide trusted. The forced-idling of siblings running +cookie-0 tasks is also avoided. + +Once userspace uses the above mentioned interfaces to group sets of tasks, tasks +within such groups are considered to trust each other, but do not trust those +outside. Tasks outside the group also don't trust tasks within. + +Limitations of core-scheduling +------------------------------ +Core scheduling tries to guarantee that only trusted tasks run concurrently on a +core. But there could be small window of time during which untrusted tasks run +concurrently or kernel could be running concurrently with a task not trusted by +kernel. + +IPI processing delays +~~~~~~~~~~~~~~~~~~~~~ +Core scheduling selects only trusted tasks to run together. IPI is used to notify +the siblings to switch to the new task. But there could be hardware delays in +receiving of the IPI on some arch (on x86, this has not been observed). This may +cause an attacker task to start running on a CPU before its siblings receive the +IPI. Even though cache is flushed on entry to user mode, victim tasks on siblings +may populate data in the cache and micro architectural buffers after the attacker +starts to run and this is a possibility for data leak. + +Open cross-HT issues that core scheduling does not solve +-------------------------------------------------------- +1. For MDS +~~~~~~~~~~ +Core scheduling cannot protect against MDS attacks between an HT running in +user mode and another running in kernel mode. Even though both HTs run tasks +which trust each other, kernel memory is still considered untrusted. Such +attacks are possible for any combination of sibling CPU modes (host or guest mode). + +2. For L1TF +~~~~~~~~~~~ +Core scheduling cannot protect against an L1TF guest attacker exploiting a +guest or host victim. This is because the guest attacker can craft invalid +PTEs which are not inverted due to a vulnerable guest kernel. The only +solution is to disable EPT (Extended Page Tables). + +For both MDS and L1TF, if the guest vCPU is configured to not trust each +other (by tagging separately), then the guest to guest attacks would go away. +Or it could be a system admin policy which considers guest to guest attacks as +a guest problem. + +Another approach to resolve these would be to make every untrusted task on the +system to not trust every other untrusted task. While this could reduce +parallelism of the untrusted tasks, it would still solve the above issues while +allowing system processes (trusted tasks) to share a core. + +3. Protecting the kernel (IRQ, syscall, VMEXIT) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Unfortunately, core scheduling does not protect kernel contexts running on +sibling hyperthreads from one another. Prototypes of mitigations have been posted +to LKML to solve this, but it is debatable whether such windows are practically +exploitable, and whether the performance overhead of the prototypes are worth +it (not to mention, the added code complexity). + +Other Use cases +--------------- +The main use case for Core scheduling is mitigating the cross-HT vulnerabilities +with SMT enabled. There are other use cases where this feature could be used: + +- Isolating tasks that needs a whole core: Examples include realtime tasks, tasks + that uses SIMD instructions etc. +- Gang scheduling: Requirements for a group of tasks that needs to be scheduled + together could also be realized using core scheduling. One example is vCPUs of + a VM. diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 2adec1e6520a..9b688b1990c6 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -15,4 +15,5 @@ are configurable at compile, boot or run time. tsx_async_abort multihit.rst special-register-buffer-data-sampling.rst + core-scheduling.rst processor_mmio_stale_data.rst
From: Ingo Molnar mingo@kernel.org
mainline inclusion from mainline-v5.14-rc1 commit d2343cb8d154fe20c4499711bb3a9af2095b2b4b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
This option at minimum adds extra code to the scheduler - even if it's default unused - and most users wouldn't want it.
Reported-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Ingo Molnar mingo@kernel.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/Kconfig.preempt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bd7c4147b9a8..5876e30c5740 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -102,7 +102,6 @@ config PREEMPT_DYNAMIC
config SCHED_CORE bool "Core Scheduling for SMT" - default y depends on SCHED_SMT help This option permits Core Scheduling, a means of coordinated task @@ -115,7 +114,8 @@ config SCHED_CORE - mitigation of some (not all) SMT side channels; - limiting SMT interference to improve determinism and/or performance.
- SCHED_CORE is default enabled when SCHED_SMT is enabled -- when - unused there should be no impact on performance. + SCHED_CORE is default disabled. When it is enabled and unused, + which is the likely usage by Linux distributions, there should + be no measurable impact on performance.
From: "Fabio M. De Francesco" fmdefrancesco@gmail.com
mainline inclusion from mainline-v5.15-rc1 commit ce48ee81a1930b2218bea23490adb6673c88bf70 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Rephrase the "For MDS" section in core-scheduling.rst for the purpose of making it clearer what is meant by "kernel memory is still considered untrusted".
Suggested-by: Vineeth Pillai Vineeth.Pillai@microsoft.com Signed-off-by: Fabio M. De Francesco fmdefrancesco@gmail.com Reviewed-by: Joel Fernandes (Google) joelaf@google.com Link: https://lore.kernel.org/r/20210721190250.26095-1-fmdefrancesco@gmail.com Signed-off-by: Jonathan Corbet corbet@lwn.net Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/hw-vuln/core-scheduling.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/Documentation/admin-guide/hw-vuln/core-scheduling.rst b/Documentation/admin-guide/hw-vuln/core-scheduling.rst index 7b410aef9c5c..0febe458597c 100644 --- a/Documentation/admin-guide/hw-vuln/core-scheduling.rst +++ b/Documentation/admin-guide/hw-vuln/core-scheduling.rst @@ -181,10 +181,12 @@ Open cross-HT issues that core scheduling does not solve -------------------------------------------------------- 1. For MDS ~~~~~~~~~~ -Core scheduling cannot protect against MDS attacks between an HT running in -user mode and another running in kernel mode. Even though both HTs run tasks -which trust each other, kernel memory is still considered untrusted. Such -attacks are possible for any combination of sibling CPU modes (host or guest mode). +Core scheduling cannot protect against MDS attacks between the siblings +running in user mode and the others running in kernel mode. Even though all +siblings run tasks which trust each other, when the kernel is executing +code on behalf of a task, it cannot trust the code running in the +sibling. Such attacks are possible for any combination of sibling CPU modes +(host or guest mode).
2. For L1TF ~~~~~~~~~~~
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.14 commit 3c474b3239f12fe0b00d7e82481f36a1f31e79ab category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Eugene tripped over the case where rq_lock(), as called in a for_each_possible_cpu() loop came apart because rq->core hadn't been setup yet.
This is a somewhat unusual, but valid case.
Rework things such that rq->core is initialized to point at itself. IOW initialize each CPU as a single threaded Core. CPU online will then join the new CPU (thread) to an existing Core where needed.
For completeness sake, have CPU offline fully undo the state so as to not presume the topology will match the next time it comes online.
Fixes: 9edeaea1bc45 ("sched: Core-wide rq->lock") Reported-by: Eugene Syromiatnikov esyr@redhat.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Josh Don joshdon@google.com Tested-by: Eugene Syromiatnikov esyr@redhat.com Link: https://lkml.kernel.org/r/YR473ZGeKqMs6kw+@hirez.programming.kicks-ass.net Conflicts: kernel/sched/core.c [Bugfix ed3cd45f8ca87("Merge tag 'v5.11' into sched/core, to pick up fixes & refresh the branch") is not applied.] Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 143 +++++++++++++++++++++++++++++++++++-------- kernel/sched/sched.h | 2 +- 2 files changed, 118 insertions(+), 27 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fb1195f295e1..4e82b38eb778 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -228,9 +228,30 @@ static DEFINE_MUTEX(sched_core_mutex); static atomic_t sched_core_count; static struct cpumask sched_core_mask;
+static void sched_core_lock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t, i = 0; + + local_irq_save(*flags); + for_each_cpu(t, smt_mask) + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); +} + +static void sched_core_unlock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t; + + for_each_cpu(t, smt_mask) + raw_spin_unlock(&cpu_rq(t)->__lock); + local_irq_restore(*flags); +} + static void __sched_core_flip(bool enabled) { - int cpu, t, i; + unsigned long flags; + int cpu, t;
cpus_read_lock();
@@ -241,19 +262,12 @@ static void __sched_core_flip(bool enabled) for_each_cpu(cpu, &sched_core_mask) { const struct cpumask *smt_mask = cpu_smt_mask(cpu);
- i = 0; - local_irq_disable(); - for_each_cpu(t, smt_mask) { - /* supports up to SMT8 */ - raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); - } + sched_core_lock(cpu, &flags);
for_each_cpu(t, smt_mask) cpu_rq(t)->core_enabled = enabled;
- for_each_cpu(t, smt_mask) - raw_spin_unlock(&cpu_rq(t)->__lock); - local_irq_enable(); + sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); } @@ -5173,35 +5187,109 @@ void queue_core_balance(struct rq *rq) queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); }
-static inline void sched_core_cpu_starting(unsigned int cpu) +static void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); - struct rq *rq, *core_rq = NULL; - int i; + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t;
- core_rq = cpu_rq(cpu)->core; + sched_core_lock(cpu, &flags);
- if (!core_rq) { - for_each_cpu(i, smt_mask) { - rq = cpu_rq(i); - if (rq->core && rq->core == rq) - core_rq = rq; + WARN_ON_ONCE(rq->core != rq); + + /* if we're the first, we'll be our own leader */ + if (cpumask_weight(smt_mask) == 1) + goto unlock; + + /* find the leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + rq = cpu_rq(t); + if (rq->core == rq) { + core_rq = rq; + break; } + }
- if (!core_rq) - core_rq = cpu_rq(cpu); + if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ + goto unlock;
- for_each_cpu(i, smt_mask) { - rq = cpu_rq(i); + /* install and validate core_rq */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t);
- WARN_ON_ONCE(rq->core && rq->core != core_rq); + if (t == cpu) rq->core = core_rq; - } + + WARN_ON_ONCE(rq->core != core_rq); } + +unlock: + sched_core_unlock(cpu, &flags); } + +static void sched_core_cpu_deactivate(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t; + + sched_core_lock(cpu, &flags); + + /* if we're the last man standing, nothing to do */ + if (cpumask_weight(smt_mask) == 1) { + WARN_ON_ONCE(rq->core != rq); + goto unlock; + } + + /* if we're not the leader, nothing to do */ + if (rq->core != rq) + goto unlock; + + /* find a new leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + core_rq = cpu_rq(t); + break; + } + + if (WARN_ON_ONCE(!core_rq)) /* impossible */ + goto unlock; + + /* copy the shared state to the new leader */ + core_rq->core_task_seq = rq->core_task_seq; + core_rq->core_pick_seq = rq->core_pick_seq; + core_rq->core_cookie = rq->core_cookie; + core_rq->core_forceidle = rq->core_forceidle; + core_rq->core_forceidle_seq = rq->core_forceidle_seq; + + /* install new leader */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); + rq->core = core_rq; + } + +unlock: + sched_core_unlock(cpu, &flags); +} + +static inline void sched_core_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->core != rq) + rq->core = rq; +} + #else /* !CONFIG_SCHED_CORE */
static inline void sched_core_cpu_starting(unsigned int cpu) {} +static inline void sched_core_cpu_deactivate(unsigned int cpu) {} +static inline void sched_core_cpu_dying(unsigned int cpu) {}
static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) @@ -7980,6 +8068,8 @@ int sched_cpu_deactivate(unsigned int cpu) */ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_dec_cpuslocked(&sched_smt_present); + + sched_core_cpu_deactivate(cpu); #endif
if (!sched_smp_initialized) @@ -8032,6 +8122,7 @@ int sched_cpu_dying(unsigned int cpu) update_max_interval(); nohz_balance_exit_idle(rq); hrtick_clear(rq); + sched_core_cpu_dying(cpu); return 0; } #endif @@ -8240,7 +8331,7 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0);
#ifdef CONFIG_SCHED_CORE - rq->core = NULL; + rq->core = rq; rq->core_pick = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 91f8795f0693..e317f02ce2e5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1120,7 +1120,7 @@ struct rq { unsigned int core_sched_seq; struct rb_root core_tree;
- /* shared state */ + /* shared state -- careful with sched_core_cpu_deactivate() */ unsigned int core_task_seq; unsigned int core_pick_seq; unsigned long core_cookie;
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.16-rc1 commit bc9ffef31bf59819c9fc032178534ff9ed7c4981 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Tao suggested a two-pass task selection to avoid the retry loop.
Not only does it avoid the retry loop, it results in *much* simpler code.
This also fixes an issue spotted by Josh Don where, for SMT3+, we can forget to update max on the first pass and get to do an extra round.
Suggested-by: Tao Zhou tao.zhou@linux.dev Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Josh Don joshdon@google.com Reviewed-by: Vineeth Pillai (Microsoft) vineethrp@gmail.com Link: https://lkml.kernel.org/r/YSS9+k1teA9oPEKl@hirez.programming.kicks-ass.net Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 156 +++++++++++++------------------------------- 1 file changed, 45 insertions(+), 111 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4e82b38eb778..058ad14cc162 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4772,8 +4772,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; }
- /* The idle class should always have a runnable task: */ - BUG(); + BUG(); /* The idle class should always have a runnable task. */ }
#ifdef CONFIG_SCHED_CORE @@ -4795,54 +4794,18 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b) return a->core_cookie == b->core_cookie; }
-// XXX fairness/fwd progress conditions -/* - * Returns - * - NULL if there is no runnable task for this class. - * - the highest priority task for this runqueue if it matches - * rq->core->core_cookie or its priority is greater than max. - * - Else returns idle_task. - */ -static struct task_struct * -pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi) +static inline struct task_struct *pick_task(struct rq *rq) { - struct task_struct *class_pick, *cookie_pick; - unsigned long cookie = rq->core->core_cookie; - - class_pick = class->pick_task(rq); - if (!class_pick) - return NULL; - - if (!cookie) { - /* - * If class_pick is tagged, return it only if it has - * higher priority than max. - */ - if (max && class_pick->core_cookie && - prio_less(class_pick, max, in_fi)) - return idle_sched_class.pick_task(rq); + const struct sched_class *class; + struct task_struct *p;
- return class_pick; + for_each_class(class) { + p = class->pick_task(rq); + if (p) + return p; }
- /* - * If class_pick is idle or matches cookie, return early. - */ - if (cookie_equals(class_pick, cookie)) - return class_pick; - - cookie_pick = sched_core_find(rq, cookie); - - /* - * If class > max && class > cookie, it is the highest priority task on - * the core (so far) and it must be selected, otherwise we must go with - * the cookie pick in order to satisfy the constraint. - */ - if (prio_less(cookie_pick, class_pick, in_fi) && - (!max || prio_less(max, class_pick, in_fi))) - return class_pick; - - return cookie_pick; + BUG(); /* The idle class should always have a runnable task. */ }
extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); @@ -4850,11 +4813,12 @@ extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_f static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *next, *max = NULL; - const struct sched_class *class; + struct task_struct *next, *p, *max = NULL; const struct cpumask *smt_mask; bool fi_before = false; - int i, j, cpu, occ = 0; + unsigned long cookie; + int i, cpu, occ = 0; + struct rq *rq_i; bool need_sync;
if (!sched_core_enabled(rq)) @@ -4927,12 +4891,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * and there are no cookied tasks running on siblings. */ if (!need_sync) { - for_each_class(class) { - next = class->pick_task(rq); - if (next) - break; - } - + next = pick_task(rq); if (!next->core_cookie) { rq->core_pick = NULL; /* @@ -4945,76 +4904,51 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } }
- for_each_cpu(i, smt_mask) { - struct rq *rq_i = cpu_rq(i); - - rq_i->core_pick = NULL; + /* + * For each thread: do the regular task pick and find the max prio task + * amongst them. + * + * Tie-break prio towards the current CPU + */ + for_each_cpu_wrap(i, smt_mask, cpu) { + rq_i = cpu_rq(i);
if (i != cpu) update_rq_clock(rq_i); + + p = rq_i->core_pick = pick_task(rq_i); + if (!max || prio_less(max, p, fi_before)) + max = p; }
+ cookie = rq->core->core_cookie = max->core_cookie; + /* - * Try and select tasks for each sibling in descending sched_class - * order. + * For each thread: try and find a runnable task that matches @max or + * force idle. */ - for_each_class(class) { -again: - for_each_cpu_wrap(i, smt_mask, cpu) { - struct rq *rq_i = cpu_rq(i); - struct task_struct *p; - - if (rq_i->core_pick) - continue; + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + p = rq_i->core_pick;
- /* - * If this sibling doesn't yet have a suitable task to - * run; ask for the most eligible task, given the - * highest priority task already selected for this - * core. - */ - p = pick_task(rq_i, class, max, fi_before); + if (!cookie_equals(p, cookie)) { + p = NULL; + if (cookie) + p = sched_core_find(rq_i, cookie); if (!p) - continue; + p = idle_sched_class.pick_task(rq_i); + }
- if (!is_task_rq_idle(p)) - occ++; + rq_i->core_pick = p;
- rq_i->core_pick = p; - if (rq_i->idle == p && rq_i->nr_running) { + if (p == rq_i->idle) { + if (rq_i->nr_running) { rq->core->core_forceidle = true; if (!fi_before) rq->core->core_forceidle_seq++; } - - /* - * If this new candidate is of higher priority than the - * previous; and they're incompatible; we need to wipe - * the slate and start over. pick_task makes sure that - * p's priority is more than max if it doesn't match - * max's cookie. - * - * NOTE: this is a linear max-filter and is thus bounded - * in execution time. - */ - if (!max || !cookie_match(max, p)) { - struct task_struct *old_max = max; - - rq->core->core_cookie = p->core_cookie; - max = p; - - if (old_max) { - rq->core->core_forceidle = false; - for_each_cpu(j, smt_mask) { - if (j == i) - continue; - - cpu_rq(j)->core_pick = NULL; - } - occ = 1; - goto again; - } - } + } else { + occ++; } }
@@ -5034,7 +4968,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * non-matching user state. */ for_each_cpu(i, smt_mask) { - struct rq *rq_i = cpu_rq(i); + rq_i = cpu_rq(i);
/* * An online sibling might have gone offline before a task
From: Eugene Syromiatnikov esyr@redhat.com
mainline inclusion from mainline-v5.16-rc1 commit 61bc346ce64a3864ac55f5d18bdc1572cda4fb18 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Commit 7ac592aa35a684ff ("sched: prctl() core-scheduling interface") made use of enum pid_type in prctl's arg4; this type and the associated enumeration definitions are not exposed to userspace. Christian has suggested to provide additional macro definitions that convey the meaning of the type argument more in alignment with its actual usage, and this patch does exactly that.
Link: https://lore.kernel.org/r/20210825170613.GA3884@asgard.redhat.com Suggested-by: Christian Brauner christian.brauner@ubuntu.com Acked-by: Christian Brauner christian.brauner@ubuntu.com Signed-off-by: Eugene Syromiatnikov esyr@redhat.com Complements: 7ac592aa35a684ff ("sched: prctl() core-scheduling interface") Signed-off-by: Christian Brauner christian.brauner@ubuntu.com Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/hw-vuln/core-scheduling.rst | 5 +++-- include/uapi/linux/prctl.h | 3 +++ kernel/sched/core_sched.c | 4 ++++ 3 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/hw-vuln/core-scheduling.rst b/Documentation/admin-guide/hw-vuln/core-scheduling.rst index 0febe458597c..cf1eeefdfc32 100644 --- a/Documentation/admin-guide/hw-vuln/core-scheduling.rst +++ b/Documentation/admin-guide/hw-vuln/core-scheduling.rst @@ -61,8 +61,9 @@ arg3: ``pid`` of the task for which the operation applies.
arg4: - ``pid_type`` for which the operation applies. It is of type ``enum pid_type``. - For example, if arg4 is ``PIDTYPE_TGID``, then the operation of this command + ``pid_type`` for which the operation applies. It is one of + ``PR_SCHED_CORE_SCOPE_``-prefixed macro constants. For example, if arg4 + is ``PR_SCHED_CORE_SCOPE_THREAD_GROUP``, then the operation of this command will be performed for all tasks in the task group of ``pid``.
arg5: diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 892da2795b59..334b256900c7 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -254,5 +254,8 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ # define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ # define PR_SCHED_CORE_MAX 4 +# define PR_SCHED_CORE_SCOPE_THREAD 0 +# define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 +# define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2
#endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 9a80e9a474c0..20f640949450 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -134,6 +134,10 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, if (!static_branch_likely(&sched_smt_present)) return -ENODEV;
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID); + BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD_GROUP != PIDTYPE_TGID); + BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_PROCESS_GROUP != PIDTYPE_PGID); + if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 || (cmd != PR_SCHED_CORE_GET && uaddr)) return -EINVAL;
From: Li Zhijian lizhijian@cn.fujitsu.com
mainline inclusion from mainline-v5.16-rc1 commit 1c36432b278cecf1499f21fae19836e614954309 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Previously, 'make -C sched run_tests' will block forever when it occurs something wrong where the *selftests framework* is waiting for its child processes to exit.
[root@iaas-rpma sched]# ./cs_prctl_test
## Create a thread/process/process group hiearchy Not a core sched system tid=74985, / tgid=74985 / pgid=74985: ffffffffffffffff Not a core sched system tid=74986, / tgid=74986 / pgid=74985: ffffffffffffffff Not a core sched system tid=74988, / tgid=74986 / pgid=74985: ffffffffffffffff Not a core sched system tid=74989, / tgid=74986 / pgid=74985: ffffffffffffffff Not a core sched system tid=74990, / tgid=74986 / pgid=74985: ffffffffffffffff Not a core sched system tid=74987, / tgid=74987 / pgid=74985: ffffffffffffffff Not a core sched system tid=74991, / tgid=74987 / pgid=74985: ffffffffffffffff Not a core sched system tid=74992, / tgid=74987 / pgid=74985: ffffffffffffffff Not a core sched system tid=74993, / tgid=74987 / pgid=74985: ffffffffffffffff
Not a core sched system (268) FAILED: get_cs_cookie(0) == 0
## Set a cookie on entire process group -1 = prctl(62, 1, 0, 2, 0) core_sched create failed -- PGID: Invalid argument (cs_prctl_test.c:272) - [root@iaas-rpma sched]# ps PID TTY TIME CMD 4605 pts/2 00:00:00 bash 74986 pts/2 00:00:00 cs_prctl_test 74987 pts/2 00:00:00 cs_prctl_test 74999 pts/2 00:00:00 ps
Reported-by: kernel test robot lkp@intel.com Signed-off-by: Li Zhijian lizhijian@cn.fujitsu.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Chris Hyser chris.hyser@oracle.com Link: https://lore.kernel.org/r/20210902024333.75983-1-lizhijian@cn.fujitsu.com Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/testing/selftests/sched/cs_prctl_test.c | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-)
diff --git a/tools/testing/selftests/sched/cs_prctl_test.c b/tools/testing/selftests/sched/cs_prctl_test.c index 63fe6521c56d..1829383715c6 100644 --- a/tools/testing/selftests/sched/cs_prctl_test.c +++ b/tools/testing/selftests/sched/cs_prctl_test.c @@ -64,6 +64,17 @@ enum pid_type {PIDTYPE_PID = 0, PIDTYPE_TGID, PIDTYPE_PGID};
const int THREAD_CLONE_FLAGS = CLONE_THREAD | CLONE_SIGHAND | CLONE_FS | CLONE_VM | CLONE_FILES;
+struct child_args { + int num_threads; + int pfd[2]; + int cpid; + int thr_tids[MAX_THREADS]; +}; + +static struct child_args procs[MAX_PROCESSES]; +static int num_processes = 2; +static int need_cleanup = 0; + static int _prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { @@ -80,8 +91,14 @@ static int _prctl(int option, unsigned long arg2, unsigned long arg3, unsigned l #define handle_error(msg) __handle_error(__FILE__, __LINE__, msg) static void __handle_error(char *fn, int ln, char *msg) { + int pidx; printf("(%s:%d) - ", fn, ln); perror(msg); + if (need_cleanup) { + for (pidx = 0; pidx < num_processes; ++pidx) + kill(procs[pidx].cpid, 15); + need_cleanup = 0; + } exit(EXIT_FAILURE); }
@@ -108,13 +125,6 @@ static unsigned long get_cs_cookie(int pid) return cookie; }
-struct child_args { - int num_threads; - int pfd[2]; - int cpid; - int thr_tids[MAX_THREADS]; -}; - static int child_func_thread(void __attribute__((unused))*arg) { while (1) @@ -214,10 +224,7 @@ void _validate(int line, int val, char *msg)
int main(int argc, char *argv[]) { - struct child_args procs[MAX_PROCESSES]; - int keypress = 0; - int num_processes = 2; int num_threads = 3; int delay = 0; int res = 0; @@ -264,6 +271,7 @@ int main(int argc, char *argv[])
printf("\n## Create a thread/process/process group hiearchy\n"); create_processes(num_processes, num_threads, procs); + need_cleanup = 1; disp_processes(num_processes, procs); validate(get_cs_cookie(0) == 0);
From: Shaokun Zhang zhangshaokun@hisilicon.com
mainline inclusion from mainline-v5.16-rc1 commit d07b2eee4501c393cbf5bfcad36143310cfd72f9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Make cookie functions static as these are no longer invoked directly by other code.
No functional change intended.
Signed-off-by: Shaokun Zhang zhangshaokun@hisilicon.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20210922085735.52812-1-zhangshaokun@hisilicon.com Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core_sched.c | 9 +++++---- kernel/sched/sched.h | 5 ----- 2 files changed, 5 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 20f640949450..517f72b008f5 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -11,7 +11,7 @@ struct sched_core_cookie { refcount_t refcnt; };
-unsigned long sched_core_alloc_cookie(void) +static unsigned long sched_core_alloc_cookie(void) { struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL); if (!ck) @@ -23,7 +23,7 @@ unsigned long sched_core_alloc_cookie(void) return (unsigned long)ck; }
-void sched_core_put_cookie(unsigned long cookie) +static void sched_core_put_cookie(unsigned long cookie) { struct sched_core_cookie *ptr = (void *)cookie;
@@ -33,7 +33,7 @@ void sched_core_put_cookie(unsigned long cookie) } }
-unsigned long sched_core_get_cookie(unsigned long cookie) +static unsigned long sched_core_get_cookie(unsigned long cookie) { struct sched_core_cookie *ptr = (void *)cookie;
@@ -53,7 +53,8 @@ unsigned long sched_core_get_cookie(unsigned long cookie) * * Returns: the old cookie */ -unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie) +static unsigned long sched_core_update_cookie(struct task_struct *p, + unsigned long cookie) { unsigned long old_cookie; struct rq_flags rf; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e317f02ce2e5..610d126a7f44 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1278,11 +1278,6 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); extern void sched_core_get(void); extern void sched_core_put(void);
-extern unsigned long sched_core_alloc_cookie(void); -extern void sched_core_put_cookie(unsigned long cookie); -extern unsigned long sched_core_get_cookie(unsigned long cookie); -extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie); - #else /* !CONFIG_SCHED_CORE */
static inline bool sched_core_enabled(struct rq *rq)
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.18-rc2 commit 5b6547ed97f4f5dfc23f8e3970af6d11d7b7ed7e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Steve reported that ChromeOS encounters the forceidle balancer being ran from rt_mutex_setprio()'s balance_callback() invocation and explodes.
Now, the forceidle balancer gets queued every time the idle task gets selected, set_next_task(), which is strictly too often. rt_mutex_setprio() also uses set_next_task() in the 'change' pattern:
queued = task_on_rq_queued(p); /* p->on_rq == TASK_ON_RQ_QUEUED */ running = task_current(rq, p); /* rq->curr == p */
if (queued) dequeue_task(...); if (running) put_prev_task(...);
/* change task properties */
if (queued) enqueue_task(...); if (running) set_next_task(...);
However, rt_mutex_setprio() will explicitly not run this pattern on the idle task (since priority boosting the idle task is quite insane). Most other 'change' pattern users are pidhash based and would also not apply to idle.
Also, the change pattern doesn't contain a __balance_callback() invocation and hence we could have an out-of-band balance-callback, which *should* trigger the WARN in rq_pin_lock() (which guards against this exact anti-pattern).
So while none of that explains how this happens, it does indicate that having it in set_next_task() might not be the most robust option.
Instead, explicitly queue the forceidle balancer from pick_next_task() when it does indeed result in forceidle selection. Having it here, ensures it can only be triggered under the __schedule() rq->lock instance, and hence must be ran from that context.
This also happens to clean up the code a little, so win-win.
Fixes: d2dfa17bc7de ("sched: Trivial forced-newidle balancer") Reported-by: Steven Rostedt rostedt@goodmis.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: T.J. Alumbaugh talumbau@chromium.org Link: https://lkml.kernel.org/r/20220330160535.GN8939@worktop.programming.kicks-as... Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 14 ++++++++++---- kernel/sched/idle.c | 1 - kernel/sched/sched.h | 6 ------ 3 files changed, 10 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 058ad14cc162..1341087361ac 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4810,6 +4810,8 @@ static inline struct task_struct *pick_task(struct rq *rq)
extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+static void queue_core_balance(struct rq *rq); + static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -4858,7 +4860,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) }
rq->core_pick = NULL; - return next; + goto out; }
put_prev_task_balance(rq, prev, rf); @@ -4900,7 +4902,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) */ WARN_ON_ONCE(fi_before); task_vruntime_update(rq, next, false); - goto done; + goto out_set_next; } }
@@ -5009,8 +5011,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) resched_curr(rq_i); }
-done: +out_set_next: set_next_task(rq, next); +out: + if (rq->core->core_forceidle && next == rq->idle) + queue_core_balance(rq); + return next; }
@@ -5107,7 +5113,7 @@ static void sched_core_balance(struct rq *rq)
static DEFINE_PER_CPU(struct callback_head, core_balance_head);
-void queue_core_balance(struct rq *rq) +static void queue_core_balance(struct rq *rq) { if (!sched_core_enabled(rq)) return; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 58bcb9517dfe..3c6396d61a04 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -430,7 +430,6 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir { update_idle_core(rq); schedstat_inc(rq->sched_goidle); - queue_core_balance(rq); }
#ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 610d126a7f44..4f057b00be4c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1265,8 +1265,6 @@ static inline bool sched_group_cookie_match(struct rq *rq, return false; }
-extern void queue_core_balance(struct rq *rq); - static inline bool sched_core_enqueued(struct task_struct *p) { return !RB_EMPTY_NODE(&p->core_node); @@ -1300,10 +1298,6 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; }
-static inline void queue_core_balance(struct rq *rq) -{ -} - static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) { return true;
From: Sebastian Andrzej Siewior bigeasy@linutronix.de
mainline inclusion from mainline-v5.18-rc2 commit 386ef214c3c6ab111d05e1790e79475363abaa05 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
try_steal_cookie() looks at task_struct::cpus_mask to decide if the task could be moved to `this' CPU. It ignores that the task might be in a migration disabled section while not on the CPU. In this case the task must not be moved otherwise per-CPU assumption are broken.
Use is_cpu_allowed(), as suggested by Peter Zijlstra, to decide if the a task can be moved.
Fixes: d2dfa17bc7de6 ("sched: Trivial forced-newidle balancer") Signed-off-by: Sebastian Andrzej Siewior bigeasy@linutronix.de Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/YjNK9El+3fzGmswf@linutronix.de Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1341087361ac..b310748225ed 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5045,7 +5045,7 @@ static bool try_steal_cookie(int this, int that) if (p == src->core_pick || p == src->curr) goto next;
- if (!cpumask_test_cpu(this, &p->cpus_mask)) + if (!is_cpu_allowed(p, this)) goto next;
if (p->core_occupation > dst->idle->core_occupation)
From: Phil Auld pauld@redhat.com
mainline inclusion from mainline-v5.18-rc2 commit 5524cbb1bfcdff0cad0aaa9f94e6092002a07259 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Arm64 systems rely on store_cpu_topology() to call update_siblings_masks() to transfer the toplogy to the various cpu masks. This needs to be done before the call to notify_cpu_starting() which tells the scheduler about each cpu found, otherwise the core scheduling data structures are setup in a way that does not match the actual topology.
With smt_mask not setup correctly we bail on `cpumask_weight(smt_mask) == 1` for !leaders in:
notify_cpu_starting() cpuhp_invoke_callback_range() sched_cpu_starting() sched_core_cpu_starting()
which leads to rq->core not being correctly set for !leader-rq's.
Without this change stress-ng (which enables core scheduling in its prctl tests in newer versions -- i.e. with PR_SCHED_CORE support) causes a warning and then a crash (trimmed for legibility):
[ 1853.805168] ------------[ cut here ]------------ [ 1853.809784] task_rq(b)->core != rq->core [ 1853.809792] WARNING: CPU: 117 PID: 0 at kernel/sched/fair.c:11102 cfs_prio_less+0x1b4/0x1c4 ... [ 1854.015210] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010 ... [ 1854.231256] Call trace: [ 1854.233689] pick_next_task+0x3dc/0x81c [ 1854.237512] __schedule+0x10c/0x4cc [ 1854.240988] schedule_idle+0x34/0x54
Fixes: 9edeaea1bc45 ("sched: Core-wide rq->lock") Signed-off-by: Phil Auld pauld@redhat.com Reviewed-by: Dietmar Eggemann dietmar.eggemann@arm.com Tested-by: Dietmar Eggemann dietmar.eggemann@arm.com Link: https://lore.kernel.org/r/20220331153926.25742-1-pauld@redhat.com Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index dd4c76ed8ca6..08328ec025a8 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -250,6 +250,7 @@ asmlinkage notrace void secondary_start_kernel(void) * Log the CPU info before it is marked online and might get read. */ cpuinfo_store_cpu(); + store_cpu_topology(cpu);
/* * Enable GIC and timers. @@ -258,7 +259,6 @@ asmlinkage notrace void secondary_start_kernel(void)
ipi_setup(cpu);
- store_cpu_topology(cpu); numa_add_cpu(cpu);
/*
From: Hao Jia jiahao.os@bytedance.com
mainline inclusion from mainline-v5.19-rc1 commit 2679a83731d51a744657f718fc02c3b077e47562 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
When we use raw_spin_rq_lock() to acquire the rq lock and have to update the rq clock while holding the lock, the kernel may issue a WARN_DOUBLE_CLOCK warning.
Since we directly use raw_spin_rq_lock() to acquire rq lock instead of rq_lock(), there is no corresponding change to rq->clock_update_flags. In particular, we have obtained the rq lock of other CPUs, the rq->clock_update_flags of this CPU may be RQCF_UPDATED at this time, and then calling update_rq_clock() will trigger the WARN_DOUBLE_CLOCK warning.
So we need to clear RQCF_UPDATED of rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
For the sched_rt_period_timer() and migrate_task_rq_dl() cases we simply replace raw_spin_rq_lock()/raw_spin_rq_unlock() with rq_lock()/rq_unlock().
For the {pull,push}_{rt,dl}_task() cases, we add the double_rq_clock_clear_update() function to clear RQCF_UPDATED of rq->clock_update_flags, and call double_rq_clock_clear_update() before double_lock_balance()/double_rq_lock() returns to avoid the WARN_DOUBLE_CLOCK warning.
Some call trace reports: Call Trace 1: <IRQ> sched_rt_period_timer+0x10f/0x3a0 ? enqueue_top_rt_rq+0x110/0x110 __hrtimer_run_queues+0x1a9/0x490 hrtimer_interrupt+0x10b/0x240 __sysvec_apic_timer_interrupt+0x8a/0x250 sysvec_apic_timer_interrupt+0x9a/0xd0 </IRQ> <TASK> asm_sysvec_apic_timer_interrupt+0x12/0x20
Call Trace 2: <TASK> activate_task+0x8b/0x110 push_rt_task.part.108+0x241/0x2c0 push_rt_tasks+0x15/0x30 finish_task_switch+0xaa/0x2e0 ? __switch_to+0x134/0x420 __schedule+0x343/0x8e0 ? hrtimer_start_range_ns+0x101/0x340 schedule+0x4e/0xb0 do_nanosleep+0x8e/0x160 hrtimer_nanosleep+0x89/0x120 ? hrtimer_init_sleeper+0x90/0x90 __x64_sys_nanosleep+0x96/0xd0 do_syscall_64+0x34/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae
Call Trace 3: <TASK> deactivate_task+0x93/0xe0 pull_rt_task+0x33e/0x400 balance_rt+0x7e/0x90 __schedule+0x62f/0x8e0 do_task_dead+0x3f/0x50 do_exit+0x7b8/0xbb0 do_group_exit+0x2d/0x90 get_signal+0x9df/0x9e0 ? preempt_count_add+0x56/0xa0 ? __remove_hrtimer+0x35/0x70 arch_do_signal_or_restart+0x36/0x720 ? nanosleep_copyout+0x39/0x50 ? do_nanosleep+0x131/0x160 ? audit_filter_inodes+0xf5/0x120 exit_to_user_mode_prepare+0x10f/0x1e0 syscall_exit_to_user_mode+0x17/0x30 do_syscall_64+0x40/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae
Call Trace 4: update_rq_clock+0x128/0x1a0 migrate_task_rq_dl+0xec/0x310 set_task_cpu+0x84/0x1e4 try_to_wake_up+0x1d8/0x5c0 wake_up_process+0x1c/0x30 hrtimer_wakeup+0x24/0x3c __hrtimer_run_queues+0x114/0x270 hrtimer_interrupt+0xe8/0x244 arch_timer_handler_phys+0x30/0x50 handle_percpu_devid_irq+0x88/0x140 generic_handle_domain_irq+0x40/0x60 gic_handle_irq+0x48/0xe0 call_on_irq_stack+0x2c/0x60 do_interrupt_handler+0x80/0x84
Steps to reproduce: 1. Enable CONFIG_SCHED_DEBUG when compiling the kernel 2. echo 1 > /sys/kernel/debug/clear_warn_once echo "WARN_DOUBLE_CLOCK" > /sys/kernel/debug/sched/features echo "NO_RT_PUSH_IPI" > /sys/kernel/debug/sched/features 3. Run some rt/dl tasks that periodically work and sleep, e.g. Create 2*n rt or dl (90% running) tasks via rt-app (on a system with n CPUs), and Dietmar Eggemann reports Call Trace 4 when running on PREEMPT_RT kernel.
Signed-off-by: Hao Jia jiahao.os@bytedance.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Dietmar Eggemann dietmar.eggemann@arm.com Link: https://lore.kernel.org/r/20220430085843.62939-2-jiahao.os@bytedance.com Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 6 +++--- kernel/sched/deadline.c | 5 +++-- kernel/sched/rt.c | 5 +++-- kernel/sched/sched.h | 28 ++++++++++++++++++++++++---- 4 files changed, 33 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b310748225ed..c936c042297b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -521,10 +521,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) swap(rq1, rq2);
raw_spin_rq_lock(rq1); - if (__rq_lockp(rq1) == __rq_lockp(rq2)) - return; + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(rq1, rq2); } #endif
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0a4d217097ae..cb487d7d33e1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1722,6 +1722,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { + struct rq_flags rf; struct rq *rq;
if (p->state != TASK_WAKING) @@ -1733,7 +1734,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * from try_to_wake_up(). Hence, p->pi_lock is locked, but * rq->lock is not... So, lock it */ - raw_spin_rq_lock(rq); + rq_lock(rq, &rf); if (p->dl.dl_non_contending) { update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); @@ -1749,7 +1750,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused put_task_struct(p); } sub_rq_bw(&p->dl, &rq->dl); - raw_spin_rq_unlock(rq); + rq_unlock(rq, &rf); }
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 11df3f66af90..5dbf51ebdaed 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -876,6 +876,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + struct rq_flags rf; int skip;
/* @@ -890,7 +891,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (skip) continue;
- raw_spin_rq_lock(rq); + rq_lock(rq, &rf); update_rq_clock(rq);
if (rt_rq->rt_time) { @@ -928,7 +929,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue) sched_rt_rq_enqueue(rt_rq); - raw_spin_rq_unlock(rq); + rq_unlock(rq, &rf); }
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4f057b00be4c..1a575df63f71 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2416,6 +2416,24 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif
+#ifdef CONFIG_SCHED_DEBUG +/* + * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to + * acquire rq lock instead of rq_lock(). So at the end of these two functions + * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of + * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning. + */ +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) +{ + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ +#ifdef CONFIG_SMP + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); +#endif +} +#else +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} +#endif
#ifdef CONFIG_SMP
@@ -2481,14 +2499,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - if (__rq_lockp(this_rq) == __rq_lockp(busiest)) - return 0; - - if (likely(raw_spin_rq_trylock(busiest))) + if (__rq_lockp(this_rq) == __rq_lockp(busiest) || + likely(raw_spin_rq_trylock(busiest))) { + double_rq_clock_clear_update(this_rq, busiest); return 0; + }
if (rq_order_less(this_rq, busiest)) { raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(this_rq, busiest); return 0; }
@@ -2582,6 +2601,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) BUG_ON(rq1 != rq2); raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ + double_rq_clock_clear_update(rq1, rq2); }
/*
From: Cruz Zhao CruzZhao@linux.alibaba.com
mainline inclusion from mainline-v6.0-rc1 commit 91caa5ae242465c3ab9fd473e50170faa7e944f4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
In function sched_core_update_cookie(), a task will enqueue into the core tree only when it enqueued before, that is, if an uncookied task is cookied, it will not enqueue into the core tree until it enqueue again, which will result in unnecessary force idle.
Here follows the scenario: CPU x and CPU y are a pair of SMT siblings. 1. Start task a running on CPU x without sleeping, and task b and task c running on CPU y without sleeping. 2. We create a cookie and share it to task a and task b, and then we create another cookie and share it to task c. 3. Simpling core_forceidle_sum of task a and b from /proc/PID/sched
And we will find out that core_forceidle_sum of task a takes 30% time of the sampling period, which shouldn't happen as task a and b have the same cookie.
Then we migrate task a to CPU x', migrate task b and c to CPU y', where CPU x' and CPU y' are a pair of SMT siblings, and sampling again, we will found out that core_forceidle_sum of task a and b are almost zero.
To solve this problem, we enqueue the task into the core tree if it's on rq.
Fixes: 6e33cad0af49("sched: Trivial core scheduling cookie management") Signed-off-by: Cruz Zhao CruzZhao@linux.alibaba.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/1656403045-100840-2-git-send-email-CruzZhao@linux.... Conflicts: kernel/sched/core_sched.c [Feature 4feee7d12603d("sched/core: Forced idle accounting") is not applied.] Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core_sched.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 517f72b008f5..447e708e22d2 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -59,7 +59,6 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long old_cookie; struct rq_flags rf; struct rq *rq; - bool enqueued;
rq = task_rq_lock(p, &rf);
@@ -71,14 +70,16 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, */ SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
- enqueued = sched_core_enqueued(p); - if (enqueued) + if (sched_core_enqueued(p)) sched_core_dequeue(rq, p);
old_cookie = p->core_cookie; p->core_cookie = cookie;
- if (enqueued) + /* + * Consider the cases: !prev_cookie and !cookie. + */ + if (cookie && task_on_rq_queued(p)) sched_core_enqueue(rq, p);
/*
From: Lin Shengwang linshengwang1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
--------------------------------------------------------------------------
Due to the conflict between Core Scheduling and SMT expeller, so SCHED_CORE and QOS_SCHED_SMT_EXPELLER are mutually exclusive.
Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: lihua hucool.lihua@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com
Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/Kconfig.preempt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 5876e30c5740..e62a623031ea 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -102,7 +102,7 @@ config PREEMPT_DYNAMIC
config SCHED_CORE bool "Core Scheduling for SMT" - depends on SCHED_SMT + depends on SCHED_SMT && !QOS_SCHED_SMT_EXPELLER help This option permits Core Scheduling, a means of coordinated task selection across SMT siblings. When enabled -- see
From: Lin Shengwang linshengwang1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5OOWG CVE: NA
--------------------------------------------------------------------------
Signed-off-by: Lin Shengwang linshengwang1@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched.h | 15 ++++++++------- kernel/sched/sched.h | 22 ++++++++++++---------- 2 files changed, 20 insertions(+), 17 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index ea82fb089cd1..d8c974338ce2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -729,17 +729,11 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; - struct sched_dl_entity dl; - -#ifdef CONFIG_SCHED_CORE - struct rb_node core_node; - unsigned long core_cookie; - unsigned int core_occupation; -#endif
#ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif + struct sched_dl_entity dl;
#ifdef CONFIG_UCLAMP_TASK /* @@ -1404,11 +1398,18 @@ struct task_struct { */ randomized_struct_fields_end
+#if defined CONFIG_SCHED_CORE && !defined(__GENKSYMS__) + struct rb_node core_node; + unsigned long core_cookie; + unsigned int core_occupation; + KABI_FILL_HOLE(unsigned int kabi_hole) +#else KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) +#endif KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1a575df63f71..3bd6c988652a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -553,11 +553,6 @@ struct cfs_rq {
u64 exec_clock; u64 min_vruntime; -#ifdef CONFIG_SCHED_CORE - unsigned int forceidle_seq; - u64 min_vruntime_fi; -#endif - #ifndef CONFIG_64BIT u64 min_vruntime_copy; #endif @@ -644,8 +639,14 @@ struct cfs_rq { KABI_RESERVE(1) KABI_RESERVE(2) #endif +#if defined CONFIG_SCHED_CORE && !defined(__GENKSYMS__) + unsigned int forceidle_seq; + KABI_FILL_HOLE(unsigned int kabi_hole) + u64 min_vruntime_fi; +#else KABI_RESERVE(3) KABI_RESERVE(4) +#endif };
static inline int rt_bandwidth_enabled(void) @@ -942,7 +943,7 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); */ struct rq { /* runqueue lock: */ - raw_spinlock_t __lock; + raw_spinlock_t KABI_RENAME(lock, __lock);
/* * nr_running and cpu_load should be in the same cacheline because @@ -1112,7 +1113,7 @@ struct rq { struct cpuidle_state *idle_state; #endif
-#ifdef CONFIG_SCHED_CORE +#if defined(CONFIG_SCHED_CORE) && !defined(__GENKSYMS__) /* per rq */ struct rq *core; struct task_struct *core_pick; @@ -2135,9 +2136,6 @@ struct sched_class { #ifdef CONFIG_SMP int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - - struct task_struct * (*pick_task)(struct rq *rq); - void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task); @@ -2175,7 +2173,11 @@ struct sched_class { void (*task_change_group)(struct task_struct *p, int type); #endif
+#if !defined(__GENKSYMS__) && defined(CONFIG_SMP) + struct task_struct * (*pick_task)(struct rq *rq); +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) } __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */