sched: steal tasks to improve CPU utilization
Cheng Jian (3): disable stealing by default sched/fair: introduce SCHED_STEAL config: enable CONFIG_SCHED_STEAL by default
Steve Sistare (10): sched: Provide sparsemask, a reduced contention bitmap sched/topology: Provide hooks to allocate data shared per LLC sched/topology: Provide cfs_overload_cpus bitmap sched/fair: Dynamically update cfs_overload_cpus sched/fair: Hoist idle_stamp up from idle_balance sched/fair: Generalize the detach_task interface sched/fair: Provide can_migrate_task_llc sched/fair: Steal work from an overloaded CPU when CPU goes idle sched/fair: disable stealing if too many NUMA nodes sched/fair: Provide idle search schedstats
arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/sched/topology.h | 3 + init/Kconfig | 15 ++ kernel/sched/core.c | 35 ++- kernel/sched/fair.c | 360 +++++++++++++++++++++++-- kernel/sched/features.h | 8 + kernel/sched/sched.h | 21 ++ kernel/sched/sparsemask.h | 210 +++++++++++++++ kernel/sched/stats.c | 15 ++ kernel/sched/stats.h | 20 ++ kernel/sched/topology.c | 141 +++++++++- 12 files changed, 804 insertions(+), 26 deletions(-) create mode 100644 kernel/sched/sparsemask.h
From: Steve Sistare steven.sistare@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
Reference: https://lore.kernel.org/lkml/1541767840-93588-2-git-send-email-steven.sistar...
---------------------------
Provide struct sparsemask and functions to manipulate it. A sparsemask is a sparse bitmap. It reduces cache contention vs the usual bitmap when many threads concurrently set, clear, and visit elements, by reducing the number of significant bits per cacheline. For each cacheline chunk of the mask, only the first K bits of the first word are used, and the remaining bits are ignored, where K is a creation time parameter. Thus a sparsemask that can represent a set of N elements is approximately (N/K * CACHELINE) bytes in size.
This type is simpler and more efficient than the struct sbitmap used by block drivers.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/sparsemask.h | 210 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 kernel/sched/sparsemask.h
diff --git a/kernel/sched/sparsemask.h b/kernel/sched/sparsemask.h new file mode 100644 index 000000000000..11948620a1a2 --- /dev/null +++ b/kernel/sched/sparsemask.h @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sparsemask.h - sparse bitmap operations + * + * Copyright (c) 2018 Oracle Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __LINUX_SPARSEMASK_H +#define __LINUX_SPARSEMASK_H + +#include <linux/kernel.h> +#include <linux/bitmap.h> +#include <linux/bug.h> + +/* + * A sparsemask is a sparse bitmap. It reduces cache contention vs the usual + * bitmap when many threads concurrently set, clear, and visit elements. For + * each cacheline chunk of the mask, only the first K bits of the first word are + * used, and the remaining bits are ignored, where K is a creation time + * parameter. Thus a sparsemask that can represent a set of N elements is + * approximately (N/K * CACHELINE) bytes in size. + * + * Clients pass and receive element numbers in the public API, and the + * implementation translates them to bit numbers to perform the bitmap + * operations. + */ + +struct sparsemask_chunk { + unsigned long word; /* the significant bits */ +} ____cacheline_aligned_in_smp; + +struct sparsemask { + short nelems; /* current number of elements */ + short density; /* store 2^density elements per chunk */ + struct sparsemask_chunk chunks[0]; /* embedded array of chunks */ +}; + +#define _SMASK_INDEX(density, elem) ((elem) >> (density)) +#define _SMASK_BIT(density, elem) ((elem) & ((1U << (density)) - 1U)) +#define SMASK_INDEX(mask, elem) _SMASK_INDEX((mask)->density, elem) +#define SMASK_BIT(mask, elem) _SMASK_BIT((mask)->density, elem) +#define SMASK_WORD(mask, elem) \ + (&(mask)->chunks[SMASK_INDEX((mask), (elem))].word) + +/* + * sparsemask_next() - Return the next one bit in a bitmap, starting at a + * specified position and wrapping from the last bit to the first, up to but + * not including a specified origin. This is a helper, so do not call it + * directly. + * + * @mask: Bitmap to search. + * @origin: Origin. + * @prev: Previous bit. Start search after this bit number. + * If -1, start search at @origin. + * + * Return: the bit number, else mask->nelems if no bits are set in the range. + */ +static inline int +sparsemask_next(const struct sparsemask *mask, int origin, int prev) +{ + int density = mask->density; + int bits_per_word = 1U << density; + const struct sparsemask_chunk *chunk; + int nelems = mask->nelems; + int next, bit, nbits; + unsigned long word; + + /* Calculate number of bits to be searched. */ + if (prev == -1) { + nbits = nelems; + next = origin; + } else if (prev < origin) { + nbits = origin - prev; + next = prev + 1; + } else { + nbits = nelems - prev + origin - 1; + next = prev + 1; + } + + if (unlikely(next >= nelems)) + return nelems; + + /* + * Fetch and adjust first word. Clear word bits below @next, and round + * @next down to @bits_per_word boundary because later ffs will add + * those bits back. + */ + chunk = &mask->chunks[_SMASK_INDEX(density, next)]; + bit = _SMASK_BIT(density, next); + word = chunk->word & (~0UL << bit); + next -= bit; + nbits += bit; + + while (!word) { + next += bits_per_word; + nbits -= bits_per_word; + if (nbits <= 0) + return nelems; + + if (next >= nelems) { + chunk = mask->chunks; + nbits -= (next - nelems); + next = 0; + } else { + chunk++; + } + word = chunk->word; + } + + next += __ffs(word); + if (next >= origin && prev != -1) + return nelems; + return next; +} + +/****************** The public API ********************/ + +/* + * Max value for the density parameter, limited by 64 bits in the chunk word. + */ +#define SMASK_DENSITY_MAX 6 + +/* + * Return bytes to allocate for a sparsemask, for custom allocators. + */ +static inline size_t sparsemask_size(int nelems, int density) +{ + int index = _SMASK_INDEX(density, nelems) + 1; + + return offsetof(struct sparsemask, chunks[index]); +} + +/* + * Initialize an allocated sparsemask, for custom allocators. + */ +static inline void +sparsemask_init(struct sparsemask *mask, int nelems, int density) +{ + WARN_ON(density < 0 || density > SMASK_DENSITY_MAX || nelems < 0); + mask->nelems = nelems; + mask->density = density; +} + +/* + * sparsemask_alloc_node() - Allocate, initialize, and return a sparsemask. + * + * @nelems - maximum number of elements. + * @density - store 2^density elements per cacheline chunk. + * values from 0 to SMASK_DENSITY_MAX inclusive. + * @flags - kmalloc allocation flags + * @node - numa node + */ +static inline struct sparsemask * +sparsemask_alloc_node(int nelems, int density, gfp_t flags, int node) +{ + int nbytes = sparsemask_size(nelems, density); + struct sparsemask *mask = kmalloc_node(nbytes, flags, node); + + if (mask) + sparsemask_init(mask, nelems, density); + return mask; +} + +static inline void sparsemask_free(struct sparsemask *mask) +{ + kfree(mask); +} + +static inline void sparsemask_set_elem(struct sparsemask *dst, int elem) +{ + set_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem)); +} + +static inline void sparsemask_clear_elem(struct sparsemask *dst, int elem) +{ + clear_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem)); +} + +static inline int sparsemask_test_elem(const struct sparsemask *mask, int elem) +{ + return test_bit(SMASK_BIT(mask, elem), SMASK_WORD(mask, elem)); +} + +/* + * sparsemask_for_each() - iterate over each set bit in a bitmap, starting at a + * specified position, and wrapping from the last bit to the first. + * + * @mask: Bitmap to iterate over. + * @origin: Bit number at which to start searching. + * @elem: Iterator. Can be signed or unsigned integer. + * + * The implementation does not assume any bit in @mask is set, including + * @origin. After the loop, @elem = @mask->nelems. + */ +#define sparsemask_for_each(mask, origin, elem) \ + for ((elem) = -1; \ + (elem) = sparsemask_next((mask), (origin), (elem)), \ + (elem) < (mask)->nelems;) + +#endif /* __LINUX_SPARSEMASK_H */
From: Steve Sistare steven.sistare@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
Reference: https://lore.kernel.org/lkml/1541767840-93588-3-git-send-email-steven.sistar...
---------------------------
Add functions sd_llc_alloc_all() and sd_llc_free_all() to allocate and free data pointed to by struct sched_domain_shared at the last-level-cache domain. sd_llc_alloc_all() is called after the SD hierarchy is known, to eliminate the unnecessary allocations that would occur if we instead allocated in __sdt_alloc() and then figured out which shared nodes are redundant.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/topology.c | 75 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 423d08947962..3346362db697 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -11,6 +11,12 @@ DEFINE_MUTEX(sched_domains_mutex); static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2;
+struct s_data; +static int sd_llc_alloc(struct sched_domain *sd); +static void sd_llc_free(struct sched_domain *sd); +static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d); +static void sd_llc_free_all(const struct cpumask *cpu_map); + #ifdef CONFIG_SCHED_DEBUG
static int __init sched_debug_setup(char *str) @@ -632,8 +638,10 @@ static void destroy_sched_domain(struct sched_domain *sd) */ free_sched_groups(sd->groups, 1);
- if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) { + sd_llc_free(sd); kfree(sd->shared); + } kfree(sd); }
@@ -1473,6 +1481,7 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_percpu(d->sd); fallthrough; case sa_sd_storage: + sd_llc_free_all(cpu_map); __sdt_free(cpu_map); fallthrough; case sa_none: @@ -2287,6 +2296,62 @@ static void __sdt_free(const struct cpumask *cpu_map) } }
+static int sd_llc_alloc(struct sched_domain *sd) +{ + /* Allocate sd->shared data here. Empty for now. */ + + return 0; +} + +static void sd_llc_free(struct sched_domain *sd) +{ + struct sched_domain_shared *sds = sd->shared; + + if (!sds) + return; + + /* Free data here. Empty for now. */ +} + +static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d) +{ + struct sched_domain *sd, *hsd; + int i; + + for_each_cpu(i, cpu_map) { + /* Find highest domain that shares resources */ + hsd = NULL; + for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) { + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) + break; + hsd = sd; + } + if (hsd && sd_llc_alloc(hsd)) + return 1; + } + + return 0; +} + +static void sd_llc_free_all(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + struct sched_domain *sd; + struct sd_data *sdd; + int j; + + for_each_sd_topology(tl) { + sdd = &tl->data; + if (!sdd || !sdd->sd) + continue; + for_each_cpu(j, cpu_map) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd) + sd_llc_free(sd); + } + } +} + static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) @@ -2480,6 +2545,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } }
+ /* + * Allocate shared sd data at last level cache. Must be done after + * domains are built above, but before the data is used in + * cpu_attach_domain and descendants below. + */ + if (sd_llc_alloc_all(cpu_map, &d)) + goto error; + /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) {
From: Steve Sistare steve.sistare@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
Reference: https://lore.kernel.org/lkml/1541767840-93588-4-git-send-email-steven.sistar...
---------------------------
Define and initialize a sparse bitmap of overloaded CPUs, per last-level-cache scheduling domain, for use by the CFS scheduling class. Save a pointer to cfs_overload_cpus in the rq for efficient access.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- include/linux/sched/topology.h | 1 + kernel/sched/sched.h | 2 ++ kernel/sched/topology.c | 26 ++++++++++++++++++++++++-- 3 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 67b573d5bf28..308daac94de0 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -82,6 +82,7 @@ struct sched_domain_shared { atomic_t nr_busy_cpus; int has_idle_cores; int nr_idle_scan; + struct sparsemask *cfs_overload_cpus; };
struct sched_domain { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3de84e95baf1..15466a81f56b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -99,6 +99,7 @@
struct rq; struct cpuidle_state; +struct sparsemask;
/* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 @@ -1006,6 +1007,7 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; + struct sparsemask *cfs_overload_cpus;
#ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3346362db697..99c9b05b88ec 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -4,6 +4,8 @@ */
#include <linux/bsearch.h> +#include "sched.h" +#include "sparsemask.h"
DEFINE_MUTEX(sched_domains_mutex);
@@ -682,7 +684,9 @@ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu) { + struct sparsemask *cfs_overload_cpus = NULL; struct sched_domain_shared *sds = NULL; + struct rq *rq = cpu_rq(cpu); struct sched_domain *sd; int id = cpu; int size = 1; @@ -692,8 +696,10 @@ static void update_top_cache_domain(int cpu) id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); sds = sd->shared; + cfs_overload_cpus = sds->cfs_overload_cpus; }
+ rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; @@ -2298,7 +2304,22 @@ static void __sdt_free(const struct cpumask *cpu_map)
static int sd_llc_alloc(struct sched_domain *sd) { - /* Allocate sd->shared data here. Empty for now. */ + struct sched_domain_shared *sds = sd->shared; + struct cpumask *span = sched_domain_span(sd); + int nid = cpu_to_node(cpumask_first(span)); + int flags = __GFP_ZERO | GFP_KERNEL; + struct sparsemask *mask; + + /* + * Allocate the bitmap if not already allocated. This is called for + * every CPU in the LLC but only allocates once per sd_llc_shared. + */ + if (!sds->cfs_overload_cpus) { + mask = sparsemask_alloc_node(nr_cpu_ids, 3, flags, nid); + if (!mask) + return 1; + sds->cfs_overload_cpus = mask; + }
return 0; } @@ -2310,7 +2331,8 @@ static void sd_llc_free(struct sched_domain *sd) if (!sds) return;
- /* Free data here. Empty for now. */ + sparsemask_free(sds->cfs_overload_cpus); + sds->cfs_overload_cpus = NULL; }
static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
From: Steve Sistare steven.sistare@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
Reference: https://lore.kernel.org/lkml/1541767840-93588-5-git-send-email-steven.sistar...
---------------------------
An overloaded CPU has more than 1 runnable task. When a CFS task wakes on a CPU, if h_nr_running transitions from 1 to more, then set the CPU in the cfs_overload_cpus bitmap. When a CFS task sleeps, if h_nr_running transitions from 2 to less, then clear the CPU in cfs_overload_cpus.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/fair.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 640c0a73e73a..2c106a223b73 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -62,6 +62,7 @@ #include <linux/resume_user_mode.h> #endif
+#include "sparsemask.h" /* * The initial- and re-scaling of tunables is configurable * @@ -5080,6 +5081,28 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); }
+static void overload_clear(struct rq *rq) +{ + struct sparsemask *overload_cpus; + + rcu_read_lock(); + overload_cpus = rcu_dereference(rq->cfs_overload_cpus); + if (overload_cpus) + sparsemask_clear_elem(overload_cpus, rq->cpu); + rcu_read_unlock(); +} + +static void overload_set(struct rq *rq) +{ + struct sparsemask *overload_cpus; + + rcu_read_lock(); + overload_cpus = rcu_dereference(rq->cfs_overload_cpus); + if (overload_cpus) + sparsemask_set_elem(overload_cpus, rq->cpu); + rcu_read_unlock(); +} + #else /* CONFIG_SMP */
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -5109,6 +5132,9 @@ static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) return 0; }
+static inline void overload_clear(struct rq *rq) {} +static inline void overload_set(struct rq *rq) {} + static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
@@ -5712,6 +5738,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); + unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; @@ -5785,6 +5812,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/ sub_nr_running(rq, task_delta); + if (prev_nr >= 2 && prev_nr - task_delta < 2) + overload_clear(rq);
done: /* @@ -5801,6 +5830,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); + unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; @@ -5883,6 +5913,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); + if (prev_nr < 2 && prev_nr + task_delta >= 2) + overload_set(rq);
unthrottle_throttle: assert_list_leaf_cfs_rq(rq); @@ -6695,6 +6727,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); int task_new = !(flags & ENQUEUE_WAKEUP); + unsigned int prev_nr = rq->cfs.h_nr_running;
/* * The code below (indirectly) updates schedutil which looks at @@ -6751,6 +6784,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); + if (prev_nr == 1) + overload_set(rq);
/* * Since new tasks are assigned an initial util_avg equal to @@ -6788,6 +6823,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); + unsigned int prev_nr = rq->cfs.h_nr_running; bool was_sched_idle = sched_idle_rq(rq);
util_est_dequeue(&rq->cfs, p); @@ -6842,6 +6878,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); + if (prev_nr == 2) + overload_clear(rq);
/* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) @@ -8475,6 +8513,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; long task_delta, idle_task_delta; + unsigned int prev_nr = cfs_rq->h_nr_running;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -8521,6 +8560,8 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/ sub_nr_running(rq, task_delta); + if (prev_nr >= 2 && prev_nr - task_delta < 2) + overload_clear(rq);
done: if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) @@ -8536,6 +8577,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; + unsigned int prev_nr = cfs_rq->h_nr_running; long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)]; @@ -8598,6 +8640,8 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) }
add_nr_running(rq, task_delta); + if (prev_nr < 2 && prev_nr + task_delta >= 2) + overload_set(rq);
unthrottle_throttle:
From: Steve Sistare steven.sistare@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
Reference: https://lore.kernel.org/lkml/1541767840-93588-6-git-send-email-steven.sistar...
---------------------------
Move the update of idle_stamp from idle_balance to the call site in pick_next_task_fair, to prepare for a future patch that adds work to pick_next_task_fair which must be included in the idle_stamp interval. No functional change.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/fair.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2c106a223b73..b4d65c7ef0a7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5081,6 +5081,16 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); }
+static inline void rq_idle_stamp_update(struct rq *rq) +{ + rq->idle_stamp = rq_clock(rq); +} + +static inline void rq_idle_stamp_clear(struct rq *rq) +{ + rq->idle_stamp = 0; +} + static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus; @@ -5132,6 +5142,8 @@ static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) return 0; }
+static inline void rq_idle_stamp_update(struct rq *rq) {} +static inline void rq_idle_stamp_clear(struct rq *rq) {} static inline void overload_clear(struct rq *rq) {} static inline void overload_set(struct rq *rq) {}
@@ -8948,8 +8960,18 @@ done: __maybe_unused; if (!rf) return NULL;
+ /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + rq_idle_stamp_update(rq); + new_tasks = newidle_balance(rq, rf);
+ if (new_tasks) + rq_idle_stamp_clear(rq); + + /* * Because newidle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -12770,12 +12792,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (this_rq->ttwu_pending) return 0;
- /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. - */ - this_rq->idle_stamp = rq_clock(this_rq); - /* * Do not pull tasks towards !active CPUs... */ @@ -12865,9 +12881,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance;
- if (pulled_task) - this_rq->idle_stamp = 0; - else + if (!pulled_task) nohz_newidle_balance(this_rq);
rq_repin_lock(this_rq, rf);
From: Steve Sistare steven.sistare@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
Reference: https://lore.kernel.org/lkml/1541767840-93588-7-git-send-email-steven.sistar...
---------------------------
The detach_task function takes a struct lb_env argument, but only needs a few of its members. Pass the rq and cpu arguments explicitly so the function may be called from code that is not based on lb_env. No functional change.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/fair.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b4d65c7ef0a7..3c4aa33fd270 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9479,14 +9479,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) }
/* - * detach_task() -- detach the task for the migration specified in env + * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu. */ -static void detach_task(struct task_struct *p, struct lb_env *env) +static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu) { - lockdep_assert_rq_held(env->src_rq); + lockdep_assert_rq_held(src_rq);
- deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); - set_task_cpu(p, env->dst_cpu); + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, dst_cpu); }
/* @@ -9506,7 +9506,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) if (!can_migrate_task(p, env)) continue;
- detach_task(p, env); + detach_task(p, env->src_rq, env->dst_cpu);
/* * Right now, this is only the second place where @@ -9625,7 +9625,7 @@ static int detach_tasks(struct lb_env *env) break; }
- detach_task(p, env); + detach_task(p, env->src_rq, env->dst_cpu); list_add(&p->se.group_node, &env->tasks);
detached++;
From: Steve Sistare steven.sistare@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
Reference: https://lore.kernel.org/lkml/1541767840-93588-8-git-send-email-steven.sistar...
---------------------------
Define a simpler version of can_migrate_task called can_migrate_task_llc which does not require a struct lb_env argument, and judges whether a migration from one CPU to another within the same LLC should be allowed.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/fair.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c4aa33fd270..549bdfa9649b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9478,6 +9478,34 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; }
+/* + * Return true if task @p can migrate from @rq to @dst_rq in the same LLC. + * No need to test for co-locality, and no need to test task_hot(), as sharing + * LLC provides cache warmth at that level. + */ +static bool +can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) +{ + int dst_cpu = dst_rq->cpu; + + lockdep_assert_rq_held(rq); + + if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu)) + return false; + + if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) { + schedstat_inc(p->stats.nr_failed_migrations_affine); + return false; + } + + if (task_on_cpu(rq, p)) { + schedstat_inc(p->stats.nr_failed_migrations_running); + return false; + } + + return true; +} + /* * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu. */
From: Steve Sistare steven.sistare@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
Reference: https://lore.kernel.org/lkml/1541767840-93588-9-git-send-email-steven.sistar...
---------------------------
When a CPU has no more CFS tasks to run, and idle_balance() fails to find a task, then attempt to steal a task from an overloaded CPU in the same LLC, using the cfs_overload_cpus bitmap to efficiently identify candidates. To minimize search time, steal the first migratable task that is found when the bitmap is traversed. For fairness, search for migratable tasks on an overloaded CPU in order of next to run.
This simple stealing yields a higher CPU utilization than idle_balance() alone, because the search is cheap, so it may be called every time the CPU is about to go idle. idle_balance() does more work because it searches widely for the busiest queue, so to limit its CPU consumption, it declines to search if the system is too busy. Simple stealing does not offload the globally busiest queue, but it is much better than running nothing at all.
Stealing is controlled by the sched feature SCHED_STEAL, which is enabled by default.
Stealing imprroves utilization with only a modest CPU overhead in scheduler code. In the following experiment, hackbench is run with varying numbers of groups (40 tasks per group), and the delta in /proc/schedstat is shown for each run, averaged per CPU, augmented with these non-standard stats:
%find - percent of time spent in old and new functions that search for idle CPUs and tasks to steal and set the overloaded CPUs bitmap.
steal - number of times a task is stolen from another CPU.
X6-2: 1 socket * 10 cores * 2 hyperthreads = 20 CPUs Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz hackbench <grps> process 100000 sched_wakeup_granularity_ns=15000000
baseline grps time %busy slice sched idle wake %find steal 1 8.084 75.02 0.10 105476 46291 59183 0.31 0 2 13.892 85.33 0.10 190225 70958 119264 0.45 0 3 19.668 89.04 0.10 263896 87047 176850 0.49 0 4 25.279 91.28 0.10 322171 94691 227474 0.51 0 8 47.832 94.86 0.09 630636 144141 486322 0.56 0
new grps time %busy slice sched idle wake %find steal %speedup 1 5.938 96.80 0.24 31255 7190 24061 0.63 7433 36.1 2 11.491 99.23 0.16 74097 4578 69512 0.84 19463 20.9 3 16.987 99.66 0.15 115824 1985 113826 0.77 24707 15.8 4 22.504 99.80 0.14 167188 2385 164786 0.75 29353 12.3 8 44.441 99.86 0.11 389153 1616 387401 0.67 38190 7.6
Elapsed time improves by 8 to 36%, and CPU busy utilization is up by 5 to 22% hitting 99% for 2 or more groups (80 or more tasks). The cost is at most 0.4% more find time.
Additional performance results follow. A negative "speedup" is a regression. Note: for all hackbench runs, sched_wakeup_granularity_ns is set to 15 msec. Otherwise, preemptions increase at higher loads and distort the comparison between baseline and new.
------------------ 1 Socket Results ------------------
X6-2: 1 socket * 10 cores * 2 hyperthreads = 20 CPUs Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz Average of 10 runs of: hackbench <groups> process 100000
--- base -- --- new --- groups time %stdev time %stdev %speedup 1 8.008 0.1 5.905 0.2 35.6 2 13.814 0.2 11.438 0.1 20.7 3 19.488 0.2 16.919 0.1 15.1 4 25.059 0.1 22.409 0.1 11.8 8 47.478 0.1 44.221 0.1 7.3
X6-2: 1 socket * 22 cores * 2 hyperthreads = 44 CPUs Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz Average of 10 runs of: hackbench <groups> process 100000
--- base -- --- new --- groups time %stdev time %stdev %speedup 1 4.586 0.8 4.596 0.6 -0.3 2 7.693 0.2 5.775 1.3 33.2 3 10.442 0.3 8.288 0.3 25.9 4 13.087 0.2 11.057 0.1 18.3 8 24.145 0.2 22.076 0.3 9.3 16 43.779 0.1 41.741 0.2 4.8
KVM 4-cpu Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz tbench, average of 11 runs
clients %speedup 1 16.2 2 11.7 4 9.9 8 12.8 16 13.7
KVM 2-cpu Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
Benchmark %speedup specjbb2015_critical_jops 5.7 mysql_sysb1.0.14_mutex_2 40.6 mysql_sysb1.0.14_oltp_2 3.9
------------------ 2 Socket Results ------------------
X6-2: 2 sockets * 10 cores * 2 hyperthreads = 40 CPUs Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz Average of 10 runs of: hackbench <groups> process 100000
--- base -- --- new --- groups time %stdev time %stdev %speedup 1 7.945 0.2 7.219 8.7 10.0 2 8.444 0.4 6.689 1.5 26.2 3 12.100 1.1 9.962 2.0 21.4 4 15.001 0.4 13.109 1.1 14.4 8 27.960 0.2 26.127 0.3 7.0
X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz Average of 10 runs of: hackbench <groups> process 100000
--- base -- --- new --- groups time %stdev time %stdev %speedup 1 5.826 5.4 5.840 5.0 -0.3 2 5.041 5.3 6.171 23.4 -18.4 3 6.839 2.1 6.324 3.8 8.1 4 8.177 0.6 7.318 3.6 11.7 8 14.429 0.7 13.966 1.3 3.3 16 26.401 0.3 25.149 1.5 4.9
X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz Oracle database OLTP, logging disabled, NVRAM storage
Customers Users %speedup 1200000 40 -1.2 2400000 80 2.7 3600000 120 8.9 4800000 160 4.4 6000000 200 3.0
X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz Results from the Oracle "Performance PIT".
Benchmark %speedup
mysql_sysb1.0.14_fileio_56_rndrd 19.6 mysql_sysb1.0.14_fileio_56_seqrd 12.1 mysql_sysb1.0.14_fileio_56_rndwr 0.4 mysql_sysb1.0.14_fileio_56_seqrewr -0.3
pgsql_sysb1.0.14_fileio_56_rndrd 19.5 pgsql_sysb1.0.14_fileio_56_seqrd 8.6 pgsql_sysb1.0.14_fileio_56_rndwr 1.0 pgsql_sysb1.0.14_fileio_56_seqrewr 0.5
opatch_time_ASM_12.2.0.1.0_HP2M 7.5 select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1 select-1_users_asmm_ASM_12.2.0.1.0_HP2M 4.4 swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M 5.8
lm3_memlat_L2 4.8 lm3_memlat_L1 0.0
ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1 ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent 5.2 ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent -3.0 ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4
X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
NAS_OMP bench class ncpu %improved(Mops) dc B 72 1.3 is C 72 0.9 is D 72 0.7
sysbench mysql, average of 24 runs --- base --- --- new --- nthr events %stdev events %stdev %speedup 1 331.0 0.25 331.0 0.24 -0.1 2 661.3 0.22 661.8 0.22 0.0 4 1297.0 0.88 1300.5 0.82 0.2 8 2420.8 0.04 2420.5 0.04 -0.1 16 4826.3 0.07 4825.4 0.05 -0.1 32 8815.3 0.27 8830.2 0.18 0.1 64 12823.0 0.24 12823.6 0.26 0.0
-------------------------------------------------------------
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/fair.c | 165 ++++++++++++++++++++++++++++++++++++++-- kernel/sched/features.h | 6 ++ 2 files changed, 166 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 549bdfa9649b..6d268682030a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5095,6 +5095,9 @@ static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus;
+ if (!sched_feat(STEAL)) + return; + rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); if (overload_cpus) @@ -5106,6 +5109,9 @@ static void overload_set(struct rq *rq) { struct sparsemask *overload_cpus;
+ if (!sched_feat(STEAL)) + return; + rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); if (overload_cpus) @@ -5113,6 +5119,8 @@ static void overload_set(struct rq *rq) rcu_read_unlock(); }
+static int try_steal(struct rq *this_rq, struct rq_flags *rf); + #else /* CONFIG_SMP */
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -5144,6 +5152,7 @@ static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
static inline void rq_idle_stamp_update(struct rq *rq) {} static inline void rq_idle_stamp_clear(struct rq *rq) {} +static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) { return 0; } static inline void overload_clear(struct rq *rq) {} static inline void overload_set(struct rq *rq) {}
@@ -8961,21 +8970,23 @@ done: __maybe_unused; return NULL;
/* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. + * We must set idle_stamp _before_ calling try_steal() or + * idle_balance(), such that we measure the duration as idle time. */ rq_idle_stamp_update(rq);
new_tasks = newidle_balance(rq, rf); + if (new_tasks == 0) + new_tasks = try_steal(rq, rf);
if (new_tasks) rq_idle_stamp_clear(rq);
/* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is - * possible for any higher priority task to appear. In that case we - * must re-start the pick_next_entity() loop. + * Because try_steal() and idle_balance() release (and re-acquire) + * rq->lock, it is possible for any higher priority task to appear. + * In that case we must re-start the pick_next_entity() loop. */ if (new_tasks < 0) return RETRY_TASK; @@ -13105,6 +13116,150 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif
+/* + * Search the runnable tasks in @cfs_rq in order of next to run, and find + * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry. + * On success, dequeue the task from @cfs_rq and return it, else return NULL. + */ +static struct task_struct * +detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq) +{ + int dst_cpu = dst_rq->cpu; + struct task_struct *p; + struct rq *rq = rq_of(cfs_rq); + + lockdep_assert_rq_held(rq); + + list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) { + if (can_migrate_task_llc(p, rq, dst_rq)) { + detach_task(p, rq, dst_cpu); + return p; + } + } + return NULL; +} + +/* + * Attempt to migrate a CFS task from @src_cpu to @dst_rq. @locked indicates + * whether @dst_rq is already locked on entry. This function may lock or + * unlock @dst_rq, and updates @locked to indicate the locked state on return. + * The locking protocol is based on idle_balance(). + * Returns 1 on success and 0 on failure. + */ +static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, + int src_cpu) +{ + struct task_struct *p; + struct rq_flags rf; + int stolen = 0; + int dst_cpu = dst_rq->cpu; + struct rq *src_rq = cpu_rq(src_cpu); + + if (dst_cpu == src_cpu || src_rq->cfs.h_nr_running < 2) + return 0; + + if (*locked) { + rq_unpin_lock(dst_rq, dst_rf); + raw_spin_rq_unlock(dst_rq); + *locked = false; + } + rq_lock_irqsave(src_rq, &rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.h_nr_running < 2 || !cpu_active(src_cpu)) + p = NULL; + else + p = detach_next_task(&src_rq->cfs, dst_rq); + + rq_unlock(src_rq, &rf); + + if (p) { + raw_spin_rq_lock(dst_rq); + rq_repin_lock(dst_rq, dst_rf); + *locked = true; + update_rq_clock(dst_rq); + attach_task(dst_rq, p); + stolen = 1; + } + local_irq_restore(rf.flags); + + return stolen; +} + +/* + * Conservative upper bound on the max cost of a steal, in nsecs (the typical + * cost is 1-2 microsec). Do not steal if average idle time is less. + */ +#define SCHED_STEAL_COST 10000 + +/* + * Try to steal a runnable CFS task from a CPU in the same LLC as @dst_rq, + * and migrate it to @dst_rq. rq_lock is held on entry and return, but + * may be dropped in between. Return 1 on success, 0 on failure, and -1 + * if a task in a different scheduling class has become runnable on @dst_rq. + */ +static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) +{ + int src_cpu; + int dst_cpu = dst_rq->cpu; + bool locked = true; + int stolen = 0; + struct sparsemask *overload_cpus; + + if (!sched_feat(STEAL)) + return 0; + + if (!cpu_active(dst_cpu)) + return 0; + + if (dst_rq->avg_idle < SCHED_STEAL_COST) + return 0; + + /* Get bitmap of overloaded CPUs in the same LLC as @dst_rq */ + + rcu_read_lock(); + overload_cpus = rcu_dereference(dst_rq->cfs_overload_cpus); + if (!overload_cpus) { + rcu_read_unlock(); + return 0; + } + +#ifdef CONFIG_SCHED_SMT + /* + * First try overloaded CPUs on the same core to preserve cache warmth. + */ + if (static_branch_likely(&sched_smt_present)) { + for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) { + if (sparsemask_test_elem(overload_cpus, src_cpu) && + steal_from(dst_rq, dst_rf, &locked, src_cpu)) { + stolen = 1; + goto out; + } + } + } +#endif /* CONFIG_SCHED_SMT */ + + /* Accept any suitable task in the LLC */ + + sparsemask_for_each(overload_cpus, dst_cpu, src_cpu) { + if (steal_from(dst_rq, dst_rf, &locked, src_cpu)) { + stolen = 1; + goto out; + } + } + +out: + rcu_read_unlock(); + if (!locked) { + raw_spin_rq_lock(dst_rq); + rq_repin_lock(dst_rq, dst_rf); + } + stolen |= (dst_rq->cfs.h_nr_running > 0); + if (dst_rq->nr_running != dst_rq->cfs.h_nr_running) + stolen = -1; + return stolen; +} + /* * scheduler tick hitting a task of our scheduling class. * diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 4dd46de2f827..139baca09fea 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -52,6 +52,12 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true)
+/* + * Steal a CFS task from another CPU when going idle. + * Improves CPU utilization. + */ +SCHED_FEAT(STEAL, true) + /* * Issue a WARN when we do multiple update_rq_clock() calls * in a single rq->lock section. Default disabled because the
From: Steve Sistare steven.sistare@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
Reference: https://lore.kernel.org/lkml/1541767840-93588-10-git-send-email-steven.sista...
---------------------------
The STEAL feature causes regressions on hackbench on larger NUMA systems, so disable it on systems with more than sched_steal_node_limit nodes (default 2). Note that the feature remains enabled as seen in features.h and /sys/kernel/debug/sched_features, but stealing is only performed if nodes <= sched_steal_node_limit. This arrangement allows users to activate stealing on reboot by setting the kernel parameter sched_steal_node_limit on kernels built without CONFIG_SCHED_DEBUG. The parameter is temporary and will be deleted when the regression is fixed.
Details of the regression follow. With the STEAL feature set, hackbench is slower on many-node systems:
X5-8: 8 sockets * 18 cores * 2 hyperthreads = 288 CPUs Intel(R) Xeon(R) CPU E7-8895 v3 @ 2.60GHz Average of 10 runs of: hackbench <groups> processes 50000
--- base -- --- new --- groups time %stdev time %stdev %speedup 1 3.627 15.8 3.876 7.3 -6.5 2 4.545 24.7 5.583 16.7 -18.6 3 5.716 25.0 7.367 14.2 -22.5 4 6.901 32.9 7.718 14.5 -10.6 8 8.604 38.5 9.111 16.0 -5.6 16 7.734 6.8 11.007 8.2 -29.8
Total CPU time increases. Profiling shows that CPU time increases uniformly across all functions, suggesting a systemic increase in cache or memory latency. This may be due to NUMA migrations, as they cause loss of LLC cache footprint and remote memory latencies.
The domains for this system and their flags are:
domain0 (SMT) : 1 core SD_LOAD_BALANCE SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING SD_SHARE_CPUCAPACITY SD_WAKE_AFFINE
domain1 (MC) : 1 socket SD_LOAD_BALANCE SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING SD_WAKE_AFFINE
domain2 (NUMA) : 4 sockets SD_LOAD_BALANCE SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_SERIALIZE SD_OVERLAP SD_NUMA SD_WAKE_AFFINE
domain3 (NUMA) : 8 sockets SD_LOAD_BALANCE SD_BALANCE_NEWIDLE SD_SERIALIZE SD_OVERLAP SD_NUMA
Schedstats point to the root cause of the regression. hackbench is run 10 times per group and the average schedstat accumulation per-run and per-cpu is shown below. Note that domain3 moves are zero because SD_WAKE_AFFINE is not set there.
NO_STEAL --- domain2 --- --- domain3 --- grp time %busy sched idle wake steal remote move pull remote move pull 1 20.3 10.3 28710 14346 14366 0 490 3378 0 4039 0 0 2 26.4 18.8 56721 28258 28469 0 792 7026 12 9229 0 7 3 29.9 28.3 90191 44933 45272 0 5380 7204 19 16481 0 3 4 30.2 35.8 121324 60409 60933 0 7012 9372 27 21438 0 5 8 27.7 64.2 229174 111917 117272 0 11991 1837 168 44006 0 32 16 32.6 74.0 334615 146784 188043 0 3404 1468 49 61405 0 8
STEAL --- domain2 --- --- domain3 --- grp time %busy sched idle wake steal remote move pull remote move pull 1 20.6 10.2 28490 14232 14261 18 3 3525 0 4254 0 0 2 27.9 18.8 56757 28203 28562 303 1675 7839 5 9690 0 2 3 35.3 27.7 87337 43274 44085 698 741 12785 14 15689 0 3 4 36.8 36.0 118630 58437 60216 1579 2973 14101 28 18732 0 7 8 48.1 73.8 289374 133681 155600 18646 35340 10179 171 65889 0 34 16 41.4 82.5 268925 91908 177172 47498 17206 6940 176 71776 0 20
Cross-numa-node migrations are caused by load balancing pulls and wake_affine moves. Pulls are small and similar for no_steal and steal. However, moves are significantly higher for steal, and rows above with the highest moves have the worst regressions for time; see for example grp=8.
Moves increase for steal due to the following logic in wake_affine_idle() for synchronous wakeup:
if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; // move the task
The steal feature does a better job of smoothing the load between idle and busy CPUs, so nr_running is 1 more often, and moves are performed more often. For hackbench, cross-node affine moves early in the run are good because they colocate wakers and wakees from the same group on the same node, but continued moves later in the run are bad, because the wakee is moved away from peers on its previous node. Note that even no_steal is far from optimal; binding an instance of "hackbench 2" to each of the 8 NUMA nodes runs much faster than running "hackbench 16" with no binding.
Clearing SD_WAKE_AFFINE for domain2 eliminates the affine cross-node migrations and eliminates the difference between no_steal and steal performance. However, overall performance is lower than WA_IDLE because some migrations are helpful as explained above.
I have tried many heuristics in a attempt to optimize the number of cross-node moves in all conditions, with limited success. The fundamental problem is that the scheduler does not track which groups of tasks talk to each other. Parts of several groups become entrenched on the same node, filling it to capacity, leaving no room for either group to pull its peers over, and there is neither data nor mechanism for the scheduler to evict one group to make room for the other.
For now, disable STEAL on such systems until we can do better, or it is shown that hackbench is atypical and most workloads benefit from stealing.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/fair.c | 16 +++++++++++++--- kernel/sched/sched.h | 2 ++ kernel/sched/topology.c | 24 ++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6d268682030a..94ca3da75f93 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5091,11 +5091,21 @@ static inline void rq_idle_stamp_clear(struct rq *rq) rq->idle_stamp = 0; }
+static inline bool steal_enabled(void) +{ +#ifdef CONFIG_NUMA + bool allow = static_branch_likely(&sched_steal_allow); +#else + bool allow = true; +#endif + return sched_feat(STEAL) && allow; +} + static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus;
- if (!sched_feat(STEAL)) + if (!steal_enabled()) return;
rcu_read_lock(); @@ -5109,7 +5119,7 @@ static void overload_set(struct rq *rq) { struct sparsemask *overload_cpus;
- if (!sched_feat(STEAL)) + if (!steal_enabled()) return;
rcu_read_lock(); @@ -13206,7 +13216,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) int stolen = 0; struct sparsemask *overload_cpus;
- if (!sched_feat(STEAL)) + if (!steal_enabled()) return 0;
if (!cpu_active(dst_cpu)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 15466a81f56b..696de660c50a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1748,6 +1748,8 @@ this_rq_lock_irq(struct rq_flags *rf) }
#ifdef CONFIG_NUMA +extern struct static_key_true sched_steal_allow; + enum numa_topology_type { NUMA_DIRECT, NUMA_GLUELESS_MESH, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99c9b05b88ec..61b0e90b13ab 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1831,6 +1831,29 @@ static void init_numa_topology_type(int offline_node) sched_numa_topology_type = NUMA_DIRECT; }
+DEFINE_STATIC_KEY_TRUE(sched_steal_allow); +static int sched_steal_node_limit; +#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2 + +static int __init steal_node_limit_setup(char *buf) +{ + get_option(&buf, &sched_steal_node_limit); + return 0; +} + +early_param("sched_steal_node_limit", steal_node_limit_setup); + +static void check_node_limit(void) +{ + int n = num_possible_nodes(); + + if (sched_steal_node_limit == 0) + sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT; + if (n > sched_steal_node_limit) { + static_branch_disable(&sched_steal_allow); + pr_debug("Suppressing sched STEAL. To enable, reboot with sched_steal_node_limit=%d", n); + } +}
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
@@ -1981,6 +2004,7 @@ void sched_init_numa(int offline_node) WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
init_numa_topology_type(offline_node); + check_node_limit(); }
From: Steve Sistare steven.sistare@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
Reference: https://lore.kernel.org/lkml/1541767840-93588-11-git-send-email-steven.sista...
---------------------------
Add schedstats to measure the effectiveness of searching for idle CPUs and stealing tasks. This is a temporary patch intended for use during development only. SCHEDSTAT_VERSION is bumped to 16, and the following fields are added to the per-CPU statistics of /proc/schedstat:
field 10: # of times select_idle_sibling "easily" found an idle CPU -- prev or target is idle. field 11: # of times select_idle_sibling searched and found an idle cpu. field 12: # of times select_idle_sibling searched and found an idle core. field 13: # of times select_idle_sibling failed to find anything idle. field 14: time in nanoseconds spent in functions that search for idle CPUs and search for tasks to steal. field 15: # of times an idle CPU steals a task from another CPU. field 16: # of times try_steal finds overloaded CPUs but no task is migratable.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/core.c | 31 +++++++++++++++++++++++-- kernel/sched/fair.c | 54 ++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 9 ++++++++ kernel/sched/stats.c | 11 ++++++++- kernel/sched/stats.h | 13 +++++++++++ 5 files changed, 111 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58c274b655ab..2ba0fb8e460b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4609,17 +4609,44 @@ static int sysctl_numa_balancing(struct ctl_table *table, int write,
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+unsigned long schedstat_skid; + +static void compute_skid(void) +{ + int i, n = 0; + s64 t; + int skid = 0; + + for (i = 0; i < 100; i++) { + t = local_clock(); + t = local_clock() - t; + if (t > 0 && t < 1000) { /* only use sane samples */ + skid += (int) t; + n++; + } + } + + if (n > 0) + schedstat_skid = skid / n; + else + schedstat_skid = 0; + pr_info("schedstat_skid = %lu\n", schedstat_skid); +} + static void set_schedstats(bool enabled) { - if (enabled) + if (enabled) { + compute_skid(); static_branch_enable(&sched_schedstats); - else + } else { static_branch_disable(&sched_schedstats); + } }
void force_schedstat_enabled(void) { if (!schedstat_enabled()) { + compute_skid(); pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); static_branch_enable(&sched_schedstats); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 94ca3da75f93..b7cc9e3d1751 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5104,29 +5104,35 @@ static inline bool steal_enabled(void) static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus; + unsigned long time;
if (!steal_enabled()) return;
+ time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); if (overload_cpus) sparsemask_clear_elem(overload_cpus, rq->cpu); rcu_read_unlock(); + schedstat_end_time(rq->find_time, time); }
static void overload_set(struct rq *rq) { struct sparsemask *overload_cpus; + unsigned long time;
if (!steal_enabled()) return;
+ time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); if (overload_cpus) sparsemask_set_elem(overload_cpus, rq->cpu); rcu_read_unlock(); + schedstat_end_time(rq->find_time, time); }
static int try_steal(struct rq *this_rq, struct rq_flags *rf); @@ -7586,6 +7592,16 @@ static inline bool asym_fits_cpu(unsigned long util, return true; }
+#define SET_STAT(STAT) \ + do { \ + if (schedstat_enabled()) { \ + struct rq *rq = this_rq(); \ + \ + if (rq) \ + __schedstat_inc(rq->STAT); \ + } \ + } while (0) + /* * Try and locate an idle core/thread in the LLC cache domain. */ @@ -7616,8 +7632,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_test_cpu(target, p->select_cpus) && #endif - asym_fits_cpu(task_util, util_min, util_max, target)) + asym_fits_cpu(task_util, util_min, util_max, target)) { + SET_STAT(found_idle_cpu_easy); return target; + }
/* * If the previous CPU is cache affine and idle, don't be stupid: @@ -7627,8 +7645,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_test_cpu(prev, p->select_cpus) && #endif - asym_fits_cpu(task_util, util_min, util_max, prev)) + asym_fits_cpu(task_util, util_min, util_max, prev)) { + SET_STAT(found_idle_cpu_easy); return prev; + }
/* * Allow a per-cpu kthread to stack with the wakee if the @@ -7643,6 +7663,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) prev == smp_processor_id() && this_rq()->nr_running <= 1 && asym_fits_cpu(task_util, util_min, util_max, prev)) { + SET_STAT(found_idle_cpu_easy); return prev; }
@@ -7659,6 +7680,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && #endif asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { + /* + * Replace recent_used_cpu with prev as it is a potential + * candidate for the next wake: + */ + SET_STAT(found_idle_cpu_easy); return recent_used_cpu; }
@@ -7678,13 +7704,16 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (sd) { i = select_idle_capacity(p, sd, target); + SET_STAT(found_idle_cpu_capacity); return ((unsigned)i < nr_cpumask_bits) ? i : target; } }
sd = rcu_dereference(per_cpu(sd_llc, target)); - if (!sd) + if (!sd) { + SET_STAT(nofound_idle_cpu); return target; + }
if (sched_smt_active()) { has_idle_core = test_idle_cores(target); @@ -7697,9 +7726,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) }
i = select_idle_cpu(p, sd, has_idle_core, target); - if ((unsigned)i < nr_cpumask_bits) + if ((unsigned)i < nr_cpumask_bits) { + SET_STAT(found_idle_cpu); return i; + }
+ SET_STAT(nofound_idle_cpu); return target; }
@@ -8301,6 +8333,7 @@ static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) { int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + unsigned long time; struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; @@ -8311,6 +8344,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) int idlest_cpu = -1; #endif
+ time = schedstat_start_time(); + /* * required for stable ->cpus_allowed */ @@ -8381,6 +8416,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } rcu_read_unlock(); + schedstat_end_time(cpu_rq(cpu)->find_time, time);
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY if (idlest_cpu != -1 && !cpumask_test_cpu(new_cpu, p->select_cpus)) { @@ -8849,6 +8885,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf struct sched_entity *se; struct task_struct *p; int new_tasks; + unsigned long time;
again: if (!sched_fair_runnable(rq)) @@ -8979,6 +9016,8 @@ done: __maybe_unused; if (!rf) return NULL;
+ time = schedstat_start_time(); + /* * We must set idle_stamp _before_ calling try_steal() or * idle_balance(), such that we measure the duration as idle time. @@ -8992,6 +9031,8 @@ done: __maybe_unused; if (new_tasks) rq_idle_stamp_clear(rq);
+ schedstat_end_time(rq->find_time, time); +
/* * Because try_steal() and idle_balance() release (and re-acquire) @@ -13190,6 +13231,7 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, update_rq_clock(dst_rq); attach_task(dst_rq, p); stolen = 1; + schedstat_inc(dst_rq->steal); } local_irq_restore(rf.flags);
@@ -13214,6 +13256,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) int dst_cpu = dst_rq->cpu; bool locked = true; int stolen = 0; + bool any_overload = false; struct sparsemask *overload_cpus;
if (!steal_enabled()) @@ -13256,6 +13299,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) stolen = 1; goto out; } + any_overload = true; }
out: @@ -13267,6 +13311,8 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) stolen |= (dst_rq->cfs.h_nr_running > 0); if (dst_rq->nr_running != dst_rq->cfs.h_nr_running) stolen = -1; + if (!stolen && any_overload) + schedstat_inc(dst_rq->steal_fail); return stolen; }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 696de660c50a..d0aa3dbba60a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1138,6 +1138,15 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; + + /* Idle search stats */ + unsigned int found_idle_cpu_capacity; + unsigned int found_idle_cpu; + unsigned int found_idle_cpu_easy; + unsigned int nofound_idle_cpu; + unsigned long find_time; + unsigned int steal; + unsigned int steal_fail; #endif
#ifdef CONFIG_CPU_IDLE diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 857f837f52cb..ee43764a563e 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -113,7 +113,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 15 +#define SCHEDSTAT_VERSION 16
static int show_schedstat(struct seq_file *seq, void *v) { @@ -140,6 +140,15 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+ seq_printf(seq, " %u %u %u %u %lu %u %u", + rq->found_idle_cpu_easy, + rq->found_idle_cpu_capacity, + rq->found_idle_cpu, + rq->nofound_idle_cpu, + rq->find_time, + rq->steal, + rq->steal_fail); + seq_printf(seq, "\n");
#ifdef CONFIG_SMP diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 38f3698f5e5b..e08a0bc77b3f 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -43,6 +43,17 @@ rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +#define schedstat_start_time() schedstat_val_or_zero(local_clock()) +#define schedstat_end_time(stat, time) \ + do { \ + unsigned long endtime; \ + \ + if (schedstat_enabled() && (time)) { \ + endtime = local_clock() - (time) - schedstat_skid; \ + schedstat_add((stat), endtime); \ + } \ + } while (0) +extern unsigned long schedstat_skid;
void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats); @@ -87,6 +98,8 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0) # define check_schedstat_required() do { } while (0)
+# define schedstat_start_time() 0 +# define schedstat_end_time(stat, t) do { } while (0) #endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_FAIR_GROUP_SCHED
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
---------------------------
Steal tasks to improve CPU utilization can solve some performance problems such as mysql, but not all scenarios are optimized, such as hackbench.
So turn off by default.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 139baca09fea..9895b17d82f0 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,7 +56,7 @@ SCHED_FEAT(SIS_UTIL, true) * Steal a CFS task from another CPU when going idle. * Improves CPU utilization. */ -SCHED_FEAT(STEAL, true) +SCHED_FEAT(STEAL, false)
/* * Issue a WARN when we do multiple update_rq_clock() calls
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
---------------------------
Introduce CONFIG_SCHED_STEAL to limit the impact of steal task.
1). If turn off CONFIG_SCHED_STEAL, then all the changes will not exist, for we use some empty functions, so this depends on compiler optimization.
2). enable CONFIG_SCHED_STEAL, but disable STEAL and schedstats, it will introduce some impact whith schedstat check. but this has little effect on performance. This will be our default choice.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- include/linux/sched/topology.h | 2 ++ init/Kconfig | 15 +++++++++++++++ kernel/sched/core.c | 4 ++++ kernel/sched/fair.c | 27 ++++++++++++++++++++++----- kernel/sched/features.h | 2 ++ kernel/sched/sched.h | 8 ++++++++ kernel/sched/stats.c | 6 ++++++ kernel/sched/stats.h | 11 +++++++++-- kernel/sched/topology.c | 22 +++++++++++++++++++++- 9 files changed, 89 insertions(+), 8 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 308daac94de0..9981f661189e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -82,7 +82,9 @@ struct sched_domain_shared { atomic_t nr_busy_cpus; int has_idle_cores; int nr_idle_scan; +#ifdef CONFIG_SCHED_STEAL struct sparsemask *cfs_overload_cpus; +#endif };
struct sched_domain { diff --git a/init/Kconfig b/init/Kconfig index c8909ca8bb48..46ded522c787 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1315,6 +1315,21 @@ config NET_NS
endif # NAMESPACES
+config SCHED_STEAL + bool "Steal tasks to improve CPU utilization" + depends on SMP + default n + help + When a CPU has no more CFS tasks to run, and idle_balance() fails + to find a task, then attempt to steal a task from an overloaded + CPU in the same LLC. Maintain and use a bitmap of overloaded CPUs + to efficiently identify candidates. To minimize search time, steal + the first migratable task that is found when the bitmap is traversed. + For fairness, search for migratable tasks on an overloaded CPU in + order of next to run. + + If unsure, say N here. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" depends on PROC_FS diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2ba0fb8e460b..5dd2694da33a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4609,6 +4609,7 @@ static int sysctl_numa_balancing(struct ctl_table *table, int write,
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+#ifdef CONFIG_SCHED_STEAL unsigned long schedstat_skid;
static void compute_skid(void) @@ -4632,6 +4633,9 @@ static void compute_skid(void) schedstat_skid = 0; pr_info("schedstat_skid = %lu\n", schedstat_skid); } +#else +static inline void compute_skid(void) {} +#endif
static void set_schedstats(bool enabled) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b7cc9e3d1751..88d9db760366 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -62,7 +62,10 @@ #include <linux/resume_user_mode.h> #endif
+#ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -5091,6 +5094,8 @@ static inline void rq_idle_stamp_clear(struct rq *rq) rq->idle_stamp = 0; }
+#ifdef CONFIG_SCHED_STEAL + static inline bool steal_enabled(void) { #ifdef CONFIG_NUMA @@ -5115,7 +5120,7 @@ static void overload_clear(struct rq *rq) if (overload_cpus) sparsemask_clear_elem(overload_cpus, rq->cpu); rcu_read_unlock(); - schedstat_end_time(rq->find_time, time); + schedstat_end_time(rq, time); }
static void overload_set(struct rq *rq) @@ -5132,10 +5137,15 @@ static void overload_set(struct rq *rq) if (overload_cpus) sparsemask_set_elem(overload_cpus, rq->cpu); rcu_read_unlock(); - schedstat_end_time(rq->find_time, time); + schedstat_end_time(rq, time); }
static int try_steal(struct rq *this_rq, struct rq_flags *rf); +#else +static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) { return 0; } +static inline void overload_clear(struct rq *rq) {} +static inline void overload_set(struct rq *rq) {} +#endif
#else /* CONFIG_SMP */
@@ -7592,6 +7602,7 @@ static inline bool asym_fits_cpu(unsigned long util, return true; }
+#ifdef CONFIG_SCHED_STEAL #define SET_STAT(STAT) \ do { \ if (schedstat_enabled()) { \ @@ -7601,6 +7612,9 @@ static inline bool asym_fits_cpu(unsigned long util, __schedstat_inc(rq->STAT); \ } \ } while (0) +#else +#define SET_STAT(STAT) +#endif
/* * Try and locate an idle core/thread in the LLC cache domain. @@ -8416,7 +8430,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } rcu_read_unlock(); - schedstat_end_time(cpu_rq(cpu)->find_time, time); + schedstat_end_time(cpu_rq(cpu), time);
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY if (idlest_cpu != -1 && !cpumask_test_cpu(new_cpu, p->select_cpus)) { @@ -9027,12 +9041,11 @@ done: __maybe_unused; new_tasks = newidle_balance(rq, rf); if (new_tasks == 0) new_tasks = try_steal(rq, rf); + schedstat_end_time(rq, time);
if (new_tasks) rq_idle_stamp_clear(rq);
- schedstat_end_time(rq->find_time, time); -
/* * Because try_steal() and idle_balance() release (and re-acquire) @@ -9540,6 +9553,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; }
+#ifdef CONFIG_SCHED_STEAL /* * Return true if task @p can migrate from @rq to @dst_rq in the same LLC. * No need to test for co-locality, and no need to test task_hot(), as sharing @@ -9567,6 +9581,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
return true; } +#endif
/* * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu. @@ -13167,6 +13182,7 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif
+#ifdef CONFIG_SCHED_STEAL /* * Search the runnable tasks in @cfs_rq in order of next to run, and find * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry. @@ -13315,6 +13331,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) schedstat_inc(dst_rq->steal_fail); return stolen; } +#endif
/* * scheduler tick hitting a task of our scheduling class. diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9895b17d82f0..e4789d09f58e 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -52,11 +52,13 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true)
+#ifdef CONFIG_SCHED_STEAL /* * Steal a CFS task from another CPU when going idle. * Improves CPU utilization. */ SCHED_FEAT(STEAL, false) +#endif
/* * Issue a WARN when we do multiple update_rq_clock() calls diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d0aa3dbba60a..ba528b17f501 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -99,7 +99,9 @@
struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_STEAL struct sparsemask; +#endif
/* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 @@ -1007,7 +1009,9 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_STEAL struct sparsemask *cfs_overload_cpus; +#endif
#ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -1139,6 +1143,7 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local;
+#ifdef CONFIG_SCHED_STEAL /* Idle search stats */ unsigned int found_idle_cpu_capacity; unsigned int found_idle_cpu; @@ -1147,6 +1152,7 @@ struct rq { unsigned long find_time; unsigned int steal; unsigned int steal_fail; +#endif /* CONFIG_SCHED_STEAL */ #endif
#ifdef CONFIG_CPU_IDLE @@ -1757,7 +1763,9 @@ this_rq_lock_irq(struct rq_flags *rf) }
#ifdef CONFIG_NUMA +#ifdef CONFIG_SCHED_STEAL extern struct static_key_true sched_steal_allow; +#endif
enum numa_topology_type { NUMA_DIRECT, diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index ee43764a563e..306f26fde69a 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -113,7 +113,11 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ +#ifdef CONFIG_SCHED_STEAL #define SCHEDSTAT_VERSION 16 +#else +#define SCHEDSTAT_VERSION 15 +#endif
static int show_schedstat(struct seq_file *seq, void *v) { @@ -140,6 +144,7 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+#ifdef CONFIG_SCHED_STEAL seq_printf(seq, " %u %u %u %u %lu %u %u", rq->found_idle_cpu_easy, rq->found_idle_cpu_capacity, @@ -148,6 +153,7 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->find_time, rq->steal, rq->steal_fail); +#endif /* CONFIG_SCHED_STEAL */
seq_printf(seq, "\n");
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index e08a0bc77b3f..4ccc1f120d67 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -43,8 +43,9 @@ rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +#ifdef CONFIG_SCHED_STEAL #define schedstat_start_time() schedstat_val_or_zero(local_clock()) -#define schedstat_end_time(stat, time) \ +#define __schedstat_end_time(stat, time) \ do { \ unsigned long endtime; \ \ @@ -53,7 +54,13 @@ rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) schedstat_add((stat), endtime); \ } \ } while (0) +#define schedstat_end_time(rq, time) \ + __schedstat_end_time(((rq)->find_time), time) extern unsigned long schedstat_skid; +#else /* !CONFIG_SCHED_STEAL */ +# define schedstat_start_time() 0 +# define schedstat_end_time(rq, t) do { } while (0) +#endif /* CONFIG_SCHED_STEAL */
void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats); @@ -99,7 +106,7 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define check_schedstat_required() do { } while (0)
# define schedstat_start_time() 0 -# define schedstat_end_time(stat, t) do { } while (0) +# define schedstat_end_time(rq, t) do { } while (0) #endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 61b0e90b13ab..9dd172be1d6b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -5,7 +5,9 @@
#include <linux/bsearch.h> #include "sched.h" +#ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" +#endif
DEFINE_MUTEX(sched_domains_mutex);
@@ -14,10 +16,16 @@ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2;
struct s_data; +#ifdef CONFIG_SCHED_STEAL static int sd_llc_alloc(struct sched_domain *sd); static void sd_llc_free(struct sched_domain *sd); static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d); static void sd_llc_free_all(const struct cpumask *cpu_map); +#else +static inline void sd_llc_free(struct sched_domain *sd) {} +static inline int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d) { return 0; } +static inline void sd_llc_free_all(const struct cpumask *cpu_map) {} +#endif
#ifdef CONFIG_SCHED_DEBUG
@@ -684,9 +692,11 @@ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu) { +#ifdef CONFIG_SCHED_STEAL + struct rq *rq = cpu_rq(cpu); struct sparsemask *cfs_overload_cpus = NULL; +#endif struct sched_domain_shared *sds = NULL; - struct rq *rq = cpu_rq(cpu); struct sched_domain *sd; int id = cpu; int size = 1; @@ -696,10 +706,14 @@ static void update_top_cache_domain(int cpu) id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); sds = sd->shared; +#ifdef CONFIG_SCHED_STEAL cfs_overload_cpus = sds->cfs_overload_cpus; +#endif }
+#ifdef CONFIG_SCHED_STEAL rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus); +#endif rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; @@ -1831,6 +1845,7 @@ static void init_numa_topology_type(int offline_node) sched_numa_topology_type = NUMA_DIRECT; }
+#ifdef CONFIG_SCHED_STEAL DEFINE_STATIC_KEY_TRUE(sched_steal_allow); static int sched_steal_node_limit; #define SCHED_STEAL_NODE_LIMIT_DEFAULT 2 @@ -1854,6 +1869,9 @@ static void check_node_limit(void) pr_debug("Suppressing sched STEAL. To enable, reboot with sched_steal_node_limit=%d", n); } } +#else +static inline void check_node_limit(void) { } +#endif /* CONFIG_SCHED_STEAL */
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
@@ -2326,6 +2344,7 @@ static void __sdt_free(const struct cpumask *cpu_map) } }
+#ifdef CONFIG_SCHED_STEAL static int sd_llc_alloc(struct sched_domain *sd) { struct sched_domain_shared *sds = sd->shared; @@ -2397,6 +2416,7 @@ static void sd_llc_free_all(const struct cpumask *cpu_map) } } } +#endif
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr,
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
-------------------------------------------------
Enable steal tasks by default to improve CPU utilization
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index c5bb420feb86..93c2ad057e3f 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -193,6 +193,7 @@ CONFIG_IPC_NS=y CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y +CONFIG_SCHED_STEAL=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_RELAY=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 15479398d9a8..cf197b5c986d 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -215,6 +215,7 @@ CONFIG_IPC_NS=y CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y +CONFIG_SCHED_STEAL=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_RELAY=y
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3712 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/E...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3712 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/E...