From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: feature bugzilla: 38261 CVE: NA
---------------------------
Introduce CONFIG_SCHED_STEAL to limit the impact of steal task.
1). If turn off CONFIG_SCHED_STEAL, then all the changes will not exist, for we use some empty functions, so this depends on compiler optimization.
2). enable CONFIG_SCHED_STEAL, but disable STEAL and schedstats, it will introduce some impact whith schedstat check. but this has little effect on performance. This will be our default choice.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Wei Li liwei391@huawei.com --- include/linux/sched/topology.h | 2 ++ init/Kconfig | 15 +++++++++++++++ kernel/sched/core.c | 4 ++++ kernel/sched/fair.c | 31 +++++++++++++++++++++++++------ kernel/sched/features.h | 2 ++ kernel/sched/sched.h | 9 +++++++++ kernel/sched/stats.c | 6 ++++++ kernel/sched/stats.h | 11 +++++++++-- kernel/sched/topology.c | 22 +++++++++++++++++++++- 9 files changed, 93 insertions(+), 9 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 3d04d4505fdc..936dfbf0e87f 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -72,7 +72,9 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; +#ifdef CONFIG_SCHED_STEAL struct sparsemask *cfs_overload_cpus; +#endif };
struct sched_domain { diff --git a/init/Kconfig b/init/Kconfig index 121ad5dbc1c0..b731d4f6a29d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -996,6 +996,21 @@ config NET_NS
endif # NAMESPACES
+config SCHED_STEAL + bool "Steal tasks to improve CPU utilization" + depends on SMP + default n + help + When a CPU has no more CFS tasks to run, and idle_balance() fails + to find a task, then attempt to steal a task from an overloaded + CPU in the same LLC. Maintain and use a bitmap of overloaded CPUs + to efficiently identify candidates. To minimize search time, steal + the first migratable task that is found when the bitmap is traversed. + For fairness, search for migratable tasks on an overloaded CPU in + order of next to run. + + If unsure, say N here. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" select PROC_CHILDREN diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ad8e38f3b351..9196c6b3b8cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2240,6 +2240,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, DEFINE_STATIC_KEY_FALSE(sched_schedstats); static bool __initdata __sched_schedstats = false;
+#ifdef CONFIG_SCHED_STEAL unsigned long schedstat_skid;
static void compute_skid(void) @@ -2263,6 +2264,9 @@ static void compute_skid(void) schedstat_skid = 0; pr_info("schedstat_skid = %lu\n", schedstat_skid); } +#else +static inline void compute_skid(void) {} +#endif
static void set_schedstats(bool enabled) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c38be5749c07..b71da6794866 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,7 +21,9 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" +#endif
#include <trace/events/sched.h>
@@ -3821,6 +3823,8 @@ static inline void rq_idle_stamp_clear(struct rq *rq) rq->idle_stamp = 0; }
+#ifdef CONFIG_SCHED_STEAL + static inline bool steal_enabled(void) { #ifdef CONFIG_NUMA @@ -3845,7 +3849,7 @@ static void overload_clear(struct rq *rq) if (overload_cpus) sparsemask_clear_elem(overload_cpus, rq->cpu); rcu_read_unlock(); - schedstat_end_time(rq->find_time, time); + schedstat_end_time(rq, time); }
static void overload_set(struct rq *rq) @@ -3862,10 +3866,15 @@ static void overload_set(struct rq *rq) if (overload_cpus) sparsemask_set_elem(overload_cpus, rq->cpu); rcu_read_unlock(); - schedstat_end_time(rq->find_time, time); + schedstat_end_time(rq, time); }
static int try_steal(struct rq *this_rq, struct rq_flags *rf); +#else +static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) { return 0; } +static inline void overload_clear(struct rq *rq) {} +static inline void overload_set(struct rq *rq) {} +#endif
#else /* CONFIG_SMP */
@@ -6283,6 +6292,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t return cpu; }
+#ifdef CONFIG_SCHED_STEAL #define SET_STAT(STAT) \ do { \ if (schedstat_enabled()) { \ @@ -6292,6 +6302,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t __schedstat_inc(rq->STAT); \ } \ } while (0) +#else +#define SET_STAT(STAT) +#endif
/* * Try and locate an idle core/thread in the LLC cache domain. @@ -6539,13 +6552,15 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { - unsigned long time = schedstat_start_time(); + unsigned long time; struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+ time = schedstat_start_time(); + if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) @@ -6588,7 +6603,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f current->recent_used_cpu = cpu; } rcu_read_unlock(); - schedstat_end_time(cpu_rq(cpu)->find_time, time); + schedstat_end_time(cpu_rq(cpu), time);
return new_cpu; } @@ -6956,14 +6971,14 @@ done: __maybe_unused; rq_idle_stamp_update(rq);
new_tasks = idle_balance(rq, rf); + if (new_tasks == 0) new_tasks = try_steal(rq, rf); + schedstat_end_time(rq, time);
if (new_tasks) rq_idle_stamp_clear(rq);
- schedstat_end_time(rq->find_time, time); - /* * Because try_steal() and idle_balance() release (and re-acquire) * rq->lock, it is possible for any higher priority task to appear. @@ -7374,6 +7389,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; }
+#ifdef CONFIG_SCHED_STEAL /* * Return true if task @p can migrate from @rq to @dst_rq in the same LLC. * No need to test for co-locality, and no need to test task_hot(), as sharing @@ -7401,6 +7417,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
return true; } +#endif
/* * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu. @@ -9913,6 +9930,7 @@ void trigger_load_balance(struct rq *rq) nohz_balancer_kick(rq); }
+#ifdef CONFIG_SCHED_STEAL /* * Search the runnable tasks in @cfs_rq in order of next to run, and find * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry. @@ -10061,6 +10079,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) schedstat_inc(dst_rq->steal_fail); return stolen; } +#endif
static void rq_online_fair(struct rq *rq) { diff --git a/kernel/sched/features.h b/kernel/sched/features.h index c8afe702d600..515bfbcc6c99 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -58,11 +58,13 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_PROP, true)
+#ifdef CONFIG_SCHED_STEAL /* * Steal a CFS task from another CPU when going idle. * Improves CPU utilization. */ SCHED_FEAT(STEAL, false) +#endif
/* * Issue a WARN when we do multiple update_rq_clock() calls diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 02e79a2657fe..136222d2b425 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -83,7 +83,9 @@
struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_STEAL struct sparsemask; +#endif
/* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 @@ -823,7 +825,9 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_STEAL struct sparsemask *cfs_overload_cpus; +#endif
#ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -928,6 +932,7 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local;
+#ifdef CONFIG_SCHED_STEAL /* Idle search stats */ unsigned int found_idle_core; unsigned int found_idle_cpu; @@ -936,6 +941,7 @@ struct rq { unsigned long find_time; unsigned int steal; unsigned int steal_fail; +#endif /* CONFIG_SCHED_STEAL */ #endif
#ifdef CONFIG_SMP @@ -975,6 +981,7 @@ static inline int cpu_of(struct rq *rq) #endif }
+ #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq);
@@ -1126,7 +1133,9 @@ extern bool find_numa_distance(int distance); #endif
#ifdef CONFIG_NUMA +#ifdef CONFIG_SCHED_STEAL extern struct static_key_true sched_steal_allow; +#endif extern void sched_init_numa(void); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 00b3de5cd338..46f57644f5df 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -10,7 +10,11 @@ * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ +#ifdef CONFIG_SCHED_STEAL #define SCHEDSTAT_VERSION 16 +#else +#define SCHEDSTAT_VERSION 15 +#endif
static int show_schedstat(struct seq_file *seq, void *v) { @@ -37,6 +41,7 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+#ifdef CONFIG_SCHED_STEAL seq_printf(seq, " %u %u %u %u %lu %u %u", rq->found_idle_cpu_easy, rq->found_idle_cpu, @@ -45,6 +50,7 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->find_time, rq->steal, rq->steal_fail); +#endif /* CONFIG_SCHED_STEAL */
seq_printf(seq, "\n");
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b5f927e278eb..f6a7d0b04f70 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -40,8 +40,9 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +#ifdef CONFIG_SCHED_STEAL #define schedstat_start_time() schedstat_val_or_zero(local_clock()) -#define schedstat_end_time(stat, time) \ +#define __schedstat_end_time(stat, time) \ do { \ unsigned long endtime; \ \ @@ -50,7 +51,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) schedstat_add((stat), endtime); \ } \ } while (0) +#define schedstat_end_time(rq, time) \ + __schedstat_end_time(((rq)->find_time), time) extern unsigned long schedstat_skid; +#else /* !CONFIG_SCHED_STEAL */ +# define schedstat_start_time() 0 +# define schedstat_end_time(rq, t) do { } while (0) +#endif /* CONFIG_SCHED_STEAL */
#else /* !CONFIG_SCHEDSTATS: */ static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } @@ -66,7 +73,7 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val(var) 0 # define schedstat_val_or_zero(var) 0 # define schedstat_start_time() 0 -# define schedstat_end_time(stat, t) do { } while (0) +# define schedstat_end_time(rq, t) do { } while (0) #endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SCHED_INFO diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 20c20dc890a7..0002b269ed3d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,7 +3,9 @@ * Scheduler topology setup/handling methods */ #include "sched.h" +#ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" +#endif
DEFINE_MUTEX(sched_domains_mutex);
@@ -12,10 +14,16 @@ cpumask_var_t sched_domains_tmpmask; cpumask_var_t sched_domains_tmpmask2;
struct s_data; +#ifdef CONFIG_SCHED_STEAL static int sd_llc_alloc(struct sched_domain *sd); static void sd_llc_free(struct sched_domain *sd); static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d); static void sd_llc_free_all(const struct cpumask *cpu_map); +#else +static inline void sd_llc_free(struct sched_domain *sd) {} +static inline int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d) { return 0; } +static inline void sd_llc_free_all(const struct cpumask *cpu_map) {} +#endif
#ifdef CONFIG_SCHED_DEBUG
@@ -410,9 +418,11 @@ DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu) { +#ifdef CONFIG_SCHED_STEAL + struct rq *rq = cpu_rq(cpu); struct sparsemask *cfs_overload_cpus = NULL; +#endif struct sched_domain_shared *sds = NULL; - struct rq *rq = cpu_rq(cpu); struct sched_domain *sd; int id = cpu; int size = 1; @@ -422,10 +432,14 @@ static void update_top_cache_domain(int cpu) id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); sds = sd->shared; +#ifdef CONFIG_SCHED_STEAL cfs_overload_cpus = sds->cfs_overload_cpus; +#endif }
+#ifdef CONFIG_SCHED_STEAL rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus); +#endif rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; @@ -1336,6 +1350,7 @@ static void init_numa_topology_type(void) } }
+#ifdef CONFIG_SCHED_STEAL DEFINE_STATIC_KEY_TRUE(sched_steal_allow); static int sched_steal_node_limit; #define SCHED_STEAL_NODE_LIMIT_DEFAULT 2 @@ -1359,6 +1374,9 @@ static void check_node_limit(void) pr_debug("Suppressing sched STEAL. To enable, reboot with sched_steal_node_limit=%d", n); } } +#else +static inline void check_node_limit(void) { } +#endif /* CONFIG_SCHED_STEAL */
void sched_init_numa(void) { @@ -1641,6 +1659,7 @@ static void __sdt_free(const struct cpumask *cpu_map) } }
+#ifdef CONFIG_SCHED_STEAL static int sd_llc_alloc(struct sched_domain *sd) { struct sched_domain_shared *sds = sd->shared; @@ -1712,6 +1731,7 @@ static void sd_llc_free_all(const struct cpumask *cpu_map) } } } +#endif
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr,