From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
---------------------------
Introduce CONFIG_SCHED_STEAL to limit the impact of steal task.
1). If turn off CONFIG_SCHED_STEAL, then all the changes will not exist, for we use some empty functions, so this depends on compiler optimization.
2). enable CONFIG_SCHED_STEAL, but disable STEAL and schedstats, it will introduce some impact whith schedstat check. but this has little effect on performance. This will be our default choice.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- include/linux/sched/topology.h | 2 ++ init/Kconfig | 15 +++++++++++++++ kernel/sched/core.c | 4 ++++ kernel/sched/fair.c | 27 ++++++++++++++++++++++----- kernel/sched/features.h | 2 ++ kernel/sched/sched.h | 8 ++++++++ kernel/sched/stats.c | 6 ++++++ kernel/sched/stats.h | 11 +++++++++-- kernel/sched/topology.c | 22 +++++++++++++++++++++- 9 files changed, 89 insertions(+), 8 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 308daac94de0..9981f661189e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -82,7 +82,9 @@ struct sched_domain_shared { atomic_t nr_busy_cpus; int has_idle_cores; int nr_idle_scan; +#ifdef CONFIG_SCHED_STEAL struct sparsemask *cfs_overload_cpus; +#endif };
struct sched_domain { diff --git a/init/Kconfig b/init/Kconfig index c8909ca8bb48..46ded522c787 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1315,6 +1315,21 @@ config NET_NS
endif # NAMESPACES
+config SCHED_STEAL + bool "Steal tasks to improve CPU utilization" + depends on SMP + default n + help + When a CPU has no more CFS tasks to run, and idle_balance() fails + to find a task, then attempt to steal a task from an overloaded + CPU in the same LLC. Maintain and use a bitmap of overloaded CPUs + to efficiently identify candidates. To minimize search time, steal + the first migratable task that is found when the bitmap is traversed. + For fairness, search for migratable tasks on an overloaded CPU in + order of next to run. + + If unsure, say N here. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" depends on PROC_FS diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2ba0fb8e460b..5dd2694da33a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4609,6 +4609,7 @@ static int sysctl_numa_balancing(struct ctl_table *table, int write,
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+#ifdef CONFIG_SCHED_STEAL unsigned long schedstat_skid;
static void compute_skid(void) @@ -4632,6 +4633,9 @@ static void compute_skid(void) schedstat_skid = 0; pr_info("schedstat_skid = %lu\n", schedstat_skid); } +#else +static inline void compute_skid(void) {} +#endif
static void set_schedstats(bool enabled) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b7cc9e3d1751..88d9db760366 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -62,7 +62,10 @@ #include <linux/resume_user_mode.h> #endif
+#ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -5091,6 +5094,8 @@ static inline void rq_idle_stamp_clear(struct rq *rq) rq->idle_stamp = 0; }
+#ifdef CONFIG_SCHED_STEAL + static inline bool steal_enabled(void) { #ifdef CONFIG_NUMA @@ -5115,7 +5120,7 @@ static void overload_clear(struct rq *rq) if (overload_cpus) sparsemask_clear_elem(overload_cpus, rq->cpu); rcu_read_unlock(); - schedstat_end_time(rq->find_time, time); + schedstat_end_time(rq, time); }
static void overload_set(struct rq *rq) @@ -5132,10 +5137,15 @@ static void overload_set(struct rq *rq) if (overload_cpus) sparsemask_set_elem(overload_cpus, rq->cpu); rcu_read_unlock(); - schedstat_end_time(rq->find_time, time); + schedstat_end_time(rq, time); }
static int try_steal(struct rq *this_rq, struct rq_flags *rf); +#else +static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) { return 0; } +static inline void overload_clear(struct rq *rq) {} +static inline void overload_set(struct rq *rq) {} +#endif
#else /* CONFIG_SMP */
@@ -7592,6 +7602,7 @@ static inline bool asym_fits_cpu(unsigned long util, return true; }
+#ifdef CONFIG_SCHED_STEAL #define SET_STAT(STAT) \ do { \ if (schedstat_enabled()) { \ @@ -7601,6 +7612,9 @@ static inline bool asym_fits_cpu(unsigned long util, __schedstat_inc(rq->STAT); \ } \ } while (0) +#else +#define SET_STAT(STAT) +#endif
/* * Try and locate an idle core/thread in the LLC cache domain. @@ -8416,7 +8430,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } rcu_read_unlock(); - schedstat_end_time(cpu_rq(cpu)->find_time, time); + schedstat_end_time(cpu_rq(cpu), time);
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY if (idlest_cpu != -1 && !cpumask_test_cpu(new_cpu, p->select_cpus)) { @@ -9027,12 +9041,11 @@ done: __maybe_unused; new_tasks = newidle_balance(rq, rf); if (new_tasks == 0) new_tasks = try_steal(rq, rf); + schedstat_end_time(rq, time);
if (new_tasks) rq_idle_stamp_clear(rq);
- schedstat_end_time(rq->find_time, time); -
/* * Because try_steal() and idle_balance() release (and re-acquire) @@ -9540,6 +9553,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; }
+#ifdef CONFIG_SCHED_STEAL /* * Return true if task @p can migrate from @rq to @dst_rq in the same LLC. * No need to test for co-locality, and no need to test task_hot(), as sharing @@ -9567,6 +9581,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
return true; } +#endif
/* * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu. @@ -13167,6 +13182,7 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif
+#ifdef CONFIG_SCHED_STEAL /* * Search the runnable tasks in @cfs_rq in order of next to run, and find * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry. @@ -13315,6 +13331,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) schedstat_inc(dst_rq->steal_fail); return stolen; } +#endif
/* * scheduler tick hitting a task of our scheduling class. diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9895b17d82f0..e4789d09f58e 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -52,11 +52,13 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true)
+#ifdef CONFIG_SCHED_STEAL /* * Steal a CFS task from another CPU when going idle. * Improves CPU utilization. */ SCHED_FEAT(STEAL, false) +#endif
/* * Issue a WARN when we do multiple update_rq_clock() calls diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d0aa3dbba60a..ba528b17f501 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -99,7 +99,9 @@
struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_STEAL struct sparsemask; +#endif
/* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 @@ -1007,7 +1009,9 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_STEAL struct sparsemask *cfs_overload_cpus; +#endif
#ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -1139,6 +1143,7 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local;
+#ifdef CONFIG_SCHED_STEAL /* Idle search stats */ unsigned int found_idle_cpu_capacity; unsigned int found_idle_cpu; @@ -1147,6 +1152,7 @@ struct rq { unsigned long find_time; unsigned int steal; unsigned int steal_fail; +#endif /* CONFIG_SCHED_STEAL */ #endif
#ifdef CONFIG_CPU_IDLE @@ -1757,7 +1763,9 @@ this_rq_lock_irq(struct rq_flags *rf) }
#ifdef CONFIG_NUMA +#ifdef CONFIG_SCHED_STEAL extern struct static_key_true sched_steal_allow; +#endif
enum numa_topology_type { NUMA_DIRECT, diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index ee43764a563e..306f26fde69a 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -113,7 +113,11 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ +#ifdef CONFIG_SCHED_STEAL #define SCHEDSTAT_VERSION 16 +#else +#define SCHEDSTAT_VERSION 15 +#endif
static int show_schedstat(struct seq_file *seq, void *v) { @@ -140,6 +144,7 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+#ifdef CONFIG_SCHED_STEAL seq_printf(seq, " %u %u %u %u %lu %u %u", rq->found_idle_cpu_easy, rq->found_idle_cpu_capacity, @@ -148,6 +153,7 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->find_time, rq->steal, rq->steal_fail); +#endif /* CONFIG_SCHED_STEAL */
seq_printf(seq, "\n");
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index e08a0bc77b3f..4ccc1f120d67 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -43,8 +43,9 @@ rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +#ifdef CONFIG_SCHED_STEAL #define schedstat_start_time() schedstat_val_or_zero(local_clock()) -#define schedstat_end_time(stat, time) \ +#define __schedstat_end_time(stat, time) \ do { \ unsigned long endtime; \ \ @@ -53,7 +54,13 @@ rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) schedstat_add((stat), endtime); \ } \ } while (0) +#define schedstat_end_time(rq, time) \ + __schedstat_end_time(((rq)->find_time), time) extern unsigned long schedstat_skid; +#else /* !CONFIG_SCHED_STEAL */ +# define schedstat_start_time() 0 +# define schedstat_end_time(rq, t) do { } while (0) +#endif /* CONFIG_SCHED_STEAL */
void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats); @@ -99,7 +106,7 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define check_schedstat_required() do { } while (0)
# define schedstat_start_time() 0 -# define schedstat_end_time(stat, t) do { } while (0) +# define schedstat_end_time(rq, t) do { } while (0) #endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 61b0e90b13ab..9dd172be1d6b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -5,7 +5,9 @@
#include <linux/bsearch.h> #include "sched.h" +#ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" +#endif
DEFINE_MUTEX(sched_domains_mutex);
@@ -14,10 +16,16 @@ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2;
struct s_data; +#ifdef CONFIG_SCHED_STEAL static int sd_llc_alloc(struct sched_domain *sd); static void sd_llc_free(struct sched_domain *sd); static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d); static void sd_llc_free_all(const struct cpumask *cpu_map); +#else +static inline void sd_llc_free(struct sched_domain *sd) {} +static inline int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d) { return 0; } +static inline void sd_llc_free_all(const struct cpumask *cpu_map) {} +#endif
#ifdef CONFIG_SCHED_DEBUG
@@ -684,9 +692,11 @@ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu) { +#ifdef CONFIG_SCHED_STEAL + struct rq *rq = cpu_rq(cpu); struct sparsemask *cfs_overload_cpus = NULL; +#endif struct sched_domain_shared *sds = NULL; - struct rq *rq = cpu_rq(cpu); struct sched_domain *sd; int id = cpu; int size = 1; @@ -696,10 +706,14 @@ static void update_top_cache_domain(int cpu) id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); sds = sd->shared; +#ifdef CONFIG_SCHED_STEAL cfs_overload_cpus = sds->cfs_overload_cpus; +#endif }
+#ifdef CONFIG_SCHED_STEAL rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus); +#endif rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; @@ -1831,6 +1845,7 @@ static void init_numa_topology_type(int offline_node) sched_numa_topology_type = NUMA_DIRECT; }
+#ifdef CONFIG_SCHED_STEAL DEFINE_STATIC_KEY_TRUE(sched_steal_allow); static int sched_steal_node_limit; #define SCHED_STEAL_NODE_LIMIT_DEFAULT 2 @@ -1854,6 +1869,9 @@ static void check_node_limit(void) pr_debug("Suppressing sched STEAL. To enable, reboot with sched_steal_node_limit=%d", n); } } +#else +static inline void check_node_limit(void) { } +#endif /* CONFIG_SCHED_STEAL */
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
@@ -2326,6 +2344,7 @@ static void __sdt_free(const struct cpumask *cpu_map) } }
+#ifdef CONFIG_SCHED_STEAL static int sd_llc_alloc(struct sched_domain *sd) { struct sched_domain_shared *sds = sd->shared; @@ -2397,6 +2416,7 @@ static void sd_llc_free_all(const struct cpumask *cpu_map) } } } +#endif
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr,