From: Steve Sistare steven.sistare@oracle.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8PIYZ CVE: NA
Reference: https://lore.kernel.org/lkml/1541767840-93588-11-git-send-email-steven.sista...
---------------------------
Add schedstats to measure the effectiveness of searching for idle CPUs and stealing tasks. This is a temporary patch intended for use during development only. SCHEDSTAT_VERSION is bumped to 16, and the following fields are added to the per-CPU statistics of /proc/schedstat:
field 10: # of times select_idle_sibling "easily" found an idle CPU -- prev or target is idle. field 11: # of times select_idle_sibling searched and found an idle cpu. field 12: # of times select_idle_sibling searched and found an idle core. field 13: # of times select_idle_sibling failed to find anything idle. field 14: time in nanoseconds spent in functions that search for idle CPUs and search for tasks to steal. field 15: # of times an idle CPU steals a task from another CPU. field 16: # of times try_steal finds overloaded CPUs but no task is migratable.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/core.c | 31 +++++++++++++++++++++++-- kernel/sched/fair.c | 54 ++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 9 ++++++++ kernel/sched/stats.c | 11 ++++++++- kernel/sched/stats.h | 13 +++++++++++ 5 files changed, 111 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a1c73dea1f77..6ebc650b5bcc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4609,17 +4609,44 @@ static int sysctl_numa_balancing(struct ctl_table *table, int write,
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+unsigned long schedstat_skid; + +static void compute_skid(void) +{ + int i, n = 0; + s64 t; + int skid = 0; + + for (i = 0; i < 100; i++) { + t = local_clock(); + t = local_clock() - t; + if (t > 0 && t < 1000) { /* only use sane samples */ + skid += (int) t; + n++; + } + } + + if (n > 0) + schedstat_skid = skid / n; + else + schedstat_skid = 0; + pr_info("schedstat_skid = %lu\n", schedstat_skid); +} + static void set_schedstats(bool enabled) { - if (enabled) + if (enabled) { + compute_skid(); static_branch_enable(&sched_schedstats); - else + } else { static_branch_disable(&sched_schedstats); + } }
void force_schedstat_enabled(void) { if (!schedstat_enabled()) { + compute_skid(); pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); static_branch_enable(&sched_schedstats); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 949c494d00a5..f0516f1cf60e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5084,29 +5084,35 @@ static inline bool steal_enabled(void) static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus; + unsigned long time;
if (!steal_enabled()) return;
+ time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); if (overload_cpus) sparsemask_clear_elem(overload_cpus, rq->cpu); rcu_read_unlock(); + schedstat_end_time(rq->find_time, time); }
static void overload_set(struct rq *rq) { struct sparsemask *overload_cpus; + unsigned long time;
if (!steal_enabled()) return;
+ time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); if (overload_cpus) sparsemask_set_elem(overload_cpus, rq->cpu); rcu_read_unlock(); + schedstat_end_time(rq->find_time, time); }
static int try_steal(struct rq *this_rq, struct rq_flags *rf); @@ -7546,6 +7552,16 @@ static inline bool asym_fits_cpu(unsigned long util, return true; }
+#define SET_STAT(STAT) \ + do { \ + if (schedstat_enabled()) { \ + struct rq *rq = this_rq(); \ + \ + if (rq) \ + __schedstat_inc(rq->STAT); \ + } \ + } while (0) + /* * Try and locate an idle core/thread in the LLC cache domain. */ @@ -7573,16 +7589,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) lockdep_assert_irqs_disabled();
if ((available_idle_cpu(target) || sched_idle_cpu(target)) && - asym_fits_cpu(task_util, util_min, util_max, target)) + asym_fits_cpu(task_util, util_min, util_max, target)) { + SET_STAT(found_idle_cpu_easy); return target; + }
/* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_cpu(task_util, util_min, util_max, prev)) + asym_fits_cpu(task_util, util_min, util_max, prev)) { + SET_STAT(found_idle_cpu_easy); return prev; + }
/* * Allow a per-cpu kthread to stack with the wakee if the @@ -7597,6 +7617,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) prev == smp_processor_id() && this_rq()->nr_running <= 1 && asym_fits_cpu(task_util, util_min, util_max, prev)) { + SET_STAT(found_idle_cpu_easy); return prev; }
@@ -7609,6 +7630,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { + /* + * Replace recent_used_cpu with prev as it is a potential + * candidate for the next wake: + */ + SET_STAT(found_idle_cpu_easy); return recent_used_cpu; }
@@ -7628,13 +7654,16 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (sd) { i = select_idle_capacity(p, sd, target); + SET_STAT(found_idle_cpu_capacity); return ((unsigned)i < nr_cpumask_bits) ? i : target; } }
sd = rcu_dereference(per_cpu(sd_llc, target)); - if (!sd) + if (!sd) { + SET_STAT(nofound_idle_cpu); return target; + }
if (sched_smt_active()) { has_idle_core = test_idle_cores(target); @@ -7647,9 +7676,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) }
i = select_idle_cpu(p, sd, has_idle_core, target); - if ((unsigned)i < nr_cpumask_bits) + if ((unsigned)i < nr_cpumask_bits) { + SET_STAT(found_idle_cpu); return i; + }
+ SET_STAT(nofound_idle_cpu); return target; }
@@ -8156,6 +8188,7 @@ static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) { int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + unsigned long time; struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; @@ -8163,6 +8196,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF;
+ time = schedstat_start_time(); + /* * required for stable ->cpus_allowed */ @@ -8218,6 +8253,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } rcu_read_unlock(); + schedstat_end_time(cpu_rq(cpu)->find_time, time);
return new_cpu; } @@ -8680,6 +8716,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf struct sched_entity *se; struct task_struct *p; int new_tasks; + unsigned long time;
again: if (!sched_fair_runnable(rq)) @@ -8810,6 +8847,8 @@ done: __maybe_unused; if (!rf) return NULL;
+ time = schedstat_start_time(); + /* * We must set idle_stamp _before_ calling try_steal() or * idle_balance(), such that we measure the duration as idle time. @@ -8823,6 +8862,8 @@ done: __maybe_unused; if (new_tasks) rq_idle_stamp_clear(rq);
+ schedstat_end_time(rq->find_time, time); +
/* * Because try_steal() and idle_balance() release (and re-acquire) @@ -13007,6 +13048,7 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, update_rq_clock(dst_rq); attach_task(dst_rq, p); stolen = 1; + schedstat_inc(dst_rq->steal); } local_irq_restore(rf.flags);
@@ -13031,6 +13073,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) int dst_cpu = dst_rq->cpu; bool locked = true; int stolen = 0; + bool any_overload = false; struct sparsemask *overload_cpus;
if (!steal_enabled()) @@ -13073,6 +13116,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) stolen = 1; goto out; } + any_overload = true; }
out: @@ -13084,6 +13128,8 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) stolen |= (dst_rq->cfs.h_nr_running > 0); if (dst_rq->nr_running != dst_rq->cfs.h_nr_running) stolen = -1; + if (!stolen && any_overload) + schedstat_inc(dst_rq->steal_fail); return stolen; }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 696de660c50a..d0aa3dbba60a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1138,6 +1138,15 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; + + /* Idle search stats */ + unsigned int found_idle_cpu_capacity; + unsigned int found_idle_cpu; + unsigned int found_idle_cpu_easy; + unsigned int nofound_idle_cpu; + unsigned long find_time; + unsigned int steal; + unsigned int steal_fail; #endif
#ifdef CONFIG_CPU_IDLE diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 857f837f52cb..ee43764a563e 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -113,7 +113,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 15 +#define SCHEDSTAT_VERSION 16
static int show_schedstat(struct seq_file *seq, void *v) { @@ -140,6 +140,15 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+ seq_printf(seq, " %u %u %u %u %lu %u %u", + rq->found_idle_cpu_easy, + rq->found_idle_cpu_capacity, + rq->found_idle_cpu, + rq->nofound_idle_cpu, + rq->find_time, + rq->steal, + rq->steal_fail); + seq_printf(seq, "\n");
#ifdef CONFIG_SMP diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 38f3698f5e5b..e08a0bc77b3f 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -43,6 +43,17 @@ rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +#define schedstat_start_time() schedstat_val_or_zero(local_clock()) +#define schedstat_end_time(stat, time) \ + do { \ + unsigned long endtime; \ + \ + if (schedstat_enabled() && (time)) { \ + endtime = local_clock() - (time) - schedstat_skid; \ + schedstat_add((stat), endtime); \ + } \ + } while (0) +extern unsigned long schedstat_skid;
void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats); @@ -87,6 +98,8 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0) # define check_schedstat_required() do { } while (0)
+# define schedstat_start_time() 0 +# define schedstat_end_time(stat, t) do { } while (0) #endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_FAIR_GROUP_SCHED