introduce smart grid scheduler.
v5: 调度部分已完成评审完成代码修改,逃逸部分仍然使用pelt. 内存策略调整模块已经按照review已将进行修改.
v4: fix KABI add 'cpu.affinity_domain_mask' add some statistics
v3: 修改了patchwork错误,但是KABI还有个问题待解决。
v2: formal version 1.
Hui Tang (1): sched: Introduce smart grid scheduling strategy for cfs
Wang ShaoBo (1): sched: smart grid: init sched_grid_qos structure on QOS purpose
fs/proc/array.c | 13 + include/linux/sched.h | 18 ++ include/linux/sched/grid_qos.h | 84 +++++++ include/linux/sched/sysctl.h | 4 + init/Kconfig | 13 + kernel/fork.c | 9 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 147 ++++++++++++ kernel/sched/fair.c | 425 ++++++++++++++++++++++++++++++++- kernel/sched/grid/Makefile | 2 + kernel/sched/grid/internal.h | 6 + kernel/sched/grid/power.c | 10 + kernel/sched/grid/qos.c | 125 ++++++++++ kernel/sched/grid/stat.c | 15 ++ kernel/sched/sched.h | 50 ++++ kernel/sysctl.c | 9 + mm/mempolicy.c | 12 +- 17 files changed, 940 insertions(+), 3 deletions(-) create mode 100644 include/linux/sched/grid_qos.h create mode 100644 kernel/sched/grid/Makefile create mode 100644 kernel/sched/grid/internal.h create mode 100644 kernel/sched/grid/power.c create mode 100644 kernel/sched/grid/qos.c create mode 100644 kernel/sched/grid/stat.c
From: Hui Tang tanghui20@huawei.com
We want to dynamically expand or shrink the affinity range of tasks based on the CPU topology level while meeting the minimum resource requirements of tasks.
We divide several level of affinity domains according to sched domains:
level4 * SOCKET [ ] level3 * DIE [ ] level2 * MC [ ] [ ] level1 * SMT [ ] [ ] [ ] [ ] level0 * CPU 0 1 2 3 4 5 6 7
Whether users tend to choose power saving or performance will affect strategy of adjusting affinity, when selecting the power saving mode, we will choose a more appropriate affinity based on the energy model to reduce power consumption, while considering the QOS of resources such as CPU and memory consumption, for instance, if the current task CPU load is less than required, smart grid will judge whether to aggregate tasks together into a smaller range or not according to energy model.
The main difference from EAS is that we pay more attention to the impact of power consumption brought by such as cpuidle and DVFS, and classify tasks to reduce interference and ensure resource QOS in each divided unit, which are more suitable for general-purpose on non-heterogeneous CPUs.
-------- -------- -------- | group0 | | group1 | | group2 | -------- -------- -------- | | | v | v ---------------------+----- ----------------- | ---v-- | | | DIE0 | MC1 | | | DIE1 | ------ | | --------------------------- -----------------
We regularly count the resource satisfaction of groups, and adjust the affinity, scheduling balance and migrating memory will be considered based on memory location for better meetting resource requirements.
Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com --- fs/proc/array.c | 13 ++ include/linux/sched.h | 9 + include/linux/sched/sysctl.h | 4 + init/Kconfig | 13 ++ kernel/sched/core.c | 147 ++++++++++++ kernel/sched/fair.c | 420 ++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 50 +++++ kernel/sysctl.c | 9 + 8 files changed, 663 insertions(+), 2 deletions(-)
diff --git a/fs/proc/array.c b/fs/proc/array.c index 9eb99a43f849a..fd56e15b3463f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -386,6 +386,16 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) cpumask_pr_args(&task->cpus_allowed)); }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static void task_cpus_preferred(struct seq_file *m, struct task_struct *task) +{ + seq_printf(m, "Cpus_preferred:\t%*pb\n", + cpumask_pr_args(task->prefer_cpus)); + seq_printf(m, "Cpus_preferred_list:\t%*pbl\n", + cpumask_pr_args(task->prefer_cpus)); +} +#endif + static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) { seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state); @@ -414,6 +424,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + task_cpus_preferred(m, task); +#endif return 0; }
diff --git a/include/linux/sched.h b/include/linux/sched.h index 928186f161000..8346294d25838 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2000,4 +2000,13 @@ int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig); void sched_prefer_cpus_free(struct task_struct *p); #endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID +bool smart_grid_used(void); +#else +static inline bool smart_grid_used(void) +{ + return false; +} +#endif + #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 04eb5b127867b..ad472760e97d7 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -36,6 +36,10 @@ extern unsigned int sysctl_sched_child_runs_first; extern int sysctl_sched_util_low_pct; #endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern int sysctl_affinity_adjust_delay_ms; +#endif + enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/init/Kconfig b/init/Kconfig index a2a733080fe11..ef1768d36a050 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -834,6 +834,19 @@ config QOS_SCHED_DYNAMIC_AFFINITY of taskgroup is below threshold setted, otherwise make taskgroup to use cpus allowed.
+config QOS_SCHED_SMART_GRID + bool "qos smart grid scheduler" + depends on FAIR_GROUP_SCHED && QOS_SCHED_DYNAMIC_AFFINITY + default n + help + This feature is used for power consumption tuning in server scenario. + This can be divided into the following aspects: + 1. User interface, manage user needs. + 2. Collect tasks' features to ensure key tasks' QOS. + 3. Weaken the influence the impact of CPU frequency and cpuidle + adjustment on tasks. + 4. Docking EAS (Energy Aware Scheduling) model. + config CGROUP_PIDS bool "PIDs controller" help diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 168e98c6d51a7..00f5bae016f8f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5842,6 +5842,7 @@ int sched_cpu_activate(unsigned int cpu) static_branch_inc_cpuslocked(&sched_smt_present); #endif set_cpu_active(cpu, true); + tg_update_affinity_domains(cpu, 1);
if (sched_smp_initialized) { sched_domains_numa_masks_set(cpu); @@ -5900,6 +5901,7 @@ int sched_cpu_deactivate(unsigned int cpu) return ret; } sched_domains_numa_masks_clear(cpu); + tg_update_affinity_domains(cpu, 0); return 0; }
@@ -5970,6 +5972,8 @@ void __init sched_init_smp(void) init_sched_dl_class();
sched_smp_initialized = true; + + init_auto_affinity(&root_task_group); }
static int __init migration_init(void) @@ -6530,6 +6534,9 @@ void sched_move_task(struct task_struct *tsk) DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq_flags rf; struct rq *rq; +#ifdef CONFIG_QOS_SCHED_SMART_GRID + struct affinity_domain *ad; +#endif
rq = task_rq_lock(tsk, &rf); update_rq_clock(rq); @@ -6550,6 +6557,14 @@ void sched_move_task(struct task_struct *tsk) set_curr_task(rq, tsk);
task_rq_unlock(rq, tsk, &rf); + +#ifdef CONFIG_QOS_SCHED_SMART_GRID + if (task_group(tsk) != &root_task_group) { + ad = &task_group(tsk)->auto_affinity->ad; + set_prefer_cpus_ptr(tsk, ad->domains[ad->curr_level]); + } +#endif + }
static inline struct task_group *css_tg(struct cgroup_subsys_state *css) @@ -6969,6 +6984,117 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_QOS_SCHED_SMART_GRID +int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + int ret = 0; + + raw_spin_lock_irq(&auto_affi->lock); + + /* auto mode*/ + if (mode == 1) { + start_auto_affinity(auto_affi); + } else if (mode == 0) { + stop_auto_affinity(auto_affi); + } else { + raw_spin_unlock_irq(&auto_affi->lock); + return -EINVAL; + } + + auto_affi->mode = mode; + raw_spin_unlock_irq(&auto_affi->lock); + + return ret; +} + +static u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return tg->auto_affinity->mode; +} + +static int cpu_affinity_mode_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 mode) +{ + return tg_set_dynamic_affinity_mode(css_tg(css), mode); +} + +int tg_set_affinity_period(struct task_group *tg, u64 period_ms) +{ + if (period_ms > U64_MAX / NSEC_PER_MSEC) + return -EINVAL; + + raw_spin_lock_irq(&tg->auto_affinity->lock); + tg->auto_affinity->period = ms_to_ktime(period_ms); + raw_spin_unlock_irq(&tg->auto_affinity->lock); + return 0; +} + +u64 tg_get_affinity_period(struct task_group *tg) +{ + return ktime_to_ms(tg->auto_affinity->period); +} + +static int cpu_affinity_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 period) +{ + return tg_set_affinity_period(css_tg(css), period); +} + +static u64 cpu_affinity_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_affinity_period(css_tg(css)); +} + +static int cpu_affinity_domain_mask_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, + u64 mask) +{ + struct task_group *tg = css_tg(css); + struct affinity_domain *ad = &tg->auto_affinity->ad; + u16 full = (1 << ad->dcount) - 1; + + if (mask > full) + return -EINVAL; + + raw_spin_lock_irq(&tg->auto_affinity->lock); + ad->domain_mask = mask; + raw_spin_unlock_irq(&tg->auto_affinity->lock); + return 0; +} + +static u64 cpu_affinity_domain_mask_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return tg->auto_affinity->ad.domain_mask; +} + +static int cpu_affinity_stat_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + struct auto_affinity *auto_affi = tg->auto_affinity; + struct affinity_domain *ad = &auto_affi->ad; + int i; + + seq_printf(sf, "period_active %d\n", auto_affi->period_active); + seq_printf(sf, "dcount %d\n", ad->dcount); + seq_printf(sf, "domain_mask 0x%x\n", ad->domain_mask); + seq_printf(sf, "curr_level %d\n", ad->curr_level); + for (i = 0; i < ad->dcount; i++) + seq_printf(sf, "sd_level %d, cpu list %*pbl, stay_cnt %llu\n", + i, cpumask_pr_args(ad->domains[i]), + schedstat_val(ad->stay_cnt[i])); + + return 0; +} +#endif /* CONFIG_QOS_SCHED_SMART_GRID */ + #ifdef CONFIG_QOS_SCHED static int tg_change_scheduler(struct task_group *tg, void *data) { @@ -7073,6 +7199,27 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_qos_read, .write_s64 = cpu_qos_write, }, +#endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + { + .name = "dynamic_affinity_mode", + .read_u64 = cpu_affinity_mode_read_u64, + .write_u64 = cpu_affinity_mode_write_u64, + }, + { + .name = "affinity_period_ms", + .read_u64 = cpu_affinity_period_read_uint, + .write_u64 = cpu_affinity_period_write_uint, + }, + { + .name = "affinity_domain_mask", + .read_u64 = cpu_affinity_domain_mask_read_u64, + .write_u64 = cpu_affinity_domain_mask_write_u64, + }, + { + .name = "affinity_stat", + .seq_show = cpu_affinity_stat_show, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 77a49eae2cddc..4ad4206d36ac7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5281,6 +5281,417 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
+#ifdef CONFIG_QOS_SCHED_SMART_GRID +#define AUTO_AFFINITY_DEFAULT_PERIOD_MS 5000 +#define IS_DOMAIN_SET(level, mask) ((1 << (level)) & (mask)) + +static inline unsigned long cpu_util(int cpu); +static unsigned long target_load(int cpu, int type); +static unsigned long capacity_of(int cpu); +static int sched_idle_cpu(int cpu); +static unsigned long weighted_cpuload(struct rq *rq); + +int sysctl_affinity_adjust_delay_ms = 5000; + +static struct static_key __smart_grid_used; + +bool smart_grid_used(void) +{ + return static_key_false(&__smart_grid_used); +} + +void smart_grid_usage_inc(void) +{ + static_key_slow_inc_cpuslocked(&__smart_grid_used); +} + +void smart_grid_usage_dec(void) +{ + static_key_slow_dec_cpuslocked(&__smart_grid_used); +} + +static inline void tg_update_task_prefer_cpus(struct task_group *tg) +{ + struct affinity_domain *ad = &tg->auto_affinity->ad; + struct task_struct *task; + struct css_task_iter it; + + css_task_iter_start(&tg->css, 0, &it); + while ((task = css_task_iter_next(&it))) { + if (tg == &root_task_group && !task->mm) + continue; + + set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]); + } + css_task_iter_end(&it); +} + +static void affinity_domain_up(struct task_group *tg) +{ + struct affinity_domain *ad = &tg->auto_affinity->ad; + u16 level = ad->curr_level; + + if (ad->curr_level >= ad->dcount - 1) + return; + + while (level < ad->dcount) { + if (IS_DOMAIN_SET(level + 1, ad->domain_mask)) { + ad->curr_level++; + break; + } + level++; + } + + if (level == ad->dcount) + return; + + tg_update_task_prefer_cpus(tg); +} + +static void affinity_domain_down(struct task_group *tg) +{ + struct affinity_domain *ad = &tg->auto_affinity->ad; + u16 level = ad->curr_level; + + if (ad->curr_level <= 0) + return; + + while (level > 0) { + if (IS_DOMAIN_SET(level - 1, ad->domain_mask)) { + ad->curr_level--; + break; + } + level--; + } + + if (!level) + return; + + tg_update_task_prefer_cpus(tg); +} + +static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer) +{ + struct auto_affinity *auto_affi = + container_of(timer, struct auto_affinity, period_timer); + struct task_group *tg = auto_affi->tg; + struct affinity_domain *ad = &auto_affi->ad; + struct cpumask *span = ad->domains[ad->curr_level]; + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + unsigned long flags; + int cpu; + + for_each_cpu(cpu, span) { + util_avg_sum += cpu_util(cpu); + tg_capacity += capacity_of(cpu); + } + + if (unlikely(!tg_capacity)) + return HRTIMER_RESTART; + + raw_spin_lock_irqsave(&auto_affi->lock, flags); + if (util_avg_sum * 100 > tg_capacity * sysctl_sched_util_low_pct) { + affinity_domain_up(tg); + } else if (util_avg_sum * 100 < tg_capacity * + sysctl_sched_util_low_pct / 2) { + affinity_domain_down(tg); + } + + schedstat_inc(ad->stay_cnt[ad->curr_level]); + hrtimer_forward_now(timer, auto_affi->period); + raw_spin_unlock_irqrestore(&auto_affi->lock, flags); + return HRTIMER_RESTART; +} + +static int tg_update_affinity_domain_down(struct task_group *tg, void *data) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + struct affinity_domain *ad; + int *cpu_state = data; + unsigned long flags; + int i; + + if (!auto_affi) + return 0; + + ad = &tg->auto_affinity->ad; + raw_spin_lock_irqsave(&auto_affi->lock, flags); + + for (i = 0; i < ad->dcount; i++) { + if (!cpumask_test_cpu(cpu_state[0], ad->domains_orig[i])) + continue; + + /* online */ + if (cpu_state[1]) + cpumask_set_cpu(cpu_state[0], ad->domains[i]); + else + cpumask_clear_cpu(cpu_state[0], ad->domains[i]); + } + raw_spin_unlock_irqrestore(&auto_affi->lock, flags); + + tg_update_task_prefer_cpus(tg); + return 0; +} + +void tg_update_affinity_domains(int cpu, int online) +{ + int cpu_state[2]; + + cpu_state[0] = cpu; + cpu_state[1] = online; + + rcu_read_lock(); + walk_tg_tree(tg_update_affinity_domain_down, tg_nop, cpu_state); + rcu_read_unlock(); +} + +void start_auto_affinity(struct auto_affinity *auto_affi) +{ + struct task_group *tg = auto_affi->tg; + ktime_t delay_ms; + + if (auto_affi->period_active == 1) + return; + + tg_update_task_prefer_cpus(tg); + + auto_affi->period_active = 1; + delay_ms = ms_to_ktime(sysctl_affinity_adjust_delay_ms); + hrtimer_forward_now(&auto_affi->period_timer, delay_ms); + hrtimer_start_expires(&auto_affi->period_timer, + HRTIMER_MODE_ABS_PINNED); + smart_grid_usage_inc(); +} + +void stop_auto_affinity(struct auto_affinity *auto_affi) +{ + struct task_group *tg = auto_affi->tg; + struct affinity_domain *ad = &auto_affi->ad; + + if (auto_affi->period_active == 0) + return; + + hrtimer_cancel(&auto_affi->period_timer); + auto_affi->period_active = 0; + ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0; + + tg_update_task_prefer_cpus(tg); + smart_grid_usage_dec(); +} + +static struct sched_group *sd_find_idlest_group(struct sched_domain *sd) +{ + struct sched_group *idlest = NULL, *group = sd->groups; + unsigned long min_runnable_load = ULONG_MAX; + unsigned long min_avg_load = ULONG_MAX; + int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; + unsigned long imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + + do { + unsigned long load, avg_load, runnable_load; + int i; + + avg_load = 0; + runnable_load = 0; + + for_each_cpu(i, sched_group_span(group)) { + load = target_load(i, 0); + runnable_load += load; + avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); + } + + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + + if (min_runnable_load > (runnable_load + imbalance)) { + min_runnable_load = runnable_load; + min_avg_load = avg_load; + idlest = group; + } else if ((runnable_load < (min_runnable_load + imbalance)) && + (100*min_avg_load > imbalance_scale*avg_load)) { + min_avg_load = avg_load; + idlest = group; + } + } while (group = group->next, group != sd->groups); + + return idlest ? idlest : group; +} + +static int group_find_idlest_cpu(struct sched_group *group) +{ + int least_loaded_cpu = cpumask_first(sched_group_span(group)); + unsigned long load, min_load = ULONG_MAX; + unsigned int min_exit_latency = UINT_MAX; + u64 latest_idle_timestamp = 0; + int shallowest_idle_cpu = -1; + int i; + + if (group->group_weight == 1) + return cpumask_first(sched_group_span(group)); + + for_each_cpu(i, sched_group_span(group)) { + if (sched_idle_cpu(i)) + return i; + + if (available_idle_cpu(i)) { + struct rq *rq = cpu_rq(i); + struct cpuidle_state *idle = idle_get_state(rq); + + if (idle && idle->exit_latency < min_exit_latency) { + min_exit_latency = idle->exit_latency; + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } else if ((!idle || + idle->exit_latency == min_exit_latency) && + rq->idle_stamp > latest_idle_timestamp) { + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } + } else if (shallowest_idle_cpu == -1) { + load = weighted_cpuload(cpu_rq(i)); + if (load < min_load) { + min_load = load; + least_loaded_cpu = i; + } + } + } + + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : + least_loaded_cpu; +} + +void free_affinity_domains(struct affinity_domain *ad) +{ + int i; + + for (i = 0; i < ad->dcount; i++) { + kfree(ad->domains[i]); + ad->domains[i] = NULL; + } + ad->dcount = 0; +} + +static int init_affinity_domains_orig(struct affinity_domain *ad) +{ + int i, j; + + for (i = 0; i < ad->dcount; i++) { + ad->domains_orig[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!ad->domains_orig[i]) + goto err; + + cpumask_copy(ad->domains_orig[i], ad->domains[i]); + } + + return 0; +err: + for (j = 0; j < i; j++) { + kfree(ad->domains_orig[j]); + ad->domains_orig[i] = NULL; + } + return -ENOMEM; +} + +static int init_affinity_domains(struct affinity_domain *ad) +{ + struct sched_domain *sd = NULL, *tmp; + struct sched_group *idlest = NULL; + int ret = -ENOMEM; + int dcount = 0; + int i = 0; + int cpu; + + rcu_read_lock(); + cpu = cpumask_first_and(cpu_active_mask, + housekeeping_cpumask(HK_FLAG_DOMAIN)); + for_each_domain(cpu, tmp) { + sd = tmp; + dcount++; + } + + if (!sd) { + ad->dcount = 0; + rcu_read_unlock(); + return -EINVAL; + } + rcu_read_unlock(); + + for (i = 0; i < dcount; i++) { + ad->domains[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!ad->domains[i]) + goto err; + } + + rcu_read_lock(); + idlest = sd_find_idlest_group(sd); + cpu = group_find_idlest_cpu(idlest); + i = 0; + for_each_domain(cpu, tmp) { + cpumask_copy(ad->domains[i], sched_domain_span(tmp)); + __schedstat_set(ad->stay_cnt[i], 0); + } + rcu_read_unlock(); + + ad->dcount = dcount; + ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0; + ad->domain_mask = (1 << ad->dcount) - 1; + + ret = init_affinity_domains_orig(ad); + if (ret) + goto err; + + return 0; +err: + free_affinity_domains(ad); + return ret; +} + +int init_auto_affinity(struct task_group *tg) +{ + struct auto_affinity *auto_affi; + int ret; + + auto_affi = kmalloc(sizeof(*auto_affi), GFP_KERNEL); + if (!auto_affi) + return -ENOMEM; + + raw_spin_lock_init(&auto_affi->lock); + auto_affi->mode = 0; + auto_affi->period_active = 0; + auto_affi->period = ms_to_ktime(AUTO_AFFINITY_DEFAULT_PERIOD_MS); + hrtimer_init(&auto_affi->period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); + auto_affi->period_timer.function = sched_auto_affi_period_timer; + + ret = init_affinity_domains(&auto_affi->ad); + if (ret) { + kfree(auto_affi); + return ret; + } + + auto_affi->tg = tg; + tg->auto_affinity = auto_affi; + return 0; +} + +static void destroy_auto_affinity(struct task_group *tg) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + + hrtimer_cancel(&auto_affi->period_timer); + free_affinity_domains(&auto_affi->ad); + + kfree(tg->auto_affinity); + tg->auto_affinity = NULL; +} +#else +static void destroy_auto_affinity(struct task_group *tg) {} +#endif + /************************************************** * CFS operations on tasks: */ @@ -6733,7 +7144,7 @@ static inline bool prefer_cpus_valid(struct task_struct *p) /* * set_task_select_cpus: select the cpu range for task * @p: the task whose available cpu range will to set - * @idlest_cpu: the cpu which is the idlest in prefer cpus + *uto_affinity_used @idlest_cpu: the cpu which is the idlest in prefer cpus * * If sum of 'util_avg' among 'preferred_cpus' lower than the percentage * 'sysctl_sched_util_low_pct' of 'preferred_cpus' capacity, select @@ -10928,6 +11339,7 @@ void free_fair_sched_group(struct task_group *tg) int i;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + destroy_auto_affinity(tg);
for_each_possible_cpu(i) { #ifdef CONFIG_QOS_SCHED @@ -10948,7 +11360,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; struct cfs_rq *cfs_rq; - int i; + int i, ret;
tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL); if (!tg->cfs_rq) @@ -10960,6 +11372,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD;
init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + ret = init_auto_affinity(tg); + if (ret) + goto err;
for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), @@ -10982,6 +11397,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) err_free_rq: kfree(cfs_rq); err: + destroy_auto_affinity(tg); return 0; }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ae30681530938..02fe692a9e60e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -361,6 +361,34 @@ struct cfs_bandwidth { #endif };
+ +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#define AD_LEVEL_MAX 8 + +struct affinity_domain { + int dcount; + int curr_level; + u32 domain_mask; +#ifdef CONFIG_SCHEDSTATS + u64 stay_cnt[AD_LEVEL_MAX]; +#endif + struct cpumask *domains[AD_LEVEL_MAX]; + struct cpumask *domains_orig[AD_LEVEL_MAX]; +}; +#endif + +struct auto_affinity { +#ifdef CONFIG_QOS_SCHED_SMART_GRID + raw_spinlock_t lock; + u64 mode; + ktime_t period; + struct hrtimer period_timer; + int period_active; + struct affinity_domain ad; + struct task_group *tg; +#endif +}; + /* Task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -407,7 +435,12 @@ struct task_group { #else KABI_RESERVE(1) #endif + +#if defined(CONFIG_QOS_SCHED_SMART_GRID) && !defined(__GENKSYMS__) + struct auto_affinity *auto_affinity; +#else KABI_RESERVE(2) +#endif };
#ifdef CONFIG_FAIR_GROUP_SCHED @@ -475,6 +508,23 @@ extern void sched_offline_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
+#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern void start_auto_affinity(struct auto_affinity *auto_affi); +extern void stop_auto_affinity(struct auto_affinity *auto_affi); +extern void smart_grid_usage_inc(void); +extern void smart_grid_usage_dec(void); +extern int init_auto_affinity(struct task_group *tg); +extern void tg_update_affinity_domains(int cpu, int online); + +#else +static inline int init_auto_affinity(struct task_group *tg) +{ + return 0; +} + +static inline void tg_update_affinity_domains(int cpu, int online) {} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ad62ea156afd9..592440a2d5db0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1336,6 +1336,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, +#endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + { + .procname = "affinity_adjust_delay_ms", + .data = &sysctl_affinity_adjust_delay_ms, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif { } };
As smart grid scheduler (SGS) may shrink resources and affect task QOS, We provide methods for evaluating task QOS in devided grid, we mainly focus on the following two aspects:
1. Evaluate whether (such as CPU or memory) resources meet our demand 2. Ensure the least impact when working with (cpufreq and cpuidle) governors
For tackling this questions, we have summarized several sampling methods to obtain tasks' characteristics at same time reducing scheduling noise as much as possible:
1. we detected the key factors that how sensitive a process is in cpufreq or cpuidle adjustment, and to guide the cpufreq/cpuidle governor 2. We dynamically monitor process memory bandwidth and adjust memory allocation to minimize cross-remote memory access 3. We provide a variety of load tracking mechanisms to adapt to different types of task's load change
--------------------------------- ----------------- | class A | | class B | | -------- -------- | | -------- | | | group0 | | group1 | |---| | group2 | |----------+ | -------- -------- | | -------- | | | CPU/memory sensitive type | | balance type | | ----------------+---------------- --------+-------- | v v | (target cpufreq) ------------------------------------------------------- | (sensitivity) | Not satisfied with QOS? | | --------------------------+---------------------------- | v v ------------------------------------------------------- ---------------- | expand or shrink resource |<--| energy model | ----------------------------+-------------------------- ---------------- v | ----------- ----------- ------------ v | | | | | | --------------- | GRID0 +--------+ GRID1 +--------+ GRID2 |<-- | governor | | | | | | | --------------- ----------- ----------- ------------ \ | / \ ------------------- / | pages migration | -------------------
We will introduce the energy model in the follow-up implementation, and consider the dynamic affinity adjustment between each divided grid in the runtime.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com --- include/linux/sched.h | 9 +++ include/linux/sched/grid_qos.h | 84 ++++++++++++++++++++++ kernel/fork.c | 9 +++ kernel/sched/Makefile | 1 + kernel/sched/fair.c | 5 ++ kernel/sched/grid/Makefile | 2 + kernel/sched/grid/internal.h | 6 ++ kernel/sched/grid/power.c | 10 +++ kernel/sched/grid/qos.c | 125 +++++++++++++++++++++++++++++++++ kernel/sched/grid/stat.c | 15 ++++ mm/mempolicy.c | 12 +++- 11 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 include/linux/sched/grid_qos.h create mode 100644 kernel/sched/grid/Makefile create mode 100644 kernel/sched/grid/internal.h create mode 100644 kernel/sched/grid/power.c create mode 100644 kernel/sched/grid/qos.c create mode 100644 kernel/sched/grid/stat.c
diff --git a/include/linux/sched.h b/include/linux/sched.h index 8346294d25838..f561defa325e4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1277,7 +1277,16 @@ struct task_struct { KABI_RESERVE(6) KABI_RESERVE(7) #endif + +#if !defined(__GENKSYMS__) +#if defined(CONFIG_QOS_SCHED_SMART_GRID) + struct sched_grid_qos *grid_qos; +#else KABI_RESERVE(8) +#endif +#else + KABI_RESERVE(8) +#endif
/* CPU-specific state of this task: */ struct thread_struct thread; diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h new file mode 100644 index 0000000000000..7b859108720d5 --- /dev/null +++ b/include/linux/sched/grid_qos.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_GRID_QOS_H +#define _LINUX_SCHED_GRID_QOS_H +#include <linux/nodemask.h> + +#ifdef CONFIG_QOS_SCHED_SMART_GRID +enum sched_grid_qos_class { + SCHED_GRID_QOS_CLASS_LEVEL_1 = 0, + SCHED_GRID_QOS_CLASS_LEVEL_2 = 1, + SCHED_GRID_QOS_CLASS_LEVEL_3 = 2, + SCHED_GRID_QOS_CLASS_LEVEL_4 = 3, + SCHED_GRID_QOS_CLASS_LEVEL_5 = 4, + SCHED_GRID_QOS_CLASS_LEVEL_6 = 5, + SCHED_GRID_QOS_CLASS_LEVEL_7 = 6, + SCHED_GRID_QOS_CLASS_LEVEL_8 = 7, + SCHED_GRID_QOS_CLASS_LEVEL_NR +}; + +enum { + SCHED_GRID_QOS_IPS_INDEX = 0, + SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX = 1, + SCHED_GRID_QOS_MEMBANDWIDTH_INDEX = 2, + SCHED_GRID_QOS_SAMPLE_NR +}; + +#define SCHED_GRID_QOS_RING_BUFFER_MAXLEN 100 + +struct sched_grid_qos_ring_buffer { + u64 vecs[SCHED_GRID_QOS_RING_BUFFER_MAXLEN]; + unsigned int head; + void (*push)(u64 *data, int stepsize, + struct sched_grid_qos_ring_buffer *ring_buffer); +}; + +struct sched_grid_qos_sample { + const char *name; + int index; + int sample_bypass; + int sample_times; + struct sched_grid_qos_ring_buffer ring_buffer; + u64 pred_target[MAX_NUMNODES]; + void (*cal_target)(int stepsize, + struct sched_grid_qos_ring_buffer *ring_buffer); + + int account_ready; + int (*start)(void *arg); + int (*account)(void *arg); +}; + +struct sched_grid_qos_stat { + enum sched_grid_qos_class class_lvl; + int (*set_class_lvl)(struct sched_grid_qos_stat *qos_stat); + struct sched_grid_qos_sample sample[SCHED_GRID_QOS_SAMPLE_NR]; +}; + +struct sched_grid_qos_power { + int cpufreq_sense_ratio; + int target_cpufreq; + int cstate_sense_ratio; +}; + +struct sched_grid_qos_affinity { + nodemask_t mem_preferred_node_mask; +}; + +struct task_struct; +struct sched_grid_qos { + struct sched_grid_qos_stat stat; + struct sched_grid_qos_power power; + struct sched_grid_qos_affinity affinity; + + int (*affinity_set)(struct task_struct *p); +}; + +int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig); +void sched_grid_qos_free(struct task_struct *p); + +int sched_grid_preferred_interleave_nid(struct mempolicy *policy); +int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask); +#else +static inline int sched_grid_preferred_interleave_nid(struct mempolicy *policy) {return -1;} +static inline int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask) {return preferred_nid;} +#endif +#endif diff --git a/kernel/fork.c b/kernel/fork.c index c256525d4ce5e..db95586d745ba 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -21,6 +21,7 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/sched/grid_qos.h> #include <linux/rtmutex.h> #include <linux/init.h> #include <linux/unistd.h> @@ -461,6 +462,9 @@ void free_task(struct task_struct *tsk) free_kthread_struct(tsk); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY sched_prefer_cpus_free(tsk); +#endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + sched_grid_qos_free(tsk); #endif free_task_struct(tsk); } @@ -1876,6 +1880,11 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_free; #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + retval = sched_grid_qos_fork(p, current); + if (retval) + goto bad_fork_free; +#endif
/* * If multiple threads are within copy_process(), then this check diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe183404c383..0612af002ae57 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o +obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4ad4206d36ac7..a2f9f025471d4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -28,6 +28,9 @@ #include <linux/delay.h> #include <linux/tracehook.h> #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#include <linux/sched/grid_qos.h> +#endif #include <trace/events/sched.h>
/* @@ -5322,6 +5325,8 @@ static inline void tg_update_task_prefer_cpus(struct task_group *tg) continue;
set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]); + /* grid_qos must not be NULL */ + task->grid_qos->affinity_set(task); } css_task_iter_end(&it); } diff --git a/kernel/sched/grid/Makefile b/kernel/sched/grid/Makefile new file mode 100644 index 0000000000000..82f2a09c3c309 --- /dev/null +++ b/kernel/sched/grid/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_QOS_SCHED_SMART_GRID) += qos.o power.o stat.o diff --git a/kernel/sched/grid/internal.h b/kernel/sched/grid/internal.h new file mode 100644 index 0000000000000..743f72aaffbfc --- /dev/null +++ b/kernel/sched/grid/internal.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SMART_GRID_INTERNAL_H +#define _LINUX_SCHED_SMART_GRID_INTERNAL_H +void qos_power_init(struct sched_grid_qos_power *power); +void qos_stat_init(struct sched_grid_qos_stat *stat); +#endif diff --git a/kernel/sched/grid/power.c b/kernel/sched/grid/power.c new file mode 100644 index 0000000000000..849e06d22a8fa --- /dev/null +++ b/kernel/sched/grid/power.c @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/sched/grid_qos.h> +#include "internal.h" + +void qos_power_init(struct sched_grid_qos_power *power) +{ + power->cpufreq_sense_ratio = 0; + power->target_cpufreq = 0; + power->cstate_sense_ratio = 0; +} diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c new file mode 100644 index 0000000000000..93d3fbfc267c1 --- /dev/null +++ b/kernel/sched/grid/qos.c @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/nodemask.h> +#include <linux/mempolicy.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/sched/grid_qos.h> +#include "internal.h" + +static int qos_affinity_set(struct task_struct *p) +{ + int n; + struct sched_grid_qos_affinity *affinity = &p->grid_qos->affinity; + + nodes_clear(affinity->mem_preferred_node_mask); + /* + * We want the memory allocation to be as close to the CPU + * as possible, and adjust after getting memory bandwidth usage. + */ + for (n = 0; n < nr_node_ids; n++) + if (cpumask_intersects(cpumask_of_node(n), p->prefer_cpus)) + node_set(n, affinity->mem_preferred_node_mask); + + return 0; +} + +int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig) +{ + struct sched_grid_qos *qos; + + qos = kzalloc(sizeof(*qos), GFP_KERNEL); + if (!qos) + return -ENOMEM; + + qos_power_init(&qos->power); + qos_stat_init(&qos->stat); + + nodes_clear(qos->affinity.mem_preferred_node_mask); + if (likely(orig->grid_qos)) + qos->affinity = orig->grid_qos->affinity; + qos->affinity_set = qos_affinity_set; + p->grid_qos = qos; + + return 0; +} + +void sched_grid_qos_free(struct task_struct *p) +{ + kfree(p->grid_qos); +} + +/* Do dynamic select a more appropriate preferred interleave nid for a process */ +int sched_grid_preferred_interleave_nid(struct mempolicy *policy) +{ + nodemask_t nmask; + unsigned int next; + struct task_struct *me = current; + nodemask_t *preferred_nmask = NULL; + + if (likely(current->grid_qos)) + preferred_nmask = + ¤t->grid_qos->affinity.mem_preferred_node_mask; + + if (!preferred_nmask || !policy) + return -1; + + if (nodes_equal(policy->v.nodes, *preferred_nmask)) + return -1; + /* + * We perceive the actual consumption of memory bandwidth + * in each node and post a preferred interleave nid in + * more appropriate range. + */ + nodes_and(nmask, policy->v.nodes, *preferred_nmask); + if (nodes_empty(nmask)) + return -1; + + next = next_node_in(me->il_prev, nmask); + if (next < MAX_NUMNODES) + me->il_prev = next; + return next; +} + +/* Do dynamic select a more appropriate preferred nid for a process */ +int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask) +{ + int nd = preferred_nid; + nodemask_t nmask, ndmask; + nodemask_t *preferred_nmask = NULL; + + if (likely(current->grid_qos)) + preferred_nmask = ¤t->grid_qos->affinity.mem_preferred_node_mask; + + if (!preferred_nmask) + return preferred_nid; + + /* + * We perceive the actual consumption of memory bandwidth + * in each node and post a preferred nid in more appropriate + * range. + */ + nmask = *preferred_nmask; + if (nodemask) { + if (nodes_equal(*nodemask, nmask)) + return preferred_nid; + + nodes_and(nmask, nmask, *nodemask); + } + + if (node_isset(preferred_nid, nmask)) + return preferred_nid; + + /* + * We prefer the numa node we're running, if there is no limit + * to nodemask, we select preferred nid in preferred range or + * in restriced range if not. + */ + init_nodemask_of_node(&ndmask, numa_node_id()); + nodes_and(ndmask, nmask, ndmask); + if (!nodes_empty(ndmask)) + nd = first_node(ndmask); + else if (!nodes_empty(nmask)) + nd = first_node(nmask); + + return nd; +} diff --git a/kernel/sched/grid/stat.c b/kernel/sched/grid/stat.c new file mode 100644 index 0000000000000..4bde8b542b2b4 --- /dev/null +++ b/kernel/sched/grid/stat.c @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/sched/grid_qos.h> +#include "internal.h" + +void qos_stat_init(struct sched_grid_qos_stat *stat) +{ + stat->sample[SCHED_GRID_QOS_IPS_INDEX].name = "ips"; + stat->sample[SCHED_GRID_QOS_IPS_INDEX].index = SCHED_GRID_QOS_IPS_INDEX; + stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].name = "membound_ratio"; + stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].index = + SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX; + stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].name = "memband_width"; + stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].index = + SCHED_GRID_QOS_MEMBANDWIDTH_INDEX; +} diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4769ed2ed7f38..1790446ca5af2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -76,6 +76,7 @@ #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> +#include <linux/sched/grid_qos.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> @@ -2172,7 +2173,14 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, if (pol->mode == MPOL_INTERLEAVE) { unsigned nid;
- nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + if (smart_grid_used()) { + nid = sched_grid_preferred_interleave_nid(pol); + nid = (nid == -1) ? + interleave_nid(pol, vma, addr, PAGE_SHIFT + order) : nid; + } else { + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + } + mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); goto out; @@ -2234,6 +2242,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); + if (smart_grid_used()) + preferred_nid = sched_grid_preferred_nid(preferred_nid, nmask); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); mark_vma_cdm(nmask, page, vma); mpol_cond_put(pol);