Some optimision, feature and bugfix for dynamic affinity as follows:
Hui Tang (6): sched: Reorganize the code of dynamic affnity and smart grid sched: Add 'affinity_preferred_nodes' for smart grid sched: Add 'affinity_util_low_pct' for task group sched: fix ping-pang for domain level adjust sched: Check preferred_nmask is valid sched: fix mem_preferred_node_mask not update
fs/proc/array.c | 10 - include/linux/sched.h | 34 +- include/linux/sched/dynamic_affinity.h | 38 ++ kernel/sched/core.c | 166 +----- kernel/sched/dynamic_affinity.c | 775 +++++++++++++++++++++++++ kernel/sched/dynamic_affinity.h | 53 ++ kernel/sched/fair.c | 584 +------------------ kernel/sched/grid/qos.c | 6 +- kernel/sched/sched.h | 47 +- kernel/sysctl.c | 7 + mm/mempolicy.c | 6 + 11 files changed, 915 insertions(+), 811 deletions(-) create mode 100644 include/linux/sched/dynamic_affinity.h create mode 100644 kernel/sched/dynamic_affinity.c create mode 100644 kernel/sched/dynamic_affinity.h
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I94PBH CVE: NA
----------------------------------------
Reorganize the code of dynamic affnity and smart grid without altering the code logic. The main changes are as follows:
1.Move related code to new file. 2.Related functions wrapped with Macro Definition by callers.
Signed-off-by: Hui Tang tanghui20@huawei.com --- fs/proc/array.c | 10 - include/linux/sched.h | 34 +- include/linux/sched/dynamic_affinity.h | 33 ++ kernel/sched/core.c | 161 +----- kernel/sched/dynamic_affinity.c | 740 +++++++++++++++++++++++++ kernel/sched/dynamic_affinity.h | 48 ++ kernel/sched/fair.c | 584 +------------------ kernel/sched/sched.h | 47 +- mm/mempolicy.c | 6 + 9 files changed, 854 insertions(+), 809 deletions(-) create mode 100644 include/linux/sched/dynamic_affinity.h create mode 100644 kernel/sched/dynamic_affinity.c create mode 100644 kernel/sched/dynamic_affinity.h
diff --git a/fs/proc/array.c b/fs/proc/array.c index fd56e15b3463..71cbd5b392da 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -386,16 +386,6 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) cpumask_pr_args(&task->cpus_allowed)); }
-#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY -static void task_cpus_preferred(struct seq_file *m, struct task_struct *task) -{ - seq_printf(m, "Cpus_preferred:\t%*pb\n", - cpumask_pr_args(task->prefer_cpus)); - seq_printf(m, "Cpus_preferred_list:\t%*pbl\n", - cpumask_pr_args(task->prefer_cpus)); -} -#endif - static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) { seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state); diff --git a/include/linux/sched.h b/include/linux/sched.h index 8fd8c5b7cdc6..95377c7cf3d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -23,6 +23,9 @@ #include <linux/rcupdate.h> #include <linux/resource.h> #include <linux/latencytop.h> +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#include <linux/sched/dynamic_affinity.h> +#endif #include <linux/sched/prio.h> #include <linux/signal_types.h> #include <linux/mm_types_task.h> @@ -444,15 +447,6 @@ struct sched_statistics { #endif };
-#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY -struct dyn_affinity_stats { -#ifdef CONFIG_SCHEDSTATS - u64 nr_wakeups_preferred_cpus; - u64 nr_wakeups_force_preferred_cpus; -#endif -}; -#endif - struct sched_entity { /* For load-balancing: */ struct load_weight load; @@ -2001,26 +1995,4 @@ static inline int sched_qos_cpu_overload(void) } #endif
-#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY -int dynamic_affinity_enabled(void); -int set_prefer_cpus_ptr(struct task_struct *p, - const struct cpumask *new_mask); -int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig); -void sched_prefer_cpus_free(struct task_struct *p); -void dynamic_affinity_enable(void); -#endif - -#ifdef CONFIG_QOS_SCHED_SMART_GRID -extern struct static_key __smart_grid_used; -static inline bool smart_grid_used(void) -{ - return static_key_false(&__smart_grid_used); -} -#else -static inline bool smart_grid_used(void) -{ - return false; -} -#endif - #endif diff --git a/include/linux/sched/dynamic_affinity.h b/include/linux/sched/dynamic_affinity.h new file mode 100644 index 000000000000..f9a22a29c0b7 --- /dev/null +++ b/include/linux/sched/dynamic_affinity.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_DYNAMIC_AFFINITY_H +#define _LINUX_SCHED_DYNAMIC_AFFINITY_H + +struct dyn_affinity_stats { +#ifdef CONFIG_SCHEDSTATS + u64 nr_wakeups_preferred_cpus; + u64 nr_wakeups_force_preferred_cpus; +#endif +}; + +extern void dynamic_affinity_enable(void); +extern int sched_prefer_cpus_fork(struct task_struct *p, + struct task_struct *orig); +extern void sched_prefer_cpus_free(struct task_struct *p); +extern void task_cpus_preferred(struct seq_file *m, struct task_struct *task); +extern int set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask); + +#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern struct static_key __smart_grid_used; +static inline bool smart_grid_used(void) +{ + return static_key_false(&__smart_grid_used); +} +#else +static inline bool smart_grid_used(void) +{ + return false; +} +#endif + +#endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7825ceaae0c4..c7c1f3125c9f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5842,7 +5842,9 @@ int sched_cpu_activate(unsigned int cpu) static_branch_inc_cpuslocked(&sched_smt_present); #endif set_cpu_active(cpu, true); +#ifdef CONFIG_QOS_SCHED_SMART_GRID tg_update_affinity_domains(cpu, 1); +#endif
if (sched_smp_initialized) { sched_domains_numa_masks_set(cpu); @@ -5905,7 +5907,9 @@ int sched_cpu_deactivate(unsigned int cpu) return ret; } sched_domains_numa_masks_clear(cpu); +#ifdef CONFIG_QOS_SCHED_SMART_GRID tg_update_affinity_domains(cpu, 0); +#endif return 0; }
@@ -5977,7 +5981,9 @@ void __init sched_init_smp(void)
sched_smp_initialized = true;
+#ifdef CONFIG_QOS_SCHED_SMART_GRID init_auto_affinity(&root_task_group); +#endif }
static int __init migration_init(void) @@ -6977,134 +6983,6 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_QOS_SCHED_SMART_GRID -int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode) -{ - struct auto_affinity *auto_affi = tg->auto_affinity; - - if (unlikely(!auto_affi)) - return -EPERM; - - /* auto mode*/ - if (mode == 1) { - start_auto_affinity(auto_affi); - } else if (mode == 0) { - stop_auto_affinity(auto_affi); - } else { - return -EINVAL; - } - - return 0; -} - -static u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct task_group *tg = css_tg(css); - - if (unlikely(!tg->auto_affinity)) - return -EPERM; - - return tg->auto_affinity->mode; -} - -static int cpu_affinity_mode_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 mode) -{ - return tg_set_dynamic_affinity_mode(css_tg(css), mode); -} - -int tg_set_affinity_period(struct task_group *tg, u64 period_ms) -{ - if (unlikely(!tg->auto_affinity)) - return -EPERM; - - if (!period_ms || period_ms > U64_MAX / NSEC_PER_MSEC) - return -EINVAL; - - raw_spin_lock_irq(&tg->auto_affinity->lock); - tg->auto_affinity->period = ms_to_ktime(period_ms); - raw_spin_unlock_irq(&tg->auto_affinity->lock); - return 0; -} - -u64 tg_get_affinity_period(struct task_group *tg) -{ - if (unlikely(!tg->auto_affinity)) - return -EPERM; - - return ktime_to_ms(tg->auto_affinity->period); -} - -static int cpu_affinity_period_write_uint(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 period) -{ - return tg_set_affinity_period(css_tg(css), period); -} - -static u64 cpu_affinity_period_read_uint(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_affinity_period(css_tg(css)); -} - -static int cpu_affinity_domain_mask_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, - u64 mask) -{ - struct task_group *tg = css_tg(css); - struct affinity_domain *ad; - u16 full; - - if (unlikely(!tg->auto_affinity)) - return -EPERM; - - ad = &tg->auto_affinity->ad; - full = (1 << ad->dcount) - 1; - if (mask > full) - return -EINVAL; - - raw_spin_lock_irq(&tg->auto_affinity->lock); - ad->domain_mask = mask; - raw_spin_unlock_irq(&tg->auto_affinity->lock); - return 0; -} - -static u64 cpu_affinity_domain_mask_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct task_group *tg = css_tg(css); - - if (unlikely(!tg->auto_affinity)) - return -EPERM; - - return tg->auto_affinity->ad.domain_mask; -} - -static int cpu_affinity_stat_show(struct seq_file *sf, void *v) -{ - struct task_group *tg = css_tg(seq_css(sf)); - struct auto_affinity *auto_affi = tg->auto_affinity; - struct affinity_domain *ad; - int i; - - if (unlikely(!auto_affi)) - return -EPERM; - - ad = &auto_affi->ad; - seq_printf(sf, "period_active %d\n", auto_affi->period_active); - seq_printf(sf, "dcount %d\n", ad->dcount); - seq_printf(sf, "domain_mask 0x%x\n", ad->domain_mask); - seq_printf(sf, "curr_level %d\n", ad->curr_level); - for (i = 0; i < ad->dcount; i++) - seq_printf(sf, "sd_level %d, cpu list %*pbl, stay_cnt %llu\n", - i, cpumask_pr_args(ad->domains[i]), - schedstat_val(ad->stay_cnt[i])); - - return 0; -} -#endif /* CONFIG_QOS_SCHED_SMART_GRID */ - #ifdef CONFIG_QOS_SCHED static int tg_change_scheduler(struct task_group *tg, void *data) { @@ -7352,33 +7230,6 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, }
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY -int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig) -{ - p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!p->prefer_cpus) - return -ENOMEM; - - if (orig->prefer_cpus) - cpumask_copy(p->prefer_cpus, orig->prefer_cpus); - else - cpumask_clear(p->prefer_cpus); - - p->se.dyn_affi_stats = kzalloc(sizeof(struct dyn_affinity_stats), - GFP_KERNEL); - if (!p->se.dyn_affi_stats) { - kfree(p->prefer_cpus); - p->prefer_cpus = NULL; - return -ENOMEM; - } - return 0; -} - -void sched_prefer_cpus_free(struct task_struct *p) -{ - kfree(p->prefer_cpus); - kfree(p->se.dyn_affi_stats); -} - static void do_set_prefer_cpus(struct task_struct *p, const struct cpumask *new_mask) { diff --git a/kernel/sched/dynamic_affinity.c b/kernel/sched/dynamic_affinity.c new file mode 100644 index 000000000000..9bead27c90ee --- /dev/null +++ b/kernel/sched/dynamic_affinity.c @@ -0,0 +1,740 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for CPU Dynamic Affinity Scheduling + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Hui Tang tanghui20@huawei.com + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include "dynamic_affinity.h" + +static inline struct cpumask *task_prefer_cpus(struct task_struct *p); +static inline int dynamic_affinity_mode(struct task_struct *p); +static unsigned long capacity_of(int cpu); +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); + +static DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_used); + +void dynamic_affinity_enable(void) +{ + static_branch_enable_cpuslocked(&__dynamic_affinity_used); +} + +static inline bool dynamic_affinity_used(void) +{ + return static_branch_unlikely(&__dynamic_affinity_used); +} + +/* + * Low utilization threshold for CPU + * + * (default: 85%), units: percentage of CPU utilization) + */ +int sysctl_sched_util_low_pct = 85; + +static inline bool prefer_cpus_valid(struct task_struct *p) +{ + struct cpumask *prefer_cpus = task_prefer_cpus(p); + + return !cpumask_empty(prefer_cpus) && + !cpumask_equal(prefer_cpus, &p->cpus_allowed) && + cpumask_subset(prefer_cpus, &p->cpus_allowed); +} + +/* + * set_task_select_cpus: select the cpu range for task + * @p: the task whose available cpu range will to set + *uto_affinity_used @idlest_cpu: the cpu which is the idlest in prefer cpus + * + * If sum of 'util_avg' among 'preferred_cpus' lower than the percentage + * 'sysctl_sched_util_low_pct' of 'preferred_cpus' capacity, select + * 'preferred_cpus' range for task, otherwise select 'preferred_cpus' for task. + * + * The available cpu range set to p->select_cpus. Idlest cpu in preferred cpus + * set to @idlest_cpu, which is set to wakeup cpu when fast path wakeup cpu + * without p->select_cpus. + */ +void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, int sd_flag) +{ + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + long min_util = INT_MIN; + struct task_group *tg; + long spare; + int cpu, mode; + + rcu_read_lock(); + mode = dynamic_affinity_mode(p); + if (mode == -1) { + rcu_read_unlock(); + return; + } else if (mode == 1) { + p->select_cpus = task_prefer_cpus(p); + if (idlest_cpu) + *idlest_cpu = cpumask_first(p->select_cpus); + sched_qos_affinity_set(p); + rcu_read_unlock(); + return; + } + + /* manual mode */ + tg = task_group(p); + for_each_cpu(cpu, p->prefer_cpus) { + if (unlikely(!tg->se[cpu])) + continue; + + if (idlest_cpu && available_idle_cpu(cpu)) { + *idlest_cpu = cpu; + } else if (idlest_cpu) { + spare = (long)(capacity_of(cpu) - tg->se[cpu]->avg.util_avg); + if (spare > min_util) { + min_util = spare; + *idlest_cpu = cpu; + } + } + + if (available_idle_cpu(cpu)) { + rcu_read_unlock(); + p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus); + return; + } + + util_avg_sum += tg->se[cpu]->avg.util_avg; + tg_capacity += capacity_of(cpu); + } + rcu_read_unlock(); + + if (tg_capacity > cpumask_weight(p->prefer_cpus) && + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { + p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus); + } +} + +int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig) +{ + p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!p->prefer_cpus) + return -ENOMEM; + + if (orig->prefer_cpus) + cpumask_copy(p->prefer_cpus, orig->prefer_cpus); + else + cpumask_clear(p->prefer_cpus); + + p->se.dyn_affi_stats = kzalloc(sizeof(struct dyn_affinity_stats), + GFP_KERNEL); + if (!p->se.dyn_affi_stats) { + kfree(p->prefer_cpus); + p->prefer_cpus = NULL; + return -ENOMEM; + } + return 0; +} + +void sched_prefer_cpus_free(struct task_struct *p) +{ + kfree(p->prefer_cpus); + kfree(p->se.dyn_affi_stats); +} + +void task_cpus_preferred(struct seq_file *m, struct task_struct *task) +{ + seq_printf(m, "Cpus_preferred:\t%*pb\n", + cpumask_pr_args(task->prefer_cpus)); + seq_printf(m, "Cpus_preferred_list:\t%*pbl\n", + cpumask_pr_args(task->prefer_cpus)); +} + +#ifdef CONFIG_QOS_SCHED_SMART_GRID + +#define AUTO_AFFINITY_DEFAULT_PERIOD_MS 2000 +#define IS_DOMAIN_SET(level, mask) ((1 << (level)) & (mask)) + +static DEFINE_MUTEX(smart_grid_used_mutex); + +static inline unsigned long cpu_util(int cpu); +static unsigned long target_load(int cpu, int type); +static unsigned long capacity_of(int cpu); +static int sched_idle_cpu(int cpu); +static unsigned long weighted_cpuload(struct rq *rq); + +int sysctl_affinity_adjust_delay_ms = 5000; + +struct static_key __smart_grid_used; + +static void smart_grid_usage_inc(void) +{ + static_key_slow_inc(&__smart_grid_used); +} + +static void smart_grid_usage_dec(void) +{ + static_key_slow_dec(&__smart_grid_used); +} + +static inline struct cpumask *task_prefer_cpus(struct task_struct *p) +{ + struct affinity_domain *ad; + + if (!smart_grid_used()) + return p->prefer_cpus; + + if (task_group(p)->auto_affinity->mode == 0) + return &p->cpus_allowed; + + ad = &task_group(p)->auto_affinity->ad; + return ad->domains[ad->curr_level]; +} + +static inline int dynamic_affinity_mode(struct task_struct *p) +{ + if (!prefer_cpus_valid(p)) + return -1; + + if (smart_grid_used()) + return task_group(p)->auto_affinity->mode == 0 ? -1 : 1; + + return 0; +} + +static void affinity_domain_up(struct task_group *tg) +{ + struct affinity_domain *ad = &tg->auto_affinity->ad; + u16 level = ad->curr_level; + + if (ad->curr_level >= ad->dcount - 1) + return; + + while (level < ad->dcount) { + if (IS_DOMAIN_SET(level + 1, ad->domain_mask) && + cpumask_weight(ad->domains[level + 1]) > 0) { + ad->curr_level = level + 1; + return; + } + level++; + } +} + +static void affinity_domain_down(struct task_group *tg) +{ + struct affinity_domain *ad = &tg->auto_affinity->ad; + u16 level = ad->curr_level; + + if (ad->curr_level <= 0) + return; + + while (level > 0) { + if (!cpumask_weight(ad->domains[level - 1])) + return; + + if (IS_DOMAIN_SET(level - 1, ad->domain_mask)) { + ad->curr_level = level - 1; + return; + } + level--; + } +} + +static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer) +{ + struct auto_affinity *auto_affi = + container_of(timer, struct auto_affinity, period_timer); + struct task_group *tg = auto_affi->tg; + struct affinity_domain *ad = &auto_affi->ad; + struct cpumask *span = ad->domains[ad->curr_level]; + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + unsigned long flags; + int cpu; + + for_each_cpu(cpu, span) { + util_avg_sum += cpu_util(cpu); + tg_capacity += capacity_of(cpu); + } + + raw_spin_lock_irqsave(&auto_affi->lock, flags); + /* May be re-entrant by stop_auto_affinity, So check again. */ + if (auto_affi->period_active == 0) { + raw_spin_unlock_irqrestore(&auto_affi->lock, flags); + return HRTIMER_NORESTART; + } + + if (util_avg_sum * 100 >= tg_capacity * sysctl_sched_util_low_pct) { + affinity_domain_up(tg); + } else if (util_avg_sum * 100 < tg_capacity * + sysctl_sched_util_low_pct / 2) { + affinity_domain_down(tg); + } + + schedstat_inc(ad->stay_cnt[ad->curr_level]); + hrtimer_forward_now(timer, auto_affi->period); + raw_spin_unlock_irqrestore(&auto_affi->lock, flags); + return HRTIMER_RESTART; +} + +static int tg_update_affinity_domain_down(struct task_group *tg, void *data) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + struct affinity_domain *ad; + int *cpu_state = data; + unsigned long flags; + int i; + + if (!auto_affi) + return 0; + + ad = &tg->auto_affinity->ad; + raw_spin_lock_irqsave(&auto_affi->lock, flags); + + for (i = 0; i < ad->dcount; i++) { + if (!cpumask_test_cpu(cpu_state[0], ad->domains_orig[i])) + continue; + + /* online */ + if (cpu_state[1]) { + cpumask_set_cpu(cpu_state[0], ad->domains[i]); + } else { + cpumask_clear_cpu(cpu_state[0], ad->domains[i]); + if (!cpumask_weight(ad->domains[i])) + affinity_domain_up(tg); + } + + } + raw_spin_unlock_irqrestore(&auto_affi->lock, flags); + + return 0; +} + +void tg_update_affinity_domains(int cpu, int online) +{ + int cpu_state[2]; + + cpu_state[0] = cpu; + cpu_state[1] = online; + + rcu_read_lock(); + walk_tg_tree(tg_update_affinity_domain_down, tg_nop, cpu_state); + rcu_read_unlock(); +} + +void start_auto_affinity(struct auto_affinity *auto_affi) +{ + ktime_t delay_ms; + + mutex_lock(&smart_grid_used_mutex); + raw_spin_lock_irq(&auto_affi->lock); + if (auto_affi->period_active == 1) { + raw_spin_unlock_irq(&auto_affi->lock); + mutex_unlock(&smart_grid_used_mutex); + return; + } + + auto_affi->period_active = 1; + auto_affi->mode = 1; + delay_ms = ms_to_ktime(sysctl_affinity_adjust_delay_ms); + hrtimer_forward_now(&auto_affi->period_timer, delay_ms); + hrtimer_start_expires(&auto_affi->period_timer, + HRTIMER_MODE_ABS_PINNED); + raw_spin_unlock_irq(&auto_affi->lock); + + smart_grid_usage_inc(); + mutex_unlock(&smart_grid_used_mutex); +} + +void stop_auto_affinity(struct auto_affinity *auto_affi) +{ + struct affinity_domain *ad = &auto_affi->ad; + + mutex_lock(&smart_grid_used_mutex); + raw_spin_lock_irq(&auto_affi->lock); + if (auto_affi->period_active == 0) { + raw_spin_unlock_irq(&auto_affi->lock); + mutex_unlock(&smart_grid_used_mutex); + return; + } + auto_affi->period_active = 0; + auto_affi->mode = 0; + ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0; + raw_spin_unlock_irq(&auto_affi->lock); + + smart_grid_usage_dec(); + mutex_unlock(&smart_grid_used_mutex); +} + +static struct sched_group *sd_find_idlest_group(struct sched_domain *sd) +{ + struct sched_group *idlest = NULL, *group = sd->groups; + unsigned long min_runnable_load = ULONG_MAX; + unsigned long min_avg_load = ULONG_MAX; + int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; + unsigned long imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + + do { + unsigned long load, avg_load, runnable_load; + int i; + + avg_load = 0; + runnable_load = 0; + + for_each_cpu(i, sched_group_span(group)) { + load = target_load(i, 0); + runnable_load += load; + avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); + } + + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + + if (min_runnable_load > (runnable_load + imbalance)) { + min_runnable_load = runnable_load; + min_avg_load = avg_load; + idlest = group; + } else if ((runnable_load < (min_runnable_load + imbalance)) && + (100*min_avg_load > imbalance_scale*avg_load)) { + min_avg_load = avg_load; + idlest = group; + } + } while (group = group->next, group != sd->groups); + + return idlest ? idlest : group; +} + +static int group_find_idlest_cpu(struct sched_group *group) +{ + int least_loaded_cpu = cpumask_first(sched_group_span(group)); + unsigned long load, min_load = ULONG_MAX; + unsigned int min_exit_latency = UINT_MAX; + u64 latest_idle_timestamp = 0; + int shallowest_idle_cpu = -1; + int i; + + if (group->group_weight == 1) + return least_loaded_cpu; + + for_each_cpu(i, sched_group_span(group)) { + if (sched_idle_cpu(i)) + return i; + + if (available_idle_cpu(i)) { + struct rq *rq = cpu_rq(i); + struct cpuidle_state *idle = idle_get_state(rq); + + if (idle && idle->exit_latency < min_exit_latency) { + min_exit_latency = idle->exit_latency; + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } else if ((!idle || + idle->exit_latency == min_exit_latency) && + rq->idle_stamp > latest_idle_timestamp) { + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } + } else if (shallowest_idle_cpu == -1) { + load = weighted_cpuload(cpu_rq(i)); + if (load < min_load) { + min_load = load; + least_loaded_cpu = i; + } + } + } + + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : + least_loaded_cpu; +} + +void free_affinity_domains(struct affinity_domain *ad) +{ + int i; + + for (i = 0; i < AD_LEVEL_MAX; i++) { + kfree(ad->domains[i]); + kfree(ad->domains_orig[i]); + ad->domains[i] = NULL; + ad->domains_orig[i] = NULL; + } + ad->dcount = 0; +} + +static int init_affinity_domains_orig(struct affinity_domain *ad) +{ + int i, j; + + for (i = 0; i < ad->dcount; i++) { + ad->domains_orig[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!ad->domains_orig[i]) + goto err; + + cpumask_copy(ad->domains_orig[i], ad->domains[i]); + } + + return 0; +err: + for (j = 0; j < i; j++) { + kfree(ad->domains_orig[j]); + ad->domains_orig[j] = NULL; + } + return -ENOMEM; +} + +static int init_affinity_domains(struct affinity_domain *ad) +{ + struct sched_domain *sd = NULL, *tmp; + struct sched_group *idlest = NULL; + int ret = -ENOMEM; + int dcount = 0; + int i = 0; + int cpu; + + for (i = 0; i < AD_LEVEL_MAX; i++) { + ad->domains[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!ad->domains[i]) + goto err; + } + + rcu_read_lock(); + cpu = cpumask_first_and(cpu_active_mask, + housekeeping_cpumask(HK_FLAG_DOMAIN)); + for_each_domain(cpu, tmp) { + sd = tmp; + dcount++; + } + + if (!sd || dcount > AD_LEVEL_MAX) { + rcu_read_unlock(); + ret = -EINVAL; + goto err; + } + + idlest = sd_find_idlest_group(sd); + cpu = group_find_idlest_cpu(idlest); + i = 0; + for_each_domain(cpu, tmp) { + cpumask_copy(ad->domains[i], sched_domain_span(tmp)); + __schedstat_set(ad->stay_cnt[i], 0); + i++; + } + rcu_read_unlock(); + + ad->dcount = dcount; + ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0; + ad->domain_mask = (1 << ad->dcount) - 1; + + ret = init_affinity_domains_orig(ad); + if (ret) + goto err; + + return 0; +err: + free_affinity_domains(ad); + return ret; +} + +int init_auto_affinity(struct task_group *tg) +{ + struct auto_affinity *auto_affi; + int ret; + + auto_affi = kzalloc(sizeof(*auto_affi), GFP_KERNEL); + if (!auto_affi) + return -ENOMEM; + + raw_spin_lock_init(&auto_affi->lock); + auto_affi->mode = 0; + auto_affi->period_active = 0; + auto_affi->period = ms_to_ktime(AUTO_AFFINITY_DEFAULT_PERIOD_MS); + hrtimer_init(&auto_affi->period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); + auto_affi->period_timer.function = sched_auto_affi_period_timer; + + ret = init_affinity_domains(&auto_affi->ad); + if (ret) { + kfree(auto_affi); + if (ret == -EINVAL) + ret = 0; + return ret; + } + + auto_affi->tg = tg; + tg->auto_affinity = auto_affi; + return 0; +} + +void destroy_auto_affinity(struct task_group *tg) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + + if (unlikely(!auto_affi)) + return; + + if (auto_affi->period_active) + smart_grid_usage_dec(); + + hrtimer_cancel(&auto_affi->period_timer); + free_affinity_domains(&auto_affi->ad); + + kfree(tg->auto_affinity); + tg->auto_affinity = NULL; +} + +int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode) +{ + struct auto_affinity *auto_affi = tg->auto_affinity; + + if (unlikely(!auto_affi)) + return -EPERM; + + /* auto mode */ + if (mode == 1) { + start_auto_affinity(auto_affi); + } else if (mode == 0) { + stop_auto_affinity(auto_affi); + } else { + return -EINVAL; + } + + return 0; +} + +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + +u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + return tg->auto_affinity->mode; +} + +int cpu_affinity_mode_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 mode) +{ + return tg_set_dynamic_affinity_mode(css_tg(css), mode); +} + +int tg_set_affinity_period(struct task_group *tg, u64 period_ms) +{ + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + if (!period_ms || period_ms > U64_MAX / NSEC_PER_MSEC) + return -EINVAL; + + raw_spin_lock_irq(&tg->auto_affinity->lock); + tg->auto_affinity->period = ms_to_ktime(period_ms); + raw_spin_unlock_irq(&tg->auto_affinity->lock); + return 0; +} + +u64 tg_get_affinity_period(struct task_group *tg) +{ + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + return ktime_to_ms(tg->auto_affinity->period); +} + +int cpu_affinity_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 period) +{ + return tg_set_affinity_period(css_tg(css), period); +} + +u64 cpu_affinity_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_affinity_period(css_tg(css)); +} + +int cpu_affinity_domain_mask_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 mask) +{ + struct task_group *tg = css_tg(css); + struct affinity_domain *ad; + u16 full; + + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + ad = &tg->auto_affinity->ad; + full = (1 << ad->dcount) - 1; + if (mask > full) + return -EINVAL; + + raw_spin_lock_irq(&tg->auto_affinity->lock); + ad->domain_mask = mask; + raw_spin_unlock_irq(&tg->auto_affinity->lock); + return 0; +} + +u64 cpu_affinity_domain_mask_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + return tg->auto_affinity->ad.domain_mask; +} + +int cpu_affinity_stat_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + struct auto_affinity *auto_affi = tg->auto_affinity; + struct affinity_domain *ad; + int i; + + if (unlikely(!auto_affi)) + return -EPERM; + + ad = &auto_affi->ad; + seq_printf(sf, "period_active %d\n", auto_affi->period_active); + seq_printf(sf, "dcount %d\n", ad->dcount); + seq_printf(sf, "domain_mask 0x%x\n", ad->domain_mask); + seq_printf(sf, "curr_level %d\n", ad->curr_level); + for (i = 0; i < ad->dcount; i++) + seq_printf(sf, "sd_level %d, cpu list %*pbl, stay_cnt %llu\n", + i, cpumask_pr_args(ad->domains[i]), + schedstat_val(ad->stay_cnt[i])); + + return 0; +} +#else +static inline bool prefer_cpus_valid(struct task_struct *p); + +static inline struct cpumask *task_prefer_cpus(struct task_struct *p) +{ + return p->prefer_cpus; +} + +static inline int dynamic_affinity_mode(struct task_struct *p) +{ + if (!prefer_cpus_valid(p)) + return -1; + + return 0; +} +#endif diff --git a/kernel/sched/dynamic_affinity.h b/kernel/sched/dynamic_affinity.h new file mode 100644 index 000000000000..b58b7f98fde7 --- /dev/null +++ b/kernel/sched/dynamic_affinity.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_DYNAMIC_AFFINITY_INTERNAL_H +#define _LINUX_SCHED_DYNAMIC_AFFINITY_INTERNAL_H + +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#define AD_LEVEL_MAX 8 + +struct affinity_domain { + int dcount; + int curr_level; + u32 domain_mask; +#ifdef CONFIG_SCHEDSTATS + u64 stay_cnt[AD_LEVEL_MAX]; +#endif + struct cpumask *domains[AD_LEVEL_MAX]; + struct cpumask *domains_orig[AD_LEVEL_MAX]; +}; + +struct auto_affinity { + raw_spinlock_t lock; + u64 mode; + ktime_t period; + struct hrtimer period_timer; + int period_active; + struct affinity_domain ad; + struct task_group *tg; +}; + +extern void start_auto_affinity(struct auto_affinity *auto_affi); +extern void stop_auto_affinity(struct auto_affinity *auto_affi); +extern int init_auto_affinity(struct task_group *tg); +extern void destroy_auto_affinity(struct task_group *tg); +extern void tg_update_affinity_domains(int cpu, int online); +extern u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft); +extern int cpu_affinity_mode_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 mode); +extern int cpu_affinity_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 period); +extern u64 cpu_affinity_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft); +extern int cpu_affinity_domain_mask_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 mask); +extern u64 cpu_affinity_domain_mask_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft); +extern int cpu_affinity_stat_show(struct seq_file *sf, void *v); +#endif +#endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6d0ec315f7be..ad000f0514f3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -31,6 +31,10 @@ #include <linux/sched/grid_qos.h> #include <trace/events/sched.h>
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#include "dynamic_affinity.c" +#endif + /* * Targeted preemption latency for CPU-bound tasks: * @@ -5295,460 +5299,6 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
-#ifdef CONFIG_QOS_SCHED_SMART_GRID -#define AUTO_AFFINITY_DEFAULT_PERIOD_MS 2000 -#define IS_DOMAIN_SET(level, mask) ((1 << (level)) & (mask)) - -static DEFINE_MUTEX(smart_grid_used_mutex); - -static inline unsigned long cpu_util(int cpu); -static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu); -static int sched_idle_cpu(int cpu); -static unsigned long weighted_cpuload(struct rq *rq); -static inline bool prefer_cpus_valid(struct task_struct *p); - -int sysctl_affinity_adjust_delay_ms = 5000; - -struct static_key __smart_grid_used; - -static void smart_grid_usage_inc(void) -{ - static_key_slow_inc(&__smart_grid_used); -} - -static void smart_grid_usage_dec(void) -{ - static_key_slow_dec(&__smart_grid_used); -} - -static inline struct cpumask *task_prefer_cpus(struct task_struct *p) -{ - struct affinity_domain *ad; - - if (!smart_grid_used()) - return p->prefer_cpus; - - if (task_group(p)->auto_affinity->mode == 0) - return &p->cpus_allowed; - - ad = &task_group(p)->auto_affinity->ad; - return ad->domains[ad->curr_level]; -} - -static inline int dynamic_affinity_mode(struct task_struct *p) -{ - if (!prefer_cpus_valid(p)) - return -1; - - if (smart_grid_used()) - return task_group(p)->auto_affinity->mode == 0 ? -1 : 1; - - return 0; -} - -static void affinity_domain_up(struct task_group *tg) -{ - struct affinity_domain *ad = &tg->auto_affinity->ad; - u16 level = ad->curr_level; - - if (ad->curr_level >= ad->dcount - 1) - return; - - while (level < ad->dcount) { - if (IS_DOMAIN_SET(level + 1, ad->domain_mask) && - cpumask_weight(ad->domains[level + 1]) > 0) { - ad->curr_level = level + 1; - return; - } - level++; - } -} - -static void affinity_domain_down(struct task_group *tg) -{ - struct affinity_domain *ad = &tg->auto_affinity->ad; - u16 level = ad->curr_level; - - if (ad->curr_level <= 0) - return; - - while (level > 0) { - if (!cpumask_weight(ad->domains[level - 1])) - return; - - if (IS_DOMAIN_SET(level - 1, ad->domain_mask)) { - ad->curr_level = level - 1; - return; - } - level--; - } -} - -static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer) -{ - struct auto_affinity *auto_affi = - container_of(timer, struct auto_affinity, period_timer); - struct task_group *tg = auto_affi->tg; - struct affinity_domain *ad = &auto_affi->ad; - struct cpumask *span = ad->domains[ad->curr_level]; - unsigned long util_avg_sum = 0; - unsigned long tg_capacity = 0; - unsigned long flags; - int cpu; - - for_each_cpu(cpu, span) { - util_avg_sum += cpu_util(cpu); - tg_capacity += capacity_of(cpu); - } - - raw_spin_lock_irqsave(&auto_affi->lock, flags); - /* May be re-entrant by stop_auto_affinity, So check again. */ - if (auto_affi->period_active == 0) { - raw_spin_unlock_irqrestore(&auto_affi->lock, flags); - return HRTIMER_NORESTART; - } - - if (util_avg_sum * 100 >= tg_capacity * sysctl_sched_util_low_pct) { - affinity_domain_up(tg); - } else if (util_avg_sum * 100 < tg_capacity * - sysctl_sched_util_low_pct / 2) { - affinity_domain_down(tg); - } - - schedstat_inc(ad->stay_cnt[ad->curr_level]); - hrtimer_forward_now(timer, auto_affi->period); - raw_spin_unlock_irqrestore(&auto_affi->lock, flags); - return HRTIMER_RESTART; -} - -static int tg_update_affinity_domain_down(struct task_group *tg, void *data) -{ - struct auto_affinity *auto_affi = tg->auto_affinity; - struct affinity_domain *ad; - int *cpu_state = data; - unsigned long flags; - int i; - - if (!auto_affi) - return 0; - - ad = &tg->auto_affinity->ad; - raw_spin_lock_irqsave(&auto_affi->lock, flags); - - for (i = 0; i < ad->dcount; i++) { - if (!cpumask_test_cpu(cpu_state[0], ad->domains_orig[i])) - continue; - - /* online */ - if (cpu_state[1]) { - cpumask_set_cpu(cpu_state[0], ad->domains[i]); - } else { - cpumask_clear_cpu(cpu_state[0], ad->domains[i]); - if (!cpumask_weight(ad->domains[i])) - affinity_domain_up(tg); - } - - } - raw_spin_unlock_irqrestore(&auto_affi->lock, flags); - - return 0; -} - -void tg_update_affinity_domains(int cpu, int online) -{ - int cpu_state[2]; - - cpu_state[0] = cpu; - cpu_state[1] = online; - - rcu_read_lock(); - walk_tg_tree(tg_update_affinity_domain_down, tg_nop, cpu_state); - rcu_read_unlock(); -} - -void start_auto_affinity(struct auto_affinity *auto_affi) -{ - ktime_t delay_ms; - - mutex_lock(&smart_grid_used_mutex); - raw_spin_lock_irq(&auto_affi->lock); - if (auto_affi->period_active == 1) { - raw_spin_unlock_irq(&auto_affi->lock); - mutex_unlock(&smart_grid_used_mutex); - return; - } - - auto_affi->period_active = 1; - auto_affi->mode = 1; - delay_ms = ms_to_ktime(sysctl_affinity_adjust_delay_ms); - hrtimer_forward_now(&auto_affi->period_timer, delay_ms); - hrtimer_start_expires(&auto_affi->period_timer, - HRTIMER_MODE_ABS_PINNED); - raw_spin_unlock_irq(&auto_affi->lock); - - smart_grid_usage_inc(); - mutex_unlock(&smart_grid_used_mutex); -} - -void stop_auto_affinity(struct auto_affinity *auto_affi) -{ - struct affinity_domain *ad = &auto_affi->ad; - - mutex_lock(&smart_grid_used_mutex); - raw_spin_lock_irq(&auto_affi->lock); - if (auto_affi->period_active == 0) { - raw_spin_unlock_irq(&auto_affi->lock); - mutex_unlock(&smart_grid_used_mutex); - return; - } - auto_affi->period_active = 0; - auto_affi->mode = 0; - ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0; - raw_spin_unlock_irq(&auto_affi->lock); - - smart_grid_usage_dec(); - mutex_unlock(&smart_grid_used_mutex); -} - -static struct sched_group *sd_find_idlest_group(struct sched_domain *sd) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - unsigned long min_runnable_load = ULONG_MAX; - unsigned long min_avg_load = ULONG_MAX; - int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; - unsigned long imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - - do { - unsigned long load, avg_load, runnable_load; - int i; - - avg_load = 0; - runnable_load = 0; - - for_each_cpu(i, sched_group_span(group)) { - load = target_load(i, 0); - runnable_load += load; - avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - } - - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - - if (min_runnable_load > (runnable_load + imbalance)) { - min_runnable_load = runnable_load; - min_avg_load = avg_load; - idlest = group; - } else if ((runnable_load < (min_runnable_load + imbalance)) && - (100*min_avg_load > imbalance_scale*avg_load)) { - min_avg_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - return idlest ? idlest : group; -} - -static int group_find_idlest_cpu(struct sched_group *group) -{ - int least_loaded_cpu = cpumask_first(sched_group_span(group)); - unsigned long load, min_load = ULONG_MAX; - unsigned int min_exit_latency = UINT_MAX; - u64 latest_idle_timestamp = 0; - int shallowest_idle_cpu = -1; - int i; - - if (group->group_weight == 1) - return least_loaded_cpu; - - for_each_cpu(i, sched_group_span(group)) { - if (sched_idle_cpu(i)) - return i; - - if (available_idle_cpu(i)) { - struct rq *rq = cpu_rq(i); - struct cpuidle_state *idle = idle_get_state(rq); - - if (idle && idle->exit_latency < min_exit_latency) { - min_exit_latency = idle->exit_latency; - latest_idle_timestamp = rq->idle_stamp; - shallowest_idle_cpu = i; - } else if ((!idle || - idle->exit_latency == min_exit_latency) && - rq->idle_stamp > latest_idle_timestamp) { - latest_idle_timestamp = rq->idle_stamp; - shallowest_idle_cpu = i; - } - } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(cpu_rq(i)); - if (load < min_load) { - min_load = load; - least_loaded_cpu = i; - } - } - } - - return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : - least_loaded_cpu; -} - -void free_affinity_domains(struct affinity_domain *ad) -{ - int i; - - for (i = 0; i < AD_LEVEL_MAX; i++) { - kfree(ad->domains[i]); - kfree(ad->domains_orig[i]); - ad->domains[i] = NULL; - ad->domains_orig[i] = NULL; - } - ad->dcount = 0; -} - -static int init_affinity_domains_orig(struct affinity_domain *ad) -{ - int i, j; - - for (i = 0; i < ad->dcount; i++) { - ad->domains_orig[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!ad->domains_orig[i]) - goto err; - - cpumask_copy(ad->domains_orig[i], ad->domains[i]); - } - - return 0; -err: - for (j = 0; j < i; j++) { - kfree(ad->domains_orig[j]); - ad->domains_orig[j] = NULL; - } - return -ENOMEM; -} - -static int init_affinity_domains(struct affinity_domain *ad) -{ - struct sched_domain *sd = NULL, *tmp; - struct sched_group *idlest = NULL; - int ret = -ENOMEM; - int dcount = 0; - int i = 0; - int cpu; - - for (i = 0; i < AD_LEVEL_MAX; i++) { - ad->domains[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!ad->domains[i]) - goto err; - } - - rcu_read_lock(); - cpu = cpumask_first_and(cpu_active_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); - for_each_domain(cpu, tmp) { - sd = tmp; - dcount++; - } - - if (!sd || dcount > AD_LEVEL_MAX) { - rcu_read_unlock(); - ret = -EINVAL; - goto err; - } - - idlest = sd_find_idlest_group(sd); - cpu = group_find_idlest_cpu(idlest); - i = 0; - for_each_domain(cpu, tmp) { - cpumask_copy(ad->domains[i], sched_domain_span(tmp)); - __schedstat_set(ad->stay_cnt[i], 0); - i++; - } - rcu_read_unlock(); - - ad->dcount = dcount; - ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0; - ad->domain_mask = (1 << ad->dcount) - 1; - - ret = init_affinity_domains_orig(ad); - if (ret) - goto err; - - return 0; -err: - free_affinity_domains(ad); - return ret; -} - -int init_auto_affinity(struct task_group *tg) -{ - struct auto_affinity *auto_affi; - int ret; - - auto_affi = kzalloc(sizeof(*auto_affi), GFP_KERNEL); - if (!auto_affi) - return -ENOMEM; - - raw_spin_lock_init(&auto_affi->lock); - auto_affi->mode = 0; - auto_affi->period_active = 0; - auto_affi->period = ms_to_ktime(AUTO_AFFINITY_DEFAULT_PERIOD_MS); - hrtimer_init(&auto_affi->period_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); - auto_affi->period_timer.function = sched_auto_affi_period_timer; - - ret = init_affinity_domains(&auto_affi->ad); - if (ret) { - kfree(auto_affi); - if (ret == -EINVAL) - ret = 0; - return ret; - } - - auto_affi->tg = tg; - tg->auto_affinity = auto_affi; - return 0; -} - -static void destroy_auto_affinity(struct task_group *tg) -{ - struct auto_affinity *auto_affi = tg->auto_affinity; - - if (unlikely(!auto_affi)) - return; - - if (auto_affi->period_active) - smart_grid_usage_dec(); - - hrtimer_cancel(&auto_affi->period_timer); - free_affinity_domains(&auto_affi->ad); - - kfree(tg->auto_affinity); - tg->auto_affinity = NULL; -} -#else -static void destroy_auto_affinity(struct task_group *tg) {} - -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY -static inline bool prefer_cpus_valid(struct task_struct *p); - -static inline struct cpumask *task_prefer_cpus(struct task_struct *p) -{ - return p->prefer_cpus; -} - -static inline int dynamic_affinity_mode(struct task_struct *p) -{ - if (!prefer_cpus_valid(p)) - return -1; - - return 0; -} -#endif -#endif - /************************************************** * CFS operations on tasks: */ @@ -7182,121 +6732,6 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
}
-#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - -#ifdef CONFIG_JUMP_LABEL -static DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_used); - -static inline bool dynamic_affinity_used(void) -{ - return static_branch_unlikely(&__dynamic_affinity_used); -} - -void dynamic_affinity_enable(void) -{ - static_branch_enable_cpuslocked(&__dynamic_affinity_used); -} - -#else /* CONFIG_JUMP_LABEL */ -static bool dynamic_affinity_used(void) -{ - return true; -} - -void dynamic_affinity_enable(void) {} -#endif - -/* - * Low utilization threshold for CPU - * - * (default: 85%), units: percentage of CPU utilization) - */ -int sysctl_sched_util_low_pct = 85; - -static inline bool prefer_cpus_valid(struct task_struct *p) -{ - struct cpumask *prefer_cpus = task_prefer_cpus(p); - - return !cpumask_empty(prefer_cpus) && - !cpumask_equal(prefer_cpus, &p->cpus_allowed) && - cpumask_subset(prefer_cpus, &p->cpus_allowed); -} - -/* - * set_task_select_cpus: select the cpu range for task - * @p: the task whose available cpu range will to set - *uto_affinity_used @idlest_cpu: the cpu which is the idlest in prefer cpus - * - * If sum of 'util_avg' among 'preferred_cpus' lower than the percentage - * 'sysctl_sched_util_low_pct' of 'preferred_cpus' capacity, select - * 'preferred_cpus' range for task, otherwise select 'preferred_cpus' for task. - * - * The available cpu range set to p->select_cpus. Idlest cpu in preferred cpus - * set to @idlest_cpu, which is set to wakeup cpu when fast path wakeup cpu - * without p->select_cpus. - */ -static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, - int sd_flag) -{ - unsigned long util_avg_sum = 0; - unsigned long tg_capacity = 0; - long min_util = INT_MIN; - struct task_group *tg; - long spare; - int cpu, mode; - - rcu_read_lock(); - mode = dynamic_affinity_mode(p); - if (mode == -1) { - rcu_read_unlock(); - return; - } else if (mode == 1) { - p->select_cpus = task_prefer_cpus(p); - if (idlest_cpu) - *idlest_cpu = cpumask_first(p->select_cpus); - sched_qos_affinity_set(p); - rcu_read_unlock(); - return; - } - - /* manual mode */ - tg = task_group(p); - for_each_cpu(cpu, p->prefer_cpus) { - if (unlikely(!tg->se[cpu])) - continue; - - if (idlest_cpu && available_idle_cpu(cpu)) { - *idlest_cpu = cpu; - } else if (idlest_cpu) { - spare = (long)(capacity_of(cpu) - tg->se[cpu]->avg.util_avg); - if (spare > min_util) { - min_util = spare; - *idlest_cpu = cpu; - } - } - - if (available_idle_cpu(cpu)) { - rcu_read_unlock(); - p->select_cpus = p->prefer_cpus; - if (sd_flag & SD_BALANCE_WAKE) - schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus); - return; - } - - util_avg_sum += tg->se[cpu]->avg.util_avg; - tg_capacity += capacity_of(cpu); - } - rcu_read_unlock(); - - if (tg_capacity > cpumask_weight(p->prefer_cpus) && - util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { - p->select_cpus = p->prefer_cpus; - if (sd_flag & SD_BALANCE_WAKE) - schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus); - } -} -#endif - /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -11439,7 +10874,9 @@ void free_fair_sched_group(struct task_group *tg) int i;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); +#ifdef CONFIG_QOS_SCHED_SMART_GRID destroy_auto_affinity(tg); +#endif
for_each_possible_cpu(i) { #ifdef CONFIG_QOS_SCHED @@ -11460,7 +10897,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; struct cfs_rq *cfs_rq; - int i, ret; + int i; +#ifdef CONFIG_QOS_SCHED_SMART_GRID + int ret; +#endif
tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL); if (!tg->cfs_rq) @@ -11472,9 +10912,11 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD;
init_cfs_bandwidth(tg_cfs_bandwidth(tg)); +#ifdef CONFIG_QOS_SCHED_SMART_GRID ret = init_auto_affinity(tg); if (ret) goto err; +#endif
for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), @@ -11497,7 +10939,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) err_free_rq: kfree(cfs_rq); err: +#ifdef CONFIG_QOS_SCHED_SMART_GRID destroy_auto_affinity(tg); +#endif return 0; }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d882a2b8d5f..5be8aa80600d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -75,6 +75,10 @@ #include "cpupri.h" #include "cpudeadline.h"
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#include "dynamic_affinity.h" +#endif + #ifdef CONFIG_SCHED_DEBUG # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else @@ -361,34 +365,6 @@ struct cfs_bandwidth { #endif };
- -#ifdef CONFIG_QOS_SCHED_SMART_GRID -#define AD_LEVEL_MAX 8 - -struct affinity_domain { - int dcount; - int curr_level; - u32 domain_mask; -#ifdef CONFIG_SCHEDSTATS - u64 stay_cnt[AD_LEVEL_MAX]; -#endif - struct cpumask *domains[AD_LEVEL_MAX]; - struct cpumask *domains_orig[AD_LEVEL_MAX]; -}; -#endif - -struct auto_affinity { -#ifdef CONFIG_QOS_SCHED_SMART_GRID - raw_spinlock_t lock; - u64 mode; - ktime_t period; - struct hrtimer period_timer; - int period_active; - struct affinity_domain ad; - struct task_group *tg; -#endif -}; - /* Task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -508,21 +484,6 @@ extern void sched_offline_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
-#ifdef CONFIG_QOS_SCHED_SMART_GRID -extern void start_auto_affinity(struct auto_affinity *auto_affi); -extern void stop_auto_affinity(struct auto_affinity *auto_affi); -extern int init_auto_affinity(struct task_group *tg); -extern void tg_update_affinity_domains(int cpu, int online); - -#else -static inline int init_auto_affinity(struct task_group *tg) -{ - return 0; -} - -static inline void tg_update_affinity_domains(int cpu, int online) {} -#endif - #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e8c82f3235e2..d845e6bb33b9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2179,6 +2179,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, if (pol->mode == MPOL_INTERLEAVE) { unsigned nid;
+#ifdef CONFIG_QOS_SCHED_SMART_GRID if (smart_grid_used()) { nid = sched_grid_preferred_interleave_nid(pol); nid = (nid == NUMA_NO_NODE) ? @@ -2186,6 +2187,9 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, } else { nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); } +#else + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); +#endif
mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); @@ -2248,8 +2252,10 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); +#ifdef CONFIG_QOS_SCHED_SMART_GRID if (smart_grid_used()) preferred_nid = sched_grid_preferred_nid(preferred_nid, nmask); +#endif page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); mark_vma_cdm(nmask, page, vma); mpol_cond_put(pol);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I94PBH CVE: NA
-------------------------------
Add sysctl 'affinity_preferred_nodes' to set preferred nodes.
For example: echo 0,3 > /proc/sys/kernel/affinity_preferred_nodes
The preferred nodes is preferentially selected to init affinity domain if util is lower than 85%. If the util is higher than 85% and less than 15% higher than the non-preferred node, the node is also preferentially selected.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched/dynamic_affinity.h | 5 + kernel/sched/dynamic_affinity.c | 186 +++++++++++++------------ kernel/sysctl.c | 7 + 3 files changed, 108 insertions(+), 90 deletions(-)
diff --git a/include/linux/sched/dynamic_affinity.h b/include/linux/sched/dynamic_affinity.h index f9a22a29c0b7..93fc9e40469e 100644 --- a/include/linux/sched/dynamic_affinity.h +++ b/include/linux/sched/dynamic_affinity.h @@ -18,6 +18,11 @@ extern int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask);
#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern unsigned long *smart_grid_preferred_nodemask_bits; +extern int proc_cpu_affinity_domain_nodemask(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + extern struct static_key __smart_grid_used; static inline bool smart_grid_used(void) { diff --git a/kernel/sched/dynamic_affinity.c b/kernel/sched/dynamic_affinity.c index 9bead27c90ee..882617af51e1 100644 --- a/kernel/sched/dynamic_affinity.c +++ b/kernel/sched/dynamic_affinity.c @@ -174,6 +174,10 @@ static unsigned long weighted_cpuload(struct rq *rq);
int sysctl_affinity_adjust_delay_ms = 5000;
+ nodemask_t smart_grid_preferred_nodemask; +unsigned long *smart_grid_preferred_nodemask_bits = + nodes_addr(smart_grid_preferred_nodemask); + struct static_key __smart_grid_used;
static void smart_grid_usage_inc(void) @@ -375,90 +379,6 @@ void stop_auto_affinity(struct auto_affinity *auto_affi) mutex_unlock(&smart_grid_used_mutex); }
-static struct sched_group *sd_find_idlest_group(struct sched_domain *sd) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - unsigned long min_runnable_load = ULONG_MAX; - unsigned long min_avg_load = ULONG_MAX; - int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; - unsigned long imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - - do { - unsigned long load, avg_load, runnable_load; - int i; - - avg_load = 0; - runnable_load = 0; - - for_each_cpu(i, sched_group_span(group)) { - load = target_load(i, 0); - runnable_load += load; - avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - } - - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - - if (min_runnable_load > (runnable_load + imbalance)) { - min_runnable_load = runnable_load; - min_avg_load = avg_load; - idlest = group; - } else if ((runnable_load < (min_runnable_load + imbalance)) && - (100*min_avg_load > imbalance_scale*avg_load)) { - min_avg_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - return idlest ? idlest : group; -} - -static int group_find_idlest_cpu(struct sched_group *group) -{ - int least_loaded_cpu = cpumask_first(sched_group_span(group)); - unsigned long load, min_load = ULONG_MAX; - unsigned int min_exit_latency = UINT_MAX; - u64 latest_idle_timestamp = 0; - int shallowest_idle_cpu = -1; - int i; - - if (group->group_weight == 1) - return least_loaded_cpu; - - for_each_cpu(i, sched_group_span(group)) { - if (sched_idle_cpu(i)) - return i; - - if (available_idle_cpu(i)) { - struct rq *rq = cpu_rq(i); - struct cpuidle_state *idle = idle_get_state(rq); - - if (idle && idle->exit_latency < min_exit_latency) { - min_exit_latency = idle->exit_latency; - latest_idle_timestamp = rq->idle_stamp; - shallowest_idle_cpu = i; - } else if ((!idle || - idle->exit_latency == min_exit_latency) && - rq->idle_stamp > latest_idle_timestamp) { - latest_idle_timestamp = rq->idle_stamp; - shallowest_idle_cpu = i; - } - } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(cpu_rq(i)); - if (load < min_load) { - min_load = load; - least_loaded_cpu = i; - } - } - } - - return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : - least_loaded_cpu; -} - void free_affinity_domains(struct affinity_domain *ad) { int i; @@ -493,10 +413,80 @@ static int init_affinity_domains_orig(struct affinity_domain *ad) return -ENOMEM; }
+struct nid_stats { + unsigned long util; + unsigned long compute_capacity; + int idlest_cpu; +}; + +static inline void update_nid_stats(struct nid_stats *ns, int nid) +{ + int min_util = INT_MAX; + int cpu, avg_util; + + memset(ns, 0, sizeof(*ns)); + for_each_cpu(cpu, cpumask_of_node(nid)) { + ns->compute_capacity += capacity_of(cpu); + ns->util += cpu_util(cpu); + avg_util = cpu_util(cpu) * SCHED_CAPACITY_SCALE / + capacity_of(cpu); + if (avg_util < min_util) { + ns->idlest_cpu = cpu; + min_util = avg_util; + } + } +} + +static int auto_affinity_find_idlest_cpu(void) +{ + int nid, imbalance_pct, is_prefer; + unsigned long long util_min = UINT_MAX; + int idlest_is_prefer = 0; + struct nid_stats ns; + int idlest_nid = 0; + int idlest_cpu = 0; + + for_each_online_node(nid) { + if (!cpumask_intersects(cpumask_of_node(nid), + housekeeping_cpumask(HK_FLAG_DOMAIN))) + continue; + + update_nid_stats(&ns, nid); + + is_prefer = 0; + if (node_isset(nid, smart_grid_preferred_nodemask)) { + if (ns.util * 100 < + ns.compute_capacity * sysctl_sched_util_low_pct) { + idlest_nid = nid; + idlest_cpu = ns.idlest_cpu; + break; + } + is_prefer = 1; + } + + if (is_prefer && !idlest_is_prefer) + /* higher ~15% */ + imbalance_pct = 117; + else if (!is_prefer && idlest_is_prefer) + /* lower ~15% */ + imbalance_pct = 85; + else + imbalance_pct = 100; + + if (ns.util * 100 < util_min * imbalance_pct) { + util_min = ns.util * 100 / imbalance_pct; + idlest_nid = nid; + idlest_cpu = ns.idlest_cpu; + idlest_is_prefer = is_prefer; + } + } + + return idlest_cpu; +} + static int init_affinity_domains(struct affinity_domain *ad) { - struct sched_domain *sd = NULL, *tmp; - struct sched_group *idlest = NULL; + struct sched_domain *tmp; int ret = -ENOMEM; int dcount = 0; int i = 0; @@ -512,19 +502,17 @@ static int init_affinity_domains(struct affinity_domain *ad) cpu = cpumask_first_and(cpu_active_mask, housekeeping_cpumask(HK_FLAG_DOMAIN)); for_each_domain(cpu, tmp) { - sd = tmp; dcount++; }
- if (!sd || dcount > AD_LEVEL_MAX) { + if (dcount > AD_LEVEL_MAX) { rcu_read_unlock(); ret = -EINVAL; goto err; }
- idlest = sd_find_idlest_group(sd); - cpu = group_find_idlest_cpu(idlest); i = 0; + cpu = auto_affinity_find_idlest_cpu(); for_each_domain(cpu, tmp) { cpumask_copy(ad->domains[i], sched_domain_span(tmp)); __schedstat_set(ad->stay_cnt[i], 0); @@ -722,6 +710,24 @@ int cpu_affinity_stat_show(struct seq_file *sf, void *v)
return 0; } + +int proc_cpu_affinity_domain_nodemask(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int err; + + mutex_lock(&smart_grid_used_mutex); + + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); + if (!err && write) + nodes_and(smart_grid_preferred_nodemask, + smart_grid_preferred_nodemask, + node_online_map); + + mutex_unlock(&smart_grid_used_mutex); + return err; +} #else static inline bool prefer_cpus_valid(struct task_struct *p);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c7064f67f4a5..b0656ef92a2d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1347,6 +1347,13 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &hundred_thousand, }, + { + .procname = "affinity_preferred_nodes", + .data = &smart_grid_preferred_nodemask_bits, + .maxlen = MAX_NUMNODES, + .mode = 0644, + .proc_handler = proc_cpu_affinity_domain_nodemask, + }, #endif { } };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I94PBH CVE: NA
-------------------------------
Allow task group set/get uti_low_pct.
If /sys/fs/cgroup/cpu/xx/cpu.affinity_util_low_pct set, use it to adjust domain level, otherwise use the value form /proc/sys/kernel/sched_util_low_pct.
Signed-off-by: Hui Tang tanghui20@huawei.com --- kernel/sched/core.c | 5 +++++ kernel/sched/dynamic_affinity.c | 38 +++++++++++++++++++++++++++++---- kernel/sched/dynamic_affinity.h | 5 +++++ 3 files changed, 44 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c7c1f3125c9f..2017481b3ff4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7108,6 +7108,11 @@ static struct cftype cpu_legacy_files[] = { .name = "affinity_stat", .seq_show = cpu_affinity_stat_show, }, + { + .name = "affinity_util_low_pct", + .read_s64 = cpu_affinity_util_low_pct_read, + .write_s64 = cpu_affinity_util_low_pct_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/dynamic_affinity.c b/kernel/sched/dynamic_affinity.c index 882617af51e1..9c9ab7703ca9 100644 --- a/kernel/sched/dynamic_affinity.c +++ b/kernel/sched/dynamic_affinity.c @@ -262,8 +262,8 @@ static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer) struct cpumask *span = ad->domains[ad->curr_level]; unsigned long util_avg_sum = 0; unsigned long tg_capacity = 0; + int cpu, util_low_pct; unsigned long flags; - int cpu;
for_each_cpu(cpu, span) { util_avg_sum += cpu_util(cpu); @@ -277,10 +277,11 @@ static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer) return HRTIMER_NORESTART; }
- if (util_avg_sum * 100 >= tg_capacity * sysctl_sched_util_low_pct) { + util_low_pct = auto_affi->util_low_pct >= 0 ? auto_affi->util_low_pct : + sysctl_sched_util_low_pct; + if (util_avg_sum * 100 >= tg_capacity * util_low_pct) { affinity_domain_up(tg); - } else if (util_avg_sum * 100 < tg_capacity * - sysctl_sched_util_low_pct / 2) { + } else if (util_avg_sum * 100 < tg_capacity * util_low_pct / 2) { affinity_domain_down(tg); }
@@ -546,6 +547,7 @@ int init_auto_affinity(struct task_group *tg) raw_spin_lock_init(&auto_affi->lock); auto_affi->mode = 0; auto_affi->period_active = 0; + auto_affi->util_low_pct = -1; auto_affi->period = ms_to_ktime(AUTO_AFFINITY_DEFAULT_PERIOD_MS); hrtimer_init(&auto_affi->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); @@ -688,6 +690,34 @@ u64 cpu_affinity_domain_mask_read_u64(struct cgroup_subsys_state *css, return tg->auto_affinity->ad.domain_mask; }
+int cpu_affinity_util_low_pct_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 util_pct) +{ + struct task_group *tg = css_tg(css); + + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + if ((util_pct < 0 && util_pct != -1) || util_pct > 100) + return -EINVAL; + + raw_spin_lock_irq(&tg->auto_affinity->lock); + tg->auto_affinity->util_low_pct = util_pct; + raw_spin_unlock_irq(&tg->auto_affinity->lock); + return 0; +} + +s64 cpu_affinity_util_low_pct_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + if (unlikely(!tg->auto_affinity)) + return -EPERM; + + return tg->auto_affinity->util_low_pct; +} + int cpu_affinity_stat_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); diff --git a/kernel/sched/dynamic_affinity.h b/kernel/sched/dynamic_affinity.h index b58b7f98fde7..5860f7c472da 100644 --- a/kernel/sched/dynamic_affinity.h +++ b/kernel/sched/dynamic_affinity.h @@ -22,6 +22,7 @@ struct auto_affinity { ktime_t period; struct hrtimer period_timer; int period_active; + int util_low_pct; struct affinity_domain ad; struct task_group *tg; }; @@ -43,6 +44,10 @@ extern int cpu_affinity_domain_mask_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 mask); extern u64 cpu_affinity_domain_mask_read_u64(struct cgroup_subsys_state *css, struct cftype *cft); +int cpu_affinity_util_low_pct_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 util_pct); +s64 cpu_affinity_util_low_pct_read(struct cgroup_subsys_state *css, + struct cftype *cft); extern int cpu_affinity_stat_show(struct seq_file *sf, void *v); #endif #endif
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I963C2 CVE: NA
----------------------------------------
When smt or cluster sched domain build in NUMA systerm, The adjust of domain level cause ping-pong.
Signed-off-by: Hui Tang tanghui20@huawei.com --- kernel/sched/dynamic_affinity.c | 41 ++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/dynamic_affinity.c b/kernel/sched/dynamic_affinity.c index 9c9ab7703ca9..b5bbf319767f 100644 --- a/kernel/sched/dynamic_affinity.c +++ b/kernel/sched/dynamic_affinity.c @@ -233,24 +233,32 @@ static void affinity_domain_up(struct task_group *tg) } }
-static void affinity_domain_down(struct task_group *tg) +static inline int down_level_to(struct affinity_domain *ad) { - struct affinity_domain *ad = &tg->auto_affinity->ad; - u16 level = ad->curr_level; + int level = ad->curr_level;
- if (ad->curr_level <= 0) - return; + if (level <= 0) + return -1;
while (level > 0) { if (!cpumask_weight(ad->domains[level - 1])) - return; + return -1; + + if (IS_DOMAIN_SET(level - 1, ad->domain_mask)) + return level - 1;
- if (IS_DOMAIN_SET(level - 1, ad->domain_mask)) { - ad->curr_level = level - 1; - return; - } level--; } + + return -1; +} + +static void affinity_domain_down(struct task_group *tg) +{ + int down_level = down_level_to(&tg->auto_affinity->ad); + + if (down_level >= 0) + tg->auto_affinity->ad.curr_level = down_level; }
static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer) @@ -260,10 +268,11 @@ static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer) struct task_group *tg = auto_affi->tg; struct affinity_domain *ad = &auto_affi->ad; struct cpumask *span = ad->domains[ad->curr_level]; + int cpu, util_low_pct, down_level; unsigned long util_avg_sum = 0; unsigned long tg_capacity = 0; - int cpu, util_low_pct; unsigned long flags; + int ratio = 2;
for_each_cpu(cpu, span) { util_avg_sum += cpu_util(cpu); @@ -277,13 +286,17 @@ static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer) return HRTIMER_NORESTART; }
+ down_level = down_level_to(ad); + if (down_level >= 0) + ratio = cpumask_weight(ad->domains[ad->curr_level]) / + cpumask_weight(ad->domains[down_level]) + 1; + util_low_pct = auto_affi->util_low_pct >= 0 ? auto_affi->util_low_pct : sysctl_sched_util_low_pct; - if (util_avg_sum * 100 >= tg_capacity * util_low_pct) { + if (util_avg_sum * 100 >= tg_capacity * util_low_pct) affinity_domain_up(tg); - } else if (util_avg_sum * 100 < tg_capacity * util_low_pct / 2) { + else if (util_avg_sum * 100 * ratio < tg_capacity * util_low_pct) affinity_domain_down(tg); - }
schedstat_inc(ad->stay_cnt[ad->curr_level]); hrtimer_forward_now(timer, auto_affi->period);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I963VJ CVE: NA
-------------------------------
If preferred_nmask is invalid, it should not to modify memory policy for the task.
Signed-off-by: Hui Tang tanghui20@huawei.com --- kernel/sched/grid/qos.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c index f0f10dfb9fd4..730ca90c8f85 100644 --- a/kernel/sched/grid/qos.c +++ b/kernel/sched/grid/qos.c @@ -84,7 +84,8 @@ int sched_grid_preferred_interleave_nid(struct mempolicy *policy) preferred_nmask = &me->grid_qos->affinity.mem_preferred_node_mask;
- if (!preferred_nmask || !policy) + if (!preferred_nmask || !policy || nodes_empty(*preferred_nmask) || + nodes_equal(*preferred_nmask, node_online_map)) return NUMA_NO_NODE;
if (nodes_equal(policy->v.nodes, *preferred_nmask)) @@ -115,7 +116,8 @@ int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask) preferred_nmask = ¤t->grid_qos->affinity.mem_preferred_node_mask;
- if (!preferred_nmask) + if (!preferred_nmask || nodes_empty(*preferred_nmask) || + nodes_equal(*preferred_nmask, node_online_map)) return preferred_nid;
/*
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9648D CVE: NA
-------------------------------
If ad->curr_level is up to highest level, sched_qos_affinity_set() is not called.
set_task_select_cpus(): -> dynamic_affinity_mode() // return -1 -> prefer_cpus_valid() // cpumask_equal(prefer_cpus, &p->cpus_allowed) -> task_prefer_cpus() // ad->curr_level is highest level
Signed-off-by: Hui Tang tanghui20@huawei.com --- kernel/sched/dynamic_affinity.c | 80 ++++++++++++++------------------- 1 file changed, 33 insertions(+), 47 deletions(-)
diff --git a/kernel/sched/dynamic_affinity.c b/kernel/sched/dynamic_affinity.c index b5bbf319767f..9c5317fe7497 100644 --- a/kernel/sched/dynamic_affinity.c +++ b/kernel/sched/dynamic_affinity.c @@ -18,7 +18,6 @@ */ #include "dynamic_affinity.h"
-static inline struct cpumask *task_prefer_cpus(struct task_struct *p); static inline int dynamic_affinity_mode(struct task_struct *p); static unsigned long capacity_of(int cpu); static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); @@ -42,13 +41,40 @@ static inline bool dynamic_affinity_used(void) */ int sysctl_sched_util_low_pct = 85;
-static inline bool prefer_cpus_valid(struct task_struct *p) +static inline struct cpumask *task_prefer_cpus(struct task_struct *p, int mode) { - struct cpumask *prefer_cpus = task_prefer_cpus(p); +#ifdef CONFIG_QOS_SCHED_SMART_GRID + struct affinity_domain *ad = &task_group(p)->auto_affinity->ad; + + if (mode == 1) + return ad->domains[ad->curr_level]; +#endif + + return p->prefer_cpus; +} + +static inline int dynamic_affinity_mode(struct task_struct *p) +{ + struct cpumask *prefer_cpus; + int mode = 0; + +#ifdef CONFIG_QOS_SCHED_SMART_GRID + if (smart_grid_used()) { + mode = task_group(p)->auto_affinity->mode == 0 ? -1 : 1; + if (mode == -1) + return -1; + } +#endif
- return !cpumask_empty(prefer_cpus) && - !cpumask_equal(prefer_cpus, &p->cpus_allowed) && - cpumask_subset(prefer_cpus, &p->cpus_allowed); + prefer_cpus = task_prefer_cpus(p, mode); + if (cpumask_empty(prefer_cpus) || + !cpumask_subset(prefer_cpus, &p->cpus_allowed)) + return -1; + + if (mode == 0 && cpumask_equal(prefer_cpus, &p->cpus_allowed)) + return -1; + + return mode; }
/* @@ -79,7 +105,7 @@ void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, int sd_flag) rcu_read_unlock(); return; } else if (mode == 1) { - p->select_cpus = task_prefer_cpus(p); + p->select_cpus = task_prefer_cpus(p, mode); if (idlest_cpu) *idlest_cpu = cpumask_first(p->select_cpus); sched_qos_affinity_set(p); @@ -190,31 +216,6 @@ static void smart_grid_usage_dec(void) static_key_slow_dec(&__smart_grid_used); }
-static inline struct cpumask *task_prefer_cpus(struct task_struct *p) -{ - struct affinity_domain *ad; - - if (!smart_grid_used()) - return p->prefer_cpus; - - if (task_group(p)->auto_affinity->mode == 0) - return &p->cpus_allowed; - - ad = &task_group(p)->auto_affinity->ad; - return ad->domains[ad->curr_level]; -} - -static inline int dynamic_affinity_mode(struct task_struct *p) -{ - if (!prefer_cpus_valid(p)) - return -1; - - if (smart_grid_used()) - return task_group(p)->auto_affinity->mode == 0 ? -1 : 1; - - return 0; -} - static void affinity_domain_up(struct task_group *tg) { struct affinity_domain *ad = &tg->auto_affinity->ad; @@ -771,19 +772,4 @@ int proc_cpu_affinity_domain_nodemask(struct ctl_table *table, int write, mutex_unlock(&smart_grid_used_mutex); return err; } -#else -static inline bool prefer_cpus_valid(struct task_struct *p); - -static inline struct cpumask *task_prefer_cpus(struct task_struct *p) -{ - return p->prefer_cpus; -} - -static inline int dynamic_affinity_mode(struct task_struct *p) -{ - if (!prefer_cpus_valid(p)) - return -1; - - return 0; -} #endif
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/4977 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/H...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/4977 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/H...