hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I94PBH CVE: NA
-------------------------------
Add sysctl 'affinity_preferred_nodes' to set preferred nodes.
For example: echo 0,3 > /proc/sys/kernel/affinity_preferred_nodes
The preferred nodes is preferentially selected to init affinity domain if util is lower than 85%. If the util is higher than 85% and less than 15% higher than the non-preferred node, the node is also preferentially selected.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched/dynamic_affinity.h | 5 + kernel/sched/dynamic_affinity.c | 186 +++++++++++++------------ kernel/sysctl.c | 7 + 3 files changed, 108 insertions(+), 90 deletions(-)
diff --git a/include/linux/sched/dynamic_affinity.h b/include/linux/sched/dynamic_affinity.h index f9a22a29c0b7..93fc9e40469e 100644 --- a/include/linux/sched/dynamic_affinity.h +++ b/include/linux/sched/dynamic_affinity.h @@ -18,6 +18,11 @@ extern int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask);
#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern unsigned long *smart_grid_preferred_nodemask_bits; +extern int proc_cpu_affinity_domain_nodemask(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + extern struct static_key __smart_grid_used; static inline bool smart_grid_used(void) { diff --git a/kernel/sched/dynamic_affinity.c b/kernel/sched/dynamic_affinity.c index 9bead27c90ee..882617af51e1 100644 --- a/kernel/sched/dynamic_affinity.c +++ b/kernel/sched/dynamic_affinity.c @@ -174,6 +174,10 @@ static unsigned long weighted_cpuload(struct rq *rq);
int sysctl_affinity_adjust_delay_ms = 5000;
+ nodemask_t smart_grid_preferred_nodemask; +unsigned long *smart_grid_preferred_nodemask_bits = + nodes_addr(smart_grid_preferred_nodemask); + struct static_key __smart_grid_used;
static void smart_grid_usage_inc(void) @@ -375,90 +379,6 @@ void stop_auto_affinity(struct auto_affinity *auto_affi) mutex_unlock(&smart_grid_used_mutex); }
-static struct sched_group *sd_find_idlest_group(struct sched_domain *sd) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - unsigned long min_runnable_load = ULONG_MAX; - unsigned long min_avg_load = ULONG_MAX; - int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; - unsigned long imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - - do { - unsigned long load, avg_load, runnable_load; - int i; - - avg_load = 0; - runnable_load = 0; - - for_each_cpu(i, sched_group_span(group)) { - load = target_load(i, 0); - runnable_load += load; - avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - } - - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - - if (min_runnable_load > (runnable_load + imbalance)) { - min_runnable_load = runnable_load; - min_avg_load = avg_load; - idlest = group; - } else if ((runnable_load < (min_runnable_load + imbalance)) && - (100*min_avg_load > imbalance_scale*avg_load)) { - min_avg_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - return idlest ? idlest : group; -} - -static int group_find_idlest_cpu(struct sched_group *group) -{ - int least_loaded_cpu = cpumask_first(sched_group_span(group)); - unsigned long load, min_load = ULONG_MAX; - unsigned int min_exit_latency = UINT_MAX; - u64 latest_idle_timestamp = 0; - int shallowest_idle_cpu = -1; - int i; - - if (group->group_weight == 1) - return least_loaded_cpu; - - for_each_cpu(i, sched_group_span(group)) { - if (sched_idle_cpu(i)) - return i; - - if (available_idle_cpu(i)) { - struct rq *rq = cpu_rq(i); - struct cpuidle_state *idle = idle_get_state(rq); - - if (idle && idle->exit_latency < min_exit_latency) { - min_exit_latency = idle->exit_latency; - latest_idle_timestamp = rq->idle_stamp; - shallowest_idle_cpu = i; - } else if ((!idle || - idle->exit_latency == min_exit_latency) && - rq->idle_stamp > latest_idle_timestamp) { - latest_idle_timestamp = rq->idle_stamp; - shallowest_idle_cpu = i; - } - } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(cpu_rq(i)); - if (load < min_load) { - min_load = load; - least_loaded_cpu = i; - } - } - } - - return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : - least_loaded_cpu; -} - void free_affinity_domains(struct affinity_domain *ad) { int i; @@ -493,10 +413,80 @@ static int init_affinity_domains_orig(struct affinity_domain *ad) return -ENOMEM; }
+struct nid_stats { + unsigned long util; + unsigned long compute_capacity; + int idlest_cpu; +}; + +static inline void update_nid_stats(struct nid_stats *ns, int nid) +{ + int min_util = INT_MAX; + int cpu, avg_util; + + memset(ns, 0, sizeof(*ns)); + for_each_cpu(cpu, cpumask_of_node(nid)) { + ns->compute_capacity += capacity_of(cpu); + ns->util += cpu_util(cpu); + avg_util = cpu_util(cpu) * SCHED_CAPACITY_SCALE / + capacity_of(cpu); + if (avg_util < min_util) { + ns->idlest_cpu = cpu; + min_util = avg_util; + } + } +} + +static int auto_affinity_find_idlest_cpu(void) +{ + int nid, imbalance_pct, is_prefer; + unsigned long long util_min = UINT_MAX; + int idlest_is_prefer = 0; + struct nid_stats ns; + int idlest_nid = 0; + int idlest_cpu = 0; + + for_each_online_node(nid) { + if (!cpumask_intersects(cpumask_of_node(nid), + housekeeping_cpumask(HK_FLAG_DOMAIN))) + continue; + + update_nid_stats(&ns, nid); + + is_prefer = 0; + if (node_isset(nid, smart_grid_preferred_nodemask)) { + if (ns.util * 100 < + ns.compute_capacity * sysctl_sched_util_low_pct) { + idlest_nid = nid; + idlest_cpu = ns.idlest_cpu; + break; + } + is_prefer = 1; + } + + if (is_prefer && !idlest_is_prefer) + /* higher ~15% */ + imbalance_pct = 117; + else if (!is_prefer && idlest_is_prefer) + /* lower ~15% */ + imbalance_pct = 85; + else + imbalance_pct = 100; + + if (ns.util * 100 < util_min * imbalance_pct) { + util_min = ns.util * 100 / imbalance_pct; + idlest_nid = nid; + idlest_cpu = ns.idlest_cpu; + idlest_is_prefer = is_prefer; + } + } + + return idlest_cpu; +} + static int init_affinity_domains(struct affinity_domain *ad) { - struct sched_domain *sd = NULL, *tmp; - struct sched_group *idlest = NULL; + struct sched_domain *tmp; int ret = -ENOMEM; int dcount = 0; int i = 0; @@ -512,19 +502,17 @@ static int init_affinity_domains(struct affinity_domain *ad) cpu = cpumask_first_and(cpu_active_mask, housekeeping_cpumask(HK_FLAG_DOMAIN)); for_each_domain(cpu, tmp) { - sd = tmp; dcount++; }
- if (!sd || dcount > AD_LEVEL_MAX) { + if (dcount > AD_LEVEL_MAX) { rcu_read_unlock(); ret = -EINVAL; goto err; }
- idlest = sd_find_idlest_group(sd); - cpu = group_find_idlest_cpu(idlest); i = 0; + cpu = auto_affinity_find_idlest_cpu(); for_each_domain(cpu, tmp) { cpumask_copy(ad->domains[i], sched_domain_span(tmp)); __schedstat_set(ad->stay_cnt[i], 0); @@ -722,6 +710,24 @@ int cpu_affinity_stat_show(struct seq_file *sf, void *v)
return 0; } + +int proc_cpu_affinity_domain_nodemask(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int err; + + mutex_lock(&smart_grid_used_mutex); + + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); + if (!err && write) + nodes_and(smart_grid_preferred_nodemask, + smart_grid_preferred_nodemask, + node_online_map); + + mutex_unlock(&smart_grid_used_mutex); + return err; +} #else static inline bool prefer_cpus_valid(struct task_struct *p);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c7064f67f4a5..b0656ef92a2d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1347,6 +1347,13 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &hundred_thousand, }, + { + .procname = "affinity_preferred_nodes", + .data = &smart_grid_preferred_nodemask_bits, + .maxlen = MAX_NUMNODES, + .mode = 0644, + .proc_handler = proc_cpu_affinity_domain_nodemask, + }, #endif { } };