hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7G6SW CVE: NA
-------------------------------
Add sysctl 'affinity_preferred_nodes' to set preferred nodes.
For example: echo 0,3 > /proc/sys/kernel/affinity_preferred_nodes
The preferred nodes is preferentially selected to init affinity domain if util is lower than 85%. If the util is higher than 85% and less than 15% higher than the non-preferred node, the node is also preferentially selected.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched/sysctl.h | 4 + kernel/sched/fair.c | 177 +++++++++++++++++------------------ kernel/sched/sched.h | 1 + kernel/sysctl.c | 7 ++ 4 files changed, 98 insertions(+), 91 deletions(-)
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index ad472760e97d7..8475ee46727ac 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -38,6 +38,10 @@ extern int sysctl_sched_util_low_pct;
#ifdef CONFIG_QOS_SCHED_SMART_GRID extern int sysctl_affinity_adjust_delay_ms; +extern unsigned long *smart_grid_preferred_nodemask_bits; +extern int proc_cpu_affinity_domain_nodemask(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); #endif
enum sched_tunable_scaling { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9afb1e6ca4ca..571123e2a23a6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5296,6 +5296,9 @@ static unsigned long weighted_cpuload(struct rq *rq); static inline bool prefer_cpus_valid(struct task_struct *p);
int sysctl_affinity_adjust_delay_ms = 5000; +nodemask_t smart_grid_preferred_nodemask; +unsigned long *smart_grid_preferred_nodemask_bits = + nodes_addr(smart_grid_preferred_nodemask);
struct static_key __smart_grid_used;
@@ -5494,90 +5497,6 @@ void stop_auto_affinity(struct auto_affinity *auto_affi) mutex_unlock(&smart_grid_used_mutex); }
-static struct sched_group *sd_find_idlest_group(struct sched_domain *sd) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - unsigned long min_runnable_load = ULONG_MAX; - unsigned long min_avg_load = ULONG_MAX; - int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; - unsigned long imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - - do { - unsigned long load, avg_load, runnable_load; - int i; - - avg_load = 0; - runnable_load = 0; - - for_each_cpu(i, sched_group_span(group)) { - load = target_load(i, 0); - runnable_load += load; - avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - } - - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - - if (min_runnable_load > (runnable_load + imbalance)) { - min_runnable_load = runnable_load; - min_avg_load = avg_load; - idlest = group; - } else if ((runnable_load < (min_runnable_load + imbalance)) && - (100*min_avg_load > imbalance_scale*avg_load)) { - min_avg_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - return idlest ? idlest : group; -} - -static int group_find_idlest_cpu(struct sched_group *group) -{ - int least_loaded_cpu = cpumask_first(sched_group_span(group)); - unsigned long load, min_load = ULONG_MAX; - unsigned int min_exit_latency = UINT_MAX; - u64 latest_idle_timestamp = 0; - int shallowest_idle_cpu = -1; - int i; - - if (group->group_weight == 1) - return least_loaded_cpu; - - for_each_cpu(i, sched_group_span(group)) { - if (sched_idle_cpu(i)) - return i; - - if (available_idle_cpu(i)) { - struct rq *rq = cpu_rq(i); - struct cpuidle_state *idle = idle_get_state(rq); - - if (idle && idle->exit_latency < min_exit_latency) { - min_exit_latency = idle->exit_latency; - latest_idle_timestamp = rq->idle_stamp; - shallowest_idle_cpu = i; - } else if ((!idle || - idle->exit_latency == min_exit_latency) && - rq->idle_stamp > latest_idle_timestamp) { - latest_idle_timestamp = rq->idle_stamp; - shallowest_idle_cpu = i; - } - } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(cpu_rq(i)); - if (load < min_load) { - min_load = load; - least_loaded_cpu = i; - } - } - } - - return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : - least_loaded_cpu; -} - void free_affinity_domains(struct affinity_domain *ad) { int i; @@ -5612,14 +5531,72 @@ static int init_affinity_domains_orig(struct affinity_domain *ad) return -ENOMEM; }
+struct nid_stats { + unsigned long util; + unsigned long compute_capacity; +}; + +static inline void update_nid_stats(struct nid_stats *ns, int nid) +{ + int cpu; + + memset(ns, 0, sizeof(*ns)); + for_each_cpu(cpu, cpumask_of_node(nid)) { + ns->compute_capacity += capacity_of(cpu); + ns->util += cpu_util(cpu); + } +} + +static int find_idlest_nid(void) +{ + int nid, imbalance_pct, is_prefer; + unsigned long long util_min = UINT_MAX; + int idlest_is_prefer = 0; + struct nid_stats ns; + int idlest_nid = 0; + + for_each_online_node(nid) { + if (!cpumask_intersects(cpumask_of_node(nid), + housekeeping_cpumask(HK_FLAG_DOMAIN))) + continue; + + update_nid_stats(&ns, nid); + + if (node_isset(nid, smart_grid_preferred_nodemask)) { + if (ns.util * 100 < + ns.compute_capacity * sysctl_sched_util_low_pct) { + idlest_nid = nid; + break; + } + is_prefer = 1; + } else { + is_prefer = 0; + } + + if (is_prefer && !idlest_is_prefer) + imbalance_pct = 117; // higher 15% + else if (!is_prefer && idlest_is_prefer) + imbalance_pct = 85; // lower 15% + else + imbalance_pct = 100; + + if (ns.util * 100 < util_min * imbalance_pct) { + util_min = ns.util * 100 / imbalance_pct; + idlest_nid = nid; + idlest_is_prefer = is_prefer; + } + } + + return idlest_nid; +} + static int init_affinity_domains(struct affinity_domain *ad) { - struct sched_domain *sd = NULL, *tmp; - struct sched_group *idlest = NULL; + struct sched_domain *tmp; + int cpu, idlest_nid; int ret = -ENOMEM; int dcount = 0; int i = 0; - int cpu;
for (i = 0; i < AD_LEVEL_MAX; i++) { ad->domains[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL); @@ -5631,18 +5608,18 @@ static int init_affinity_domains(struct affinity_domain *ad) cpu = cpumask_first_and(cpu_active_mask, housekeeping_cpumask(HK_FLAG_DOMAIN)); for_each_domain(cpu, tmp) { - sd = tmp; dcount++; }
- if (!sd || dcount > AD_LEVEL_MAX) { + if (dcount > AD_LEVEL_MAX) { rcu_read_unlock(); ret = -EINVAL; goto err; }
- idlest = sd_find_idlest_group(sd); - cpu = group_find_idlest_cpu(idlest); + idlest_nid = find_idlest_nid(); + cpu = cpumask_first_and(cpumask_of_node(idlest_nid), + housekeeping_cpumask(HK_FLAG_DOMAIN)); i = 0; for_each_domain(cpu, tmp) { cpumask_copy(ad->domains[i], sched_domain_span(tmp)); @@ -5711,6 +5688,24 @@ static void destroy_auto_affinity(struct task_group *tg) kfree(tg->auto_affinity); tg->auto_affinity = NULL; } + +int proc_cpu_affinity_domain_nodemask(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int err; + + mutex_lock(&smart_grid_used_mutex); + + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); + if (!err && write) + nodes_and(smart_grid_preferred_nodemask, + smart_grid_preferred_nodemask, + node_online_map); + + mutex_unlock(&smart_grid_used_mutex); + return err; +} #else static void destroy_auto_affinity(struct task_group *tg) {}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d882a2b8d5fe..ef3f5c84e8b95 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -509,6 +509,7 @@ extern void sched_offline_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_QOS_SCHED_SMART_GRID +extern nodemask_t smart_grid_preferred_nodemask; extern void start_auto_affinity(struct auto_affinity *auto_affi); extern void stop_auto_affinity(struct auto_affinity *auto_affi); extern int init_auto_affinity(struct task_group *tg); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c7064f67f4a53..b0656ef92a2df 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1347,6 +1347,13 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &hundred_thousand, }, + { + .procname = "affinity_preferred_nodes", + .data = &smart_grid_preferred_nodemask_bits, + .maxlen = MAX_NUMNODES, + .mode = 0644, + .proc_handler = proc_cpu_affinity_domain_nodemask, + }, #endif { } };
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/1948 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Q...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/1948 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Q...