[PATCH openEuler-1.0-LTS 3/6] sched: Adjust wakeup cpu range according CPU util dynamicly

21 Jul 2022

From: Hui Tang <tanghui20@huawei.com>

hulk inclusion
category: feature
bugzilla: 187173, https://gitee.com/openeuler/kernel/issues/I5G4IH
CVE: NA

--------------------------------

Compare taskgroup 'util_avg' in perferred cpu with capacity preferred cpu,
dynamicly adjust cpu range for task wakeup process.

Signed-off-by: Hui Tang <tanghui20@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Reviewed-by: Zhang Qiao <zhangqiao22@huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13@huawei.com>
---
 include/linux/sched.h        |   4 +-
 include/linux/sched/sysctl.h |   4 +
 kernel/sched/fair.c          | 144 +++++++++++++++++++++++++++++++++++
 kernel/sysctl.c              |  11 +++
 4 files changed, 162 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 718ec0289d83..8f27fa3e5622 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1251,13 +1251,15 @@ struct task_struct {
 #if !defined(__GENKSYMS__)
 #if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY)
 	cpumask_t			*prefer_cpus;
+	const cpumask_t			*select_cpus;
 #else
 	KABI_RESERVE(6)
+	KABI_RESERVE(7)
 #endif
 #else
 	KABI_RESERVE(6)
-#endif
 	KABI_RESERVE(7)
+#endif
 	KABI_RESERVE(8)
 
 	/* CPU-specific state of this task: */
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index b277fbc807ec..04eb5b127867 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -32,6 +32,10 @@ extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+extern int sysctl_sched_util_low_pct;
+#endif
+
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
 	SCHED_TUNABLESCALING_LOG,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7d553a4c5120..407bceee1126 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1775,6 +1775,9 @@ static void task_numa_compare(struct task_numa_env *env,
 		 * can be used from IRQ context.
 		 */
 		local_irq_disable();
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		env->p->select_cpus = &env->p->cpus_allowed;
+#endif
 		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
 						   env->dst_cpu);
 		local_irq_enable();
@@ -5955,8 +5958,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		int i;
 
 		/* Skip over this group if it has no CPUs allowed */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		if (!cpumask_intersects(sched_group_span(group),
+					p->select_cpus))
+#else
 		if (!cpumask_intersects(sched_group_span(group),
 					&p->cpus_allowed))
+#endif
 			continue;
 
 		local_group = cpumask_test_cpu(this_cpu,
@@ -6088,7 +6096,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 		return cpumask_first(sched_group_span(group));
 
 	/* Traverse only the allowed CPUs */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	for_each_cpu_and(i, sched_group_span(group), p->select_cpus) {
+#else
 	for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
+#endif
 		if (sched_idle_cpu(i))
 			return i;
 
@@ -6131,7 +6143,11 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 {
 	int new_cpu = cpu;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!cpumask_intersects(sched_domain_span(sd), p->select_cpus))
+#else
 	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+#endif
 		return prev_cpu;
 
 	/*
@@ -6248,7 +6264,11 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 	if (!test_idle_cores(target, false))
 		return -1;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_and(cpus, sched_domain_span(sd), p->select_cpus);
+#else
 	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+#endif
 
 	for_each_cpu_wrap(core, cpus, target) {
 		bool idle = true;
@@ -6282,8 +6302,13 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 		return -1;
 
 	for_each_cpu(cpu, cpu_smt_mask(target)) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		if (!cpumask_test_cpu(cpu, p->select_cpus) ||
+		    !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+#else
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
 		    !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+#endif
 			continue;
 		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
 			return cpu;
@@ -6344,7 +6369,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 
 	time = local_clock();
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_and(cpus, sched_domain_span(sd), p->select_cpus);
+#else
 	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+#endif
 
 	for_each_cpu_wrap(cpu, cpus, target) {
 		if (!--nr)
@@ -6383,7 +6412,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	struct sched_domain *sd;
 	int i, recent_used_cpu;
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+	    cpumask_test_cpu(target, p->select_cpus)) {
+#else
 	if (available_idle_cpu(target) || sched_idle_cpu(target)) {
+#endif
 		SET_STAT(found_idle_cpu_easy);
 		return target;
 	}
@@ -6391,8 +6425,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	if (prev != target && cpus_share_cache(prev, target) &&
+	    cpumask_test_cpu(prev, p->select_cpus) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev))) {
+#else
+	if (prev != target && cpus_share_cache(prev, target) &&
+	    (available_idle_cpu(prev) || sched_idle_cpu(prev))) {
+#endif
 		SET_STAT(found_idle_cpu_easy);
 		return prev;
 	}
@@ -6403,7 +6443,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	    cpumask_test_cpu(p->recent_used_cpu, p->select_cpus)) {
+#else
 	    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+#endif
 		/*
 		 * Replace recent_used_cpu with prev as it is a potential
 		 * candidate for the next wake:
@@ -6605,7 +6649,85 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 	sync_entity_load_avg(&p->se);
 
 	return min_cap * 1024 < task_util(p) * capacity_margin;
+
+}
+
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+/*
+ * Low utilization threshold for CPU
+ *
+ * (default: 85%), units: percentage of CPU utilization)
+ */
+int sysctl_sched_util_low_pct = 85;
+
+static inline bool prefer_cpus_valid(struct task_struct *p)
+{
+	return p->prefer_cpus &&
+	       !cpumask_empty(p->prefer_cpus) &&
+	       !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) &&
+	       cpumask_subset(p->prefer_cpus, &p->cpus_allowed);
+}
+
+/*
+ * set_task_select_cpus: select the cpu range for task
+ * @p: the task whose available cpu range will to set
+ * @idlest_cpu: the cpu which is the idlest in prefer cpus
+ *
+ * If sum of 'util_avg' among 'preferred_cpus' lower than the percentage
+ * 'sysctl_sched_util_low_pct' of 'preferred_cpus' capacity, select
+ * 'preferred_cpus' range for task, otherwise select 'preferred_cpus' for task.
+ *
+ * The available cpu range set to p->select_cpus. Idlest cpu in preferred cpus
+ * set to @idlest_cpu, which is set to wakeup cpu when fast path wakeup cpu
+ * without p->select_cpus.
+ */
+static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
+				 int sd_flag)
+{
+	unsigned long util_avg_sum = 0;
+	unsigned long tg_capacity = 0;
+	long min_util = INT_MIN;
+	struct task_group *tg;
+	long spare;
+	int cpu;
+
+	p->select_cpus = &p->cpus_allowed;
+	if (!prefer_cpus_valid(p))
+		return;
+
+	rcu_read_lock();
+	tg = task_group(p);
+	for_each_cpu(cpu, p->prefer_cpus) {
+		if (unlikely(!tg->se[cpu]))
+			continue;
+
+		if (idlest_cpu && available_idle_cpu(cpu)) {
+			*idlest_cpu = cpu;
+		} else if (idlest_cpu) {
+			spare = (long)(capacity_of(cpu) - tg->se[cpu]->avg.util_avg);
+			if (spare > min_util) {
+				min_util = spare;
+				*idlest_cpu = cpu;
+			}
+		}
+
+		if (available_idle_cpu(cpu)) {
+			rcu_read_unlock();
+			p->select_cpus = p->prefer_cpus;
+			return;
+		}
+
+		util_avg_sum += tg->se[cpu]->avg.util_avg;
+		tg_capacity += capacity_of(cpu);
+	}
+	rcu_read_unlock();
+
+	if (tg_capacity > cpumask_weight(p->prefer_cpus) &&
+	    util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) {
+		p->select_cpus = p->prefer_cpus;
+	}
 }
+#endif
 
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -6628,13 +6750,24 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int new_cpu = prev_cpu;
 	int want_affine = 0;
 	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	int idlest_cpu = 0;
+#endif
 
 	time = schedstat_start_time();
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	set_task_select_cpus(p, &idlest_cpu, sd_flag);
+#endif
+
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
 		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+			      && cpumask_test_cpu(cpu, p->select_cpus);
+#else
 			      && cpumask_test_cpu(cpu, &p->cpus_allowed);
+#endif
 	}
 
 	rcu_read_lock();
@@ -6648,7 +6781,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		 */
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+			new_cpu = cpu;
+			if (cpu != prev_cpu &&
+			    cpumask_test_cpu(prev_cpu, p->select_cpus))
+#else
 			if (cpu != prev_cpu)
+#endif
 				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
 
 			sd = NULL; /* Prefer wake_affine over balance flags */
@@ -6673,6 +6812,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 			current->recent_used_cpu = cpu;
 	}
 	rcu_read_unlock();
+
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!cpumask_test_cpu(new_cpu, p->select_cpus))
+		new_cpu = idlest_cpu;
+#endif
 	schedstat_end_time(cpu_rq(cpu), time);
 
 	return new_cpu;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 345d4a14ce6d..ad62ea156afd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1325,6 +1325,17 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one_hundred,
 		.extra2		= &one_thousand,
 	},
+#endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	{
+		.procname       = "sched_util_low_pct",
+		.data           = &sysctl_sched_util_low_pct,
+		.maxlen         = sizeof(sysctl_sched_util_low_pct),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = &zero,
+		.extra2		= &one_hundred,
+	},
 #endif
 	{ }
 };
-- 
2.25.1