[PATCH openEuler-1.0-LTS 1/2] sched: introduce smart grid scheduling strategy for cfs scheduler

31 May 2023

From: Hui Tang <tanghui20@huawei.com>

We hope to be able to dynamically expand or shrink the resource
affinity of task under the premise that the resources meet the
requirements, We divide several level of affinity domains according
to sched_domains:

level4   * SOCKET  [                                                         ]
level3   * DIE     [                             ]
level2   * MC      [             ] [             ]
level1   * SMT     [     ] [     ] [     ] [     ]
level0   * CPU      0   1   2   3   4   5   6   7

Whether users tend to choose power saving or performance will affect
strategy of adjusting affinity, when selecting the power saving mode,
we will choose a more appropriate affinity based on the energy model
to reduce power consumption, while considering the QOS of resources
such as CPU and memory consumption, for instance, if the current task
CPU load is less than required, smart grid will judge whether to aggregate
tasks together into a smaller range or not according to energy model.

The main difference from EAS is that we pay more attention to the impact
of power consumption brought by such as cpuidle and DVFS, and classify
tasks to reduce interference and ensure resource QOS in each divided unit,
which are more suitable for general-purpose on non-heterogeneous CPUs.

        --------        --------        --------
       | group0 |      | group1 |      | group2 |
        --------        --------        --------
	   |                |              |
	   v                |              v
       ---------------------+-----     -----------------
      |                  ---v--   |   |
      |       DIE0      |  MC1 |  |   |   DIE1
      |                  ------   |   |
       ---------------------------     -----------------

We regularly count the resource satisfaction of groups, and adjust the
affinity, scheduling balance and migrating memory will be considered
based on memory location for better meetting resource requirements.

Signed-off-by: Hui Tang <tanghui20@huawei.com>
Signed-off-by: Wang ShaoBo <bobo.shaobowang@huawei.com>
---
 fs/proc/array.c               |  13 +
 fs/proc/stat.c                |   4 +
 include/linux/sched.h         |   2 +
 include/linux/sched/cputime.h |   3 +
 include/linux/sched/sysctl.h  |   8 +
 init/Kconfig                  |  10 +
 kernel/sched/core.c           | 178 ++++++++++++-
 kernel/sched/fair.c           | 482 +++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h          |  39 +++
 kernel/sched/topology.c       |   3 +
 kernel/sysctl.c               |  27 ++
 11 files changed, 761 insertions(+), 8 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9eb99a43f849a..3c8cad18c6d66 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -386,6 +386,16 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 		   cpumask_pr_args(&task->cpus_allowed));
 }
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+static void task_cpus_preferred(struct seq_file *m, struct task_struct *task)
+{
+	seq_printf(m, "Cpus_preferred:\t%*pb\n",
+		   cpumask_pr_args(task->prefer_cpus));
+	seq_printf(m, "Cpus_preferred_list:\t%*pbl\n",
+		   cpumask_pr_args(task->prefer_cpus));
+}
+#endif
+
 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
 {
 	seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
@@ -412,6 +422,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	task_cap(m, task);
 	task_seccomp(m, task);
 	task_cpus_allowed(m, task);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	task_cpus_preferred(m, task);
+#endif
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
 	return 0;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7e832b24847dd..3fe60a77b0b4d 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -63,7 +63,11 @@ u64 get_idle_time(int cpu)
 	return idle;
 }
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+u64 get_iowait_time(int cpu)
+#else
 static u64 get_iowait_time(int cpu)
+#endif
 {
 	u64 iowait, iowait_usecs = -1ULL;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 928186f161000..8be102dc9a193 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1996,6 +1996,8 @@ static inline int sched_qos_cpu_overload(void)
 int dynamic_affinity_enabled(void);
 int set_prefer_cpus_ptr(struct task_struct *p,
 			const struct cpumask *new_mask);
+int set_prefer_cpus_ptr_nolock(struct task_struct *p,
+			       const struct cpumask *new_mask);
 int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig);
 void sched_prefer_cpus_free(struct task_struct *p);
 #endif
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 6b1793606fc95..4a092e006f5b2 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -189,6 +189,9 @@ task_sched_runtime(struct task_struct *task);
 extern int use_sched_idle_time;
 extern int sched_idle_time_adjust(int cpu, u64 *utime, u64 *stime);
 extern unsigned long long sched_get_idle_time(int cpu);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+extern u64 get_iowait_time(int cpu);
+#endif
 
 #ifdef CONFIG_PROC_FS
 extern u64 get_idle_time(int cpu);
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 04eb5b127867b..8e3bd01efd224 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -31,11 +31,19 @@ extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
+extern int sysctl_sched_util_update_interval;
+extern unsigned long sysctl_sched_util_update_interval_max;
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 extern int sysctl_sched_util_low_pct;
 #endif
 
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+extern int sched_affinity_sd_level_max;
+extern int sysctl_affinity_init_sd_level;
+extern int sysctl_affinity_adjust_delay_ms;
+#endif
+
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
 	SCHED_TUNABLESCALING_LOG,
diff --git a/init/Kconfig b/init/Kconfig
index a2a733080fe11..037cde5dcf4be 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -834,6 +834,16 @@ config QOS_SCHED_DYNAMIC_AFFINITY
 	 of taskgroup is below threshold setted, otherwise make taskgroup to use
 	 cpus allowed.
 
+config QOS_SCHED_SMART_GRID
+	bool "qos smart grid scheduler"
+	depends on FAIR_GROUP_SCHED && QOS_SCHED_DYNAMIC_AFFINITY
+	default n
+	help
+	 This will dynamically adjust the workload's core selection range according
+	 to the performance or power consumption needs of the workload and users, and
+	 tell cpufreq governor how can they adjust cpu frequency, and even dynamically
+	 adjust the location of memory allocation based on QOS considerations.
+
 config CGROUP_PIDS
 	bool "PIDs controller"
 	help
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 168e98c6d51a7..ba9ddcfea7d9f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5842,6 +5842,7 @@ int sched_cpu_activate(unsigned int cpu)
 		static_branch_inc_cpuslocked(&sched_smt_present);
 #endif
 	set_cpu_active(cpu, true);
+	tg_update_affinity_domains(cpu, 1);
 
 	if (sched_smp_initialized) {
 		sched_domains_numa_masks_set(cpu);
@@ -5900,6 +5901,7 @@ int sched_cpu_deactivate(unsigned int cpu)
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
+	tg_update_affinity_domains(cpu, 0);
 	return 0;
 }
 
@@ -6117,6 +6119,8 @@ void __init sched_init(void)
 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
 		 */
 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+
+		init_auto_affinity(&root_task_group.auto_affinity);
 #ifdef CONFIG_QOS_SCHED
 		init_qos_hrtimer(i);
 #endif
@@ -6530,6 +6534,9 @@ void sched_move_task(struct task_struct *tsk)
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	struct rq_flags rf;
 	struct rq *rq;
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	struct affinity_domain *ad;
+#endif
 
 	rq = task_rq_lock(tsk, &rf);
 	update_rq_clock(rq);
@@ -6550,6 +6557,14 @@ void sched_move_task(struct task_struct *tsk)
 		set_curr_task(rq, tsk);
 
 	task_rq_unlock(rq, tsk, &rf);
+
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	if (task_group(tsk) != &root_task_group) {
+		ad = &task_group(tsk)->auto_affinity.ad;
+		set_prefer_cpus_ptr(tsk, ad->domains[ad->curr_level]);
+	}
+#endif
+
 }
 
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -6969,6 +6984,124 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+const u64 max_affinity_period = 60 * NSEC_PER_SEC; /* 1min */
+const u64 min_affinity_period = 1 * NSEC_PER_MSEC; /* 1ms */
+
+int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode)
+{
+	struct auto_affinity *auto_affi = &tg->auto_affinity;
+
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	raw_spin_lock_irq(&auto_affi->lock);
+	auto_affi->mode = mode;
+
+	if (mode == 0 && auto_affi->period_active == 1) {
+		shutdown_auto_affinity(auto_affi);
+	} else if (mode != 0 && auto_affi->period_active == 0) {
+		start_auto_affinity(auto_affi);
+	}
+	raw_spin_unlock_irq(&auto_affi->lock);
+
+	return 0;
+}
+
+static u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css,
+					      struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+
+	return tg->auto_affinity.mode;
+}
+
+static int cpu_affinity_mode_write_u64(struct cgroup_subsys_state *css,
+				   struct cftype *cftype, u64 mode)
+{
+	struct task_group *tg = css_tg(css);
+
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	return tg_set_dynamic_affinity_mode(tg, mode);
+}
+
+int tg_set_affinity_period(struct task_group *tg, u64 period_ms)
+{
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	if (period_ms > U64_MAX / NSEC_PER_MSEC)
+		return -EINVAL;
+
+	raw_spin_lock_irq(&tg->auto_affinity.lock);
+	tg->auto_affinity.period = ms_to_ktime(period_ms);
+	raw_spin_unlock_irq(&tg->auto_affinity.lock);
+	return 0;
+}
+
+u64 tg_get_affinity_period(struct task_group *tg)
+{
+	return ktime_to_ms(tg->auto_affinity.period);
+}
+
+static int cpu_affinity_period_write_uint(struct cgroup_subsys_state *css,
+					  struct cftype *cftype, u64 period)
+{
+	return tg_set_affinity_period(css_tg(css), period);
+}
+
+static u64 cpu_affinity_period_read_uint(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	return tg_get_affinity_period(css_tg(css));
+}
+
+static int cpu_affinity_sd_mask_write_u64(struct cgroup_subsys_state *css,
+					  struct cftype *cftype,
+					  u64 mask)
+{
+	struct task_group *tg = css_tg(css);
+	struct affinity_domain *ad = &tg->auto_affinity.ad;
+	u16 full = (1 << ad->dcount) - 1;
+
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	if (mask > full)
+		return -EINVAL;
+
+	ad->sd_mask = mask;
+	return 0;
+}
+
+static u64 cpu_affinity_sd_mask_read_u64(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+
+	return tg->auto_affinity.ad.sd_mask;
+}
+
+static int cpu_affinity_stat_show(struct seq_file *sf, void *v)
+{
+	struct task_group *tg = css_tg(seq_css(sf));
+	struct auto_affinity *auto_affi = &tg->auto_affinity;
+	struct affinity_domain *ad = &auto_affi->ad;
+	int i;
+
+	seq_printf(sf, "period_active %d\n", auto_affi->period_active);
+	seq_printf(sf, "dcount %d\n", ad->dcount);
+	seq_printf(sf, "curr_level %d\n", ad->curr_level);
+	for (i = 0; i < ad->dcount; i++)
+		seq_printf(sf, "sd_level %d, cpu list %*pbl\n",
+			i, cpumask_pr_args(ad->domains[i]));
+
+	return 0;
+}
+#endif /* CONFIG_QOS_SCHED_SMART_GRID */
+
 #ifdef CONFIG_QOS_SCHED
 static int tg_change_scheduler(struct task_group *tg, void *data)
 {
@@ -7073,6 +7206,27 @@ static struct cftype cpu_legacy_files[] = {
 		.read_s64 = cpu_qos_read,
 		.write_s64 = cpu_qos_write,
 	},
+#endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	{
+		.name = "dynamic_affinity_mode",
+		.read_u64 = cpu_affinity_mode_read_u64,
+		.write_u64 = cpu_affinity_mode_write_u64,
+	},
+	{
+		.name = "affinity_period_ms",
+		.read_u64 = cpu_affinity_period_read_uint,
+		.write_u64 = cpu_affinity_period_write_uint,
+	},
+	{
+		.name = "affinity_sd_mask",
+		.read_u64 = cpu_affinity_sd_mask_read_u64,
+		.write_u64 = cpu_affinity_sd_mask_write_u64,
+	},
+	{
+		.name = "affinity_stat",
+		.seq_show = cpu_affinity_stat_show,
+	},
 #endif
 	{ }	/* Terminate */
 };
@@ -7263,16 +7417,11 @@ static void do_set_prefer_cpus(struct task_struct *p,
 static int __set_prefer_cpus_ptr(struct task_struct *p,
 				  const struct cpumask *new_mask, bool check)
 {
-	struct rq_flags rf;
-	struct rq *rq;
 	int ret = 0;
 
 	if (unlikely(!p->prefer_cpus))
 		return -EINVAL;
 
-	rq = task_rq_lock(p, &rf);
-	update_rq_clock(rq);
-
 	if (cpumask_equal(p->prefer_cpus, new_mask))
 		goto out;
 
@@ -7283,12 +7432,29 @@ static int __set_prefer_cpus_ptr(struct task_struct *p,
 
 	do_set_prefer_cpus(p, new_mask);
 out:
-	task_rq_unlock(rq, p, &rf);
 
 	return ret;
 }
 
 int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	struct rq_flags rf;
+	struct rq *rq;
+	int ret;
+
+	if (p->sched_class != &fair_sched_class)
+		return 0;
+
+	rq = task_rq_lock(p, &rf);
+	update_rq_clock(rq);
+	ret = __set_prefer_cpus_ptr(p, new_mask, false);
+	task_rq_unlock(rq, p, &rf);
+
+	return ret;
+}
+
+int set_prefer_cpus_ptr_nolock(struct task_struct *p,
+			       const struct cpumask *new_mask)
 {
 	if (p->sched_class != &fair_sched_class)
 		return 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 77a49eae2cddc..093dc714090c6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5281,6 +5281,397 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
 
 #endif /* CONFIG_CFS_BANDWIDTH */
 
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+static inline unsigned long cpu_util(int cpu);
+static unsigned long target_load(int cpu, int type);
+static unsigned long capacity_of(int cpu);
+static int sched_idle_cpu(int cpu);
+static unsigned long weighted_cpuload(struct rq *rq);
+
+int sched_affinity_sd_level_max;
+int sysctl_affinity_init_sd_level = 100;
+int sysctl_affinity_adjust_delay_ms = 5000;
+
+static inline u64 default_auto_affi_period(void)
+{
+	return 5000ULL;
+}
+
+static inline unsigned long taskgroup_cpu_util(struct task_group *tg,
+					       int cpu)
+{
+	return cpu_util(cpu);
+}
+
+static void affinity_domain_up(struct task_group *tg)
+{
+	struct affinity_domain *ad = &tg->auto_affinity.ad;
+	struct task_struct *task;
+	struct css_task_iter it;
+
+	if (ad->curr_level >= ad->dcount - 1)
+		return;
+
+	ad->curr_level++;
+	css_task_iter_start(&tg->css, 0, &it);
+	while ((task = css_task_iter_next(&it))) {
+		set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]);
+	}
+	css_task_iter_end(&it);
+}
+
+static void affinity_domain_down(struct task_group *tg)
+{
+	struct affinity_domain *ad = &tg->auto_affinity.ad;
+	struct task_struct *task;
+	struct css_task_iter it;
+
+	if (ad->curr_level <= 0)
+		return;
+
+	ad->curr_level--;
+	css_task_iter_start(&tg->css, 0, &it);
+	while ((task = css_task_iter_next(&it))) {
+		set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]);
+	}
+	css_task_iter_end(&it);
+}
+
+static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer)
+{
+	struct auto_affinity *auto_affi =
+		container_of(timer, struct auto_affinity, period_timer);
+	struct task_group *tg =
+		container_of(auto_affi, struct task_group, auto_affinity);
+	struct affinity_domain *ad = &auto_affi->ad;
+	struct cpumask *span = ad->domains[ad->curr_level];
+	unsigned long util_avg_sum = 0;
+	unsigned long tg_capacity = 0;
+	unsigned long flags;
+	int cpu;
+
+	raw_spin_lock_irqsave(&auto_affi->lock, flags);
+	for_each_cpu(cpu, span) {
+		util_avg_sum += taskgroup_cpu_util(tg, cpu);
+		tg_capacity += capacity_of(cpu);
+	}
+
+	if (!tg_capacity)
+		goto exit;
+
+	if (util_avg_sum * 100 > tg_capacity * sysctl_sched_util_low_pct) {
+		affinity_domain_up(tg);
+	} else if (util_avg_sum * 100 < tg_capacity *
+		   sysctl_sched_util_low_pct / 2) {
+		affinity_domain_down(tg);
+	}
+
+exit:
+	hrtimer_forward_now(timer, auto_affi->period);
+	raw_spin_unlock_irqrestore(&auto_affi->lock, flags);
+	return HRTIMER_RESTART;
+}
+
+static inline struct auto_affinity *tg_auto_affinity(struct task_group *tg)
+{
+	return &tg->auto_affinity;
+}
+
+static int tg_update_affinity_domain_down(struct task_group *tg, void *data)
+{
+	struct auto_affinity *auto_affi = &tg->auto_affinity;
+	struct affinity_domain *ad = &tg->auto_affinity.ad;
+	struct task_struct *task;
+	struct css_task_iter it;
+	int *cpu_state = data;
+	unsigned long flags;
+	int i;
+
+	if (!ad->dcount)
+		return 0;
+
+	raw_spin_lock_irqsave(&auto_affi->lock, flags);
+
+	for (i = 0; i < ad->dcount; i++) {
+		if (cpu_state[1])
+			cpumask_set_cpu(cpu_state[0], ad->domains[i]);
+		else
+			cpumask_clear_cpu(cpu_state[0], ad->domains[i]);
+	}
+	raw_spin_unlock_irqrestore(&auto_affi->lock, flags);
+
+	css_task_iter_start(&tg->css, 0, &it);
+	while ((task = css_task_iter_next(&it))) {
+		set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]);
+	}
+	css_task_iter_end(&it);
+	return 0;
+}
+
+void tg_update_affinity_domains(int cpu, int online)
+{
+	int cpu_state[2];
+
+	cpu_state[0] = cpu;
+	cpu_state[1] = online;
+
+	rcu_read_lock();
+	walk_tg_tree(tg_update_affinity_domain_down, tg_nop, cpu_state);
+	rcu_read_unlock();
+}
+
+void start_auto_affinity(struct auto_affinity *auto_affi)
+{
+	ktime_t delay_ms;
+
+	if (auto_affi->period_active)
+		return;
+
+	auto_affi->period_active = 1;
+	delay_ms = ms_to_ktime(sysctl_affinity_adjust_delay_ms);
+	hrtimer_forward_now(&auto_affi->period_timer, delay_ms);
+	hrtimer_start_expires(&auto_affi->period_timer, HRTIMER_MODE_ABS_PINNED);
+}
+
+void shutdown_auto_affinity(struct auto_affinity *auto_affi)
+{
+	struct task_struct *task;
+	struct css_task_iter it;
+	struct task_group *tg =
+		container_of(auto_affi, struct task_group, auto_affinity);
+	struct affinity_domain *ad = &auto_affi->ad;
+
+	if (auto_affi->period_active == 0)
+		return;
+
+	hrtimer_cancel(&auto_affi->period_timer);
+	auto_affi->period_active = 0;
+	if (ad->dcount > 0)
+		ad->curr_level = ad->dcount - 1;
+
+	css_task_iter_start(&tg->css, 0, &it);
+	while ((task = css_task_iter_next(&it))) {
+		set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]);
+		if (likely(task->qos))
+			task->qos->affinity_set(task);
+	}
+	css_task_iter_end(&it);
+}
+
+static struct sched_group *sd_find_idlest_group(struct sched_domain *sd)
+{
+	struct sched_group *idlest = NULL, *group = sd->groups;
+	unsigned long min_runnable_load = ULONG_MAX;
+	unsigned long min_avg_load = ULONG_MAX;
+	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+				(sd->imbalance_pct-100) / 100;
+
+	do {
+		unsigned long load, avg_load, runnable_load;
+		int i;
+
+		/*
+		 * Tally up the load of all CPUs in the group and find
+		 * the group containing the CPU with most spare capacity.
+		 */
+		avg_load = 0;
+		runnable_load = 0;
+
+		for_each_cpu(i, sched_group_span(group)) {
+			load = target_load(i, 0);
+			runnable_load += load;
+			avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
+		}
+
+		/* Adjust by relative CPU capacity of the group */
+		avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
+					group->sgc->capacity;
+		runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
+					group->sgc->capacity;
+
+		if (min_runnable_load > (runnable_load + imbalance)) {
+			/*
+			 * The runnable load is significantly smaller
+			 * so we can pick this new CPU:
+			 */
+			min_runnable_load = runnable_load;
+			min_avg_load = avg_load;
+			idlest = group;
+		} else if ((runnable_load < (min_runnable_load + imbalance)) &&
+			   (100*min_avg_load > imbalance_scale*avg_load)) {
+			/*
+			 * The runnable loads are close so take the
+			 * blocked load into account through avg_load:
+			 */
+			min_avg_load = avg_load;
+			idlest = group;
+		}
+	} while (group = group->next, group != sd->groups);
+
+	return idlest ? idlest : group;
+}
+
+static int group_find_idlest_cpu(struct sched_group *group)
+{
+	int least_loaded_cpu = cpumask_first(sched_group_span(group));
+	unsigned long load, min_load = ULONG_MAX;
+	unsigned int min_exit_latency = UINT_MAX;
+	u64 latest_idle_timestamp = 0;
+	int shallowest_idle_cpu = -1;
+	int i;
+
+	/* Check if we have any choice: */
+	if (group->group_weight == 1)
+		return cpumask_first(sched_group_span(group));
+
+	for_each_cpu(i, sched_group_span(group)) {
+		if (sched_idle_cpu(i))
+			return i;
+
+		if (available_idle_cpu(i)) {
+			struct rq *rq = cpu_rq(i);
+			struct cpuidle_state *idle = idle_get_state(rq);
+			if (idle && idle->exit_latency < min_exit_latency) {
+				/*
+				 * We give priority to a CPU whose idle state
+				 * has the smallest exit latency irrespective
+				 * of any idle timestamp.
+				 */
+				min_exit_latency = idle->exit_latency;
+				latest_idle_timestamp = rq->idle_stamp;
+				shallowest_idle_cpu = i;
+			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
+				   rq->idle_stamp > latest_idle_timestamp) {
+				/*
+				 * If equal or no active idle state, then
+				 * the most recently idled CPU might have
+				 * a warmer cache.
+				 */
+				latest_idle_timestamp = rq->idle_stamp;
+				shallowest_idle_cpu = i;
+			}
+		} else if (shallowest_idle_cpu == -1) {
+			load = weighted_cpuload(cpu_rq(i));
+			if (load < min_load) {
+				min_load = load;
+				least_loaded_cpu = i;
+			}
+		}
+	}
+
+	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+}
+void dump_affinity_domains(struct affinity_domain *ad)
+{
+	int i = 0;
+
+	if (!ad->domains[0])
+		return;
+
+	trace_printk("cpu %d, domains[0]: %*pbl\n",
+		ad->cpu, cpumask_pr_args(ad->domains[ad->curr_level]));
+
+	for (i = 0; i < ad->dcount; i++) {
+		trace_printk("sd: %*pbl\n",
+			cpumask_pr_args(ad->domains[i]));
+	}
+}
+
+void free_affinity_domains(struct affinity_domain *ad)
+{
+	int i;
+
+	for (i = 0; i < ad->dcount; i++) {
+		kfree(ad->domains[i]);
+		ad->domains[i] = NULL;
+	}
+	ad->dcount = 0;
+}
+
+static int init_affinity_domains(struct affinity_domain *ad)
+{
+	struct sched_domain *sd = NULL, *tmp;
+	struct sched_group *idlest = NULL;
+	int cpu;
+	int i = 0;
+
+	rcu_read_lock();
+	cpu = cpumask_first_and(cpu_active_mask,
+				housekeeping_cpumask(HK_FLAG_DOMAIN));
+	for_each_domain(cpu, tmp)
+		sd = tmp;
+
+	ad->dcount = 0;
+	if (!sd) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	idlest = sd_find_idlest_group(sd);
+	cpu = group_find_idlest_cpu(idlest);
+	for_each_domain(cpu, tmp) {
+		ad->domains[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+		if (!ad->domains[i])
+			goto err;
+
+		cpumask_copy(ad->domains[i], sched_domain_span(tmp));
+
+		ad->dcount = ++i;
+	}
+
+	ad->curr_level = 0;
+	if (ad->dcount > 0)
+		ad->curr_level = ad->dcount - 1;
+
+	dump_affinity_domains(ad);
+	rcu_read_unlock();
+	return 0;
+err:
+	free_affinity_domains(ad);
+	rcu_read_unlock();
+	return -ENOMEM;
+}
+
+int init_auto_affinity(struct auto_affinity *auto_affi)
+{
+	raw_spin_lock_init(&auto_affi->lock);
+	auto_affi->mode = 0;
+	auto_affi->period = ms_to_ktime(default_auto_affi_period());
+
+	hrtimer_init(&auto_affi->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+	auto_affi->period_timer.function = sched_auto_affi_period_timer;
+
+	return init_affinity_domains(&auto_affi->ad);
+}
+
+static void destroy_auto_affinity(struct auto_affinity *auto_affi)
+{
+	if (!auto_affi->period_active)
+		return;
+
+	hrtimer_cancel(&auto_affi->period_timer);
+	free_affinity_domains(&auto_affi->ad);
+}
+#else
+int init_auto_affinity(struct auto_affinity *auto_affi)
+{
+	return 0;
+}
+static void destroy_auto_affinity(struct auto_affinity *auto_affi) {}
+
+static inline struct auto_affinity *tg_auto_affinity(struct task_group *tg)
+{
+	return NULL;
+}
+
+void start_auto_affinity(struct auto_affinity *auto_affi) {}
+void shutdown_auto_affinity(struct auto_affinity *auto_affi) {}
+
+void tg_update_affinity_domains(int cpu, int online) {}
+
+#endif
+
 /**************************************************
  * CFS operations on tasks:
  */
@@ -6033,6 +6424,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 					       sched_group_span(group));
 
 		/*
+skip_spare:
 		 * Tally up the load of all CPUs in the group and find
 		 * the group containing the CPU with most spare capacity.
 		 */
@@ -6722,6 +7114,73 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
  */
 int sysctl_sched_util_low_pct = 85;
 
+struct cpu_timeinfo {
+	u64		systime;
+	u64		idletime;
+	unsigned long	next_update;
+	int		vutil;
+};
+
+/*
+ * The time interval to update CPU utilization
+ * (default 1ms, max 10min)
+ */
+int sysctl_sched_util_update_interval = 1;
+unsigned long sysctl_sched_util_update_interval_max = 600000;
+
+static DEFINE_PER_CPU(struct cpu_timeinfo, qos_cputime);
+
+static inline u64 cpu_systime(int cpu)
+{
+	u64 user, nice, system, idle, iowait, irq, softirq, steal;
+
+	user = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
+	system = kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
+	iowait = get_iowait_time(cpu);
+	irq = kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
+	softirq = kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
+	nice = kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
+	steal = kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
+	idle = get_idle_time(cpu);
+
+	return user + system + iowait + irq + softirq + nice + idle + steal;
+}
+
+static inline u64 cpu_idletime(int cpu)
+{
+	return get_idle_time(cpu) + get_iowait_time(cpu);
+}
+
+static inline void update_cpu_vutil(void)
+{
+	struct cpu_timeinfo *cputime = per_cpu_ptr(&qos_cputime, smp_processor_id());
+	u64 delta_systime, delta_idle, systime, idletime;
+	int cpu = smp_processor_id();
+	unsigned long interval;
+
+	if (time_after(jiffies, cputime->next_update)) {
+		interval = msecs_to_jiffies(sysctl_sched_util_update_interval);
+		cputime->next_update = jiffies + interval;
+		systime = cpu_systime(cpu);
+		idletime = cpu_idletime(cpu);
+		delta_systime =  systime - cputime->systime;
+		delta_idle = idletime - cputime->idletime;
+		if (!delta_systime)
+			return;
+
+		cputime->systime = systime;
+		cputime->idletime = idletime;
+		cputime->vutil = (delta_systime - delta_idle) * 100 / delta_systime;
+	}
+}
+
+static inline int cpu_vutil_of(int cpu)
+{
+	struct cpu_timeinfo *cputime = per_cpu_ptr(&qos_cputime, cpu);
+
+	return cputime->vutil;
+}
+
 static inline bool prefer_cpus_valid(struct task_struct *p)
 {
 	return p->prefer_cpus &&
@@ -6747,7 +7206,6 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
 				 int sd_flag)
 {
 	unsigned long util_avg_sum = 0;
-	unsigned long tg_capacity = 0;
 	long min_util = INT_MIN;
 	struct task_group *tg;
 	long spare;
@@ -6781,17 +7239,29 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
 			return;
 		}
 
+#if 0
 		util_avg_sum += tg->se[cpu]->avg.util_avg;
 		tg_capacity += capacity_of(cpu);
+#endif
+		util_avg_sum += cpu_vutil_of(cpu);
 	}
 	rcu_read_unlock();
 
+#if 0
 	if (tg_capacity > cpumask_weight(p->prefer_cpus) &&
 	    util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) {
 		p->select_cpus = p->prefer_cpus;
 		if (sd_flag & SD_BALANCE_WAKE)
 			schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus);
 	}
+#endif
+
+	if (util_avg_sum < sysctl_sched_util_low_pct *
+			    cpumask_weight(p->prefer_cpus)) {
+		p->select_cpus = p->prefer_cpus;
+		if (sd_flag & SD_BALANCE_WAKE)
+			schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus);
+	}
 }
 #endif
 
@@ -10647,6 +11117,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
+
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	 update_cpu_vutil();
+#endif
 }
 
 /*
@@ -10928,6 +11402,7 @@ void free_fair_sched_group(struct task_group *tg)
 	int i;
 
 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+	destroy_auto_affinity(tg_auto_affinity(tg));
 
 	for_each_possible_cpu(i) {
 #ifdef CONFIG_QOS_SCHED
@@ -10948,7 +11423,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
-	int i;
+	int i, ret;
 
 	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
 	if (!tg->cfs_rq)
@@ -10960,6 +11435,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	tg->shares = NICE_0_LOAD;
 
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+	ret = init_auto_affinity(tg_auto_affinity(tg));
+	if (ret)
+		goto err;
 
 	for_each_possible_cpu(i) {
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ae30681530938..9193aa57b7888 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -163,6 +163,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
  */
 #define RUNTIME_INF		((u64)~0ULL)
 
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+#define DA_MODE_MANUAL		0
+#define DA_MODE_PERF		1
+#define DA_MODE_POWER		2
+#endif
+
 static inline int idle_policy(int policy)
 {
 	return policy == SCHED_IDLE;
@@ -361,6 +367,33 @@ struct cfs_bandwidth {
 #endif
 };
 
+
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+#define AD_LEVEL_MAX		10
+
+struct affinity_domain {
+	int			cpu;
+	int			dcount;
+	u16 			sd_mask;
+	int 			curr_level;
+	struct cpumask		*domains[AD_LEVEL_MAX];
+	u64			util_avg;
+	u64			cfs_capacity;
+};
+
+#endif
+
+struct auto_affinity {
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	raw_spinlock_t		lock;
+	u64			mode;
+	ktime_t			period;
+	struct hrtimer		period_timer;
+	int			period_active;
+	struct affinity_domain	ad;
+#endif
+};
+
 /* Task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
@@ -401,6 +434,7 @@ struct task_group {
 #endif
 
 	struct cfs_bandwidth	cfs_bandwidth;
+	struct auto_affinity	auto_affinity;
 
 #if defined(CONFIG_QOS_SCHED) && !defined(__GENKSYMS__)
 	long qos_level;
@@ -475,6 +509,11 @@ extern void sched_offline_group(struct task_group *tg);
 
 extern void sched_move_task(struct task_struct *tsk);
 
+extern int init_auto_affinity(struct auto_affinity *auto_affi);
+extern void start_auto_affinity(struct auto_affinity *auto_affi);
+extern void shutdown_auto_affinity(struct auto_affinity *auto_affi);
+extern void tg_update_affinity_domains(int cpu, int online);
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ad5591520c99a..a5296219f25c7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1821,6 +1821,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
 	}
 	set_domain_attribute(sd, attr);
 
+	trace_printk("cpu %d: level %d, name %s, %*pbl\n",
+		cpu, sd->level, sd->name, cpumask_pr_args(sched_domain_span(sd)));
 	return sd;
 }
 
@@ -1902,6 +1904,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 
 		cpu_attach_domain(sd, d.rd, i);
 	}
+
 	rcu_read_unlock();
 
 	if (rq && sched_debug_enabled) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ad62ea156afd9..984287982a20d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -352,6 +352,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname       = "sched_util_update_interval_ms",
+		.data           = &sysctl_sched_util_update_interval,
+		.maxlen         = sizeof(sysctl_sched_util_update_interval),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = &one,
+		.extra2		= &sysctl_sched_util_update_interval_max,
+	},
 #ifdef CONFIG_SCHED_DEBUG
 	{
 		.procname	= "sched_min_granularity_ns",
@@ -1336,6 +1345,24 @@ static struct ctl_table kern_table[] = {
 		.extra1         = &zero,
 		.extra2		= &one_hundred,
 	},
+#endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	{
+		.procname	= "affinity_init_sd_level",
+		.data		= &sysctl_affinity_init_sd_level,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &sched_affinity_sd_level_max,
+	},
+	{
+		.procname	= "affinity_adjust_delay_ms",
+		.data		= &sysctl_affinity_adjust_delay_ms,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #endif
 	{ }
 };
-- 
2.25.1