[PATCH OLK-6.6 v2 1/7] sched: Introduce smart grid scheduling strategy for cfs

3 Jan 2024

From: Hui Tang <tanghui20@huawei.com>

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7BQZ0
CVE: NA

----------------------------------------

We want to dynamically expand or shrink the affinity range of tasks
based on the CPU topology level while meeting the minimum resource
requirements of tasks.

We divide several level of affinity domains according to sched domains:

level4   * SOCKET  [                                                  ]
level3   * DIE     [                             ]
level2   * MC      [             ] [             ]
level1   * SMT     [     ] [     ] [     ] [     ]
level0   * CPU      0   1   2   3   4   5   6   7

Whether users tend to choose power saving or performance will affect
strategy of adjusting affinity, when selecting the power saving mode,
we will choose a more appropriate affinity based on the energy model
to reduce power consumption, while considering the QOS of resources
such as CPU and memory consumption, for instance, if the current task
CPU load is less than required, smart grid will judge whether to aggregate
tasks together into a smaller range or not according to energy model.

The main difference from EAS is that we pay more attention to the impact
of power consumption brought by such as cpuidle and DVFS, and classify
tasks to reduce interference and ensure resource QOS in each divided unit,
which are more suitable for general-purpose on non-heterogeneous CPUs.

        --------        --------        --------
       | group0 |      | group1 |      | group2 |
        --------        --------        --------
	   |                |              |
	   v                |              v
       ---------------------+-----     -----------------
      |                  ---v--   |   |
      |       DIE0      |  MC1 |  |   |   DIE1
      |                  ------   |   |
       ---------------------------     -----------------

We regularly count the resource satisfaction of groups, and adjust the
affinity, scheduling balance and migrating memory will be considered
based on memory location for better meetting resource requirements.

Signed-off-by: Hui Tang <tanghui20@huawei.com>
Signed-off-by: Wang ShaoBo <bobo.shaobowang@huawei.com>
Reviewed-by: Chen Hui <judy.chenhui@huawei.com>
Reviewed-by: Zhang Qiao <zhangqiao22@huawei.com>
Signed-off-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: Yipeng Zou <zouyipeng@huawei.com>
---
 fs/proc/array.c              |  16 ++
 include/linux/sched.h        |  13 +
 include/linux/sched/sysctl.h |   4 +
 init/Kconfig                 |  13 +
 kernel/sched/core.c          | 174 ++++++++++++
 kernel/sched/fair.c          | 501 ++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h         |  47 +++-
 kernel/sysctl.c              |  14 +
 8 files changed, 772 insertions(+), 10 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 2c2efbe685d8..0aeaeb9d2b48 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -436,6 +436,19 @@ __weak void arch_proc_pid_thread_features(struct seq_file *m,
 {
 }
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+static void task_cpus_preferred(struct seq_file *m, struct task_struct *task)
+{
+	if (!dynamic_affinity_enabled())
+		return;
+
+	seq_printf(m, "Cpus_preferred:\t%*pb\n",
+		   cpumask_pr_args(task->prefer_cpus));
+	seq_printf(m, "Cpus_preferred_list:\t%*pbl\n",
+		   cpumask_pr_args(task->prefer_cpus));
+}
+#endif
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -461,6 +474,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
 	arch_proc_pid_thread_features(m, task);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	task_cpus_preferred(m, task);
+#endif
 	return 0;
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9fdd08aa9626..6e7e3940a97f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2503,4 +2503,17 @@ static inline bool dynamic_affinity_enabled(void)
 	return static_branch_unlikely(&__dynamic_affinity_switch);
 }
 #endif
+
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+extern struct static_key __smart_grid_used;
+static inline bool smart_grid_used(void)
+{
+	return static_key_false(&__smart_grid_used);
+}
+#else
+static inline bool smart_grid_used(void)
+{
+	return false;
+}
+#endif
 #endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 5a64582b086b..feaa9067f9fc 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -12,6 +12,10 @@ extern unsigned long sysctl_hung_task_timeout_secs;
 enum { sysctl_hung_task_timeout_secs = 0 };
 #endif
 
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+extern int sysctl_affinity_adjust_delay_ms;
+#endif
+
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
 	SCHED_TUNABLESCALING_LOG,
diff --git a/init/Kconfig b/init/Kconfig
index 869eea4108d0..66f6cb9f4029 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1114,6 +1114,19 @@ config UCLAMP_TASK_GROUP
 
 	  If in doubt, say N.
 
+config QOS_SCHED_SMART_GRID
+	bool "qos smart grid scheduler"
+	depends on FAIR_GROUP_SCHED && QOS_SCHED_DYNAMIC_AFFINITY
+	default n
+	help
+	 This feature is used for power consumption tuning in server scenario.
+	 This can be divided into the following aspects:
+	  1. User interface, manage user needs.
+	  2. Collect tasks' features to ensure key tasks' QOS.
+	  3. Weaken the influence the impact of CPU frequency and cpuidle
+	     adjustment on tasks.
+	  4. Docking EAS (Energy Aware Scheduling) model.
+
 config CGROUP_PIDS
 	bool "PIDs controller"
 	help
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7a0997e7e136..fc837f6992ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9708,6 +9708,7 @@ int sched_cpu_activate(unsigned int cpu)
 		static_branch_inc_cpuslocked(&sched_smt_present);
 #endif
 	set_cpu_active(cpu, true);
+	tg_update_affinity_domains(cpu, 1);
 
 	if (sched_smp_initialized) {
 		sched_update_numa(cpu, true);
@@ -9797,6 +9798,7 @@ int sched_cpu_deactivate(unsigned int cpu)
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
+	tg_update_affinity_domains(cpu, 0);
 	return 0;
 }
 
@@ -9917,6 +9919,8 @@ void __init sched_init_smp(void)
 	init_sched_dl_class();
 
 	sched_smp_initialized = true;
+
+	init_auto_affinity(&root_task_group);
 }
 
 static int __init migration_init(void)
@@ -11328,6 +11332,155 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
 }
 #endif
 
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+int tg_set_dynamic_affinity_mode(struct task_group *tg, u64 mode)
+{
+	struct auto_affinity *auto_affi = tg->auto_affinity;
+
+	if (unlikely(!auto_affi))
+		return -EPERM;
+
+	/* auto mode */
+	if (mode == 1)
+		start_auto_affinity(auto_affi);
+	else if (mode == 0)
+		stop_auto_affinity(auto_affi);
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+static u64 cpu_affinity_mode_read_u64(struct cgroup_subsys_state *css,
+					      struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+
+	if (!dynamic_affinity_enabled())
+		return -EPERM;
+
+	if (unlikely(!tg->auto_affinity))
+		return -EPERM;
+
+	return tg->auto_affinity->mode;
+}
+
+static int cpu_affinity_mode_write_u64(struct cgroup_subsys_state *css,
+				   struct cftype *cftype, u64 mode)
+{
+	if (!dynamic_affinity_enabled())
+		return -EPERM;
+
+	return tg_set_dynamic_affinity_mode(css_tg(css), mode);
+}
+
+int tg_set_affinity_period(struct task_group *tg, u64 period_ms)
+{
+	if (unlikely(!tg->auto_affinity))
+		return -EPERM;
+
+	if (!period_ms || period_ms > U64_MAX / NSEC_PER_MSEC)
+		return -EINVAL;
+
+	raw_spin_lock_irq(&tg->auto_affinity->lock);
+	tg->auto_affinity->period = ms_to_ktime(period_ms);
+	raw_spin_unlock_irq(&tg->auto_affinity->lock);
+	return 0;
+}
+
+u64 tg_get_affinity_period(struct task_group *tg)
+{
+	if (unlikely(!tg->auto_affinity))
+		return -EPERM;
+
+	return ktime_to_ms(tg->auto_affinity->period);
+}
+
+static int cpu_affinity_period_write_uint(struct cgroup_subsys_state *css,
+					  struct cftype *cftype, u64 period)
+{
+	if (!dynamic_affinity_enabled())
+		return -EPERM;
+
+	return tg_set_affinity_period(css_tg(css), period);
+}
+
+static u64 cpu_affinity_period_read_uint(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	if (!dynamic_affinity_enabled())
+		return -EPERM;
+
+	return tg_get_affinity_period(css_tg(css));
+}
+
+static int cpu_affinity_domain_mask_write_u64(struct cgroup_subsys_state *css,
+					      struct cftype *cftype,
+					      u64 mask)
+{
+	struct task_group *tg = css_tg(css);
+	struct affinity_domain *ad;
+	u16 full;
+
+	if (!dynamic_affinity_enabled())
+		return -EPERM;
+
+	if (unlikely(!tg->auto_affinity))
+		return -EPERM;
+
+	ad = &tg->auto_affinity->ad;
+	full = (1 << ad->dcount) - 1;
+	if (mask > full)
+		return -EINVAL;
+
+	raw_spin_lock_irq(&tg->auto_affinity->lock);
+	ad->domain_mask = mask;
+	raw_spin_unlock_irq(&tg->auto_affinity->lock);
+	return 0;
+}
+
+static u64 cpu_affinity_domain_mask_read_u64(struct cgroup_subsys_state *css,
+					     struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+
+	if (!dynamic_affinity_enabled())
+		return -EPERM;
+
+	if (unlikely(!tg->auto_affinity))
+		return -EPERM;
+
+	return tg->auto_affinity->ad.domain_mask;
+}
+
+static int cpu_affinity_stat_show(struct seq_file *sf, void *v)
+{
+	struct task_group *tg = css_tg(seq_css(sf));
+	struct auto_affinity *auto_affi = tg->auto_affinity;
+	struct affinity_domain *ad;
+	int i;
+
+	/* No stat when dynamic affinity disabled */
+	if (!dynamic_affinity_enabled())
+		return -EPERM;
+
+	if (unlikely(!auto_affi))
+		return -EPERM;
+
+	ad = &auto_affi->ad;
+	seq_printf(sf, "period_active %d\n", auto_affi->period_active);
+	seq_printf(sf, "dcount %d\n", ad->dcount);
+	seq_printf(sf, "domain_mask 0x%x\n", ad->domain_mask);
+	seq_printf(sf, "curr_level %d\n", ad->curr_level);
+	for (i = 0; i < ad->dcount; i++)
+		seq_printf(sf, "sd_level %d, cpu list %*pbl, stay_cnt %llu\n",
+			i, cpumask_pr_args(ad->domains[i]),
+			schedstat_val(ad->stay_cnt[i]));
+
+	return 0;
+}
+#endif /* CONFIG_QOS_SCHED_SMART_GRID */
+
 #ifdef CONFIG_QOS_SCHED
 static int tg_change_scheduler(struct task_group *tg, void *data)
 {
@@ -11404,6 +11557,27 @@ static struct cftype cpu_legacy_files[] = {
 		.write_s64 = cpu_idle_write_s64,
 	},
 #endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	{
+		.name = "dynamic_affinity_mode",
+		.read_u64 = cpu_affinity_mode_read_u64,
+		.write_u64 = cpu_affinity_mode_write_u64,
+	},
+	{
+		.name = "affinity_period_ms",
+		.read_u64 = cpu_affinity_period_read_uint,
+		.write_u64 = cpu_affinity_period_write_uint,
+	},
+	{
+		.name = "affinity_domain_mask",
+		.read_u64 = cpu_affinity_domain_mask_read_u64,
+		.write_u64 = cpu_affinity_domain_mask_write_u64,
+	},
+	{
+		.name = "affinity_stat",
+		.seq_show = cpu_affinity_stat_show,
+	},
+#endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.name = "cfs_quota_us",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 318258ea011e..04006d6cc2cd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -69,6 +69,7 @@
 #ifdef CONFIG_SCHED_STEAL
 #include "sparsemask.h"
 #endif
+#include <linux/sched/grid_qos.h>
 
 /*
  * The initial- and re-scaling of tunables is configurable
@@ -6745,6 +6746,469 @@ bool cfs_task_bw_constrained(struct task_struct *p)
 static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
 #endif
 
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+#define AUTO_AFFINITY_DEFAULT_PERIOD_MS 2000
+#define IS_DOMAIN_SET(level, mask)	((1 << (level)) & (mask))
+
+static DEFINE_MUTEX(smart_grid_used_mutex);
+
+static unsigned long capacity_of(int cpu);
+static int sched_idle_cpu(int cpu);
+static unsigned long cpu_runnable(struct rq *rq);
+static inline bool prefer_cpus_valid(struct task_struct *p);
+
+int sysctl_affinity_adjust_delay_ms = 5000;
+
+struct static_key __smart_grid_used;
+
+static void smart_grid_usage_inc(void)
+{
+	static_key_slow_inc(&__smart_grid_used);
+}
+
+static void smart_grid_usage_dec(void)
+{
+	static_key_slow_dec(&__smart_grid_used);
+}
+
+static inline struct cpumask *task_prefer_cpus(struct task_struct *p)
+{
+	struct affinity_domain *ad;
+
+	if (!smart_grid_used())
+		return p->prefer_cpus;
+
+	if (task_group(p)->auto_affinity->mode == 0)
+		return (void *)p->cpus_ptr;
+
+	ad = &task_group(p)->auto_affinity->ad;
+	return ad->domains[ad->curr_level];
+}
+
+static inline int dynamic_affinity_mode(struct task_struct *p)
+{
+	if (!prefer_cpus_valid(p))
+		return -1;
+
+	if (smart_grid_used())
+		return task_group(p)->auto_affinity->mode == 0 ? -1 : 1;
+
+	return 0;
+}
+
+static void affinity_domain_up(struct task_group *tg)
+{
+	struct affinity_domain *ad = &tg->auto_affinity->ad;
+	u16 level = ad->curr_level;
+
+	if (ad->curr_level >= ad->dcount - 1)
+		return;
+
+	while (level < ad->dcount) {
+		if (IS_DOMAIN_SET(level + 1, ad->domain_mask) &&
+		    cpumask_weight(ad->domains[level + 1]) > 0) {
+			ad->curr_level = level + 1;
+			return;
+		}
+		level++;
+	}
+}
+
+static void affinity_domain_down(struct task_group *tg)
+{
+	struct affinity_domain *ad = &tg->auto_affinity->ad;
+	u16 level = ad->curr_level;
+
+	if (ad->curr_level <= 0)
+		return;
+
+	while (level > 0) {
+		if (!cpumask_weight(ad->domains[level - 1]))
+			return;
+
+		if (IS_DOMAIN_SET(level - 1, ad->domain_mask)) {
+			ad->curr_level = level - 1;
+			return;
+		}
+		level--;
+	}
+}
+
+static enum hrtimer_restart sched_auto_affi_period_timer(struct hrtimer *timer)
+{
+	struct auto_affinity *auto_affi =
+		container_of(timer, struct auto_affinity, period_timer);
+	struct task_group *tg = auto_affi->tg;
+	struct affinity_domain *ad = &auto_affi->ad;
+	struct cpumask *span = ad->domains[ad->curr_level];
+	unsigned long util_avg_sum = 0;
+	unsigned long tg_capacity = 0;
+	unsigned long flags;
+	int cpu;
+
+	for_each_cpu(cpu, span) {
+		util_avg_sum += cpu_util_cfs(cpu);
+		tg_capacity += capacity_of(cpu);
+	}
+
+	raw_spin_lock_irqsave(&auto_affi->lock, flags);
+	/* May be re-entrant by stop_auto_affinity, So check again. */
+	if (auto_affi->period_active == 0) {
+		raw_spin_unlock_irqrestore(&auto_affi->lock, flags);
+		return HRTIMER_NORESTART;
+	}
+
+	if (util_avg_sum * 100 >= tg_capacity * sysctl_sched_util_low_pct) {
+		affinity_domain_up(tg);
+	} else if (util_avg_sum * 100 < tg_capacity *
+		   sysctl_sched_util_low_pct / 2) {
+		affinity_domain_down(tg);
+	}
+
+	schedstat_inc(ad->stay_cnt[ad->curr_level]);
+	hrtimer_forward_now(timer, auto_affi->period);
+	raw_spin_unlock_irqrestore(&auto_affi->lock, flags);
+	return HRTIMER_RESTART;
+}
+
+static int tg_update_affinity_domain_down(struct task_group *tg, void *data)
+{
+	struct auto_affinity *auto_affi = tg->auto_affinity;
+	struct affinity_domain *ad;
+	int *cpu_state = data;
+	unsigned long flags;
+	int i;
+
+	if (!auto_affi)
+		return 0;
+
+	ad = &tg->auto_affinity->ad;
+	raw_spin_lock_irqsave(&auto_affi->lock, flags);
+
+	for (i = 0; i < ad->dcount; i++) {
+		if (!cpumask_test_cpu(cpu_state[0], ad->domains_orig[i]))
+			continue;
+
+		/* online */
+		if (cpu_state[1]) {
+			cpumask_set_cpu(cpu_state[0], ad->domains[i]);
+		} else {
+			cpumask_clear_cpu(cpu_state[0], ad->domains[i]);
+			if (!cpumask_weight(ad->domains[i]))
+				affinity_domain_up(tg);
+		}
+
+	}
+	raw_spin_unlock_irqrestore(&auto_affi->lock, flags);
+
+	return 0;
+}
+
+void tg_update_affinity_domains(int cpu, int online)
+{
+	int cpu_state[2];
+
+	/* No need update when dynamic affinity disabled */
+	if (!dynamic_affinity_enabled())
+		return;
+
+	cpu_state[0] = cpu;
+	cpu_state[1] = online;
+
+	rcu_read_lock();
+	walk_tg_tree(tg_update_affinity_domain_down, tg_nop, cpu_state);
+	rcu_read_unlock();
+}
+
+void start_auto_affinity(struct auto_affinity *auto_affi)
+{
+	ktime_t delay_ms;
+
+	mutex_lock(&smart_grid_used_mutex);
+	raw_spin_lock_irq(&auto_affi->lock);
+	if (auto_affi->period_active == 1) {
+		raw_spin_unlock_irq(&auto_affi->lock);
+		mutex_unlock(&smart_grid_used_mutex);
+		return;
+	}
+
+	auto_affi->period_active = 1;
+	auto_affi->mode = 1;
+	delay_ms = ms_to_ktime(sysctl_affinity_adjust_delay_ms);
+	hrtimer_forward_now(&auto_affi->period_timer, delay_ms);
+	hrtimer_start_expires(&auto_affi->period_timer,
+				HRTIMER_MODE_ABS_PINNED);
+	raw_spin_unlock_irq(&auto_affi->lock);
+
+	smart_grid_usage_inc();
+	mutex_unlock(&smart_grid_used_mutex);
+}
+
+void stop_auto_affinity(struct auto_affinity *auto_affi)
+{
+	struct affinity_domain *ad = &auto_affi->ad;
+
+	mutex_lock(&smart_grid_used_mutex);
+	raw_spin_lock_irq(&auto_affi->lock);
+	if (auto_affi->period_active == 0) {
+		raw_spin_unlock_irq(&auto_affi->lock);
+		mutex_unlock(&smart_grid_used_mutex);
+		return;
+	}
+	auto_affi->period_active = 0;
+	auto_affi->mode = 0;
+	ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0;
+	raw_spin_unlock_irq(&auto_affi->lock);
+
+	smart_grid_usage_dec();
+	mutex_unlock(&smart_grid_used_mutex);
+}
+
+static struct sched_group *sd_find_idlest_group(struct sched_domain *sd)
+{
+	struct sched_group *idlest = NULL, *group = sd->groups;
+	unsigned long min_runnable_load = ULONG_MAX;
+	unsigned long min_avg_load = ULONG_MAX;
+	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+				(sd->imbalance_pct-100) / 100;
+
+	do {
+		unsigned long load, avg_load, runnable_load;
+		int i;
+
+		avg_load = 0;
+		runnable_load = 0;
+
+		for_each_cpu(i, sched_group_span(group)) {
+			load = cpu_runnable(cpu_rq(i));
+			runnable_load += load;
+			avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
+		}
+
+		avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
+					group->sgc->capacity;
+		runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
+					group->sgc->capacity;
+
+		if (min_runnable_load > (runnable_load + imbalance)) {
+			min_runnable_load = runnable_load;
+			min_avg_load = avg_load;
+			idlest = group;
+		} else if ((runnable_load < (min_runnable_load + imbalance)) &&
+			   (100*min_avg_load > imbalance_scale*avg_load)) {
+			min_avg_load = avg_load;
+			idlest = group;
+		}
+	} while (group = group->next, group != sd->groups);
+
+	return idlest ? idlest : group;
+}
+
+static int group_find_idlest_cpu(struct sched_group *group)
+{
+	int least_loaded_cpu = cpumask_first(sched_group_span(group));
+	unsigned long load, min_load = ULONG_MAX;
+	unsigned int min_exit_latency = UINT_MAX;
+	u64 latest_idle_timestamp = 0;
+	int shallowest_idle_cpu = -1;
+	int i;
+
+	if (group->group_weight == 1)
+		return least_loaded_cpu;
+
+	for_each_cpu(i, sched_group_span(group)) {
+		if (sched_idle_cpu(i))
+			return i;
+
+		if (available_idle_cpu(i)) {
+			struct rq *rq = cpu_rq(i);
+			struct cpuidle_state *idle = idle_get_state(rq);
+
+			if (idle && idle->exit_latency < min_exit_latency) {
+				min_exit_latency = idle->exit_latency;
+				latest_idle_timestamp = rq->idle_stamp;
+				shallowest_idle_cpu = i;
+			} else if ((!idle ||
+				   idle->exit_latency == min_exit_latency) &&
+				   rq->idle_stamp > latest_idle_timestamp) {
+				latest_idle_timestamp = rq->idle_stamp;
+				shallowest_idle_cpu = i;
+			}
+		} else if (shallowest_idle_cpu == -1) {
+			load = cpu_runnable(cpu_rq(i));
+			if (load < min_load) {
+				min_load = load;
+				least_loaded_cpu = i;
+			}
+		}
+	}
+
+	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu :
+			least_loaded_cpu;
+}
+
+void free_affinity_domains(struct affinity_domain *ad)
+{
+	int i;
+
+	for (i = 0; i < AD_LEVEL_MAX; i++) {
+		kfree(ad->domains[i]);
+		kfree(ad->domains_orig[i]);
+		ad->domains[i] = NULL;
+		ad->domains_orig[i] = NULL;
+	}
+	ad->dcount = 0;
+}
+
+static int init_affinity_domains_orig(struct affinity_domain *ad)
+{
+	int i, j;
+
+	for (i = 0; i < ad->dcount; i++) {
+		ad->domains_orig[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+		if (!ad->domains_orig[i])
+			goto err;
+
+		cpumask_copy(ad->domains_orig[i], ad->domains[i]);
+	}
+
+	return 0;
+err:
+	for (j = 0; j < i; j++) {
+		kfree(ad->domains_orig[j]);
+		ad->domains_orig[j] = NULL;
+	}
+	return -ENOMEM;
+}
+
+static int init_affinity_domains(struct affinity_domain *ad)
+{
+	struct sched_domain *sd = NULL, *tmp;
+	struct sched_group *idlest = NULL;
+	int ret = -ENOMEM;
+	int dcount = 0;
+	int i = 0;
+	int cpu;
+
+	for (i = 0; i < AD_LEVEL_MAX; i++) {
+		ad->domains[i] = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+		if (!ad->domains[i])
+			goto err;
+	}
+
+	rcu_read_lock();
+	cpu = cpumask_first_and(cpu_active_mask,
+				housekeeping_cpumask(HK_TYPE_DOMAIN));
+	for_each_domain(cpu, tmp) {
+		sd = tmp;
+		dcount++;
+	}
+
+	if (!sd || dcount > AD_LEVEL_MAX) {
+		rcu_read_unlock();
+		ret = -EINVAL;
+		goto err;
+	}
+
+	idlest = sd_find_idlest_group(sd);
+	cpu = group_find_idlest_cpu(idlest);
+	i = 0;
+	for_each_domain(cpu, tmp) {
+		cpumask_copy(ad->domains[i], sched_domain_span(tmp));
+		__schedstat_set(ad->stay_cnt[i], 0);
+		i++;
+	}
+	rcu_read_unlock();
+
+	ad->dcount = dcount;
+	ad->curr_level = ad->dcount > 0 ? ad->dcount - 1 : 0;
+	ad->domain_mask = (1 << ad->dcount) - 1;
+
+	ret = init_affinity_domains_orig(ad);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	free_affinity_domains(ad);
+	return ret;
+}
+
+int init_auto_affinity(struct task_group *tg)
+{
+	struct auto_affinity *auto_affi;
+	int ret;
+
+	/* No need init auto affinity when dynamic affinity disabled */
+	if (!dynamic_affinity_enabled())
+		return 0;
+
+	auto_affi = kzalloc(sizeof(*auto_affi), GFP_KERNEL);
+	if (!auto_affi)
+		return -ENOMEM;
+
+	raw_spin_lock_init(&auto_affi->lock);
+	auto_affi->mode = 0;
+	auto_affi->period_active = 0;
+	auto_affi->period = ms_to_ktime(AUTO_AFFINITY_DEFAULT_PERIOD_MS);
+	hrtimer_init(&auto_affi->period_timer, CLOCK_MONOTONIC,
+		HRTIMER_MODE_ABS_PINNED);
+	auto_affi->period_timer.function = sched_auto_affi_period_timer;
+
+	ret = init_affinity_domains(&auto_affi->ad);
+	if (ret) {
+		kfree(auto_affi);
+		if (ret == -EINVAL)
+			ret = 0;
+		return ret;
+	}
+
+	auto_affi->tg = tg;
+	tg->auto_affinity = auto_affi;
+	return 0;
+}
+
+static void destroy_auto_affinity(struct task_group *tg)
+{
+	struct auto_affinity *auto_affi = tg->auto_affinity;
+
+	if (!dynamic_affinity_enabled())
+		return;
+
+	if (unlikely(!auto_affi))
+		return;
+
+	if (auto_affi->period_active)
+		smart_grid_usage_dec();
+
+	hrtimer_cancel(&auto_affi->period_timer);
+	free_affinity_domains(&auto_affi->ad);
+
+	kfree(tg->auto_affinity);
+	tg->auto_affinity = NULL;
+}
+#else
+static void destroy_auto_affinity(struct task_group *tg) {}
+
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+static inline bool prefer_cpus_valid(struct task_struct *p);
+
+static inline struct cpumask *task_prefer_cpus(struct task_struct *p)
+{
+	return p->prefer_cpus;
+}
+
+static inline int dynamic_affinity_mode(struct task_struct *p)
+{
+	if (!prefer_cpus_valid(p))
+		return -1;
+
+	return 0;
+}
+#endif
+#endif
+
 /**************************************************
  * CFS operations on tasks:
  */
@@ -8335,13 +8799,16 @@ __setup("dynamic_affinity", dynamic_affinity_switch_setup);
 
 static inline bool prefer_cpus_valid(struct task_struct *p)
 {
+	struct cpumask *prefer_cpus;
+
 	if (!dynamic_affinity_enabled())
 		return false;
 
-	return p->prefer_cpus &&
-	       !cpumask_empty(p->prefer_cpus) &&
-	       !cpumask_equal(p->prefer_cpus, p->cpus_ptr) &&
-	       cpumask_subset(p->prefer_cpus, p->cpus_ptr);
+	prefer_cpus = task_prefer_cpus(p);
+
+	return !cpumask_empty(prefer_cpus) &&
+	       !cpumask_equal(prefer_cpus, p->cpus_ptr) &&
+	       cpumask_subset(prefer_cpus, p->cpus_ptr);
 }
 
 static inline unsigned long taskgroup_cpu_util(struct task_group *tg,
@@ -8375,13 +8842,24 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
 	long min_util = INT_MIN;
 	struct task_group *tg;
 	long spare;
-	int cpu;
+	int cpu, mode;
 
 	p->select_cpus = p->cpus_ptr;
-	if (!prefer_cpus_valid(p))
+	rcu_read_lock();
+	mode = dynamic_affinity_mode(p);
+	if (mode == -1) {
+		rcu_read_unlock();
+		return;
+	} else if (mode == 1) {
+		p->select_cpus = task_prefer_cpus(p);
+		if (idlest_cpu)
+			*idlest_cpu = cpumask_first(p->select_cpus);
+		sched_qos_affinity_set(p);
+		rcu_read_unlock();
 		return;
+	}
 
-	rcu_read_lock();
+	/* manual mode */
 	tg = task_group(p);
 	for_each_cpu(cpu, p->prefer_cpus) {
 		if (idlest_cpu && (available_idle_cpu(cpu) || sched_idle_cpu(cpu))) {
@@ -8445,7 +8923,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	time = schedstat_start_time();
 
 	/*
-	 * required for stable ->cpus_allowed
+	 * required for stable ->cpus_ptr
 	 */
 	lockdep_assert_held(&p->pi_lock);
 
@@ -13986,7 +14464,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
-	int i;
+	int i, ret;
 
 	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
 	if (!tg->cfs_rq)
@@ -13998,6 +14476,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	tg->shares = NICE_0_LOAD;
 
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
+	ret = init_auto_affinity(tg);
+	if (ret)
+		goto err;
 
 	for_each_possible_cpu(i) {
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
@@ -14020,6 +14501,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 err_free_rq:
 	kfree(cfs_rq);
 err:
+	destroy_auto_affinity(tg);
 	return 0;
 }
 
@@ -14048,6 +14530,7 @@ void unregister_fair_sched_group(struct task_group *tg)
 	int cpu;
 
 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+	destroy_auto_affinity(tg);
 
 	for_each_possible_cpu(cpu) {
 		if (tg->se[cpu])
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4b679122d26f..87d9f705645e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -362,6 +362,34 @@ struct cfs_bandwidth {
 #endif
 };
 
+
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+#define AD_LEVEL_MAX		8
+
+struct affinity_domain {
+	int			dcount;
+	int			curr_level;
+	u32			domain_mask;
+#ifdef CONFIG_SCHEDSTATS
+	u64			stay_cnt[AD_LEVEL_MAX];
+#endif
+	struct cpumask		*domains[AD_LEVEL_MAX];
+	struct cpumask		*domains_orig[AD_LEVEL_MAX];
+};
+#endif
+
+struct auto_affinity {
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	raw_spinlock_t		lock;
+	u64			mode;
+	ktime_t			period;
+	struct hrtimer		period_timer;
+	int			period_active;
+	struct affinity_domain	ad;
+	struct task_group	*tg;
+#endif
+};
+
 /* Task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
@@ -418,7 +446,9 @@ struct task_group {
 	/* Effective clamp values used for a task group */
 	struct uclamp_se	uclamp[UCLAMP_CNT];
 #endif
-
+#if defined(CONFIG_QOS_SCHED_SMART_GRID) && !defined(__GENKSYMS__)
+	struct auto_affinity *auto_affinity;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -485,6 +515,21 @@ extern void sched_release_group(struct task_group *tg);
 
 extern void sched_move_task(struct task_struct *tsk);
 
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+extern void start_auto_affinity(struct auto_affinity *auto_affi);
+extern void stop_auto_affinity(struct auto_affinity *auto_affi);
+extern int init_auto_affinity(struct task_group *tg);
+extern void tg_update_affinity_domains(int cpu, int online);
+
+#else
+static inline int init_auto_affinity(struct task_group *tg)
+{
+	return 0;
+}
+
+static inline void tg_update_affinity_domains(int cpu, int online) {}
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 354a2d294f52..fe5d617f699a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -91,6 +91,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
 #if defined(CONFIG_SYSCTL)
 
 /* Constants used for minimum and maximum */
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+static int hundred_thousand = 100000;
+#endif
 
 #ifdef CONFIG_PERF_EVENTS
 static const int six_hundred_forty_kb = 640 * 1024;
@@ -2042,6 +2045,17 @@ static struct ctl_table kern_table[] = {
 		.extra1		= SYSCTL_ONE,
 		.extra2		= SYSCTL_INT_MAX,
 	},
+#endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	{
+		.procname	= "affinity_adjust_delay_ms",
+		.data		= &sysctl_affinity_adjust_delay_ms,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2		= &hundred_thousand,
+	},
 #endif
 	{ }
 };
-- 
2.34.1