Only when the load of the physical core exceeds the set threshold, the
task will be migrated, otherwise the task will keep on the physical
core.
External impacts:
1) default config in arm64,x86: CONFIG_SCHED_KEEP_ON_CORE=y
2) sysctl: /proc/sys/kernel/sched_util_keep_on_core
3) sched features: KEEP_ON_CORE (default NO_KEEP_ON_CORE)
Signed-off-by: Cheng Yu <serein.chengyu(a)huawei.com>
---
arch/arm64/configs/openeuler_defconfig | 1 +
arch/x86/configs/openeuler_defconfig | 1 +
include/linux/sched/sysctl.h | 4 +++
init/Kconfig | 9 +++++++
kernel/sched/fair.c | 36 ++++++++++++++++++++++++++
kernel/sched/features.h | 4 +++
kernel/sysctl.c | 10 +++++++
7 files changed, 65 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 69ff0b64ba59..8c95a5332b40 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -189,6 +189,7 @@ CONFIG_USER_NS=y
CONFIG_PID_NS=y
CONFIG_NET_NS=y
CONFIG_SCHED_STEAL=y
+CONFIG_SCHED_KEEP_ON_CORE=y
CONFIG_CHECKPOINT_RESTORE=y
CONFIG_SCHED_AUTOGROUP=y
# CONFIG_SYSFS_DEPRECATED is not set
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index f3b810d0cf47..65ef2f183b02 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -194,6 +194,7 @@ CONFIG_USER_NS=y
CONFIG_PID_NS=y
CONFIG_NET_NS=y
CONFIG_SCHED_STEAL=y
+CONFIG_SCHED_KEEP_ON_CORE=y
CONFIG_CHECKPOINT_RESTORE=y
CONFIG_SCHED_AUTOGROUP=y
# CONFIG_SYSFS_DEPRECATED is not set
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 5cd5b3c579d3..fb1436286994 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -35,6 +35,10 @@ extern unsigned int sysctl_sched_child_runs_first;
extern int sysctl_sched_util_low_pct;
#endif
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+extern int sysctl_sched_util_keep_on_core;
+#endif
+
#ifdef CONFIG_QOS_SCHED_SMART_GRID
extern unsigned int sysctl_smart_grid_strategy_ctrl;
extern int sysctl_affinity_adjust_delay_ms;
diff --git a/init/Kconfig b/init/Kconfig
index e552194efbea..ee94515b2f04 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1370,6 +1370,15 @@ config SCHED_STEAL
If unsure, say N here.
+config SCHED_KEEP_ON_CORE
+ bool "Prefer physical cores when migrating tasks"
+ depends on SMP
+ default n
+ help
+ Only when the load of the physical core exceeds the set threshold,
+ the task will be migrated, otherwise the task will keep on the
+ physical core.
+
config CHECKPOINT_RESTORE
bool "Checkpoint/restore support"
select PROC_CHILDREN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 95d1841f8a20..4f3d81537bab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4497,6 +4497,12 @@ static inline void overload_clear(struct rq *rq) {}
static inline void overload_set(struct rq *rq) {}
#endif
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+static int core_has_spare(int cpu);
+#else
+static inline int core_has_spare(int cpu) { return 0; }
+#endif
+
#else /* CONFIG_SMP */
#define UPDATE_TG 0x0
@@ -4523,6 +4529,7 @@ static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
static inline void rq_idle_stamp_update(struct rq *rq) {}
static inline void rq_idle_stamp_clear(struct rq *rq) {}
static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) { return 0; }
+static inline int core_has_spare(int cpu) { return 0; }
static inline void overload_clear(struct rq *rq) {}
static inline void overload_set(struct rq *rq) {}
@@ -8210,6 +8217,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
#endif
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+ if (static_branch_likely(&sched_smt_present) &&
+ sched_feat(KEEP_ON_CORE))
+ if (core_has_spare(new_cpu))
+ new_cpu = cpumask_first(cpu_smt_mask((new_cpu)));
+#endif
+
rcu_read_unlock();
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
@@ -9701,6 +9715,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
}
#endif
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+ if (static_branch_likely(&sched_smt_present) &&
+ sched_feat(KEEP_ON_CORE))
+ if (core_has_spare(env->dst_cpu) &&
+ cpumask_first(cpu_smt_mask((env->dst_cpu))) != env->dst_cpu)
+ return 0;
+#endif
+
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
@@ -13189,6 +13211,20 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
}
#endif
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+int sysctl_sched_util_keep_on_core = 100;
+
+static int core_has_spare(int cpu)
+{
+ int core_id = cpumask_first(cpu_smt_mask(cpu));
+ struct rq *rq = cpu_rq(core_id);
+ unsigned long util = rq->cfs.avg.util_avg;
+ unsigned long capacity = rq->cpu_capacity;
+
+ return util * 100 < capacity * sysctl_sched_util_keep_on_core;
+}
+#endif
+
static void rq_online_fair(struct rq *rq)
{
update_sysctl();
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 76fade025c4b..fb885b20ba34 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -70,6 +70,10 @@ SCHED_FEAT(SIS_UTIL, false)
SCHED_FEAT(STEAL, false)
#endif
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+SCHED_FEAT(KEEP_ON_CORE, false)
+#endif
+
/*
* Issue a WARN when we do multiple update_rq_clock() calls
* in a single rq->lock section. Default disabled because the
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3941856c19d1..9abc01982645 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2817,6 +2817,16 @@ static struct ctl_table kern_table[] = {
.extra2 = &one_hundred,
},
#endif
+#ifdef CONFIG_SCHED_KEEP_ON_CORE
+ {
+ .procname = "sched_util_keep_on_core",
+ .data = &sysctl_sched_util_keep_on_core,
+ .maxlen = sizeof(sysctl_sched_util_keep_on_core),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#endif
#ifdef CONFIG_QOS_SCHED_SMART_GRID
{
.procname = "smart_grid_strategy_ctrl",
--
2.25.1