add steal_task for cgroup
Cheng Yu (6): Revert "sched: add mutex lock to protect qos_level" sched/fair: Add group_steal in cmdline to enable STEAL for cgroup sched/core: Add cpu.steal_task in cgroup v1 cpu subsystem sched/topology: Remove SCHED_STEAL_NODE_LIMIT_DEFAULT sched/fair: Count the number of tasks marked as steal_task on cfs_rq sched/fair: Set the maximum number of steal attempts
Zheng Zucheng (2): sched/debug: Add h_nr_running/steal_h_nr_running in sched_debug sched/core: Add mutex lock to protect steal_task
include/linux/sched.h | 4 + include/linux/sched/sysctl.h | 4 + kernel/sched/core.c | 123 +++++++++++++++++++++++++-- kernel/sched/debug.c | 5 ++ kernel/sched/fair.c | 160 +++++++++++++++++++++++++++++------ kernel/sched/sched.h | 33 ++++++-- kernel/sched/topology.c | 3 +- kernel/sysctl.c | 14 +++ 8 files changed, 304 insertions(+), 42 deletions(-)
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAS45L
-----------------------------------------
This reverts commit 995f0e60b7ef251174dfb2a5c89391a9f230403a.
RCU is an atomic operation, and calling a sleepable mutex in RCU is not allowed.
Fixes: 995f0e60b7ef ("sched: add mutex lock to protect qos_level") Signed-off-by: Cheng Yu serein.chengyu@huawei.com Signed-off-by: Zucheng Zheng zhengzucheng@huawei.com --- kernel/sched/core.c | 9 --------- kernel/sched/fair.c | 3 --- kernel/sched/sched.h | 4 ---- 3 files changed, 16 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 72abf7459829..7595a3fef28f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8642,13 +8642,6 @@ static inline int alloc_qos_sched_group(struct task_group *tg, #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER tg->smt_expell = parent->smt_expell; #endif - tg->qos_level_mutex = kzalloc(sizeof(struct mutex), GFP_KERNEL); - - if (!tg->qos_level_mutex) - return 0; - - mutex_init(tg->qos_level_mutex); - return 1; }
@@ -9724,7 +9717,6 @@ static int tg_change_scheduler(struct task_group *tg, void *data) s64 qos_level = *(s64 *)data; struct cgroup_subsys_state *css = &tg->css;
- mutex_lock(tg->qos_level_mutex); tg->qos_level = qos_level; if (is_offline_level(qos_level)) policy = SCHED_IDLE; @@ -9742,7 +9734,6 @@ static int tg_change_scheduler(struct task_group *tg, void *data) sched_setscheduler(tsk, policy, ¶m); } css_task_iter_end(&it); - mutex_unlock(tg->qos_level_mutex);
return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d742793567b2..45b8e13943db 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13645,9 +13645,6 @@ void free_fair_sched_group(struct task_group *tg) kfree(tg->se[i]); }
-#ifdef CONFIG_QOS_SCHED - kfree(tg->qos_level_mutex); -#endif kfree(tg->cfs_rq); kfree(tg->se); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dbd264f595a0..a6d7febf789d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -497,11 +497,7 @@ struct task_group { #else KABI_RESERVE(2) #endif -#ifdef CONFIG_QOS_SCHED - KABI_USE(3, struct mutex *qos_level_mutex) -#else KABI_RESERVE(3) -#endif #if defined(CONFIG_QOS_SCHED_SMART_GRID) && !defined(__GENKSYMS__) KABI_USE(4, struct auto_affinity *auto_affinity) #else
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAQWPQ
-----------------------------------------
We add a new parameter group_steal in cmdline to enable the steal task feature for cgroup. That is, when group_steal is not configured in cmdline, it means the feature is enabled globally; after configuration, it can be enabled only for a certain cgroup.
Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/fair.c | 8 ++++++++ kernel/sched/sched.h | 9 +++++++++ 2 files changed, 17 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 45b8e13943db..5b737961e327 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4446,6 +4446,14 @@ static inline void rq_idle_stamp_clear(struct rq *rq) }
#ifdef CONFIG_SCHED_STEAL +DEFINE_STATIC_KEY_FALSE(group_steal); + +static int __init group_steal_setup(char *__unused) +{ + static_branch_enable(&group_steal); + return 1; +} +__setup("group_steal", group_steal_setup);
static inline bool steal_enabled(void) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a6d7febf789d..87c63fa5625e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1778,6 +1778,15 @@ extern void set_sched_cluster(void); static inline void set_sched_cluster(void) { } #endif
+#ifdef CONFIG_SCHED_STEAL +DECLARE_STATIC_KEY_FALSE(group_steal); + +static inline bool group_steal_used(void) +{ + return static_branch_unlikely(&group_steal); +} +#endif + #ifdef CONFIG_NUMA #ifdef CONFIG_SCHED_STEAL extern struct static_key_true sched_steal_allow;
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAQWPQ
-----------------------------------------
We add a new cpu.steal_task interface in cgroup v1 for a certain cgroup. The default value is 0, which means it is disabled, and 1, which means it is enabled.
If we want to enable the steal task feature for a cgroup, we first need to configure group_steal in cmdline, then configure STEAL in sched_feature, and finally configure the value of cpu.steal_task to 1.
Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- include/linux/sched.h | 4 ++ kernel/sched/core.c | 108 ++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 19 +++++++- 3 files changed, 130 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 97c216bfb0fc..57de624f17a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -522,7 +522,11 @@ struct sched_entity { #else KABI_RESERVE(1) #endif +#ifdef CONFIG_SCHED_STEAL + KABI_USE(2, int steal_task) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7595a3fef28f..900637a6ac09 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8278,6 +8278,9 @@ void __init sched_init(void) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER root_task_group.smt_expell = TG_SMT_EXPELL; #endif +#ifdef CONFIG_SCHED_STEAL + root_task_group.steal_task = TG_STEAL_NO; +#endif #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -8718,6 +8721,20 @@ static void sched_free_group(struct task_group *tg) kmem_cache_free(task_group_cache, tg); }
+#ifdef CONFIG_SCHED_STEAL +static void sched_change_steal_group(struct task_struct *tsk, struct task_group *tg) +{ + struct sched_entity *se = &tsk->se; + + se->steal_task = tg->steal_task; +} + +static inline void tg_init_steal(struct task_group *tg, struct task_group *ptg) +{ + tg->steal_task = ptg->steal_task; +} +#endif + #ifdef CONFIG_BPF_SCHED static inline void tg_init_tag(struct task_group *tg, struct task_group *ptg) { @@ -8745,6 +8762,10 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err;
+#ifdef CONFIG_SCHED_STEAL + tg_init_steal(tg, parent); +#endif + #ifdef CONFIG_BPF_SCHED tg_init_tag(tg, parent); #endif @@ -8820,6 +8841,10 @@ static void sched_change_group(struct task_struct *tsk, int type) sched_change_qos_group(tsk, tg); #endif
+#ifdef CONFIG_SCHED_STEAL + sched_change_steal_group(tsk, tg); +#endif + #ifdef CONFIG_BPF_SCHED /* * This function has cleared and restored the task status, @@ -9784,6 +9809,81 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, } #endif
+#ifdef CONFIG_SCHED_STEAL +static inline s64 cpu_steal_task_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->steal_task; +} + +void sched_setsteal(struct task_struct *tsk, s64 steal_task) +{ + struct sched_entity *se = &tsk->se; + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq_flags rf; + struct rq *rq; + + if (se->steal_task == steal_task) + return; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + update_rq_clock(rq); + if (queued) + dequeue_task(rq, tsk, queue_flags); + if (running) + put_prev_task(rq, tsk); + + se->steal_task = steal_task; + + if (queued) + enqueue_task(rq, tsk, queue_flags); + if (running) + set_next_task(rq, tsk); + + task_rq_unlock(rq, tsk, &rf); +} + +int tg_change_steal(struct task_group *tg, void *data) +{ + struct css_task_iter it; + struct task_struct *tsk; + s64 steal_task = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->steal_task = steal_task; + + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_setsteal(tsk, steal_task); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_steal_task_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 steal_task) +{ + struct task_group *tg = css_tg(css); + + if (!group_steal_used()) + return -EPERM; + + if (steal_task < TG_STEAL_NO || steal_task > TG_STEAL) + return -EINVAL; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_steal, tg_nop, (void *)(&steal_task)); + rcu_read_unlock(); + + return 0; +} +#endif + #ifdef CONFIG_BPF_SCHED void sched_settag(struct task_struct *tsk, s64 tag) { @@ -9950,6 +10050,14 @@ static struct cftype cpu_legacy_files[] = { .write_s64 = cpu_smt_expell_write, }, #endif +#ifdef CONFIG_SCHED_STEAL + { + .name = "steal_task", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_steal_task_read, + .write_s64 = cpu_steal_task_write, + }, +#endif #ifdef CONFIG_BPF_SCHED { .name = "tag", diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 87c63fa5625e..ceea107a1dc8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -402,7 +402,6 @@ struct cfs_bandwidth { #endif };
- #ifdef CONFIG_QOS_SCHED_SMART_GRID #define AD_LEVEL_MAX 8
@@ -497,7 +496,13 @@ struct task_group { #else KABI_RESERVE(2) #endif + +#ifdef CONFIG_SCHED_STEAL + KABI_USE(3, int steal_task) +#else KABI_RESERVE(3) +#endif + #if defined(CONFIG_QOS_SCHED_SMART_GRID) && !defined(__GENKSYMS__) KABI_USE(4, struct auto_affinity *auto_affinity) #else @@ -505,6 +510,18 @@ struct task_group { #endif };
+#ifdef CONFIG_SCHED_STEAL +enum tg_steal_task { + TG_STEAL_NO = 0, + TG_STEAL = 1, +}; + +static inline bool is_tg_steal(int steal_task) +{ + return steal_task == TG_STEAL; +} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAQWPQ
-----------------------------------------
The sched_steal_node_limit parameter in cmdline is used to configure the number of numa nodes to enable the steal task feature. That is, if the configured value is less than the actual number of numa nodes, the feature will not be enabled. In a system with 2 numa nodes, there is no need to configure this parameter. Now we remove the default value 2 and directly obtain the actual number of numa nodes in the system.
Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/topology.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1df0f8e345bd..84ae8708eda4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1894,7 +1894,6 @@ static void init_numa_topology_type(void) #ifdef CONFIG_SCHED_STEAL DEFINE_STATIC_KEY_TRUE(sched_steal_allow); static int sched_steal_node_limit; -#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2
static int __init steal_node_limit_setup(char *buf) { @@ -1909,7 +1908,7 @@ static void check_node_limit(void) int n = num_possible_nodes();
if (sched_steal_node_limit == 0) - sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT; + sched_steal_node_limit = n; if (n > sched_steal_node_limit) { static_branch_disable(&sched_steal_allow); pr_debug("Suppressing sched STEAL. To enable, reboot with sched_steal_node_limit=%d", n);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAQWPQ
-----------------------------------------
Assume that we have a cpu cgroup named test and set the cpu.steal_task to 1, then the tasks in the test cgroup are called tasks marked with steal_task.
When there are at least 2 cfs tasks on the cpu's rq, and at least 1 of them is marked with steal_task, the cpu is considered as an overload cpu.
Before a cpu enters idle, it will pull tasks from busy cpu through idle balance. If it fails to pull a task, steal task will be triggered. The idle cpu will pull a task from the overload cpu.
Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/fair.c | 142 ++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 5 +- 2 files changed, 122 insertions(+), 25 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b737961e327..836f1a7feb74 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4465,14 +4465,30 @@ static inline bool steal_enabled(void) return sched_feat(STEAL) && allow; }
+static inline bool group_steal_enabled(int steal_task) +{ + return group_steal_used() && is_tg_steal(steal_task); +} + static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus; unsigned long time; + bool need_clear = false;
if (!steal_enabled()) return;
+ if (!group_steal_used() && rq->cfs.h_nr_running >= 2) + return; + + if (group_steal_used() && + (rq->cfs.h_nr_running < 2 || rq->cfs.steal_h_nr_running == 0)) + need_clear = true; + + if (!need_clear) + return; + time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); @@ -4490,6 +4506,12 @@ static void overload_set(struct rq *rq) if (!steal_enabled()) return;
+ if (rq->cfs.h_nr_running < 2) + return; + + if (group_steal_used() && rq->cfs.steal_h_nr_running < 1) + return; + time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); @@ -5273,13 +5295,15 @@ static int tg_throttle_down(struct task_group *tg, void *data) static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; +#endif
raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5314,6 +5338,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif
for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); @@ -5333,6 +5360,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + qcfs_rq->steal_h_nr_running -= steal_delta; +#endif
if (qcfs_rq->load.weight) dequeue = 0; @@ -5340,8 +5370,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se) { sub_nr_running(rq, task_delta); - if (prev_nr >= 2 && prev_nr - task_delta < 2) - overload_clear(rq); +#ifdef CONFIG_SCHED_STEAL + overload_clear(rq); +#endif }
/* @@ -5356,13 +5387,15 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; +#endif
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -5394,6 +5427,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif + for_each_sched_entity(se) { if (se->on_rq) break; @@ -5405,6 +5442,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5422,6 +5462,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5437,8 +5480,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); - if (prev_nr < 2 && prev_nr + task_delta >= 2) - overload_set(rq); +#ifdef CONFIG_SCHED_STEAL + overload_set(rq); +#endif
unthrottle_throttle: /* @@ -6527,8 +6571,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP); - unsigned int prev_nr = rq->cfs.h_nr_running; - +#ifdef CONFIG_SCHED_STEAL + bool tg_steal_enabled = group_steal_enabled(se->steal_task); +#endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER int qos_idle_h_nr_running;
@@ -6563,6 +6608,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running++; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6583,6 +6632,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running++; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6598,8 +6651,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); - if (prev_nr == 1) - overload_set(rq); +#ifdef CONFIG_SCHED_STEAL + overload_set(rq); +#endif
/* * Since new tasks are assigned an initial util_avg equal to @@ -6658,9 +6712,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p);
- unsigned int prev_nr = rq->cfs.h_nr_running; bool was_sched_idle = sched_idle_rq(rq); - +#ifdef CONFIG_SCHED_STEAL + bool tg_steal_enabled = group_steal_enabled(se->steal_task); +#endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER int qos_idle_h_nr_running = se->qos_idle ? 1 : 0;
@@ -6678,6 +6733,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running--; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6710,6 +6769,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running--; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6719,8 +6782,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); - if (prev_nr == 2) - overload_clear(rq); +#ifdef CONFIG_SCHED_STEAL + overload_clear(rq); +#endif
/* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) @@ -8494,10 +8558,12 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; - unsigned int prev_nr = cfs_rq->h_nr_running; long task_delta, idle_task_delta, dequeue = 1; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; +#endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; #endif se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -8511,6 +8577,10 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif + for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -8529,6 +8599,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + qcfs_rq->steal_h_nr_running -= steal_delta; +#endif
if (qcfs_rq->load.weight) dequeue = 0; @@ -8536,9 +8609,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
if (!se) { sub_nr_running(rq, task_delta); - if (prev_nr >= 2 && prev_nr - task_delta < 2) - overload_clear(rq); - +#ifdef CONFIG_SCHED_STEAL + overload_clear(rq); +#endif }
if (!qos_timer_is_activated(cpu_of(rq))) @@ -8554,11 +8627,13 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; - unsigned int prev_nr = cfs_rq->h_nr_running; long task_delta, idle_task_delta; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; +#endif
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -8583,6 +8658,10 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif + for_each_sched_entity(se) { if (se->on_rq) break; @@ -8595,6 +8674,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif
if (cfs_rq_throttled(cfs_rq)) goto unthrottle_throttle; @@ -8611,6 +8693,10 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto unthrottle_throttle; @@ -8624,8 +8710,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) }
add_nr_running(rq, task_delta); - if (prev_nr < 2 && prev_nr + task_delta >= 2) - overload_set(rq); +#ifdef CONFIG_SCHED_STEAL + overload_set(rq); +#endif
unthrottle_throttle: /* @@ -9793,10 +9880,14 @@ static bool can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) { int dst_cpu = dst_rq->cpu; + struct task_group *tg = task_group(p);
lockdep_assert_rq_held(rq);
- if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu)) + if (group_steal_used() && !is_tg_steal(tg->steal_task)) + return false; + + if (throttled_lb_pair(tg, cpu_of(rq), dst_cpu)) return false;
if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) { @@ -13071,10 +13162,14 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, int stolen = 0; int dst_cpu = dst_rq->cpu; struct rq *src_rq = cpu_rq(src_cpu); + bool tg_used = group_steal_used();
if (dst_cpu == src_cpu || src_rq->cfs.h_nr_running < 2) return 0;
+ if (tg_used && src_rq->cfs.steal_h_nr_running < 1) + return 0; + if (*locked) { rq_unpin_lock(dst_rq, dst_rf); raw_spin_rq_unlock(dst_rq); @@ -13083,7 +13178,8 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, rq_lock_irqsave(src_rq, &rf); update_rq_clock(src_rq);
- if (src_rq->cfs.h_nr_running < 2 || !cpu_active(src_cpu)) + if (!cpu_active(src_cpu) || src_rq->cfs.h_nr_running < 2 || + (tg_used && src_rq->cfs.steal_h_nr_running < 1)) p = NULL; else p = detach_next_task(&src_rq->cfs, dst_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ceea107a1dc8..65c1dac9cca1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -722,12 +722,13 @@ struct cfs_rq { unsigned int forceidle_seq; KABI_FILL_HOLE(unsigned int kabi_hole) u64 min_vruntime_fi; -#elif defined CONFIG_QOS_SCHED_SMT_EXPELLER && !defined(__GENKSYMS__) +#elif (defined(CONFIG_QOS_SCHED_SMT_EXPELLER) || \ + defined(CONFIG_SCHED_STEAL)) && !defined(__GENKSYMS__) union { unsigned int qos_idle_h_nr_running; /* qos_level:-1 */ unsigned long qos_idle_h_nr_running_padding; }; - KABI_FILL_HOLE(unsigned long kabi_hole) + unsigned long steal_h_nr_running; #else KABI_RESERVE(3) KABI_RESERVE(4)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAQWPQ
-----------------------------------------
When there are a large number of cfs tasks on rq, but only a few steal_task tasks, steal_from may often fail to pull tasks. To avoid multiple pull task failures, we can set the maximum number of steal tasks through sysctl_sched_max_steal_count. The default value is 32 and the maximum value is 1024.
Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- include/linux/sched/sysctl.h | 4 ++++ kernel/sched/fair.c | 7 +++++++ kernel/sysctl.c | 14 ++++++++++++++ 3 files changed, 25 insertions(+)
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 09214349bddf..9f998be56bdd 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,6 +39,10 @@ extern int sysctl_sched_util_low_pct; extern int sysctl_sched_util_ratio; #endif
+#ifdef CONFIG_SCHED_STEAL +extern int sysctl_sched_max_steal_count; +#endif + #ifdef CONFIG_QOS_SCHED_SMART_GRID extern unsigned int sysctl_smart_grid_strategy_ctrl; extern int sysctl_affinity_adjust_delay_ms; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 836f1a7feb74..bcb51ab94df0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13124,6 +13124,7 @@ void trigger_load_balance(struct rq *rq) }
#ifdef CONFIG_SCHED_STEAL +int sysctl_sched_max_steal_count = 32; /* * Search the runnable tasks in @cfs_rq in order of next to run, and find * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry. @@ -13135,14 +13136,20 @@ detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq) int dst_cpu = dst_rq->cpu; struct task_struct *p; struct rq *rq = rq_of(cfs_rq); + int count = 1;
lockdep_assert_rq_held(rq_of(cfs_rq));
list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) { + if (count > sysctl_sched_max_steal_count) + break; + if (can_migrate_task_llc(p, rq, dst_rq)) { detach_task(p, rq, dst_cpu); return p; } + + count++; } return NULL; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cc0d98c07c4c..3aac8e236d7d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -131,6 +131,9 @@ static int hundred_thousand = 100000; #ifdef CONFIG_PERF_EVENTS static int six_hundred_forty_kb = 640 * 1024; #endif +#ifdef CONFIG_SCHED_STEAL +static int max_steal_count = 1024; +#endif
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -2813,6 +2816,17 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, }, #endif +#ifdef CONFIG_SCHED_STEAL + { + .procname = "sched_max_steal_count", + .data = &sysctl_sched_max_steal_count, + .maxlen = sizeof(sysctl_sched_max_steal_count), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &max_steal_count, + }, +#endif #ifdef CONFIG_QOS_SCHED_SMART_GRID { .procname = "smart_grid_strategy_ctrl",
From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAQWPQ
-----------------------------------------
Add h_nr_running and steal_h_nr_running filed in /proc/sched_debug.
Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com --- kernel/sched/debug.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5233ba9fdc69..ff2d6dc59c14 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -594,6 +594,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); +#ifdef CONFIG_SCHED_STEAL + SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "steal_h_nr_running", + cfs_rq->steal_h_nr_running); +#endif SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAQWPQ
-----------------------------------------
In order to reduce the concurrent operations introduced by users setting cpu.steal_task, we add a mutex lock.
Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com --- kernel/sched/core.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 900637a6ac09..b734d561c43b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9810,6 +9810,8 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, #endif
#ifdef CONFIG_SCHED_STEAL +static DEFINE_MUTEX(steal_mutex); + static inline s64 cpu_steal_task_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -9876,10 +9878,14 @@ static int cpu_steal_task_write(struct cgroup_subsys_state *css, if (steal_task < TG_STEAL_NO || steal_task > TG_STEAL) return -EINVAL;
+ mutex_lock(&steal_mutex); + rcu_read_lock(); walk_tg_tree_from(tg, tg_change_steal, tg_nop, (void *)(&steal_task)); rcu_read_unlock();
+ mutex_unlock(&steal_mutex); + return 0; } #endif
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/12006 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/N...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/12006 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/N...