hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAQWPQ
-----------------------------------------
Assume that we have a cpu cgroup named test and set the cpu.steal_task to 1, then the tasks in the test cgroup are called tasks marked with steal_task.
When there are at least 2 cfs tasks on the cpu's rq, and at least 1 of them is marked with steal_task, the cpu is considered as an overload cpu.
Before a cpu enters idle, it will pull tasks from busy cpu through idle balance. If it fails to pull a task, steal task will be triggered. The idle cpu will pull a task from the overload cpu.
Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- kernel/sched/fair.c | 142 ++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 5 +- 2 files changed, 122 insertions(+), 25 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b737961e327..836f1a7feb74 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4465,14 +4465,30 @@ static inline bool steal_enabled(void) return sched_feat(STEAL) && allow; }
+static inline bool group_steal_enabled(int steal_task) +{ + return group_steal_used() && is_tg_steal(steal_task); +} + static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus; unsigned long time; + bool need_clear = false;
if (!steal_enabled()) return;
+ if (!group_steal_used() && rq->cfs.h_nr_running >= 2) + return; + + if (group_steal_used() && + (rq->cfs.h_nr_running < 2 || rq->cfs.steal_h_nr_running == 0)) + need_clear = true; + + if (!need_clear) + return; + time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); @@ -4490,6 +4506,12 @@ static void overload_set(struct rq *rq) if (!steal_enabled()) return;
+ if (rq->cfs.h_nr_running < 2) + return; + + if (group_steal_used() && rq->cfs.steal_h_nr_running < 1) + return; + time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); @@ -5273,13 +5295,15 @@ static int tg_throttle_down(struct task_group *tg, void *data) static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; +#endif
raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5314,6 +5338,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif
for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); @@ -5333,6 +5360,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + qcfs_rq->steal_h_nr_running -= steal_delta; +#endif
if (qcfs_rq->load.weight) dequeue = 0; @@ -5340,8 +5370,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se) { sub_nr_running(rq, task_delta); - if (prev_nr >= 2 && prev_nr - task_delta < 2) - overload_clear(rq); +#ifdef CONFIG_SCHED_STEAL + overload_clear(rq); +#endif }
/* @@ -5356,13 +5387,15 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; +#endif
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -5394,6 +5427,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif + for_each_sched_entity(se) { if (se->on_rq) break; @@ -5405,6 +5442,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5422,6 +5462,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5437,8 +5480,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); - if (prev_nr < 2 && prev_nr + task_delta >= 2) - overload_set(rq); +#ifdef CONFIG_SCHED_STEAL + overload_set(rq); +#endif
unthrottle_throttle: /* @@ -6527,8 +6571,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP); - unsigned int prev_nr = rq->cfs.h_nr_running; - +#ifdef CONFIG_SCHED_STEAL + bool tg_steal_enabled = group_steal_enabled(se->steal_task); +#endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER int qos_idle_h_nr_running;
@@ -6563,6 +6608,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running++; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6583,6 +6632,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running++; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6598,8 +6651,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); - if (prev_nr == 1) - overload_set(rq); +#ifdef CONFIG_SCHED_STEAL + overload_set(rq); +#endif
/* * Since new tasks are assigned an initial util_avg equal to @@ -6658,9 +6712,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p);
- unsigned int prev_nr = rq->cfs.h_nr_running; bool was_sched_idle = sched_idle_rq(rq); - +#ifdef CONFIG_SCHED_STEAL + bool tg_steal_enabled = group_steal_enabled(se->steal_task); +#endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER int qos_idle_h_nr_running = se->qos_idle ? 1 : 0;
@@ -6678,6 +6733,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running--; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6710,6 +6769,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + if (tg_steal_enabled) + cfs_rq->steal_h_nr_running--; +#endif
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6719,8 +6782,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); - if (prev_nr == 2) - overload_clear(rq); +#ifdef CONFIG_SCHED_STEAL + overload_clear(rq); +#endif
/* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) @@ -8494,10 +8558,12 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; - unsigned int prev_nr = cfs_rq->h_nr_running; long task_delta, idle_task_delta, dequeue = 1; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; +#endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; #endif se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -8511,6 +8577,10 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif + for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -8529,6 +8599,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + qcfs_rq->steal_h_nr_running -= steal_delta; +#endif
if (qcfs_rq->load.weight) dequeue = 0; @@ -8536,9 +8609,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
if (!se) { sub_nr_running(rq, task_delta); - if (prev_nr >= 2 && prev_nr - task_delta < 2) - overload_clear(rq); - +#ifdef CONFIG_SCHED_STEAL + overload_clear(rq); +#endif }
if (!qos_timer_is_activated(cpu_of(rq))) @@ -8554,11 +8627,13 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; - unsigned int prev_nr = cfs_rq->h_nr_running; long task_delta, idle_task_delta; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER long qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + long steal_delta; +#endif
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -8583,6 +8658,10 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_idle_delta = cfs_rq->qos_idle_h_nr_running; #endif +#ifdef CONFIG_SCHED_STEAL + steal_delta = cfs_rq->steal_h_nr_running; +#endif + for_each_sched_entity(se) { if (se->on_rq) break; @@ -8595,6 +8674,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif
if (cfs_rq_throttled(cfs_rq)) goto unthrottle_throttle; @@ -8611,6 +8693,10 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif +#ifdef CONFIG_SCHED_STEAL + cfs_rq->steal_h_nr_running += steal_delta; +#endif + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto unthrottle_throttle; @@ -8624,8 +8710,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) }
add_nr_running(rq, task_delta); - if (prev_nr < 2 && prev_nr + task_delta >= 2) - overload_set(rq); +#ifdef CONFIG_SCHED_STEAL + overload_set(rq); +#endif
unthrottle_throttle: /* @@ -9793,10 +9880,14 @@ static bool can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) { int dst_cpu = dst_rq->cpu; + struct task_group *tg = task_group(p);
lockdep_assert_rq_held(rq);
- if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu)) + if (group_steal_used() && !is_tg_steal(tg->steal_task)) + return false; + + if (throttled_lb_pair(tg, cpu_of(rq), dst_cpu)) return false;
if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) { @@ -13071,10 +13162,14 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, int stolen = 0; int dst_cpu = dst_rq->cpu; struct rq *src_rq = cpu_rq(src_cpu); + bool tg_used = group_steal_used();
if (dst_cpu == src_cpu || src_rq->cfs.h_nr_running < 2) return 0;
+ if (tg_used && src_rq->cfs.steal_h_nr_running < 1) + return 0; + if (*locked) { rq_unpin_lock(dst_rq, dst_rf); raw_spin_rq_unlock(dst_rq); @@ -13083,7 +13178,8 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, rq_lock_irqsave(src_rq, &rf); update_rq_clock(src_rq);
- if (src_rq->cfs.h_nr_running < 2 || !cpu_active(src_cpu)) + if (!cpu_active(src_cpu) || src_rq->cfs.h_nr_running < 2 || + (tg_used && src_rq->cfs.steal_h_nr_running < 1)) p = NULL; else p = detach_next_task(&src_rq->cfs, dst_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ceea107a1dc8..65c1dac9cca1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -722,12 +722,13 @@ struct cfs_rq { unsigned int forceidle_seq; KABI_FILL_HOLE(unsigned int kabi_hole) u64 min_vruntime_fi; -#elif defined CONFIG_QOS_SCHED_SMT_EXPELLER && !defined(__GENKSYMS__) +#elif (defined(CONFIG_QOS_SCHED_SMT_EXPELLER) || \ + defined(CONFIG_SCHED_STEAL)) && !defined(__GENKSYMS__) union { unsigned int qos_idle_h_nr_running; /* qos_level:-1 */ unsigned long qos_idle_h_nr_running_padding; }; - KABI_FILL_HOLE(unsigned long kabi_hole) + unsigned long steal_h_nr_running; #else KABI_RESERVE(3) KABI_RESERVE(4)