[PATCH OLK-5.10 V3 5/8] sched/fair: Count the number of tasks marked as steal_task on cfs_rq

30 Sep 2024

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/IAQWPQ
-----------------------------------------
Assume that we have a cpu cgroup named test and set the cpu.steal_task
to 1, then the tasks in the test cgroup are called tasks marked with
steal_task.
When there are at least 2 cfs tasks on the cpu's rq, and at least 1 of
them is marked with steal_task, the cpu is considered as an overload
cpu.
Before a cpu enters idle, it will pull tasks from busy cpu through idle
balance. If it fails to pull a task, steal task will be triggered. The
idle cpu will pull a task from the overload cpu.
Signed-off-by: Cheng Yu serein.chengyu@huawei.com
---
 kernel/sched/fair.c  | 142 ++++++++++++++++++++++++++++++++++++-------
 kernel/sched/sched.h |   5 +-
 2 files changed, 122 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b737961e327..836f1a7feb74 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4465,14 +4465,30 @@ static inline bool steal_enabled(void)
    return sched_feat(STEAL) && allow;
 }
+static inline bool group_steal_enabled(int steal_task)
+{
+	return group_steal_used() && is_tg_steal(steal_task);
+}
+
 static void overload_clear(struct rq *rq)
 {
    struct sparsemask *overload_cpus;
    unsigned long time;
+	bool need_clear = false;
if (!steal_enabled())
    	return;
+	if (!group_steal_used() && rq->cfs.h_nr_running >= 2)
+		return;
+
+	if (group_steal_used() &&
+	    (rq->cfs.h_nr_running < 2 || rq->cfs.steal_h_nr_running == 0))
+		need_clear = true;
+
+	if (!need_clear)
+		return;
+
    time = schedstat_start_time();
    rcu_read_lock();
    overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
@@ -4490,6 +4506,12 @@ static void overload_set(struct rq *rq)
    if (!steal_enabled())
    	return;
+	if (rq->cfs.h_nr_running < 2)
+		return;
+
+	if (group_steal_used() && rq->cfs.steal_h_nr_running < 1)
+		return;
+
    time = schedstat_start_time();
    rcu_read_lock();
    overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
@@ -5273,13 +5295,15 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
    struct rq *rq = rq_of(cfs_rq);
-	unsigned int prev_nr = rq->cfs.h_nr_running;
    struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
    struct sched_entity *se;
    long task_delta, idle_task_delta, dequeue = 1;
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    long qos_idle_delta;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+	long steal_delta;
+#endif
raw_spin_lock(&cfs_b->lock);
    /* This will start the period timer if necessary */
@@ -5314,6 +5338,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    qos_idle_delta = cfs_rq->qos_idle_h_nr_running;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+	steal_delta = cfs_rq->steal_h_nr_running;
+#endif
for_each_sched_entity(se) {
    	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
@@ -5333,6 +5360,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    	qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+		qcfs_rq->steal_h_nr_running -= steal_delta;
+#endif
if (qcfs_rq->load.weight)
    		dequeue = 0;
@@ -5340,8 +5370,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se) {
    	sub_nr_running(rq, task_delta);
-		if (prev_nr >= 2 && prev_nr - task_delta < 2)
-			overload_clear(rq);
+#ifdef CONFIG_SCHED_STEAL
+		overload_clear(rq);
+#endif
    }
/*
@@ -5356,13 +5387,15 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
    struct rq *rq = rq_of(cfs_rq);
-	unsigned int prev_nr = rq->cfs.h_nr_running;
    struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
    struct sched_entity *se;
    long task_delta, idle_task_delta;
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    long qos_idle_delta;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+	long steal_delta;
+#endif
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -5394,6 +5427,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    qos_idle_delta = cfs_rq->qos_idle_h_nr_running;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+	steal_delta = cfs_rq->steal_h_nr_running;
+#endif
+
    for_each_sched_entity(se) {
    	if (se->on_rq)
    		break;
@@ -5405,6 +5442,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    	cfs_rq->qos_idle_h_nr_running += qos_idle_delta;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+		cfs_rq->steal_h_nr_running += steal_delta;
+#endif
/* end evaluation on encountering a throttled cfs_rq */
    	if (cfs_rq_throttled(cfs_rq))
@@ -5422,6 +5462,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    	cfs_rq->qos_idle_h_nr_running += qos_idle_delta;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+		cfs_rq->steal_h_nr_running += steal_delta;
+#endif
/* end evaluation on encountering a throttled cfs_rq */
    	if (cfs_rq_throttled(cfs_rq))
@@ -5437,8 +5480,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/
    add_nr_running(rq, task_delta);
-	if (prev_nr < 2 && prev_nr + task_delta >= 2)
-		overload_set(rq);
+#ifdef CONFIG_SCHED_STEAL
+	overload_set(rq);
+#endif
unthrottle_throttle:
    /*
@@ -6527,8 +6571,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
    int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);
-	unsigned int prev_nr = rq->cfs.h_nr_running;
-
+#ifdef CONFIG_SCHED_STEAL
+	bool tg_steal_enabled = group_steal_enabled(se->steal_task);
+#endif
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    int qos_idle_h_nr_running;
@@ -6563,6 +6608,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    	cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+		if (tg_steal_enabled)
+			cfs_rq->steal_h_nr_running++;
+#endif
/* end evaluation on encountering a throttled cfs_rq */
    	if (cfs_rq_throttled(cfs_rq))
@@ -6583,6 +6632,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    	cfs_rq->qos_idle_h_nr_running += qos_idle_h_nr_running;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+		if (tg_steal_enabled)
+			cfs_rq->steal_h_nr_running++;
+#endif
/* end evaluation on encountering a throttled cfs_rq */
    	if (cfs_rq_throttled(cfs_rq))
@@ -6598,8 +6651,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/
    add_nr_running(rq, 1);
-	if (prev_nr == 1)
-		overload_set(rq);
+#ifdef CONFIG_SCHED_STEAL
+	overload_set(rq);
+#endif
/*
     * Since new tasks are assigned an initial util_avg equal to
@@ -6658,9 +6712,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
    int task_sleep = flags & DEQUEUE_SLEEP;
    int idle_h_nr_running = task_has_idle_policy(p);
-	unsigned int prev_nr = rq->cfs.h_nr_running;
    bool was_sched_idle = sched_idle_rq(rq);
-
+#ifdef CONFIG_SCHED_STEAL
+	bool tg_steal_enabled = group_steal_enabled(se->steal_task);
+#endif
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    int qos_idle_h_nr_running = se->qos_idle ? 1 : 0;
@@ -6678,6 +6733,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    	cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+		if (tg_steal_enabled)
+			cfs_rq->steal_h_nr_running--;
+#endif
/* end evaluation on encountering a throttled cfs_rq */
    	if (cfs_rq_throttled(cfs_rq))
@@ -6710,6 +6769,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    	cfs_rq->qos_idle_h_nr_running -= qos_idle_h_nr_running;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+		if (tg_steal_enabled)
+			cfs_rq->steal_h_nr_running--;
+#endif
/* end evaluation on encountering a throttled cfs_rq */
    	if (cfs_rq_throttled(cfs_rq))
@@ -6719,8 +6782,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/
    sub_nr_running(rq, 1);
-	if (prev_nr == 2)
-		overload_clear(rq);
+#ifdef CONFIG_SCHED_STEAL
+	overload_clear(rq);
+#endif
/* balance early to pull high priority tasks */
    if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
@@ -8494,10 +8558,12 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
    struct rq *rq = rq_of(cfs_rq);
    struct sched_entity *se;
-	unsigned int prev_nr = cfs_rq->h_nr_running;
    long task_delta, idle_task_delta, dequeue = 1;
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    long qos_idle_delta;
+#endif
+#ifdef CONFIG_SCHED_STEAL
+	long steal_delta;
 #endif
    se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -8511,6 +8577,10 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    qos_idle_delta = cfs_rq->qos_idle_h_nr_running;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+	steal_delta = cfs_rq->steal_h_nr_running;
+#endif
+
    for_each_sched_entity(se) {
    	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
    	/* throttled entity or throttle-on-deactivate */
@@ -8529,6 +8599,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    	qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+		qcfs_rq->steal_h_nr_running -= steal_delta;
+#endif
if (qcfs_rq->load.weight)
    		dequeue = 0;
@@ -8536,9 +8609,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
if (!se) {
    	sub_nr_running(rq, task_delta);
-		if (prev_nr >= 2 && prev_nr - task_delta < 2)
-			overload_clear(rq);
-
+#ifdef CONFIG_SCHED_STEAL
+		overload_clear(rq);
+#endif
    }
if (!qos_timer_is_activated(cpu_of(rq)))
@@ -8554,11 +8627,13 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 {
    struct rq *rq = rq_of(cfs_rq);
    struct sched_entity *se;
-	unsigned int prev_nr = cfs_rq->h_nr_running;
    long task_delta, idle_task_delta;
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    long qos_idle_delta;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+	long steal_delta;
+#endif
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -8583,6 +8658,10 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    qos_idle_delta = cfs_rq->qos_idle_h_nr_running;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+	steal_delta = cfs_rq->steal_h_nr_running;
+#endif
+
    for_each_sched_entity(se) {
    	if (se->on_rq)
    		break;
@@ -8595,6 +8674,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    	cfs_rq->qos_idle_h_nr_running += qos_idle_delta;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+		cfs_rq->steal_h_nr_running += steal_delta;
+#endif
if (cfs_rq_throttled(cfs_rq))
    		goto unthrottle_throttle;
@@ -8611,6 +8693,10 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER
    	cfs_rq->qos_idle_h_nr_running += qos_idle_delta;
 #endif
+#ifdef CONFIG_SCHED_STEAL
+		cfs_rq->steal_h_nr_running += steal_delta;
+#endif
+
    	/* end evaluation on encountering a throttled cfs_rq */
    	if (cfs_rq_throttled(cfs_rq))
    		goto unthrottle_throttle;
@@ -8624,8 +8710,9 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
    }
add_nr_running(rq, task_delta);
-	if (prev_nr < 2 && prev_nr + task_delta >= 2)
-		overload_set(rq);
+#ifdef CONFIG_SCHED_STEAL
+	overload_set(rq);
+#endif
unthrottle_throttle:
    /*
@@ -9793,10 +9880,14 @@ static bool
 can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
 {
    int dst_cpu = dst_rq->cpu;
+	struct task_group *tg = task_group(p);
lockdep_assert_rq_held(rq);
-	if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu))
+	if (group_steal_used() && !is_tg_steal(tg->steal_task))
+		return false;
+
+	if (throttled_lb_pair(tg, cpu_of(rq), dst_cpu))
    	return false;
if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) {
@@ -13071,10 +13162,14 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
    int stolen = 0;
    int dst_cpu = dst_rq->cpu;
    struct rq *src_rq = cpu_rq(src_cpu);
+	bool tg_used = group_steal_used();
if (dst_cpu == src_cpu || src_rq->cfs.h_nr_running < 2)
    	return 0;
+	if (tg_used && src_rq->cfs.steal_h_nr_running < 1)
+		return 0;
+
    if (*locked) {
    	rq_unpin_lock(dst_rq, dst_rf);
    	raw_spin_rq_unlock(dst_rq);
@@ -13083,7 +13178,8 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
    rq_lock_irqsave(src_rq, &rf);
    update_rq_clock(src_rq);
-	if (src_rq->cfs.h_nr_running < 2 || !cpu_active(src_cpu))
+	if (!cpu_active(src_cpu) || src_rq->cfs.h_nr_running < 2 ||
+	    (tg_used && src_rq->cfs.steal_h_nr_running < 1))
    	p = NULL;
    else
    	p = detach_next_task(&src_rq->cfs, dst_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ceea107a1dc8..65c1dac9cca1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -722,12 +722,13 @@ struct cfs_rq {
    unsigned int		forceidle_seq;
    KABI_FILL_HOLE(unsigned int kabi_hole)
    u64			min_vruntime_fi;
-#elif defined CONFIG_QOS_SCHED_SMT_EXPELLER && !defined(__GENKSYMS__)
+#elif (defined(CONFIG_QOS_SCHED_SMT_EXPELLER) || \
+		defined(CONFIG_SCHED_STEAL)) && !defined(__GENKSYMS__)
    union {
    	unsigned int            qos_idle_h_nr_running; /* qos_level：-1 */
    	unsigned long           qos_idle_h_nr_running_padding;
    };
-	KABI_FILL_HOLE(unsigned long kabi_hole)
+	unsigned long		steal_h_nr_running;
 #else
    KABI_RESERVE(3)
    KABI_RESERVE(4)
-- 
2.25.1


    

2025

2024

2023

2022

2021

2020

2019

[PATCH OLK-5.10 V3 5/8] sched/fair: Count the number of tasks marked as steal_task on cfs_rq