From: Aaron Lu <ziqianlu@bytedance.com> mainline inclusion from mainline-v6.18-rc1 commit eb962f251fbba251a0d34897d6170f7616d70c52 category: feature bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- With task based throttle model, the previous way to check cfs_rq's nr_queued to decide if throttled time should be accounted doesn't work as expected, e.g. when a cfs_rq which has a single task is throttled, that task could later block in kernel mode instead of being dequeued on limbo list and accounting this as throttled time is not accurate. Rework throttle time accounting for a cfs_rq as follows: - start accounting when the first task gets throttled in its hierarchy; - stop accounting on unthrottle. Note that there will be a time gap between when a cfs_rq is throttled and when a task in its hierarchy is actually throttled. This accounting mechanism only starts accounting in the latter case. Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # accounting mechanism Co-developed-by: K Prateek Nayak <kprateek.nayak@amd.com> # simplify implementation Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Valentin Schneider <vschneid@redhat.com> Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> Link: https://lore.kernel.org/r/20250829081120.806-5-ziqianlu@bytedance.com Conflicts: kernel/sched/fair.c kernel/sched/sched.h [Context differences.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/fair.c | 54 +++++++++++++++++++++++++++----------------- kernel/sched/sched.h | 1 + 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e1627cabfc7..9f3f447e4baf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5514,19 +5514,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->throttled != QOS_THROTTLED) { #endif #ifdef CONFIG_CFS_BANDWIDTH - if (throttled_hierarchy(cfs_rq)) { + if (cfs_rq->pelt_clock_throttled) { struct rq *rq = rq_of(cfs_rq); - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) - cfs_rq->throttled_clock = rq_clock(rq); - if (!cfs_rq->throttled_clock_self) - cfs_rq->throttled_clock_self = rq_clock(rq); - - if (cfs_rq->pelt_clock_throttled) { - cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - - cfs_rq->throttled_clock_pelt; - cfs_rq->pelt_clock_throttled = 0; - } + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; } #endif #ifdef CONFIG_QOS_SCHED @@ -5949,7 +5942,7 @@ static void throttle_cfs_rq_work(struct callback_head *work) update_rq_clock(rq); SCHED_WARN_ON(p->throttled || !list_empty(&p->throttle_node)); - dequeue_task_fair(rq, p, DEQUEUE_SLEEP); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); /* * Must not set throttled before dequeue or dequeue will @@ -6111,6 +6104,17 @@ static inline void task_throttle_setup_work(struct task_struct *p) task_work_add(p, &p->sched_throttle_work, TWA_RESUME); } +static void record_throttle_clock(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; @@ -6119,21 +6123,18 @@ static int tg_throttle_down(struct task_group *tg, void *data) if (cfs_rq->throttle_count++) return 0; - /* group is entering throttled state, stop time */ - SCHED_WARN_ON(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_running) - cfs_rq->throttled_clock_self = rq_clock(rq); - else { - /* - * For cfs_rqs that still have entities enqueued, PELT clock - * stop happens at dequeue time when all entities are dequeued. - */ + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ + if (!cfs_rq->nr_running) { list_del_leaf_cfs_rq(cfs_rq); cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); cfs_rq->pelt_clock_throttled = 1; } + SCHED_WARN_ON(cfs_rq->throttled_clock_self); SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } @@ -6176,6 +6177,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) */ cfs_rq->throttled = 1; SCHED_WARN_ON(cfs_rq->throttled_clock); +<<<<<<< HEAD if (cfs_rq->nr_running) cfs_rq->throttled_clock = rq_clock(rq); @@ -6186,6 +6188,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) } #endif +======= +>>>>>>> f5bbe97ae31f... [Backport] sched/fair: Task based throttle time accounting return true; } @@ -6962,6 +6966,7 @@ static void task_throttle_setup_work(struct task_struct *p) {} static bool task_is_throttled(struct task_struct *p) { return false; } static void dequeue_throttled_task(struct task_struct *p, int flags) {} static bool enqueue_throttled_task(struct task_struct *p) { return false; } +static void record_throttle_clock(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { @@ -7739,6 +7744,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + bool task_throttled = flags & DEQUEUE_THROTTLE; int idle_h_nr_running = task_has_idle_policy(p); #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER int qos_idle_h_nr_running = task_has_qos_idle_policy(p); @@ -7766,6 +7772,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -7795,6 +7804,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; + + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); } /* At this point se is NULL and we are at root level*/ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a1dacaf7803a..850baeab1085 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2419,6 +2419,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_THROTTLE 0x800 #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 -- 2.18.0