[PATCH OLK-6.6 00/15] *** backport task throttle ***
Aaron Lu (7): [Backport] sched/fair: Task based throttle time accounting [Backport] sched/fair: Get rid of throttled_lb_pair() [Backport] sched/fair: Propagate load for throttled cfs_rq [Backport] sched/fair: update_cfs_group() for throttled cfs_rqs [Backport] sched/fair: Do not balance task to a throttled cfs_rq [Backport] sched/fair: Prevent cfs_rq from being unthrottled with zero runtime_remaining [Backport] sched/fair: Do not special case tasks in throttled hierarchy K Prateek Nayak (1): [Backport] sched/fair: Start a cfs_rq on throttled hierarchy with PELT clock throttled Valentin Schneider (3): [Backport] sched/fair: Add related data structure for task based throttle [Backport] sched/fair: Implement throttle task work and related helpers [Backport] sched/fair: Switch to task based throttle model Wang Tao (2): [Huawei] sched: Fix kabi broken of struct task_struct and struct cfs_rq [Huawei] sched: Fix kabi broken of struct cfs_rq Zhang Qiao (2): [Huawei] sched/fair: Use separate throttle functions for QoS [Huawei] sched/fair: Use separate qos_throttled and qos_throttle_count include/linux/sched.h | 8 + kernel/sched/core.c | 5 +- kernel/sched/fair.c | 622 ++++++++++++++++++++++++++---------------- kernel/sched/pelt.h | 4 +- kernel/sched/sched.h | 17 +- 5 files changed, 409 insertions(+), 247 deletions(-) -- 2.18.0
From: Valentin Schneider <vschneid@redhat.com> mainline inclusion from mainline-v6.18-rc1 commit 2cd571245b43492867bf1b4252485f3e6647b643 category: feature bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Add related data structures for this new throttle functionality. Tesed-by: K Prateek Nayak <kprateek.nayak@amd.com> Signed-off-by: Valentin Schneider <vschneid@redhat.com> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Tested-by: Valentin Schneider <vschneid@redhat.com> Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk> Link: https://lore.kernel.org/r/20250829081120.806-2-ziqianlu@bytedance.com Conflicts: include/linux/sched.h kernel/sched/core.c kernel/sched/fair.c kernel/sched/sched.h [Context differences.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- include/linux/sched.h | 5 +++++ kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 13 +++++++++++++ kernel/sched/sched.h | 3 +++ 4 files changed, 24 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index f5c80c372f74..63aaa1b767bf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -847,6 +847,11 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; +#ifdef CONFIG_CFS_BANDWIDTH + struct callback_head sched_throttle_work; + struct list_head throttle_node; + bool throttled; +#endif #endif #ifdef CONFIG_UCLAMP_TASK diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d071de3ffa5..41744f1640e3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4547,6 +4547,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; +#ifdef CONFIG_CFS_BANDWIDTH + init_cfs_throttle_work(p); +#endif #endif #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c3841098c212..ad79472212d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5893,6 +5893,18 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } +static void throttle_cfs_rq_work(struct callback_head *work) +{ +} + +void init_cfs_throttle_work(struct task_struct *p) +{ + init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work); + /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */ + p->sched_throttle_work.next = &p->sched_throttle_work; + INIT_LIST_HEAD(&p->throttle_node); +} + static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; @@ -6690,6 +6702,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) #ifdef CONFIG_SCHED_SOFT_QUOTA INIT_LIST_HEAD(&cfs_rq->soft_quota_throttled_list); #endif + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e9a60a6295e4..f1fd008faaf7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -765,6 +765,7 @@ struct cfs_rq { #ifdef CONFIG_SMP struct list_head throttled_csd_list; #endif + struct list_head throttled_limbo_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -2657,6 +2658,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_cfs_throttle_work(struct task_struct *p); + #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 -- 2.18.0
From: Wang Tao <wangtao554@huawei.com> Offering: HULK hulk inclusion category: bugfix bugzilla: NA -------------------------------- Fixes: 3104af4d6492 ("[Backport] sched/fair: Add related data structure for task based throttle") Signed-off-by: Wang Tao <wangtao554@huawei.com> --- include/linux/sched.h | 13 ++++++++----- kernel/sched/sched.h | 6 +++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 63aaa1b767bf..45e010dfe7b2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -847,11 +847,6 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; -#ifdef CONFIG_CFS_BANDWIDTH - struct callback_head sched_throttle_work; - struct list_head throttle_node; - bool throttled; -#endif #endif #ifdef CONFIG_UCLAMP_TASK @@ -1633,11 +1628,19 @@ struct task_struct { KABI_RESERVE(2) /* saved state for "spinlock sleepers" */ KABI_USE2(3, unsigned kabi_reserved_int, unsigned int saved_state) +#ifdef CONFIG_CFS_BANDWIDTH + KABI_REPLACE(_KABI_RESERVE(4); _KABI_RESERVE(5), + struct callback_head sched_throttle_work) + KABI_REPLACE(_KABI_RESERVE(6); _KABI_RESERVE(7), + struct list_head throttle_node) + KABI_USE(8, bool throttled) +#else KABI_RESERVE(4) KABI_RESERVE(5) KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8) +#endif KABI_RESERVE(9) KABI_RESERVE(10) KABI_RESERVE(11) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f1fd008faaf7..8691a409ee6e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -765,7 +765,6 @@ struct cfs_rq { #ifdef CONFIG_SMP struct list_head throttled_csd_list; #endif - struct list_head throttled_limbo_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -790,8 +789,9 @@ struct cfs_rq { KABI_RESERVE(3) KABI_RESERVE(4) #endif - KABI_RESERVE(5) - KABI_RESERVE(6) +#endif + KABI_REPLACE(_KABI_RESERVE(5); _KABI_RESERVE(6), + struct list_head throttled_limbo_list) KABI_RESERVE(7) KABI_RESERVE(8) }; -- 2.18.0
From: Valentin Schneider <vschneid@redhat.com> mainline inclusion from mainline-v6.18-rc1 commit 7fc2d14392475e368a2a7be458aba4eecdf2439b category: feature bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Implement throttle_cfs_rq_work() task work which gets executed on task's ret2user path where the task is dequeued and marked as throttled. Signed-off-by: Valentin Schneider <vschneid@redhat.com> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Tested-by: Valentin Schneider <vschneid@redhat.com> Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> Link: https://lore.kernel.org/r/20250829081120.806-3-ziqianlu@bytedance.com Conflicts: kernel/sched/fair.c [Context differences.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/fair.c | 68 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ad79472212d1..b17fee3cbd33 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5893,8 +5893,54 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } +static inline bool task_is_throttled(struct task_struct *p) +{ + return cfs_bandwidth_used() && p->throttled; +} + +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags); static void throttle_cfs_rq_work(struct callback_head *work) { + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work); + struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq_flags rf; + struct rq *rq; + + SCHED_WARN_ON(p != current); + p->sched_throttle_work.next = &p->sched_throttle_work; + + /* + * If task is exiting, then there won't be a return to userspace, so we + * don't have to bother with any of this. + */ + if ((p->flags & PF_EXITING)) + return; + + rq = task_rq_lock(p, &rf); + se = &p->se; + cfs_rq = cfs_rq_of(se); + + /* Raced, forget */ + if (p->sched_class != &fair_sched_class) + goto out; + + /* + * If not in limbo, then either replenish has happened or this + * task got migrated out of the throttled cfs_rq, move along. + */ + if (!cfs_rq->throttle_count) + goto out; + + update_rq_clock(rq); + SCHED_WARN_ON(p->throttled || !list_empty(&p->throttle_node)); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP); + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + p->throttled = true; + resched_curr(rq); + +out: + task_rq_unlock(rq, p, &rf); } void init_cfs_throttle_work(struct task_struct *p) @@ -5934,6 +5980,26 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) return 0; } +static inline bool task_has_throttle_work(struct task_struct *p) +{ + return p->sched_throttle_work.next != &p->sched_throttle_work; +} + +static inline void task_throttle_setup_work(struct task_struct *p) +{ + if (task_has_throttle_work(p)) + return; + + /* + * Kthreads and exiting tasks don't return to userspace, so adding the + * work is pointless + */ + if ((p->flags & (PF_EXITING | PF_KTHREAD))) + return; + + task_work_add(p, &p->sched_throttle_work, TWA_RESUME); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; @@ -6880,6 +6946,8 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void task_throttle_setup_work(struct task_struct *p) {} +static bool task_is_throttled(struct task_struct *p) { return false; } static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { -- 2.18.0
From: Valentin Schneider <vschneid@redhat.com> mainline inclusion from mainline-v6.18-rc1 commit e1fad12dcb66b7f35573c52b665830a1538f9886 category: feature bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- In current throttle model, when a cfs_rq is throttled, its entity will be dequeued from cpu's rq, making tasks attached to it not able to run, thus achiveing the throttle target. This has a drawback though: assume a task is a reader of percpu_rwsem and is waiting. When it gets woken, it can not run till its task group's next period comes, which can be a relatively long time. Waiting writer will have to wait longer due to this and it also makes further reader build up and eventually trigger task hung. To improve this situation, change the throttle model to task based, i.e. when a cfs_rq is throttled, record its throttled status but do not remove it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when they get picked, add a task work to them so that when they return to user, they can be dequeued there. In this way, tasks throttled will not hold any kernel resources. And on unthrottle, enqueue back those tasks so they can continue to run. Throttled cfs_rq's PELT clock is handled differently now: previously the cfs_rq's PELT clock is stopped once it entered throttled state but since now tasks(in kernel mode) can continue to run, change the behaviour to stop PELT clock when the throttled cfs_rq has no tasks left. Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick Signed-off-by: Valentin Schneider <vschneid@redhat.com> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Valentin Schneider <vschneid@redhat.com> Tested-by: Chen Yu <yu.c.chen@intel.com> Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> Link: https://lore.kernel.org/r/20250829081120.806-4-ziqianlu@bytedance.com Conflicts: kernel/sched/fair.c kernel/sched/pelt.h kernel/sched/sched.h [Context differences. changes from upstream: use throttled_hierarchy() in pick*task_fair() because in 6.6 the check_cfs_rq_runtime() is only done for cfs_rq with curr set.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/fair.c | 362 +++++++++++++++++++++---------------------- kernel/sched/pelt.h | 4 +- kernel/sched/sched.h | 3 +- 3 files changed, 182 insertions(+), 187 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b17fee3cbd33..6e1627cabfc7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5509,24 +5509,29 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_running == 1) { check_enqueue_throttle(cfs_rq); - if (!throttled_hierarchy(cfs_rq)) { - list_add_leaf_cfs_rq(cfs_rq); - } else { + list_add_leaf_cfs_rq(cfs_rq); #ifdef CONFIG_QOS_SCHED - if (cfs_rq->throttled != QOS_THROTTLED) { + if (cfs_rq->throttled != QOS_THROTTLED) { #endif #ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { struct rq *rq = rq_of(cfs_rq); if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) cfs_rq->throttled_clock = rq_clock(rq); if (!cfs_rq->throttled_clock_self) cfs_rq->throttled_clock_self = rq_clock(rq); -#endif -#ifdef CONFIG_QOS_SCHED + + if (cfs_rq->pelt_clock_throttled) { + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; + } } #endif +#ifdef CONFIG_QOS_SCHED } +#endif } } @@ -5596,8 +5601,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_group(se); - if (cfs_rq->nr_running == 0) + if (cfs_rq->nr_running == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); +#ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { + struct rq *rq = rq_of(cfs_rq); + + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; + } +#endif + } } static void @@ -5936,6 +5951,10 @@ static void throttle_cfs_rq_work(struct callback_head *work) SCHED_WARN_ON(p->throttled || !list_empty(&p->throttle_node)); dequeue_task_fair(rq, p, DEQUEUE_SLEEP); list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + /* + * Must not set throttled before dequeue or dequeue will + * mistakenly regard this task as an already throttled one. + */ p->throttled = true; resched_curr(rq); @@ -5951,32 +5970,124 @@ void init_cfs_throttle_work(struct task_struct *p) INIT_LIST_HEAD(&p->throttle_node); } +/* + * Task is throttled and someone wants to dequeue it again: + * it could be sched/core when core needs to do things like + * task affinity change, task group change, task sched class + * change etc. and in these cases, DEQUEUE_SLEEP is not set; + * or the task is blocked after throttled due to freezer etc. + * and in these cases, DEQUEUE_SLEEP is set. + */ +static void detach_task_cfs_rq(struct task_struct *p); +static void dequeue_throttled_task(struct task_struct *p, int flags) +{ + SCHED_WARN_ON(p->se.on_rq); + list_del_init(&p->throttle_node); + + /* task blocked after throttled */ + if (flags & DEQUEUE_SLEEP) { + p->throttled = false; + return; + } + + /* + * task is migrating off its old cfs_rq, detach + * the task's load from its old cfs_rq. + */ + if (task_on_rq_migrating(p)) + detach_task_cfs_rq(p); +} + +static bool enqueue_throttled_task(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + + /* @p should have gone through dequeue_throttled_task() first */ + SCHED_WARN_ON(!list_empty(&p->throttle_node)); + + /* + * If the throttled task @p is enqueued to a throttled cfs_rq, + * take the fast path by directly putting the task on the + * target cfs_rq's limbo list. + * + * Do not do that when @p is current because the following race can + * cause @p's group_node to be incorectly re-insterted in its rq's + * cfs_tasks list, despite being throttled: + * + * cpuX cpuY + * p ret2user + * throttle_cfs_rq_work() sched_move_task(p) + * LOCK task_rq_lock + * dequeue_task_fair(p) + * UNLOCK task_rq_lock + * LOCK task_rq_lock + * task_current_donor(p) == true + * task_on_rq_queued(p) == true + * dequeue_task(p) + * put_prev_task(p) + * sched_change_group() + * enqueue_task(p) -> p's new cfs_rq + * is throttled, go + * fast path and skip + * actual enqueue + * set_next_task(p) + * list_move(&se->group_node, &rq->cfs_tasks); // bug + * schedule() + * + * In the above race case, @p current cfs_rq is in the same rq as + * its previous cfs_rq because sched_move_task() only moves a task + * to a different group from the same rq, so we can use its current + * cfs_rq to derive rq and test if the task is current. + */ + if (throttled_hierarchy(cfs_rq) && + !task_current(rq_of(cfs_rq), p)) { + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + return true; + } + + /* we can't take the fast path, do an actual enqueue*/ + p->throttled = false; + return false; +} + +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct task_struct *p, *tmp; + + if (--cfs_rq->throttle_count) + return 0; - cfs_rq->throttle_count--; - if (!cfs_rq->throttle_count) { + if (cfs_rq->pelt_clock_throttled) { cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; + } - /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; - if (cfs_rq->throttled_clock_self) { - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + cfs_rq->throttled_clock_self = 0; - cfs_rq->throttled_clock_self = 0; + if (SCHED_WARN_ON((s64)delta < 0)) + delta = 0; - if (SCHED_WARN_ON((s64)delta < 0)) - delta = 0; + cfs_rq->throttled_clock_self_time += delta; + } - cfs_rq->throttled_clock_self_time += delta; - } + /* Re-enqueue the tasks that have been throttled at this level. */ + list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { + list_del_init(&p->throttle_node); + p->throttled = false; + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); } + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + return 0; } @@ -6005,30 +6116,33 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + if (cfs_rq->throttle_count++) + return 0; + /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + SCHED_WARN_ON(cfs_rq->throttled_clock_self); + if (cfs_rq->nr_running) + cfs_rq->throttled_clock_self = rq_clock(rq); + else { + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ list_del_leaf_cfs_rq(cfs_rq); - SCHED_WARN_ON(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_running) - cfs_rq->throttled_clock_self = rq_clock(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; } - cfs_rq->throttle_count++; + SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long task_delta, idle_task_delta, dequeue = 1; -#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER - long qos_idle_delta; -#endif + int dequeue = 1; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -6051,68 +6165,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!dequeue) return false; /* Throttle no longer required. */ - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; -#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER - qos_idle_delta = cfs_rq->qos_idle_h_nr_running; -#endif - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; - - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; -#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER - qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; -#endif - - if (qcfs_rq->load.weight) { - /* Avoid re-evaluating load for this entity: */ - se = parent_entity(se); - break; - } - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; - - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; -#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER - qcfs_rq->qos_idle_h_nr_running -= qos_idle_delta; -#endif - } - - /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, task_delta); - if (prev_nr >= 2 && prev_nr - task_delta < 2) - overload_clear(rq); - -done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. @@ -6135,13 +6192,21 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long task_delta, idle_task_delta; -#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER - long qos_idle_delta; -#endif + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + + /* + * It's possible we are called with !runtime_remaining due to things + * like user changed quota setting(see tg_set_cfs_bandwidth()) or async + * unthrottled us with a positive runtime_remaining but other still + * running entities consumed those runtime before we reached here. + * + * Anyway, we can't unthrottle this cfs_rq without any runtime remaining + * because any enqueue in tg_unthrottle_up() will immediately trigger a + * throttle, which is not supposed to happen on unthrottle path. + */ + if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) + return; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -6187,61 +6252,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (list_add_leaf_cfs_rq(cfs_rq_of(se))) break; } - goto unthrottle_throttle; - } - - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; -#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER - qos_idle_delta = cfs_rq->qos_idle_h_nr_running; -#endif - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - if (se->on_rq) - break; - enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; - - qcfs_rq->h_nr_running += task_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; -#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER - qcfs_rq->qos_idle_h_nr_running += qos_idle_delta; -#endif - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; } - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - update_load_avg(qcfs_rq, se, UPDATE_TG); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; - - qcfs_rq->h_nr_running += task_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; -#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER - qcfs_rq->qos_idle_h_nr_running += qos_idle_delta; -#endif - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; - } - - /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, task_delta); - if (prev_nr < 2 && prev_nr + task_delta >= 2) - overload_set(rq); - -unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ @@ -6948,6 +6960,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void task_throttle_setup_work(struct task_struct *p) {} static bool task_is_throttled(struct task_struct *p) { return false; } +static void dequeue_throttled_task(struct task_struct *p, int flags) {} +static bool enqueue_throttled_task(struct task_struct *p) { return false; } static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { @@ -7633,6 +7647,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int task_new = !(flags & ENQUEUE_WAKEUP); unsigned int prev_nr = rq->cfs.h_nr_running; + if (task_is_throttled(p) && enqueue_throttled_task(p)) + return; + /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. @@ -7664,10 +7681,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; - flags = ENQUEUE_WAKEUP; } @@ -7685,10 +7698,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; } /* At this point se is NULL and we are at root level*/ @@ -7713,7 +7722,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) check_update_overutilized_status(rq); -enqueue_throttle: assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -7738,6 +7746,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) unsigned int prev_nr = rq->cfs.h_nr_running; bool was_sched_idle = sched_idle_rq(rq); + if (task_is_throttled(p)) { + dequeue_throttled_task(p, flags); + return; + } + util_est_dequeue(&rq->cfs, p); for_each_sched_entity(se) { @@ -7753,10 +7766,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; - /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -7786,11 +7795,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; - } /* At this point se is NULL and we are at root level*/ @@ -7802,7 +7806,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; -dequeue_throttle: util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -10219,8 +10222,8 @@ static struct task_struct *pick_task_fair(struct rq *rq) { struct sched_entity *se; struct cfs_rq *cfs_rq; + struct task_struct *p; -again: cfs_rq = &rq->cfs; if (!cfs_rq->nr_running) return NULL; @@ -10235,15 +10238,17 @@ static struct task_struct *pick_task_fair(struct rq *rq) else curr = NULL; - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto again; + check_cfs_rq_runtime(cfs_rq); } se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); } while (cfs_rq); - return task_of(se); + p = task_of(se); + if (unlikely(throttled_hierarchy(cfs_rq_of(se)))) + task_throttle_setup_work(p); + return p; } #endif @@ -10309,20 +10314,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf else curr = NULL; - /* - * This call to check_cfs_rq_runtime() will do the - * throttle and dequeue its entity in the parent(s). - * Therefore the nr_running test will indeed - * be correct. - */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) { - cfs_rq = &rq->cfs; - - if (!cfs_rq->nr_running) - goto idle; - - goto simple; - } + check_cfs_rq_runtime(cfs_rq); } se = pick_next_entity(cfs_rq, curr); @@ -10437,6 +10429,8 @@ done: __maybe_unused; qos_smt_expel(this_cpu, p); #endif + if (unlikely(throttled_hierarchy(cfs_rq_of(&p->se)))) + task_throttle_setup_work(p); return p; idle: diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 3a0e0dc28721..ab2b8f49f771 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -157,7 +157,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { u64 throttled; - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) throttled = U64_MAX; else throttled = cfs_rq->throttled_clock_pelt_time; @@ -168,7 +168,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8691a409ee6e..75ec34f5ec9b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -759,7 +759,8 @@ struct cfs_rq { u64 throttled_clock_pelt_time; u64 throttled_clock_self; u64 throttled_clock_self_time; - int throttled; + unsigned int throttled:2; + bool pelt_clock_throttled:1; int throttle_count; struct list_head throttled_list; #ifdef CONFIG_SMP -- 2.18.0
From: Wang Tao <wangtao554@huawei.com> Offering: HULK hulk inclusion category: bugfix bugzilla: NA -------------------------------- Fixes: 41838a3bdddf ("[Backport] sched/fair: Switch to task based throttle model") Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/sched.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 75ec34f5ec9b..a1dacaf7803a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -759,8 +759,9 @@ struct cfs_rq { u64 throttled_clock_pelt_time; u64 throttled_clock_self; u64 throttled_clock_self_time; - unsigned int throttled:2; - bool pelt_clock_throttled:1; + KABI_REPLACE2(int throttled, + unsigned int throttled:2, + bool pelt_clock_throttled:1) int throttle_count; struct list_head throttled_list; #ifdef CONFIG_SMP -- 2.18.0
From: Aaron Lu <ziqianlu@bytedance.com> mainline inclusion from mainline-v6.18-rc1 commit eb962f251fbba251a0d34897d6170f7616d70c52 category: feature bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- With task based throttle model, the previous way to check cfs_rq's nr_queued to decide if throttled time should be accounted doesn't work as expected, e.g. when a cfs_rq which has a single task is throttled, that task could later block in kernel mode instead of being dequeued on limbo list and accounting this as throttled time is not accurate. Rework throttle time accounting for a cfs_rq as follows: - start accounting when the first task gets throttled in its hierarchy; - stop accounting on unthrottle. Note that there will be a time gap between when a cfs_rq is throttled and when a task in its hierarchy is actually throttled. This accounting mechanism only starts accounting in the latter case. Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # accounting mechanism Co-developed-by: K Prateek Nayak <kprateek.nayak@amd.com> # simplify implementation Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Valentin Schneider <vschneid@redhat.com> Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> Link: https://lore.kernel.org/r/20250829081120.806-5-ziqianlu@bytedance.com Conflicts: kernel/sched/fair.c kernel/sched/sched.h [Context differences.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/fair.c | 54 +++++++++++++++++++++++++++----------------- kernel/sched/sched.h | 1 + 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e1627cabfc7..9f3f447e4baf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5514,19 +5514,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->throttled != QOS_THROTTLED) { #endif #ifdef CONFIG_CFS_BANDWIDTH - if (throttled_hierarchy(cfs_rq)) { + if (cfs_rq->pelt_clock_throttled) { struct rq *rq = rq_of(cfs_rq); - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) - cfs_rq->throttled_clock = rq_clock(rq); - if (!cfs_rq->throttled_clock_self) - cfs_rq->throttled_clock_self = rq_clock(rq); - - if (cfs_rq->pelt_clock_throttled) { - cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - - cfs_rq->throttled_clock_pelt; - cfs_rq->pelt_clock_throttled = 0; - } + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; } #endif #ifdef CONFIG_QOS_SCHED @@ -5949,7 +5942,7 @@ static void throttle_cfs_rq_work(struct callback_head *work) update_rq_clock(rq); SCHED_WARN_ON(p->throttled || !list_empty(&p->throttle_node)); - dequeue_task_fair(rq, p, DEQUEUE_SLEEP); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); /* * Must not set throttled before dequeue or dequeue will @@ -6111,6 +6104,17 @@ static inline void task_throttle_setup_work(struct task_struct *p) task_work_add(p, &p->sched_throttle_work, TWA_RESUME); } +static void record_throttle_clock(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; @@ -6119,21 +6123,18 @@ static int tg_throttle_down(struct task_group *tg, void *data) if (cfs_rq->throttle_count++) return 0; - /* group is entering throttled state, stop time */ - SCHED_WARN_ON(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_running) - cfs_rq->throttled_clock_self = rq_clock(rq); - else { - /* - * For cfs_rqs that still have entities enqueued, PELT clock - * stop happens at dequeue time when all entities are dequeued. - */ + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ + if (!cfs_rq->nr_running) { list_del_leaf_cfs_rq(cfs_rq); cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); cfs_rq->pelt_clock_throttled = 1; } + SCHED_WARN_ON(cfs_rq->throttled_clock_self); SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } @@ -6176,6 +6177,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) */ cfs_rq->throttled = 1; SCHED_WARN_ON(cfs_rq->throttled_clock); +<<<<<<< HEAD if (cfs_rq->nr_running) cfs_rq->throttled_clock = rq_clock(rq); @@ -6186,6 +6188,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) } #endif +======= +>>>>>>> f5bbe97ae31f... [Backport] sched/fair: Task based throttle time accounting return true; } @@ -6962,6 +6966,7 @@ static void task_throttle_setup_work(struct task_struct *p) {} static bool task_is_throttled(struct task_struct *p) { return false; } static void dequeue_throttled_task(struct task_struct *p, int flags) {} static bool enqueue_throttled_task(struct task_struct *p) { return false; } +static void record_throttle_clock(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { @@ -7739,6 +7744,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + bool task_throttled = flags & DEQUEUE_THROTTLE; int idle_h_nr_running = task_has_idle_policy(p); #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER int qos_idle_h_nr_running = task_has_qos_idle_policy(p); @@ -7766,6 +7772,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -7795,6 +7804,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; + + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); } /* At this point se is NULL and we are at root level*/ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a1dacaf7803a..850baeab1085 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2419,6 +2419,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_THROTTLE 0x800 #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 -- 2.18.0
From: Aaron Lu <ziqianlu@bytedance.com> mainline inclusion from mainline-v6.18-rc1 commit 5b726e9bf9544a349090879a513a5e00da486c14 category: feature bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Now that throttled tasks are dequeued and can not stay on rq's cfs_tasks list, there is no need to take special care of these throttled tasks anymore in load balance. Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Valentin Schneider <vschneid@redhat.com> Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> Link: https://lore.kernel.org/r/20250829081120.806-6-ziqianlu@bytedance.com Conflicts: kernel/sched/fair.c [Context differences.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/fair.c | 35 +++-------------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9f3f447e4baf..e07209179147 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5884,23 +5884,6 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttle_count; } -/* - * Ensure that neither of the group entities corresponding to src_cpu or - * dest_cpu are members of a throttled hierarchy when performing group - * load-balance operations. - */ -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) -{ - struct cfs_rq *src_cfs_rq, *dest_cfs_rq; - - src_cfs_rq = tg->cfs_rq[src_cpu]; - dest_cfs_rq = tg->cfs_rq[dest_cpu]; - - return throttled_hierarchy(src_cfs_rq) || - throttled_hierarchy(dest_cfs_rq); -} - static inline bool task_is_throttled(struct task_struct *p) { return cfs_bandwidth_used() && p->throttled; @@ -6978,12 +6961,6 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return 0; } -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) -{ - return 0; -} - #ifdef CONFIG_FAIR_GROUP_SCHED void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -10930,13 +10907,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: - * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_ptr, or - * 3) running (obviously), or - * 4) are cache-hot on their current CPU. + * 1) cannot be migrated to this CPU due to cpus_ptr, or + * 2) running (obviously), or + * 3) are cache-hot on their current CPU. */ - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) - return 0; #ifdef CONFIG_SCHED_SOFT_DOMAIN /* Do not migrate soft domain tasks between numa. */ @@ -11041,9 +11015,6 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) lockdep_assert_rq_held(rq); - if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu)) - return false; - if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) { schedstat_inc(p->stats.nr_failed_migrations_affine); return false; -- 2.18.0
From: Aaron Lu <ziqianlu@bytedance.com> mainline inclusion from mainline-v6.18-rc1 commit fe8d238e646e16cc431b7a5899f8dda690258ee9 category: feature bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Before task based throttle model, propagating load will stop at a throttled cfs_rq and that propagate will happen on unthrottle time by update_load_avg(). Now that there is no update_load_avg() on unthrottle for throttled cfs_rq and all load tracking is done by task related operations, let the propagate happen immediately. While at it, add a comment to explain why cfs_rqs that are not affected by throttle have to be added to leaf cfs_rq list in propagate_entity_cfs_rq() per my understanding of commit 0258bdfaff5b ("sched/fair: Fix unfairness caused by missing load decay"). Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Conflicts: kernel/sched/fair.c [Context differences.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/fair.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e07209179147..8e4f20f95e3b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5878,6 +5878,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled; +} + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { @@ -6956,6 +6961,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return 0; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return false; +} + static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return 0; @@ -15104,10 +15114,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - return; - - if (!throttled_hierarchy(cfs_rq)) + /* + * If a task gets attached to this cfs_rq and before being queued, + * it gets migrated to another CPU due to reasons like affinity + * change, make sure this cfs_rq stays on leaf cfs_rq list to have + * that removed load decayed or it can cause faireness problem. + */ + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ @@ -15118,10 +15131,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) update_load_avg(cfs_rq, se, UPDATE_TG); - if (cfs_rq_throttled(cfs_rq)) - break; - - if (!throttled_hierarchy(cfs_rq)) + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } } -- 2.18.0
From: Aaron Lu <ziqianlu@bytedance.com> mainline inclusion from mainline-v6.18-rc1 commit fcd394866e3db344cbe0bb485d7e3f741ac07245 category: feature bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- With task based throttle model, tasks in a throttled hierarchy are allowed to continue to run if they are running in kernel mode. For this reason, PELT clock is not stopped for these cfs_rqs in throttled hierarchy when they still have tasks running or queued. Since PELT clock is not stopped, whether to allow update_cfs_group() doing its job for cfs_rqs which are in throttled hierarchy but still have tasks running/queued is a question. The good side is, continue to run update_cfs_group() can get these cfs_rq entities with an up2date weight and that up2date weight can be useful to derive an accurate load for the CPU as well as ensure fairness if multiple tasks of different cgroups are running on the same CPU. OTOH, as Benjamin Segall pointed: when unthrottle comes around the most likely correct distribution is the distribution we had at the time of throttle. In reality, either way may not matter that much if tasks in throttled hierarchy don't run in kernel mode for too long. But in case that happens, let these cfs_rq entities have an up2date weight seems a good thing to do. Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Conflicts: kernel/sched/fair.c [Context differences.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/fair.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e4f20f95e3b..dc03e016902f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4112,9 +4112,6 @@ static void update_cfs_group(struct sched_entity *se) if (!gcfs_rq) return; - if (throttled_hierarchy(gcfs_rq)) - return; - #ifndef CONFIG_SMP shares = READ_ONCE(gcfs_rq->tg->shares); #else -- 2.18.0
From: Aaron Lu <ziqianlu@bytedance.com> mainline inclusion from mainline-v6.18-rc1 commit 0d4eaf8caf8cd633b23e949e2996b420052c2d45 category: feature bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- When doing load balance and the target cfs_rq is in throttled hierarchy, whether to allow balancing there is a question. The good side to allow balancing is: if the target CPU is idle or less loaded and the being balanced task is holding some kernel resources, then it seems a good idea to balance the task there and let the task get the CPU earlier and release kernel resources sooner. The bad part is, if the task is not holding any kernel resources, then the balance seems not that useful. While theoretically it's debatable, a performance test[0] which involves 200 cgroups and each cgroup runs hackbench(20 sender, 20 receiver) in pipe mode showed a performance degradation on AMD Genoa when allowing load balance to throttled cfs_rq. Analysis[1] showed hackbench doesn't like task migration across LLC boundary. For this reason, add a check in can_migrate_task() to forbid balancing to a cfs_rq that is in throttled hierarchy. This reduced task migration a lot and performance restored. [0]: https://lore.kernel.org/lkml/20250822110701.GB289@bytedance/ [1]: https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/ Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com> Conflicts: kernel/sched/fair.c [Context differences.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/fair.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dc03e016902f..180a141f3e31 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5886,6 +5886,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttle_count; } +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) +{ + return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); +} + static inline bool task_is_throttled(struct task_struct *p) { return cfs_bandwidth_used() && p->throttled; @@ -6968,6 +6973,11 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) return 0; } +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) +{ + return 0; +} + #ifdef CONFIG_FAIR_GROUP_SCHED void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -10914,10 +10924,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: - * 1) cannot be migrated to this CPU due to cpus_ptr, or - * 2) running (obviously), or - * 3) are cache-hot on their current CPU. + * 1) target cfs_rq is in throttled hierarchy, or + * 2) cannot be migrated to this CPU due to cpus_ptr, or + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. */ + if (lb_throttled_hierarchy(p, env->dst_cpu)) + return 0; #ifdef CONFIG_SCHED_SOFT_DOMAIN /* Do not migrate soft domain tasks between numa. */ @@ -11022,6 +11035,9 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) lockdep_assert_rq_held(rq); + if (lb_throttled_hierarchy(p, dst_cpu)) + return false; + if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) { schedstat_inc(p->stats.nr_failed_migrations_affine); return false; -- 2.18.0
From: K Prateek Nayak <kprateek.nayak@amd.com> mainline inclusion from mainline-v6.18-rc3 commit 0e4a169d1a2b630c607416d9e3739d80e176ed67 category: bugfix bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Matteo reported hitting the assert_list_leaf_cfs_rq() warning from enqueue_task_fair() post commit fe8d238e646e ("sched/fair: Propagate load for throttled cfs_rq") which transitioned to using cfs_rq_pelt_clock_throttled() check for leaf cfs_rq insertions in propagate_entity_cfs_rq(). The "cfs_rq->pelt_clock_throttled" flag is used to indicate if the hierarchy has its PELT frozen. If a cfs_rq's PELT is marked frozen, all its descendants should have their PELT frozen too or weird things can happen as a result of children accumulating PELT signals when the parents have their PELT clock stopped. Another side effect of this is the loss of integrity of the leaf cfs_rq list. As debugged by Aaron, consider the following hierarchy: root(#) / \ A(#) B(*) | C <--- new cgroup | D <--- new cgroup # - Already on leaf cfs_rq list * - Throttled with PELT frozen The newly created cgroups don't have their "pelt_clock_throttled" signal synced with cgroup B. Next, the following series of events occur: 1. online_fair_sched_group() for cgroup D will call propagate_entity_cfs_rq(). (Same can happen if a throttled task is moved to cgroup C and enqueue_task_fair() returns early.) propagate_entity_cfs_rq() adds the cfs_rq of cgroup C to "rq->tmp_alone_branch" since its PELT clock is not marked throttled and cfs_rq of cgroup B is not on the list. cfs_rq of cgroup B is skipped since its PELT is throttled. root cfs_rq already exists on cfs_rq leading to list_add_leaf_cfs_rq() returning early. The cfs_rq of cgroup C is left dangling on the "rq->tmp_alone_branch". 2. A new task wakes up on cgroup A. Since the whole hierarchy is already on the leaf cfs_rq list, list_add_leaf_cfs_rq() keeps returning early without any modifications to "rq->tmp_alone_branch". The final assert_list_leaf_cfs_rq() in enqueue_task_fair() sees the dangling reference to cgroup C's cfs_rq in "rq->tmp_alone_branch". !!! Splat !!! Syncing the "pelt_clock_throttled" indicator with parent cfs_rq is not enough since the new cfs_rq is not yet enqueued on the hierarchy. A dequeue on other subtree on the throttled hierarchy can freeze the PELT clock for the parent hierarchy without setting the indicators for this newly added cfs_rq which was never enqueued. Since there are no tasks on the new hierarchy, start a cfs_rq on a throttled hierarchy with its PELT clock throttled. The first enqueue, or the distribution (whichever happens first) will unfreeze the PELT clock and queue the cfs_rq on the leaf cfs_rq list. While at it, add an assert_list_leaf_cfs_rq() in propagate_entity_cfs_rq() to catch such cases in the future. Closes: https://lore.kernel.org/lkml/58a587d694f33c2ea487c700b0d046fa@codethink.co.u... Fixes: e1fad12dcb66 ("sched/fair: Switch to task based throttle model") Reported-by: Matteo Martelli <matteo.martelli@codethink.co.uk> Suggested-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Aaron Lu <ziqianlu@bytedance.com> Tested-by: Aaron Lu <ziqianlu@bytedance.com> Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk> Link: https://patch.msgid.link/20251021053522.37583-1-kprateek.nayak@amd.com Conflicts: kernel/sched/fair.c [Context differences.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 180a141f3e31..1317ab8cf063 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6652,6 +6652,16 @@ static void sync_throttle(struct task_group *tg, int cpu) cfs_rq->throttle_count = pcfs_rq->throttle_count; cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); + + /* + * It is not enough to sync the "pelt_clock_throttled" indicator + * with the parent cfs_rq when the hierarchy is not queued. + * Always join a throttled hierarchy with PELT clock throttled + * and leaf it to the first enqueue, or distribution to + * unthrottle the PELT clock. + */ + if (cfs_rq->throttle_count) + cfs_rq->pelt_clock_throttled = 1; } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@ -15147,6 +15157,8 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } + + assert_list_leaf_cfs_rq(rq_of(cfs_rq)); } #else static void propagate_entity_cfs_rq(struct sched_entity *se) { } -- 2.18.0
From: Aaron Lu <ziqianlu@bytedance.com> mainline inclusion from mainline-v6.18-rc5 commit 956dfda6a70885f18c0f8236a461aa2bc4f556ad category: bugfix bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- When a cfs_rq is to be throttled, its limbo list should be empty and that's why there is a warn in tg_throttle_down() for non empty cfs_rq->throttled_limbo_list. When running a test with the following hierarchy: root / \ A* ... / | \ ... B / \ C* where both A and C have quota settings, that warn on non empty limbo list is triggered for a cfs_rq of C, let's call it cfs_rq_c(and ignore the cpu part of the cfs_rq for the sake of simpler representation). Debug showed it happened like this: Task group C is created and quota is set, so in tg_set_cfs_bandwidth(), cfs_rq_c is initialized with runtime_enabled set, runtime_remaining equals to 0 and *unthrottled*. Before any tasks are enqueued to cfs_rq_c, *multiple* throttled tasks can migrate to cfs_rq_c (e.g., due to task group changes). When enqueue_task_fair(cfs_rq_c, throttled_task) is called and cfs_rq_c is in a throttled hierarchy (e.g., A is throttled), these throttled tasks are directly placed into cfs_rq_c's limbo list by enqueue_throttled_task(). Later, when A is unthrottled, tg_unthrottle_up(cfs_rq_c) enqueues these tasks. The first enqueue triggers check_enqueue_throttle(), and with zero runtime_remaining, cfs_rq_c can be throttled in throttle_cfs_rq() if it can't get more runtime and enters tg_throttle_down(), where the warning is hit due to remaining tasks in the limbo list. I think it's a chaos to trigger throttle on unthrottle path, the status of a being unthrottled cfs_rq can be in a mixed state in the end, so fix this by granting 1ns to cfs_rq in tg_set_cfs_bandwidth(). This ensures cfs_rq_c has a positive runtime_remaining when initialized as unthrottled and cannot enter tg_unthrottle_up() with zero runtime_remaining. Also, update outdated comments in tg_throttle_down() since unthrottle_cfs_rq() is no longer called with zero runtime_remaining. While at it, remove a redundant assignment to se in tg_throttle_down(). Fixes: e1fad12dcb66 ("sched/fair: Switch to task based throttle model") Reviewed-By: Benjamin Segall <bsegall@google.com> Suggested-by: Benjamin Segall <bsegall@google.com> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> Tested-by: Hao Jia <jiahao1@lixiang.com> Link: https://patch.msgid.link/20251030032755.560-1-ziqianlu@bytedance.com Conflicts: kernel/sched/core.c kernel/sched/fair.c [Context differences.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 19 ++++++------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 41744f1640e3..400f6b3b6685 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11190,7 +11190,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; - cfs_rq->runtime_remaining = 0; + cfs_rq->runtime_remaining = 1; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1317ab8cf063..60f7a99eda64 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6167,9 +6167,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) */ cfs_rq->throttled = 1; SCHED_WARN_ON(cfs_rq->throttled_clock); -<<<<<<< HEAD - if (cfs_rq->nr_running) - cfs_rq->throttled_clock = rq_clock(rq); #ifdef CONFIG_SCHED_SOFT_QUOTA if (cfs_rq->tg->soft_quota == 1) { @@ -6178,8 +6175,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) } #endif -======= ->>>>>>> f5bbe97ae31f... [Backport] sched/fair: Task based throttle time accounting return true; } @@ -6190,19 +6185,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; /* - * It's possible we are called with !runtime_remaining due to things - * like user changed quota setting(see tg_set_cfs_bandwidth()) or async - * unthrottled us with a positive runtime_remaining but other still - * running entities consumed those runtime before we reached here. + * It's possible we are called with runtime_remaining < 0 due to things + * like async unthrottled us with a positive runtime_remaining but other + * still running entities consumed those runtime before we reached here. * - * Anyway, we can't unthrottle this cfs_rq without any runtime remaining - * because any enqueue in tg_unthrottle_up() will immediately trigger a - * throttle, which is not supposed to happen on unthrottle path. + * We can't unthrottle this cfs_rq without any runtime remaining because + * any enqueue in tg_unthrottle_up() will immediately trigger a throttle, + * which is not supposed to happen on unthrottle path. */ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) return; - se = cfs_rq->tg->se[cpu_of(rq)]; #ifdef CONFIG_SCHED_SOFT_QUOTA list_del_init(&cfs_rq->soft_quota_throttled_list); -- 2.18.0
Offering: HULK hulk inclusion category: feature bugzilla: NA -------------------------------- Use tg_qos_throttle_down() and tg_qos_unthrottle_up() for QoS throttle to avoid conflicts with CFS bandwidth throttle's task-based model. This removes the need for separate PELT clock handling logic while keeping the list_add/list_del operations. Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com> --- kernel/sched/fair.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 60f7a99eda64..2e4b5d666507 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9722,6 +9722,32 @@ static inline bool is_offline_task(struct task_struct *p) static void start_qos_hrtimer(int cpu); +static int tg_qos_throttle_down(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + if (!cfs_rq->throttle_count) + list_del_leaf_cfs_rq(cfs_rq); + + cfs_rq->throttle_count++; + + return 0; +} + +static int tg_qos_unthrottle_up(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count--; + + if (!cfs_rq->throttle_count && !cfs_rq_is_decayed(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + + return 0; +} + static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -9736,7 +9762,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, tg_qos_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); task_delta = cfs_rq->h_nr_running; @@ -9821,7 +9847,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) /* update hierarchical throttle state */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_qos_unthrottle_up, (void *)rq); rcu_read_unlock(); if (!cfs_rq->load.weight) { -- 2.18.0
From: Aaron Lu <ziqianlu@bytedance.com> mainline inclusion from mainline-v6.18-rc1 commit 253b3f587241967a97a971e23b1e2a7d74244fad category: feature bugzilla: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- With the introduction of task based throttle model, task in a throttled hierarchy is allowed to continue to run till it gets throttled on its ret2user path. For this reason, remove those throttled_hierarchy() checks in the following functions so that those tasks can get their turn as normal tasks: dequeue_entities(), check_preempt_wakeup_fair() and yield_to_task_fair(). The benefit of doing it this way is: if those tasks gets the chance to run earlier and if they hold any kernel resources, they can release those resources earlier. The downside is, if they don't hold any kernel resouces, all they can do is to throttle themselves on their way back to user space so the favor to let them run seems not that useful and for check_preempt_wakeup_fair(), that favor may be bad for curr. K Prateek Nayak pointed out prio_changed_fair() can send a throttled task to check_preempt_wakeup_fair(), further tests showed the affinity change path from move_queued_task() can also send a throttled task to check_preempt_wakeup_fair(), that's why the check of task_is_throttled() in that function. Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Conflicts: kernel/sched/fair.c [Context differences.] Signed-off-by: Wang Tao <wangtao554@huawei.com> --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2e4b5d666507..a7d07ec86cb3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7780,7 +7780,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + if (task_sleep && se) set_next_buddy(se); break; } @@ -9635,7 +9635,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + if (task_is_throttled(p)) return; if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { @@ -10594,8 +10594,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - /* throttled hierarchies are not runnable */ - if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + /* !se->on_rq also covers throttled task */ + if (!se->on_rq) return false; /* Tell the scheduler that we'd really like pse to run next. */ -- 2.18.0
Offering: HULK hulk inclusion category: feature bugzilla: NA -------------------------------- Use separate qos_throttled and qos_throttle_count for QoS throttle to avoid conflicts with CFS bandwidth throttle's task-based model. This removes the need for separate PELT clock handling logic while keeping the list_add/list_del operations. Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com> --- kernel/sched/fair.c | 118 ++++++++++++++++++++++++++----------------- kernel/sched/sched.h | 5 ++ 2 files changed, 78 insertions(+), 45 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a7d07ec86cb3..603c67be3b5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -163,6 +163,20 @@ static int __unthrottle_qos_cfs_rqs(int cpu); static int unthrottle_qos_cfs_rqs(int cpu); static bool qos_smt_expelled(int this_cpu); static bool is_offline_task(struct task_struct *p); +static inline int qos_cfs_rq_throttled(struct cfs_rq *cfs_rq); +static inline int qos_throttled_hierarchy(struct cfs_rq *cfs_rq); + +#else // !CONFIG_QOS_SCHED + +static inline int qos_cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return false; +} + +static inline int qos_throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return false; +} #endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER @@ -5506,20 +5520,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_running == 1) { check_enqueue_throttle(cfs_rq); - list_add_leaf_cfs_rq(cfs_rq); -#ifdef CONFIG_QOS_SCHED - if (cfs_rq->throttled != QOS_THROTTLED) { -#endif + + /* + * Skip adding to leaf list when qos throttled. The cfs_rq is + * not in the leaf list while throttled, and will be added + * back when unthrottled via tg_qos_unthrottle_up(). + */ + if (!qos_throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + #ifdef CONFIG_CFS_BANDWIDTH - if (cfs_rq->pelt_clock_throttled) { - struct rq *rq = rq_of(cfs_rq); + if (cfs_rq->pelt_clock_throttled) { + struct rq *rq = rq_of(cfs_rq); - cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - - cfs_rq->throttled_clock_pelt; - cfs_rq->pelt_clock_throttled = 0; - } -#endif -#ifdef CONFIG_QOS_SCHED + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; } #endif } @@ -6201,14 +6217,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) list_del_init(&cfs_rq->soft_quota_throttled_list); #endif -#ifdef CONFIG_QOS_SCHED - /* - * if this cfs_rq throttled by qos, not need unthrottle it. - */ - if (cfs_rq->throttled == QOS_THROTTLED) - return; -#endif - cfs_rq->throttled = 0; update_rq_clock(rq); @@ -6373,7 +6381,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) * so no longer allocate time to cfs_rq in this scenario. */ #ifdef CONFIG_QOS_SCHED - if (cfs_rq->throttled == QOS_THROTTLED && + if (qos_cfs_rq_throttled(cfs_rq) && cfs_rq->runtime_remaining > 0) goto next; #endif @@ -7682,6 +7690,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ + if (qos_cfs_rq_throttled(cfs_rq)) + goto enqueue_throttle; flags = ENQUEUE_WAKEUP; } @@ -7700,6 +7711,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #endif if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ + if (qos_cfs_rq_throttled(cfs_rq)) + goto enqueue_throttle; } /* At this point se is NULL and we are at root level*/ @@ -7724,6 +7738,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) check_update_overutilized_status(rq); +enqueue_throttle: assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -7769,6 +7784,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ + if (qos_cfs_rq_throttled(cfs_rq)) + goto dequeue_throttle; + if (throttled_hierarchy(cfs_rq) && task_throttled) record_throttle_clock(cfs_rq); @@ -7802,6 +7821,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ + if (qos_cfs_rq_throttled(cfs_rq)) + goto dequeue_throttle; + if (throttled_hierarchy(cfs_rq) && task_throttled) record_throttle_clock(cfs_rq); } @@ -7815,6 +7838,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; +dequeue_throttle: util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -9727,10 +9751,10 @@ static int tg_qos_throttle_down(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - if (!cfs_rq->throttle_count) + if (!cfs_rq->qos_throttle_count) list_del_leaf_cfs_rq(cfs_rq); - cfs_rq->throttle_count++; + cfs_rq->qos_throttle_count++; return 0; } @@ -9740,14 +9764,23 @@ static int tg_qos_unthrottle_up(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - cfs_rq->throttle_count--; - - if (!cfs_rq->throttle_count && !cfs_rq_is_decayed(cfs_rq)) + cfs_rq->qos_throttle_count--; + if (!cfs_rq->qos_throttle_count && !cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); return 0; } +static inline int qos_cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_rq->qos_throttled; +} + +static inline int qos_throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return cfs_rq->qos_throttle_count; +} + static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -9819,7 +9852,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) if (!qos_timer_is_activated(cpu_of(rq))) start_qos_hrtimer(cpu_of(rq)); - cfs_rq->throttled = QOS_THROTTLED; + cfs_rq->qos_throttled = 1; list_add(&cfs_rq->qos_throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); @@ -9837,10 +9870,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq)]; - if (cfs_rq->throttled != QOS_THROTTLED) - return; - - cfs_rq->throttled = 0; + cfs_rq->qos_throttled = 0; update_rq_clock(rq); list_del_init(&cfs_rq->qos_throttled_list); @@ -9882,7 +9912,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->qos_idle_h_nr_running += qos_idle_delta; #endif - if (cfs_rq_throttled(cfs_rq)) + if (qos_cfs_rq_throttled(cfs_rq)) goto unthrottle_throttle; } @@ -9899,7 +9929,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) #endif /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (qos_cfs_rq_throttled(cfs_rq)) goto unthrottle_throttle; } @@ -9915,30 +9945,26 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) static int __unthrottle_qos_cfs_rqs(int cpu) { struct cfs_rq *cfs_rq, *tmp_rq; - int cfs_bandwidth_throttle = 0; list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(qos_throttled_cfs_rq, cpu), qos_throttled_list) { - if (cfs_rq_throttled(cfs_rq)) { + if (qos_cfs_rq_throttled(cfs_rq)) { unthrottle_qos_cfs_rq(cfs_rq); } - - if (throttled_hierarchy(cfs_rq)) - cfs_bandwidth_throttle = 1; } - return cfs_bandwidth_throttle; + return 0; } static int unthrottle_qos_cfs_rqs(int cpu) { - int throttled = __unthrottle_qos_cfs_rqs(cpu); + __unthrottle_qos_cfs_rqs(cpu); /* * We should not cancel the timer if there is still a cfs_rq * throttling after __unthrottle_qos_cfs_rqs(). */ - if (qos_timer_is_activated(cpu) && !(qos_smt_expelled(cpu) || throttled)) + if (qos_timer_is_activated(cpu) && !qos_smt_expelled(cpu)) cancel_qos_timer(cpu); return cpu_rq(cpu)->cfs.h_nr_running; @@ -9949,6 +9975,9 @@ static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) if (unlikely(__this_cpu_read(qos_cpu_overload))) return false; + if (cfs_rq && throttled_hierarchy(cfs_rq)) + return false; + if (unlikely(cfs_rq && is_offline_level(cfs_rq->tg->qos_level) && !sched_idle_cpu(smp_processor_id()) && cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { @@ -9969,7 +9998,7 @@ static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) struct rq_flags rf; rq_lock_irqsave(rq, &rf); - if (is_offline_level(cfs_rq->tg->qos_level) && cfs_rq_throttled(cfs_rq)) + if (is_offline_level(cfs_rq->tg->qos_level) && qos_cfs_rq_throttled(cfs_rq)) unthrottle_qos_cfs_rq(cfs_rq); rq_unlock_irqrestore(rq, &rf); } @@ -10085,7 +10114,6 @@ static bool qos_smt_expelled(int this_cpu) return false; } #endif - #endif #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER @@ -15162,7 +15190,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) * change, make sure this cfs_rq stays on leaf cfs_rq list to have * that removed load decayed or it can cause faireness problem. */ - if (!cfs_rq_pelt_clock_throttled(cfs_rq)) + if (!cfs_rq_pelt_clock_throttled(cfs_rq) && !qos_throttled_hierarchy(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ @@ -15173,7 +15201,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) update_load_avg(cfs_rq, se, UPDATE_TG); - if (!cfs_rq_pelt_clock_throttled(cfs_rq)) + if (!cfs_rq_pelt_clock_throttled(cfs_rq) && !qos_throttled_hierarchy(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 850baeab1085..1e1fcfd37b02 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -794,7 +794,12 @@ struct cfs_rq { #endif KABI_REPLACE(_KABI_RESERVE(5); _KABI_RESERVE(6), struct list_head throttled_limbo_list) +#ifdef CONFIG_QOS_SCHED + KABI_USE2(7, int qos_throttled, int qos_throttle_count) +#else KABI_RESERVE(7) +#endif + KABI_RESERVE(8) }; -- 2.18.0
participants (1)
-
Zhang Qiao