Abel Wu (2): sched/eevdf: Sort the rbtree by virtual deadline sched/eevdf: O(1) fastpath for task selection
K Prateek Nayak (1): sched/eevdf: Skip eligibility check for current entity during wakeup preemption
include/linux/sched.h | 2 +- kernel/sched/debug.c | 11 ++- kernel/sched/fair.c | 202 +++++++++++++++++++--------------------- kernel/sched/features.h | 1 + kernel/sched/sched.h | 1 + 5 files changed, 107 insertions(+), 110 deletions(-)
From: Abel Wu wuyun.abel@bytedance.com
mainline inclusion from mainline-v6.8 commit 2227a957e1d5b1941be4e4207879ec74f4bb37f8 category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I9EHKI CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=2227a...
--------------------------------
Sort the task timeline by virtual deadline and keep the min_vruntime in the augmented tree, so we can avoid doubling the worst case cost and make full use of the cached leftmost node to enable O(1) fastpath picking in next patch.
Signed-off-by: Abel Wu wuyun.abel@bytedance.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20231115033647.80785-3-wuyun.abel@bytedance.com --- include/linux/sched.h | 2 +- kernel/sched/debug.c | 11 ++- kernel/sched/fair.c | 168 +++++++++++++++++------------------------- kernel/sched/sched.h | 1 + 4 files changed, 77 insertions(+), 105 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index c3452e1830ca..b65d74c5e765 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -572,7 +572,7 @@ struct sched_entity { struct load_weight load; struct rb_node run_node; u64 deadline; - u64 min_deadline; + u64 min_vruntime;
struct list_head group_node; unsigned int on_rq; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ebec719e204c..8b3063398bbd 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -628,8 +628,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread; - struct sched_entity *last, *first; + s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; + struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); unsigned long flags;
@@ -644,15 +644,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags); + root = __pick_root_entity(cfs_rq); + if (root) + left_vruntime = root->min_vruntime; first = __pick_first_entity(cfs_rq); if (first) - left_vruntime = first->vruntime; + left_deadline = first->deadline; last = __pick_last_entity(cfs_rq); if (last) right_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; raw_spin_rq_unlock_irqrestore(rq, flags);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", + SPLIT_NS(left_deadline)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7079251d1178..c0a53774eefb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -708,7 +708,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline bool entity_before(const struct sched_entity *a, const struct sched_entity *b) { - return (s64)(a->vruntime - b->vruntime) < 0; + /* + * Tiebreak on vruntime seems unnecessary since it can + * hardly happen. + */ + return (s64)(a->deadline - b->deadline) < 0; }
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -877,7 +881,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * Note: using 'avg_vruntime() > se->vruntime' is inacurate due * to the loss in precision caused by the division. */ -int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; s64 avg = cfs_rq->avg_vruntime; @@ -890,7 +894,12 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) load += weight; }
- return avg >= entity_key(cfs_rq, se) * load; + return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; +} + +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return vruntime_eligible(cfs_rq, se->vruntime); }
static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) @@ -909,9 +918,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
static void update_min_vruntime(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_first_entity(cfs_rq); + struct sched_entity *se = __pick_root_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; - u64 vruntime = cfs_rq->min_vruntime;
if (curr) { @@ -923,9 +931,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
if (se) { if (!curr) - vruntime = se->vruntime; + vruntime = se->min_vruntime; else - vruntime = min_vruntime(vruntime, se->vruntime); + vruntime = min_vruntime(vruntime, se->min_vruntime); }
/* ensure we never gain time by being placed backwards. */ @@ -938,34 +946,34 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); }
-#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) +#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
-static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) +static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) { if (node) { struct sched_entity *rse = __node_2_se(node); - if (deadline_gt(min_deadline, se, rse)) - se->min_deadline = rse->min_deadline; + if (vruntime_gt(min_vruntime, se, rse)) + se->min_vruntime = rse->min_vruntime; } }
/* - * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) + * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ -static inline bool min_deadline_update(struct sched_entity *se, bool exit) +static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { - u64 old_min_deadline = se->min_deadline; + u64 old_min_vruntime = se->min_vruntime; struct rb_node *node = &se->run_node;
- se->min_deadline = se->deadline; - __update_min_deadline(se, node->rb_right); - __update_min_deadline(se, node->rb_left); + se->min_vruntime = se->vruntime; + __min_vruntime_update(se, node->rb_right); + __min_vruntime_update(se, node->rb_left);
- return se->min_deadline == old_min_deadline; + return se->min_vruntime == old_min_vruntime; }
-RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, - run_node, min_deadline, min_deadline_update); +RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, + run_node, min_vruntime, min_vruntime_update);
/* * Enqueue an entity into the rb-tree: @@ -973,18 +981,28 @@ RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); - se->min_deadline = se->deadline; + se->min_vruntime = se->vruntime; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - __entity_less, &min_deadline_cb); + __entity_less, &min_vruntime_cb); }
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - &min_deadline_cb); + &min_vruntime_cb); avg_vruntime_sub(cfs_rq, se); }
+struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node; + + if (!root) + return NULL; + + return __node_2_se(root); +} + struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); @@ -1007,23 +1025,28 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) * with the earliest virtual deadline. * * We can do this in O(log n) time due to an augmented RB-tree. The - * tree keeps the entries sorted on service, but also functions as a - * heap based on the deadline by keeping: + * tree keeps the entries sorted on deadline, but also functions as a + * heap based on the vruntime by keeping: * - * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) + * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime) * - * Which allows an EDF like search on (sub)trees. + * Which allows tree pruning through eligibility. */ -static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL; - struct sched_entity *best_left = NULL; + + /* + * We can safely skip eligibility check if there is only one entity + * in this cfs_rq, saving some cycles. + */ + if (cfs_rq->nr_running == 1) + return curr && curr->on_rq ? curr : __node_2_se(node);
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - best = curr;
/* * Once selected, run a task until it either becomes non-eligible or @@ -1032,95 +1055,38 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq) if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) return curr;
+ /* Heap search for the EEVD entity */ while (node) { struct sched_entity *se = __node_2_se(node); + struct rb_node *left = node->rb_left;
/* - * If this entity is not eligible, try the left subtree. + * Eligible entities in left subtree are always better + * choices, since they have earlier deadlines. */ - if (!entity_eligible(cfs_rq, se)) { - node = node->rb_left; + if (left && vruntime_eligible(cfs_rq, + __node_2_se(left)->min_vruntime)) { + node = left; continue; }
/* - * Now we heap search eligible trees for the best (min_)deadline + * The left subtree either is empty or has no eligible + * entity, so check the current node since it is the one + * with earliest deadline that might be eligible. */ - if (!best || deadline_gt(deadline, best, se)) + if (entity_eligible(cfs_rq, se)) { best = se; - - /* - * Every se in a left branch is eligible, keep track of the - * branch with the best min_deadline - */ - if (node->rb_left) { - struct sched_entity *left = __node_2_se(node->rb_left); - - if (!best_left || deadline_gt(min_deadline, best_left, left)) - best_left = left; - - /* - * min_deadline is in the left branch. rb_left and all - * descendants are eligible, so immediately switch to the second - * loop. - */ - if (left->min_deadline == se->min_deadline) - break; - } - - /* min_deadline is at this node, no need to look right */ - if (se->deadline == se->min_deadline) break; - - /* else min_deadline is in the right branch. */ - node = node->rb_right; - } - - /* - * We ran into an eligible node which is itself the best. - * (Or nr_running == 0 and both are NULL) - */ - if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0) - return best; - - /* - * Now best_left and all of its children are eligible, and we are just - * looking for deadline == min_deadline - */ - node = &best_left->run_node; - while (node) { - struct sched_entity *se = __node_2_se(node); - - /* min_deadline is the current node */ - if (se->deadline == se->min_deadline) - return se; - - /* min_deadline is in the left branch */ - if (node->rb_left && - __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { - node = node->rb_left; - continue; }
- /* else min_deadline is in the right branch */ node = node->rb_right; } - return NULL; -}
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = __pick_eevdf(cfs_rq); - - if (!se) { - struct sched_entity *left = __pick_first_entity(cfs_rq); - if (left) { - pr_err("EEVDF scheduling fail, picking leftmost\n"); - return left; - } - } + if (!best || (curr && entity_before(curr, best))) + best = curr;
- return se; + return best; }
#ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d12206cb6cb8..f643777adbe4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2995,6 +2995,7 @@ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), double_rq_unlock(_T->lock, _T->lock2))
+extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
From: Abel Wu wuyun.abel@bytedance.com
mainline inclusion from mainline-v6.8 commit ee4373dc902c0a403dd084b254ce70a78f95466f category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I9EHKI CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=ee437...
--------------------------------
Since the RB-tree is now sorted by deadline, let's first try the leftmost entity which has the earliest virtual deadline. I've done some benchmarks to see its effectiveness.
All the benchmarks are done inside a normal cpu cgroup in a clean environment with cpu turbo disabled, on a dual-CPU Intel Xeon(R) Platinum 8260 with 2 NUMA nodes each of which has 24C/48T.
hackbench: process/thread + pipe/socket + 1/2/4/8 groups netperf: TCP/UDP + STREAM/RR + 24/48/72/96/192 threads tbench: loopback 24/48/72/96/192 threads schbench: 1/2/4/8 mthreads
direct: cfs_rq has only one entity parity: RUN_TO_PARITY fast: O(1) fastpath slow: heap search
(%) direct parity fast slow hackbench 92.95 2.02 4.91 0.12 netperf 68.08 6.60 24.18 1.14 tbench 67.55 11.22 20.61 0.62 schbench 69.91 2.65 25.73 1.71
The above results indicate that this fastpath really makes task selection more efficient.
Signed-off-by: Abel Wu wuyun.abel@bytedance.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20231115033647.80785-4-wuyun.abel@bytedance.com --- kernel/sched/fair.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c0a53774eefb..ddb6462babda 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1035,6 +1035,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL;
@@ -1043,7 +1044,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) * in this cfs_rq, saving some cycles. */ if (cfs_rq->nr_running == 1) - return curr && curr->on_rq ? curr : __node_2_se(node); + return curr && curr->on_rq ? curr : se;
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; @@ -1055,9 +1056,14 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) return curr;
+ /* Pick the leftmost entity if it's eligible */ + if (se && entity_eligible(cfs_rq, se)) { + best = se; + goto found; + } + /* Heap search for the EEVD entity */ while (node) { - struct sched_entity *se = __node_2_se(node); struct rb_node *left = node->rb_left;
/* @@ -1070,6 +1076,8 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) continue; }
+ se = __node_2_se(node); + /* * The left subtree either is empty or has no eligible * entity, so check the current node since it is the one @@ -1082,7 +1090,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
node = node->rb_right; } - +found: if (!best || (curr && entity_before(curr, best))) best = curr;
From: K Prateek Nayak kprateek.nayak@amd.com
maillist inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I9EHKI CVE: NA
Reference: https://lore.kernel.org/lkml/20240325060226.1540-2-kprateek.nayak@amd.com/
--------------------------------
With the curr entity's eligibility check, a wakeup preemption is very likely when an entity with positive lag joins the runqueue pushing the avg_vruntime of the runqueue backwards, making the vruntime of the current entity ineligible. This leads to aggressive wakeup preemption which was previously guarded by wakeup_granularity_ns in legacy CFS. Below figure depicts one such aggressive preemption scenario with EEVDF in DeathStarBench [1]:
deadline for Nginx | +-------+ | | /-- | Nginx | -|------------------> | | +-------+ | | | | | -----------|-------------------------------> vruntime timeline | --> rq->avg_vruntime | | wakes service on the same runqueue since system is busy | | +---------+| -->| Service || (service has +ve lag pushes avg_vruntime backwards) +---------+| | | wakeup | +--|-----+ | preempts ---->| N|ginx | --------------------> | {deadline for Nginx} +--|-----+ | (Nginx ineligible) -----------|-------------------------------> vruntime timeline --> rq->avg_vruntime
When NGINX server is involuntarily switched out, it cannot accept any incoming request, leading to longer turn around time for the clients and thus loss in DeathStarBench throughput.
================================================================== Test : DeathStarBench Units : Normalized latency Interpretation: Lower is better Statistic : Mean ================================================================== tip 1.00 eevdf 1.14 (+14.61%)
For current running task, skip eligibility check in pick_eevdf() if it has not exhausted the slice promised to it during selection despite the situation having changed since. The behavior is guarded by RUN_TO_PARITY_WAKEUP sched_feat to simplify testing. With RUN_TO_PARITY_WAKEUP enabled, performance loss seen with DeathStarBench since the merge of EEVDF disappears. Following are the results from testing on a Dual Socket 3rd Generation EPYC server (2 x 64C/128T):
================================================================== Test : DeathStarBench Units : Normalized throughput Interpretation: Higher is better Statistic : Mean ================================================================== Pinning scaling tip run-to-parity-wakeup(pct imp) 1CCD 1 1.00 1.16 (%diff: 16%) 2CCD 2 1.00 1.03 (%diff: 3%) 4CCD 4 1.00 1.12 (%diff: 12%) 8CCD 8 1.00 1.05 (%diff: 6%)
With spec_rstack_overflow=off, the DeathStarBench performance with the proposed solution is same as the performance on v6.5 release before EEVDF was merged.
This may lead to newly waking task waiting longer for its turn on the CPU, however, testing on the same system did not reveal any consistent regressions with the standard benchmarks.
Link: https://github.com/delimitrou/DeathStarBench/ [1] Signed-off-by: K Prateek Nayak kprateek.nayak@amd.com --- kernel/sched/fair.c | 24 ++++++++++++++++++++---- kernel/sched/features.h | 1 + 2 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ddb6462babda..7b0cb2f090da 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1032,7 +1032,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) * * Which allows tree pruning through eligibility. */ -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool wakeup_preempt) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se = __pick_first_entity(cfs_rq); @@ -1046,7 +1046,23 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (cfs_rq->nr_running == 1) return curr && curr->on_rq ? curr : se;
- if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) + if (curr && !curr->on_rq) + curr = NULL; + + /* + * When an entity with positive lag wakes up, it pushes the + * avg_vruntime of the runqueue backwards. This may causes the + * current entity to be ineligible soon into its run leading to + * wakeup preemption. + * + * To prevent such aggressive preemption of the current running + * entity during task wakeups, skip the eligibility check if the + * slice promised to the entity since its selection has not yet + * elapsed. + */ + if (curr && + !(sched_feat(RUN_TO_PARITY_WAKEUP) && wakeup_preempt && curr->vlag == curr->deadline) && + !entity_eligible(cfs_rq, curr)) curr = NULL;
/* @@ -5576,7 +5592,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) return cfs_rq->next;
- return pick_eevdf(cfs_rq); + return pick_eevdf(cfs_rq, false); }
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -9245,7 +9261,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ /* * XXX pick_eevdf(cfs_rq) != se ? */ - if (pick_eevdf(cfs_rq) == pse) + if (pick_eevdf(cfs_rq, true) == pse) goto preempt;
return; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e4789d09f58e..26b1a03bd3d2 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -7,6 +7,7 @@ SCHED_FEAT(PLACE_LAG, true) SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) SCHED_FEAT(RUN_TO_PARITY, true) +SCHED_FEAT(RUN_TO_PARITY_WAKEUP, true)
/* * Prefer to schedule the task we woke last (assuming it failed