From: Shakeel Butt shakeelb@google.com
mainline inclusion from mainline-v5.13-rc1 commit df77430639c9cf73559bac0f25084518bf9a812d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We noticed that the cost of psi increases with the increase in the levels of the cgroups. Particularly the cost of cpu_clock() sticks out as the kernel calls it multiple times as it traverses up the cgroup tree. This patch reduces the calls to cpu_clock().
Performed perf bench on Intel Broadwell with 3 levels of cgroup.
Before the patch:
$ perf bench sched all # Running sched/messaging benchmark... # 20 sender and receiver processes per group # 10 groups == 400 processes run
Total time: 0.747 [sec]
# Running sched/pipe benchmark... # Executed 1000000 pipe operations between two processes
Total time: 3.516 [sec]
3.516689 usecs/op 284358 ops/sec
After the patch:
$ perf bench sched all # Running sched/messaging benchmark... # 20 sender and receiver processes per group # 10 groups == 400 processes run
Total time: 0.640 [sec]
# Running sched/pipe benchmark... # Executed 1000000 pipe operations between two processes
Total time: 3.329 [sec]
3.329820 usecs/op 300316 ops/sec
Signed-off-by: Shakeel Butt shakeelb@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Johannes Weiner hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20210321205156.4186483-1-shakeelb@google.com Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/psi.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1fb5cfd6a172..de46e44f5e01 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -651,12 +651,10 @@ static void poll_timer_fn(struct timer_list *t) wake_up_interruptible(&group->poll_wait); }
-static void record_times(struct psi_group_cpu *groupc, int cpu) +static void record_times(struct psi_group_cpu *groupc, u64 now) { u32 delta; - u64 now;
- now = cpu_clock(cpu); delta = now - groupc->state_start; groupc->state_start = now;
@@ -683,7 +681,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu) }
static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, + unsigned int clear, unsigned int set, u64 now, bool wake_clock) { struct psi_group_cpu *groupc; @@ -703,7 +701,7 @@ static void psi_group_change(struct psi_group *group, int cpu, */ write_seqcount_begin(&groupc->seq);
- record_times(groupc, cpu); + record_times(groupc, now);
for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) @@ -810,12 +808,14 @@ void psi_task_change(struct task_struct *task, int clear, int set) struct psi_group *group; bool wake_clock = true; void *iter = NULL; + u64 now;
if (!task->pid) return;
psi_flags_change(task, clear, set);
+ now = cpu_clock(cpu); /* * Periodic aggregation shuts off if there is a period of no * task changes, so we wake it back up if necessary. However, @@ -828,7 +828,7 @@ void psi_task_change(struct task_struct *task, int clear, int set) wake_clock = false;
while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, wake_clock); + psi_group_change(group, cpu, clear, set, now, wake_clock); }
void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -837,6 +837,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); void *iter; + u64 now = cpu_clock(cpu);
if (next->pid) { bool identical_state; @@ -858,7 +859,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; }
- psi_group_change(group, cpu, 0, TSK_ONCPU, true); + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); } }
@@ -880,7 +881,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
iter = NULL; while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, clear, set, true); + psi_group_change(group, cpu, clear, set, now, true);
/* * TSK_ONCPU is handled up to the common ancestor. If we're tasked @@ -889,7 +890,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if (sleep) { clear &= ~TSK_ONCPU; for (; group; group = iterate_groups(prev, &iter)) - psi_group_change(group, cpu, clear, set, true); + psi_group_change(group, cpu, clear, set, now, true); } } }