From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-v6.1-rc1 commit 71dbdde7914d32e86f01ac1f6e54e964c9dfdbd9 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8BCV4
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We put all fields updated by the scheduler in the first cacheline of struct psi_group_cpu for performance.
Since we want add another PSI_IRQ_FULL to track IRQ/SOFTIRQ pressure, we need to reclaim space first. This patch remove NR_ONCPU task accounting in struct psi_group_cpu, use one bit in state_mask to track instead.
Signed-off-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Chengming Zhou zhouchengming@bytedance.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Chengming Zhou zhouchengming@bytedance.com Tested-by: Chengming Zhou zhouchengming@bytedance.com Link: https://lore.kernel.org/r/20220825164111.29534-7-zhouchengming@bytedance.com Conflict: include/linux/psi_types.h Signed-off-by: Lu Jialin lujialin4@huawei.com --- include/linux/psi_types.h | 14 +++++-------- kernel/sched/psi.c | 41 ++++++++++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 20 deletions(-)
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 0b6e17e7f84f..9b3988ee2ce8 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -36,13 +36,6 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - /* - * This can't have values other than 0 or 1 and could be - * implemented as a bit flag. But for now we still have room - * in the first cacheline of psi_group_cpu, and this way we - * don't have to special case any state tracking for it. - */ - NR_ONCPU, /* * For IO and CPU stalls the presence of running/oncpu tasks * in the domain means a partial rather than a full stall. @@ -53,7 +46,7 @@ enum psi_task_count { * threads and memstall ones. */ NR_MEMSTALL_RUNNING, - NR_PSI_TASK_COUNTS = 5, + NR_PSI_TASK_COUNTS = 4, }; #endif
@@ -61,8 +54,9 @@ enum psi_task_count { #define TSK_IOWAIT (1 << NR_IOWAIT) #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) -#define TSK_ONCPU (1 << NR_ONCPU) #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) +/* Only one task can be scheduled, no corresponding task count */ +#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS)
/* Resources that workloads could be stalled on */ enum psi_res { @@ -110,6 +104,8 @@ enum psi_states { }; #endif
+/* Use one bit in the state mask to track TSK_ONCPU */ +#define PSI_ONCPU (1 << NR_PSI_STATES)
enum psi_aggregators { PSI_AVGS = 0, diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 17898826fe24..800d11f9898c 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -228,7 +228,7 @@ void __init psi_init(void) group_init(&psi_system); }
-static bool test_state(unsigned int *tasks, enum psi_states state) +static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) { switch (state) { case PSI_IO_SOME: @@ -241,9 +241,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state) return unlikely(tasks[NR_MEMSTALL] && tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: - return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] > oncpu); case PSI_CPU_FULL: - return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] && !oncpu); case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -696,9 +696,9 @@ static void psi_group_change(struct psi_group *group, int cpu, bool wake_clock) { struct psi_group_cpu *groupc; - u32 state_mask = 0; unsigned int t, m; enum psi_states s; + u32 state_mask;
groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -714,17 +714,36 @@ static void psi_group_change(struct psi_group *group, int cpu,
record_times(groupc, now);
+ /* + * Start with TSK_ONCPU, which doesn't have a corresponding + * task count - it's just a boolean flag directly encoded in + * the state mask. Clear, set, or carry the current state if + * no changes are requested. + */ + if (unlikely(clear & TSK_ONCPU)) { + state_mask = 0; + clear &= ~TSK_ONCPU; + } else if (unlikely(set & TSK_ONCPU)) { + state_mask = PSI_ONCPU; + set &= ~TSK_ONCPU; + } else { + state_mask = groupc->state_mask & PSI_ONCPU; + } + + /* + * The rest of the state mask is calculated based on the task + * counts. Update those first, then construct the mask. + */ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], groupc->tasks[4], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } } @@ -733,9 +752,8 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++;
- /* Calculate state mask representing active states */ for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s)) + if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) state_mask |= (1 << s); }
@@ -747,7 +765,7 @@ static void psi_group_change(struct psi_group *group, int cpu, * task in a cgroup is in_memstall, the corresponding groupc * on that cpu is in PSI_MEM_FULL state. */ - if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) state_mask |= (1 << PSI_MEM_FULL);
groupc->state_mask = state_mask; @@ -849,7 +867,8 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ iter = NULL; while ((group = iterate_groups(next, &iter))) { - if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + if (per_cpu_ptr(group->pcpu, cpu)->state_mask & + PSI_ONCPU) { common = group; break; }