From: Chengming Zhou zhouchengming@bytedance.com
mainline inclusion from mainline-v6.1-rc1 commit 52b1364ba0b105122d6de0e719b36db705011ac1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8BCV4
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Now PSI already tracked workload pressure stall information for CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have obvious impact on some workload productivity, such as web service workload.
When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time from update_rq_clock_task(), in which we can record that delta to CPU curr task's cgroups as PSI_IRQ_FULL status.
Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in the current task on the CPU, make nothing productive could run even if it were runnable, so we only use PSI_IRQ_FULL.
Signed-off-by: Chengming Zhou zhouchengming@bytedance.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Johannes Weiner hannes@cmpxchg.org Link: https://lore.kernel.org/r/20220825164111.29534-8-zhouchengming@bytedance.com Conflict: Documentation/admin-guide/cgroup-v2.rst include/linux/psi_types.h kernel/sched/psi.c kernel/sched/stats.h Signed-off-by: Lu Jialin lujialin4@huawei.com --- Documentation/admin-guide/cgroup-v2.rst | 6 ++ include/linux/psi.h | 1 + include/linux/psi_types.h | 10 +++- kernel/cgroup/cgroup.c | 26 +++++++++ kernel/sched/core.c | 1 + kernel/sched/psi.c | 74 ++++++++++++++++++++++++- kernel/sched/stats.h | 1 + 7 files changed, 115 insertions(+), 4 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 394fb18c9e3d..20c35c289253 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -951,6 +951,12 @@ All cgroup core files are prefixed with "cgroup." it's possible to delete a frozen (and empty) cgroup, as well as create new sub-cgroups.
+irq.pressure + A read-write nested-keyed file. + + Shows pressure stall information for IRQ/SOFTIRQ. See + :ref:`Documentation/accounting/psi.rst <psi>` for details. + Controllers ===========
diff --git a/include/linux/psi.h b/include/linux/psi.h index 86635a5630ba..9e15596968d0 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -21,6 +21,7 @@ void psi_init(void); void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); +void psi_account_irqtime(struct task_struct *task, u32 delta);
void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 9b3988ee2ce8..668f67b56464 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -63,7 +63,10 @@ enum psi_res { PSI_IO, PSI_MEM, PSI_CPU, - NR_PSI_RESOURCES = 3, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ, +#endif + NR_PSI_RESOURCES, };
/* @@ -98,9 +101,12 @@ enum psi_states { PSI_MEM_FULL, PSI_CPU_SOME, PSI_CPU_FULL, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ_FULL, +#endif /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES = 7, + NR_PSI_STATES, }; #endif
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3d778636f2e8..45c422808340 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3751,6 +3751,23 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); }
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + + return psi_show(seq, psi, PSI_IRQ); +} + +static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ); +} +#endif + static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { @@ -5155,6 +5172,15 @@ static struct cftype cgroup_base_files[] = { .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif #endif /* CONFIG_PSI */ { } /* terminate */ }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 201d75a786e0..eb3353b90b8d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -629,6 +629,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta; delta -= irq_delta; + psi_account_irqtime(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 800d11f9898c..7c255aabc8ac 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -925,6 +925,36 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } }
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct task_struct *task, u32 delta) +{ + int cpu = task_cpu(task); + void *iter = NULL; + struct psi_group *group; + struct psi_group_cpu *groupc; + u64 now; + + if (!task->pid) + return; + + now = cpu_clock(cpu); + + while ((group = iterate_groups(task, &iter))) { + groupc = per_cpu_ptr(group->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + + record_times(groupc, now); + groupc->times[PSI_IRQ_FULL] += delta; + + write_seqcount_end(&groupc->seq); + + if (group->poll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_poll_work(group, 1); + } +} +#endif + /** * psi_memstall_enter - mark the beginning of a memory stall section * @flags: flags to handle nested sections @@ -1083,6 +1113,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; u64 now;
@@ -1097,7 +1128,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock);
- for (full = 0; full < 2; full++) { +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif + + for (full = 0; full < 2 - only_full; full++) { unsigned long avg[3] = { 0, }; u64 total = 0; int w; @@ -1111,7 +1146,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) }
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -1140,6 +1175,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, else return ERR_PTR(-EINVAL);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL);
@@ -1423,6 +1463,33 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, };
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_irq_show, NULL); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif + static int __init psi_proc_init(void) { if (psi_enable) { @@ -1430,7 +1497,10 @@ static int __init psi_proc_init(void) proc_create("pressure/io", 0, NULL, &psi_io_proc_ops); proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops); proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0, NULL, &psi_irq_proc_ops); } +#endif return 0; } module_init(psi_proc_init); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b8b4e5b2694e..874d8c6e6750 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -170,6 +170,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} +static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} #endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO