mainline inclusion from mainline-v5.4 commit b1d29ba82cf2bc784f4c963ddd6a2cf29e229b33 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
Delay accounting already measures the time a task spends in direct reclaim and waiting for swapin, but in low memory situations tasks spend can spend a significant amount of their time waiting on thrashing page cache. This isn't tracked right now.
To know the full impact of memory contention on an individual task, measure the delay when waiting for a recently evicted active cache page to read back into memory.
Also update tools/accounting/getdelays.c:
[hannes@computer accounting]$ sudo ./getdelays -d -p 1 print delayacct stats ON PID 1
CPU count real total virtual total delay total delay average 50318 745000000 847346785 400533713 0.008ms IO count delay total delay average 435 122601218 0ms SWAP count delay total delay average 0 0 0ms RECLAIM count delay total delay average 0 0 0ms THRASHING count delay total delay average 19 12621439 0ms
Link: http://lkml.kernel.org/r/20180828172258.3185-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Daniel Drake drake@endlessm.com Tested-by: Suren Baghdasaryan surenb@google.com Cc: Christopher Lameter cl@linux.com Cc: Ingo Molnar mingo@redhat.com Cc: Johannes Weiner jweiner@fb.com Cc: Mike Galbraith efault@gmx.de Cc: Peter Enderborg peter.enderborg@sony.com Cc: Randy Dunlap rdunlap@infradead.org Cc: Shakeel Butt shakeelb@google.com Cc: Tejun Heo tj@kernel.org Cc: Vinayak Menon vinmenon@codeaurora.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- include/linux/delayacct.h | 23 +++++++++++++++++++++++ include/uapi/linux/taskstats.h | 6 +++++- kernel/delayacct.c | 15 +++++++++++++++ mm/filemap.c | 11 +++++++++++ tools/accounting/getdelays.c | 8 +++++++- 5 files changed, 61 insertions(+), 2 deletions(-)
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 31c865d..577d1b2 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -57,7 +57,12 @@ struct task_delay_info {
u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ + + u64 thrashing_start; + u64 thrashing_delay; /* wait for thrashing page */ + u32 freepages_count; /* total count of memory reclaim */ + u32 thrashing_count; /* total count of thrash waits */ }; #endif
@@ -76,6 +81,8 @@ struct task_delay_info { extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); +extern void __delayacct_thrashing_start(void); +extern void __delayacct_thrashing_end(void);
static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { @@ -156,6 +163,18 @@ static inline void delayacct_freepages_end(void) __delayacct_freepages_end(); }
+static inline void delayacct_thrashing_start(void) +{ + if (current->delays) + __delayacct_thrashing_start(); +} + +static inline void delayacct_thrashing_end(void) +{ + if (current->delays) + __delayacct_thrashing_end(); +} + #else static inline void delayacct_set_flag(int flag) {} @@ -182,6 +201,10 @@ static inline void delayacct_freepages_start(void) {} static inline void delayacct_freepages_end(void) {} +static inline void delayacct_thrashing_start(void) +{} +static inline void delayacct_thrashing_end(void) +{}
#endif /* CONFIG_TASK_DELAY_ACCT */
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index b7aa7bb..5e8ca16 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */
-#define TASKSTATS_VERSION 8 +#define TASKSTATS_VERSION 9 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */
@@ -164,6 +164,10 @@ struct taskstats { /* Delay waiting for memory reclaim */ __u64 freepages_count; __u64 freepages_delay_total; + + /* Delay waiting for thrashing page */ + __u64 thrashing_count; + __u64 thrashing_delay_total; };
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ca8ac28..2a12b98 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; tmp = d->freepages_delay_total + tsk->delays->freepages_delay; d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; + tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; + d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; + d->thrashing_count += tsk->delays->thrashing_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0; @@ -169,3 +172,15 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_count); }
+void __delayacct_thrashing_start(void) +{ + current->delays->thrashing_start = ktime_get_ns(); +} + +void __delayacct_thrashing_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->thrashing_start, + ¤t->delays->thrashing_delay, + ¤t->delays->thrashing_count); +} diff --git a/mm/filemap.c b/mm/filemap.c index 4cf4b09..6c964d8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -36,6 +36,7 @@ #include <linux/cleancache.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> +#include <linux/delayacct.h> #include "internal.h"
#define CREATE_TRACE_POINTS @@ -1183,8 +1184,15 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; int ret = 0;
+ if (bit_nr == PG_locked && !PageSwapBacked(page) && + !PageUptodate(page) && PageWorkingset(page)) { + delayacct_thrashing_start(); + thrashing = true; + } + init_wait(wait); wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; @@ -1223,6 +1231,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
finish_wait(q, wait);
+ if (thrashing) + delayacct_thrashing_end(); + /* * A signal could leave PageWaiters set. Clearing it here if * !waitqueue_active would be possible (by open-coding finish_wait), diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 6bf6a20..5ef1c15 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -203,6 +203,8 @@ static void print_delayacct(struct taskstats *t) "SWAP %15s%15s%15s\n" " %15llu%15llu%15llums\n" "RECLAIM %12s%15s%15s\n" + " %15llu%15llu%15llums\n" + "THRASHING%12s%15s%15s\n" " %15llu%15llu%15llums\n", "count", "real total", "virtual total", "delay total", "delay average", @@ -222,7 +224,11 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->freepages_count, (unsigned long long)t->freepages_delay_total, - average_ms(t->freepages_delay_total, t->freepages_count)); + average_ms(t->freepages_delay_total, t->freepages_count), + "count", "delay total", "delay average", + (unsigned long long)t->thrashing_count, + (unsigned long long)t->thrashing_delay_total, + average_ms(t->thrashing_delay_total, t->thrashing_count)); }
static void task_context_switch_counts(struct taskstats *t)
mainline inclusion from mainline-v5.4 commit 8508cf3ffad4defa202b303e5b6379efc4cd9054 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
There are several definitions of those functions/macros in places that mess with fixed-point load averages. Provide an official version.
[akpm@linux-foundation.org: fix missed conversion in block/blk-iolatency.c] Link: http://lkml.kernel.org/r/20180828172258.3185-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Suren Baghdasaryan surenb@google.com Tested-by: Daniel Drake drake@endlessm.com Cc: Christopher Lameter cl@linux.com Cc: Ingo Molnar mingo@redhat.com Cc: Johannes Weiner jweiner@fb.com Cc: Mike Galbraith efault@gmx.de Cc: Peter Enderborg peter.enderborg@sony.com Cc: Randy Dunlap rdunlap@infradead.org Cc: Shakeel Butt shakeelb@google.com Cc: Tejun Heo tj@kernel.org Cc: Vinayak Menon vinmenon@codeaurora.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- arch/powerpc/platforms/cell/cpufreq_spudemand.c | 2 +- arch/powerpc/platforms/cell/spufs/sched.c | 9 +++------ arch/s390/appldata/appldata_os.c | 4 ---- block/blk-iolatency.c | 7 ++++--- drivers/cpuidle/governors/menu.c | 4 ---- fs/proc/loadavg.c | 3 --- include/linux/sched/loadavg.h | 21 +++++++++++++++++---- kernel/debug/kdb/kdb_main.c | 7 +------ kernel/sched/loadavg.c | 15 --------------- 9 files changed, 26 insertions(+), 46 deletions(-)
diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c index 882944c..5d8e8b6 100644 --- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c +++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c @@ -49,7 +49,7 @@ static int calc_freq(struct spu_gov_info_struct *info) cpu = info->policy->cpu; busy_spus = atomic_read(&cbe_spu_info[cpu_to_node(cpu)].busy_spus);
- CALC_LOAD(info->busy_spus, EXP, busy_spus * FIXED_1); + info->busy_spus = calc_load(info->busy_spus, EXP, busy_spus * FIXED_1); pr_debug("cpu %d: busy_spus=%d, info->busy_spus=%ld\n", cpu, busy_spus, info->busy_spus);
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index c9ef3c5..9fcccb4 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -987,9 +987,9 @@ static void spu_calc_load(void) unsigned long active_tasks; /* fixed-point */
active_tasks = count_active_contexts() * FIXED_1; - CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks); - CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks); - CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks); + spu_avenrun[0] = calc_load(spu_avenrun[0], EXP_1, active_tasks); + spu_avenrun[1] = calc_load(spu_avenrun[1], EXP_5, active_tasks); + spu_avenrun[2] = calc_load(spu_avenrun[2], EXP_15, active_tasks); }
static void spusched_wake(struct timer_list *unused) @@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx, } }
-#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int show_spu_loadavg(struct seq_file *s, void *private) { int a, b, c; diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 433a994..54f3756 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -25,10 +25,6 @@
#include "appldata.h"
- -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - /* * OS data * diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 8897f7c..6f02897 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -141,7 +141,7 @@ struct iolatency_grp { #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC /* * These are the constants used to fake the fixed-point moving average - * calculation just like load average. The call to CALC_LOAD folds + * calculation just like load average. The call to calc_load() folds * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling * window size is bucketed to try to approximately calculate average * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows @@ -505,7 +505,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) lat_info = &parent->child_lat;
/* - * CALC_LOAD takes in a number stored in fixed point representation. + * calc_load takes in a number stored in fixed point representation. * Because we are using this for IO time in ns, the values stored * are significantly larger than the FIXED_1 denominator (2048). * Therefore, rounding errors in the calculation are negligible and @@ -514,7 +514,8 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, div64_u64(iolat->cur_win_nsec, BLKIOLATENCY_EXP_BUCKET_SIZE)); - CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean); + iolat->lat_avg = calc_avg(iolat->lat_avg, + iolatency_exp_factors[exp_idx], stat.mean);
/* Everything is ok and we don't need to adjust the scale. */ if (stat.mean <= iolat->min_lat_nsec && diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index c447014..69c25bf 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -130,10 +130,6 @@ struct menu_device { int interval_ptr; };
- -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static inline int get_loadavg(unsigned long load) { return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10; diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index d066947..8468bae 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -10,9 +10,6 @@ #include <linux/seqlock.h> #include <linux/time.h>
-#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int loadavg_proc_show(struct seq_file *m, void *v) { unsigned long avnrun[3]; diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index 80bc84b..cc9cc62 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -22,10 +22,23 @@ #define EXP_5 2014 /* 1/exp(5sec/5min) */ #define EXP_15 2037 /* 1/exp(5sec/15min) */
-#define CALC_LOAD(load,exp,n) \ - load *= exp; \ - load += n*(FIXED_1-exp); \ - load >>= FSHIFT; +/* + * a1 = a0 * e + a * (1 - e) + */ +static inline unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; +} + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
extern void calc_global_load(unsigned long ticks);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index dc6bf35..cdde394 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2556,16 +2556,11 @@ static int kdb_summary(int argc, const char **argv) } kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
- /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); -#undef LOAD_INT -#undef LOAD_FRAC + /* Display in kilobytes */ #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a171c12..54fbdfb 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,21 +91,6 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; }
-/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - unsigned long newload; - - newload = load * exp + active * (FIXED_1 - exp); - if (active >= load) - newload += FIXED_1-1; - - return newload / FIXED_1; -} - #ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average.
您好,冒昧打扰。
您的补丁自第 2 个补丁之后,有如下编译错误。
On 2021/9/5 22:26, Liu Xinpeng wrote:
- latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
@@ -505,7 +505,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) lat_info = &parent->child_lat;
/*
* CALC_LOAD takes in a number stored in fixed point representation.
* calc_load takes in a number stored in fixed point representation.
- Because we are using this for IO time in ns, the values stored
- are significantly larger than the FIXED_1 denominator (2048).
- Therefore, rounding errors in the calculation are negligible and
@@ -514,7 +514,8 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, div64_u64(iolat->cur_win_nsec, BLKIOLATENCY_EXP_BUCKET_SIZE));
- CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
- iolat->lat_avg = calc_avg(iolat->lat_avg,
iolatency_exp_factors[exp_idx], stat.mean);
[FAIL]build_x86_64Sept. 6, 2021, 9:54 a.m. UTC
make ARCH=x86_64 CROSS_COMPILE=x86_64-linux-gnu- allmodconfig && make -j64 ARCH=x86_64 CROSS_COMPILE=x86_64-linux-gnu-
block/blk-iolatency.c: In function ‘iolatency_check_latencies’: block/blk-iolatency.c:517:19: error: implicit declaration of function ‘calc_avg’; did you mean ‘calc_load’? [-Werror=implicit-function-declaration] iolat->lat_avg = calc_avg(iolat->lat_avg, ^~~~~~~~ calc_load cc1: some warnings being treated as errors make[1]: *** [block/blk-iolatency.o] Error 1 make: *** [block] Error 2 make: *** Waiting for unfinished jobs....
[FAIL]build_arm64Sept. 6, 2021, 9:54 a.m. UTC
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- allmodconfig && make -j64 ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu-
block/blk-iolatency.c: In function ‘iolatency_check_latencies’: block/blk-iolatency.c:517:19: error: implicit declaration of function ‘calc_avg’; did you mean ‘calc_load’? [-Werror=implicit-function-declaration] iolat->lat_avg = calc_avg(iolat->lat_avg, ^~~~~~~~ calc_load cc1: some warnings being treated as errors make[1]: *** [block/blk-iolatency.o] Error 1 make: *** [block] Error 2 make: *** Waiting for unfinished jobs....
[FAIL]build_storage_arm64Sept. 6, 2021, 9:53 a.m. UTC
cp arch/arm64/configs/storage_ci_defconfig .config && make -j8 ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu-
block/blk-iolatency.c: In function ‘iolatency_check_latencies’: block/blk-iolatency.c:517:19: error: implicit declaration of function ‘calc_avg’; did you mean ‘calc_load’? [-Werror=implicit-function-declaration] iolat->lat_avg = calc_avg(iolat->lat_avg, ^~~~~~~~ calc_load cc1: some warnings being treated as errors make[1]: *** [block/blk-iolatency.o] Error 1 make: *** [block] Error 2 make: *** Waiting for unfinished jobs.... * _HZ for the global load-average.
kernel 合入的补丁要求每个补丁依次顺序合入都可以正常编译。
谢谢。
---- 成坚
There is no "calc_load --> calc_avg" in patch 02,please check.
Thanks!
From: chengjian (D) Date: 2021-09-07 10:21 To: Liu Xinpeng; kernel@openeuler.org CC: xiexiuqi; cheng_openeuler; Li Bin Subject: Re: [PATCH kernel-4.19 02/29] sched: loadavg: consolidate LOAD_INT, LOAD_FRAC, CALC_LOAD 您好,冒昧打扰。 您的补丁自第 2 个补丁之后,有如下编译错误。 On 2021/9/5 22:26, Liu Xinpeng wrote: * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows @@ -505,7 +505,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) lat_info = &parent->child_lat;
/* - * CALC_LOAD takes in a number stored in fixed point representation. + * calc_load takes in a number stored in fixed point representation. * Because we are using this for IO time in ns, the values stored * are significantly larger than the FIXED_1 denominator (2048). * Therefore, rounding errors in the calculation are negligible and @@ -514,7 +514,8 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, div64_u64(iolat->cur_win_nsec, BLKIOLATENCY_EXP_BUCKET_SIZE)); - CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean); + iolat->lat_avg = calc_avg(iolat->lat_avg, + iolatency_exp_factors[exp_idx], stat.mean);
[FAIL] build_x86_64Sept. 6, 2021, 9:54 a.m. UTC make ARCH=x86_64 CROSS_COMPILE=x86_64-linux-gnu- allmodconfig && make -j64 ARCH=x86_64 CROSS_COMPILE=x86_64-linux-gnu- block/blk-iolatency.c: In function ‘iolatency_check_latencies’: block/blk-iolatency.c:517:19: error: implicit declaration of function ‘calc_avg’; did you mean ‘calc_load’? [-Werror=implicit-function-declaration] iolat->lat_avg = calc_avg(iolat->lat_avg, ^~~~~~~~ calc_load cc1: some warnings being treated as errors make[1]: *** [block/blk-iolatency.o] Error 1 make: *** [block] Error 2 make: *** Waiting for unfinished jobs....
[FAIL] build_arm64Sept. 6, 2021, 9:54 a.m. UTC make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- allmodconfig && make -j64 ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- block/blk-iolatency.c: In function ‘iolatency_check_latencies’: block/blk-iolatency.c:517:19: error: implicit declaration of function ‘calc_avg’; did you mean ‘calc_load’? [-Werror=implicit-function-declaration] iolat->lat_avg = calc_avg(iolat->lat_avg, ^~~~~~~~ calc_load cc1: some warnings being treated as errors make[1]: *** [block/blk-iolatency.o] Error 1 make: *** [block] Error 2 make: *** Waiting for unfinished jobs....
[FAIL] build_storage_arm64Sept. 6, 2021, 9:53 a.m. UTC cp arch/arm64/configs/storage_ci_defconfig .config && make -j8 ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- block/blk-iolatency.c: In function ‘iolatency_check_latencies’: block/blk-iolatency.c:517:19: error: implicit declaration of function ‘calc_avg’; did you mean ‘calc_load’? [-Werror=implicit-function-declaration] iolat->lat_avg = calc_avg(iolat->lat_avg, ^~~~~~~~ calc_load cc1: some warnings being treated as errors make[1]: *** [block/blk-iolatency.o] Error 1 make: *** [block] Error 2 make: *** Waiting for unfinished jobs.... * _HZ for the global load-average.
kernel 合入的补丁要求每个补丁依次顺序合入都可以正常编译。
谢谢。
---- 成坚
是我弄错版本了。 我会把每个commit再检查和编译一遍。
KABI的问题也在研究,谢谢指点!
Thanks!
From: liuxp11@chinatelecom.cn Date: 2021-09-07 10:47 To: cj.chengjian; kernel CC: xiexiuqi; cheng_openeuler; Li Bin Subject: Re: Re: [PATCH kernel-4.19 02/29] sched: loadavg: consolidate LOAD_INT, LOAD_FRAC, CALC_LOAD There is no "calc_load --> calc_avg" in patch 02,please check.
Thanks!
From: chengjian (D) Date: 2021-09-07 10:21 To: Liu Xinpeng; kernel@openeuler.org CC: xiexiuqi; cheng_openeuler; Li Bin Subject: Re: [PATCH kernel-4.19 02/29] sched: loadavg: consolidate LOAD_INT, LOAD_FRAC, CALC_LOAD 您好,冒昧打扰。 您的补丁自第 2 个补丁之后,有如下编译错误。 On 2021/9/5 22:26, Liu Xinpeng wrote: * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows @@ -505,7 +505,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) lat_info = &parent->child_lat;
/* - * CALC_LOAD takes in a number stored in fixed point representation. + * calc_load takes in a number stored in fixed point representation. * Because we are using this for IO time in ns, the values stored * are significantly larger than the FIXED_1 denominator (2048). * Therefore, rounding errors in the calculation are negligible and @@ -514,7 +514,8 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, div64_u64(iolat->cur_win_nsec, BLKIOLATENCY_EXP_BUCKET_SIZE)); - CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean); + iolat->lat_avg = calc_avg(iolat->lat_avg, + iolatency_exp_factors[exp_idx], stat.mean);
[FAIL] build_x86_64Sept. 6, 2021, 9:54 a.m. UTC make ARCH=x86_64 CROSS_COMPILE=x86_64-linux-gnu- allmodconfig && make -j64 ARCH=x86_64 CROSS_COMPILE=x86_64-linux-gnu- block/blk-iolatency.c: In function ‘iolatency_check_latencies’: block/blk-iolatency.c:517:19: error: implicit declaration of function ‘calc_avg’; did you mean ‘calc_load’? [-Werror=implicit-function-declaration] iolat->lat_avg = calc_avg(iolat->lat_avg, ^~~~~~~~ calc_load cc1: some warnings being treated as errors make[1]: *** [block/blk-iolatency.o] Error 1 make: *** [block] Error 2 make: *** Waiting for unfinished jobs....
[FAIL] build_arm64Sept. 6, 2021, 9:54 a.m. UTC make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- allmodconfig && make -j64 ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- block/blk-iolatency.c: In function ‘iolatency_check_latencies’: block/blk-iolatency.c:517:19: error: implicit declaration of function ‘calc_avg’; did you mean ‘calc_load’? [-Werror=implicit-function-declaration] iolat->lat_avg = calc_avg(iolat->lat_avg, ^~~~~~~~ calc_load cc1: some warnings being treated as errors make[1]: *** [block/blk-iolatency.o] Error 1 make: *** [block] Error 2 make: *** Waiting for unfinished jobs....
[FAIL] build_storage_arm64Sept. 6, 2021, 9:53 a.m. UTC cp arch/arm64/configs/storage_ci_defconfig .config && make -j8 ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- block/blk-iolatency.c: In function ‘iolatency_check_latencies’: block/blk-iolatency.c:517:19: error: implicit declaration of function ‘calc_avg’; did you mean ‘calc_load’? [-Werror=implicit-function-declaration] iolat->lat_avg = calc_avg(iolat->lat_avg, ^~~~~~~~ calc_load cc1: some warnings being treated as errors make[1]: *** [block/blk-iolatency.o] Error 1 make: *** [block] Error 2 make: *** Waiting for unfinished jobs.... * _HZ for the global load-average.
kernel 合入的补丁要求每个补丁依次顺序合入都可以正常编译。
谢谢。
---- 成坚
mainline inclusion from mainline-v5.4 commit 5c54f5b9edb1aa2eabbb1091c458f1b6776a1896 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
It's going to be used in a later patch. Keep the churn separate.
Link: http://lkml.kernel.org/r/20180828172258.3185-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Suren Baghdasaryan surenb@google.com Tested-by: Daniel Drake drake@endlessm.com Cc: Christopher Lameter cl@linux.com Cc: Ingo Molnar mingo@redhat.com Cc: Johannes Weiner jweiner@fb.com Cc: Mike Galbraith efault@gmx.de Cc: Peter Enderborg peter.enderborg@sony.com Cc: Randy Dunlap rdunlap@infradead.org Cc: Shakeel Butt shakeelb@google.com Cc: Tejun Heo tj@kernel.org Cc: Vinayak Menon vinmenon@codeaurora.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- include/linux/sched/loadavg.h | 3 + kernel/sched/loadavg.c | 138 +++++++++++++++++++++--------------------- 2 files changed, 72 insertions(+), 69 deletions(-)
diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index cc9cc62..4859bea 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -37,6 +37,9 @@ return newload / FIXED_1; }
+extern unsigned long calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n); + #define LOAD_INT(x) ((x) >> FSHIFT) #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 54fbdfb..28a5165 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,6 +91,75 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; }
+/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + #ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. @@ -210,75 +279,6 @@ static long calc_load_nohz_fold(void) return delta; }
-/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) { - for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - /* * NO_HZ can leave us missing all per-CPU ticks calling * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
mainline inclusion from mainline-v5.4 commit 1f351d7f7590857ea281579c26e6045b4c548ef4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
kernel/sched/sched.h includes "stats.h" half-way through the file. The next patch introduces users of sched.h's rq locking functions and update_rq_clock() in kernel/sched/stats.h. Move those definitions up in the file so they are available in stats.h.
Link: http://lkml.kernel.org/r/20180828172258.3185-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Suren Baghdasaryan surenb@google.com Tested-by: Daniel Drake drake@endlessm.com Cc: Christopher Lameter cl@linux.com Cc: Ingo Molnar mingo@redhat.com Cc: Johannes Weiner jweiner@fb.com Cc: Mike Galbraith efault@gmx.de Cc: Peter Enderborg peter.enderborg@sony.com Cc: Randy Dunlap rdunlap@infradead.org Cc: Shakeel Butt shakeelb@google.com Cc: Tejun Heo tj@kernel.org Cc: Vinayak Menon vinmenon@codeaurora.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/sched.h | 164 +++++++++++++++++++++++++-------------------------- 1 file changed, 82 insertions(+), 82 deletions(-)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d7bc93..8f2ae75 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1003,6 +1003,8 @@ static inline void update_idle_core(struct rq *rq) { } #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues)
+extern void update_rq_clock(struct rq *rq); + static inline u64 __rq_clock_broken(struct rq *rq) { return READ_ONCE(rq->clock); @@ -1121,6 +1123,86 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) #endif }
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, @@ -1777,8 +1859,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); }
-extern void update_rq_clock(struct rq *rq); - extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1843,86 +1923,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) #endif #endif
-struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(rq->lock); - -struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(p->pi_lock) - __acquires(rq->lock); - -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - __releases(rq->lock) - __releases(p->pi_lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -} - -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irqsave(&rq->lock, rf->flags); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irq(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_relock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); -} - -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -} - -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irq(&rq->lock); -} - -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT
mainline inclusion from mainline-v5.4 commit 246b3b3342c9b0a2e24cda2178be87bc36e1c874 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
do_sched_yield() disables IRQs, looks up this_rq() and locks it. The next patch is adding another site with the same pattern, so provide a convenience function for it.
Link: http://lkml.kernel.org/r/20180828172258.3185-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Suren Baghdasaryan surenb@google.com Tested-by: Daniel Drake drake@endlessm.com Cc: Christopher Lameter cl@linux.com Cc: Ingo Molnar mingo@redhat.com Cc: Johannes Weiner jweiner@fb.com Cc: Mike Galbraith efault@gmx.de Cc: Peter Enderborg peter.enderborg@sony.com Cc: Randy Dunlap rdunlap@infradead.org Cc: Shakeel Butt shakeelb@google.com Cc: Tejun Heo tj@kernel.org Cc: Vinayak Menon vinmenon@codeaurora.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/core.c | 4 +--- kernel/sched/sched.h | 12 ++++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d0d6153..81529c4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5048,9 +5048,7 @@ static void do_sched_yield(void) struct rq_flags rf; struct rq *rq;
- local_irq_disable(); - rq = this_rq(); - rq_lock(rq, &rf); + rq = this_rq_lock_irq(&rf);
schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8f2ae75..eef13b5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1203,6 +1203,18 @@ static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) raw_spin_unlock(&rq->lock); }
+static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, rf); + return rq; +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT,
mainline inclusion from mainline-v5.4 commit eb414681d5a07d28d2ff90dc05f69ec6b232ebd2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
When systems are overcommitted and resources become contended, it's hard to tell exactly the impact this has on workload productivity, or how close the system is to lockups and OOM kills. In particular, when machines work multiple jobs concurrently, the impact of overcommit in terms of latency and throughput on the individual job can be enormous.
In order to maximize hardware utilization without sacrificing individual job health or risk complete machine lockups, this patch implements a way to quantify resource pressure in the system.
A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that expose the percentage of time the system is stalled on CPU, memory, or IO, respectively. Stall states are aggregate versions of the per-task delay accounting delays:
cpu: some tasks are runnable but not executing on a CPU memory: tasks are reclaiming, or waiting for swapin or thrashing cache io: tasks are waiting for io completions
These percentages of walltime can be thought of as pressure percentages, and they give a general sense of system health and productivity loss incurred by resource overcommit. They can also indicate when the system is approaching lockup scenarios and OOMs.
To do this, psi keeps track of the task states associated with each CPU and samples the time they spend in stall states. Every 2 seconds, the samples are averaged across CPUs - weighted by the CPUs' non-idle time to eliminate artifacts from unused CPUs - and translated into percentages of walltime. A running average of those percentages is maintained over 10s, 1m, and 5m periods (similar to the loadaverage).
[hannes@cmpxchg.org: doc fixlet, per Randy] Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org [hannes@cmpxchg.org: code optimization] Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org [hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter] Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org [hannes@cmpxchg.org: fix build] Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- Documentation/accounting/psi.txt | 64 ++++ include/linux/psi.h | 28 ++ include/linux/psi_types.h | 92 ++++++ include/linux/sched.h | 10 + init/Kconfig | 15 + kernel/fork.c | 4 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 12 +- kernel/sched/psi.c | 657 +++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 2 + kernel/sched/stats.h | 86 +++++ mm/compaction.c | 5 + mm/filemap.c | 15 +- mm/page_alloc.c | 9 + mm/vmscan.c | 9 + 15 files changed, 1003 insertions(+), 6 deletions(-) create mode 100644 Documentation/accounting/psi.txt create mode 100644 include/linux/psi.h create mode 100644 include/linux/psi_types.h create mode 100644 kernel/sched/psi.c
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt new file mode 100644 index 00000000..3753a82 --- /dev/null +++ b/Documentation/accounting/psi.txt @@ -0,0 +1,64 @@ +================================ +PSI - Pressure Stall Information +================================ + +:Date: April, 2018 +:Author: Johannes Weiner hannes@cmpxchg.org + +When CPU, memory or IO devices are contended, workloads experience +latency spikes, throughput losses, and run the risk of OOM kills. + +Without an accurate measure of such contention, users are forced to +either play it safe and under-utilize their hardware resources, or +roll the dice and frequently suffer the disruptions resulting from +excessive overcommit. + +The psi feature identifies and quantifies the disruptions caused by +such resource crunches and the time impact it has on complex workloads +or even entire systems. + +Having an accurate measure of productivity losses caused by resource +scarcity aids users in sizing workloads to hardware--or provisioning +hardware according to workload demand. + +As psi aggregates this information in realtime, systems can be managed +dynamically using techniques such as load shedding, migrating jobs to +other systems or data centers, or strategically pausing or killing low +priority or restartable batch jobs. + +This allows maximizing hardware utilization without sacrificing +workload health or risking major disruptions such as OOM kills. + +Pressure interface +================== + +Pressure information for each resource is exported through the +respective file in /proc/pressure/ -- cpu, memory, and io. + +The format for CPU is as such: + +some avg10=0.00 avg60=0.00 avg300=0.00 total=0 + +and for memory and IO: + +some avg10=0.00 avg60=0.00 avg300=0.00 total=0 +full avg10=0.00 avg60=0.00 avg300=0.00 total=0 + +The "some" line indicates the share of time in which at least some +tasks are stalled on a given resource. + +The "full" line indicates the share of time in which all non-idle +tasks are stalled on a given resource simultaneously. In this state +actual CPU cycles are going to waste, and a workload that spends +extended time in this state is considered to be thrashing. This has +severe impact on performance, and it's useful to distinguish this +situation from a state where some tasks are stalled but the CPU is +still doing productive work. As such, time spent in this subset of the +stall state is tracked separately and exported in the "full" averages. + +The ratios are tracked as recent trends over ten, sixty, and three +hundred second windows, which gives insight into short term events as +well as medium and long term trends. The total absolute stall time is +tracked and exported as well, to allow detection of latency spikes +which wouldn't necessarily make a dent in the time averages, or to +average trends over custom time frames. diff --git a/include/linux/psi.h b/include/linux/psi.h new file mode 100644 index 00000000..b0daf05 --- /dev/null +++ b/include/linux/psi.h @@ -0,0 +1,28 @@ +#ifndef _LINUX_PSI_H +#define _LINUX_PSI_H + +#include <linux/psi_types.h> +#include <linux/sched.h> + +#ifdef CONFIG_PSI + +extern bool psi_disabled; + +void psi_init(void); + +void psi_task_change(struct task_struct *task, int clear, int set); + +void psi_memstall_tick(struct task_struct *task, int cpu); +void psi_memstall_enter(unsigned long *flags); +void psi_memstall_leave(unsigned long *flags); + +#else /* CONFIG_PSI */ + +static inline void psi_init(void) {} + +static inline void psi_memstall_enter(unsigned long *flags) {} +static inline void psi_memstall_leave(unsigned long *flags) {} + +#endif /* CONFIG_PSI */ + +#endif /* _LINUX_PSI_H */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h new file mode 100644 index 00000000..2cf422d --- /dev/null +++ b/include/linux/psi_types.h @@ -0,0 +1,92 @@ +#ifndef _LINUX_PSI_TYPES_H +#define _LINUX_PSI_TYPES_H + +#include <linux/seqlock.h> +#include <linux/types.h> + +#ifdef CONFIG_PSI + +/* Tracked task states */ +enum psi_task_count { + NR_IOWAIT, + NR_MEMSTALL, + NR_RUNNING, + NR_PSI_TASK_COUNTS, +}; + +/* Task state bitmasks */ +#define TSK_IOWAIT (1 << NR_IOWAIT) +#define TSK_MEMSTALL (1 << NR_MEMSTALL) +#define TSK_RUNNING (1 << NR_RUNNING) + +/* Resources that workloads could be stalled on */ +enum psi_res { + PSI_IO, + PSI_MEM, + PSI_CPU, + NR_PSI_RESOURCES, +}; + +/* + * Pressure states for each resource: + * + * SOME: Stalled tasks & working tasks + * FULL: Stalled tasks & no working tasks + */ +enum psi_states { + PSI_IO_SOME, + PSI_IO_FULL, + PSI_MEM_SOME, + PSI_MEM_FULL, + PSI_CPU_SOME, + /* Only per-CPU, to weigh the CPU in the global average: */ + PSI_NONIDLE, + NR_PSI_STATES, +}; + +struct psi_group_cpu { + /* 1st cacheline updated by the scheduler */ + + /* Aggregator needs to know of concurrent changes */ + seqcount_t seq ____cacheline_aligned_in_smp; + + /* States of the tasks belonging to this group */ + unsigned int tasks[NR_PSI_TASK_COUNTS]; + + /* Period time sampling buckets for each state of interest (ns) */ + u32 times[NR_PSI_STATES]; + + /* Time of last task change in this group (rq_clock) */ + u64 state_start; + + /* 2nd cacheline updated by the aggregator */ + + /* Delta detection against the sampling buckets */ + u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp; +}; + +struct psi_group { + /* Protects data updated during an aggregation */ + struct mutex stat_lock; + + /* Per-cpu task state & time tracking */ + struct psi_group_cpu __percpu *pcpu; + + /* Periodic aggregation state */ + u64 total_prev[NR_PSI_STATES - 1]; + u64 last_update; + u64 next_update; + struct delayed_work clock_work; + + /* Total stall times and sampled pressure averages */ + u64 total[NR_PSI_STATES - 1]; + unsigned long avg[NR_PSI_STATES - 1][3]; +}; + +#else /* CONFIG_PSI */ + +struct psi_group { }; + +#endif /* CONFIG_PSI */ + +#endif /* _LINUX_PSI_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8a09c0c..e1b822d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -25,6 +25,7 @@ #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/signal_types.h> +#include <linux/psi_types.h> #include <linux/mm_types_task.h> #include <linux/task_io_accounting.h> #include <linux/rseq.h> @@ -715,6 +716,10 @@ struct task_struct { unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_remote_wakeup:1; +#ifdef CONFIG_PSI + unsigned sched_psi_wake_requeue:1; +#endif + /* Force alignment to the next boundary: */ unsigned :0;
@@ -968,6 +973,10 @@ struct task_struct { siginfo_t *last_siginfo;
struct task_io_accounting ioac; +#ifdef CONFIG_PSI + /* Pressure stall state */ + unsigned int psi_flags; +#endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; @@ -1416,6 +1425,7 @@ static inline int is_global_init(struct task_struct *tsk) #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ +#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ diff --git a/init/Kconfig b/init/Kconfig index b890162..ef392c2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -503,6 +503,21 @@ config TASK_IO_ACCOUNTING
Say N if unsure.
+config PSI + bool "Pressure stall information tracking" + help + Collect metrics that indicate how overcommitted the CPU, memory, + and IO capacity are in the system. + + If you say Y here, the kernel will create /proc/pressure/ with the + pressure statistics files cpu, memory, and io. These will indicate + the share of walltime in which some or all tasks in the system are + delayed due to contention of the respective resource. + + For more details see Documentation/accounting/psi.txt. + + Say N if unsure. + endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION diff --git a/kernel/fork.c b/kernel/fork.c index 403b8a3..0e5c220 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1895,6 +1895,10 @@ static __latent_entropy struct task_struct *copy_process(
p->default_timer_slack_ns = current->timer_slack_ns;
+#ifdef CONFIG_PSI + p->psi_flags = 0; +#endif + task_io_accounting_init(&p->ioac); acct_clear_integrals(p);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe1834..21fb5a5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o +obj-$(CONFIG_PSI) += psi.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 81529c4..087f028 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -747,8 +747,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq);
- if (!(flags & ENQUEUE_RESTORE)) + if (!(flags & ENQUEUE_RESTORE)) { sched_info_queued(rq, p); + psi_enqueue(p, flags & ENQUEUE_WAKEUP); + }
p->sched_class->enqueue_task(rq, p, flags); } @@ -758,8 +760,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq);
- if (!(flags & DEQUEUE_SAVE)) + if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeued(rq, p); + psi_dequeue(p, flags & DEQUEUE_SLEEP); + }
p->sched_class->dequeue_task(rq, p, flags); } @@ -2062,6 +2066,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); set_task_cpu(p, cpu); }
@@ -3124,6 +3129,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); + psi_task_tick(rq);
rq_unlock(rq, &rf);
@@ -6200,6 +6206,8 @@ void __init sched_init(void) if (use_sched_idle_time) rq_cputime_init();
+ psi_init(); + scheduler_running = 1; }
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c new file mode 100644 index 00000000..5954145 --- /dev/null +++ b/kernel/sched/psi.c @@ -0,0 +1,657 @@ +/* + * Pressure stall information for CPU, memory and IO + * + * Copyright (c) 2018 Facebook, Inc. + * Author: Johannes Weiner hannes@cmpxchg.org + * + * When CPU, memory and IO are contended, tasks experience delays that + * reduce throughput and introduce latencies into the workload. Memory + * and IO contention, in addition, can cause a full loss of forward + * progress in which the CPU goes idle. + * + * This code aggregates individual task delays into resource pressure + * metrics that indicate problems with both workload health and + * resource utilization. + * + * Model + * + * The time in which a task can execute on a CPU is our baseline for + * productivity. Pressure expresses the amount of time in which this + * potential cannot be realized due to resource contention. + * + * This concept of productivity has two components: the workload and + * the CPU. To measure the impact of pressure on both, we define two + * contention states for a resource: SOME and FULL. + * + * In the SOME state of a given resource, one or more tasks are + * delayed on that resource. This affects the workload's ability to + * perform work, but the CPU may still be executing other tasks. + * + * In the FULL state of a given resource, all non-idle tasks are + * delayed on that resource such that nobody is advancing and the CPU + * goes idle. This leaves both workload and CPU unproductive. + * + * (Naturally, the FULL state doesn't exist for the CPU resource.) + * + * SOME = nr_delayed_tasks != 0 + * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * + * The percentage of wallclock time spent in those compound stall + * states gives pressure numbers between 0 and 100 for each resource, + * where the SOME percentage indicates workload slowdowns and the FULL + * percentage indicates reduced CPU utilization: + * + * %SOME = time(SOME) / period + * %FULL = time(FULL) / period + * + * Multiple CPUs + * + * The more tasks and available CPUs there are, the more work can be + * performed concurrently. This means that the potential that can go + * unrealized due to resource contention *also* scales with non-idle + * tasks and CPUs. + * + * Consider a scenario where 257 number crunching tasks are trying to + * run concurrently on 256 CPUs. If we simply aggregated the task + * states, we would have to conclude a CPU SOME pressure number of + * 100%, since *somebody* is waiting on a runqueue at all + * times. However, that is clearly not the amount of contention the + * workload is experiencing: only one out of 256 possible exceution + * threads will be contended at any given time, or about 0.4%. + * + * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any + * given time *one* of the tasks is delayed due to a lack of memory. + * Again, looking purely at the task state would yield a memory FULL + * pressure number of 0%, since *somebody* is always making forward + * progress. But again this wouldn't capture the amount of execution + * potential lost, which is 1 out of 4 CPUs, or 25%. + * + * To calculate wasted potential (pressure) with multiple processors, + * we have to base our calculation on the number of non-idle tasks in + * conjunction with the number of available CPUs, which is the number + * of potential execution threads. SOME becomes then the proportion of + * delayed tasks to possibe threads, and FULL is the share of possible + * threads that are unproductive due to delays: + * + * threads = min(nr_nonidle_tasks, nr_cpus) + * SOME = min(nr_delayed_tasks / threads, 1) + * FULL = (threads - min(nr_running_tasks, threads)) / threads + * + * For the 257 number crunchers on 256 CPUs, this yields: + * + * threads = min(257, 256) + * SOME = min(1 / 256, 1) = 0.4% + * FULL = (256 - min(257, 256)) / 256 = 0% + * + * For the 1 out of 4 memory-delayed tasks, this yields: + * + * threads = min(4, 4) + * SOME = min(1 / 4, 1) = 25% + * FULL = (4 - min(3, 4)) / 4 = 25% + * + * [ Substitute nr_cpus with 1, and you can see that it's a natural + * extension of the single-CPU model. ] + * + * Implementation + * + * To assess the precise time spent in each such state, we would have + * to freeze the system on task changes and start/stop the state + * clocks accordingly. Obviously that doesn't scale in practice. + * + * Because the scheduler aims to distribute the compute load evenly + * among the available CPUs, we can track task state locally to each + * CPU and, at much lower frequency, extrapolate the global state for + * the cumulative stall times and the running averages. + * + * For each runqueue, we track: + * + * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) + * + * and then periodically aggregate: + * + * tNONIDLE = sum(tNONIDLE[i]) + * + * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE + * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE + * + * %SOME = tSOME / period + * %FULL = tFULL / period + * + * This gives us an approximation of pressure that is practical + * cost-wise, yet way more sensitive and accurate than periodic + * sampling of the aggregate task states would be. + */ + +#include <linux/sched/loadavg.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/seqlock.h> +#include <linux/cgroup.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/psi.h> +#include "sched.h" + +static int psi_bug __read_mostly; + +bool psi_disabled __read_mostly; +core_param(psi_disabled, psi_disabled, bool, 0644); + +/* Running averages - we need to be higher-res than loadavg */ +#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ +#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */ +#define EXP_60s 1981 /* 1/exp(2s/60s) */ +#define EXP_300s 2034 /* 1/exp(2s/300s) */ + +/* Sampling frequency in nanoseconds */ +static u64 psi_period __read_mostly; + +/* System-level pressure and stall tracking */ +static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); +static struct psi_group psi_system = { + .pcpu = &system_group_pcpu, +}; + +static void psi_update_work(struct work_struct *work); + +static void group_init(struct psi_group *group) +{ + int cpu; + + for_each_possible_cpu(cpu) + seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + group->next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->clock_work, psi_update_work); + mutex_init(&group->stat_lock); +} + +void __init psi_init(void) +{ + if (psi_disabled) + return; + + psi_period = jiffies_to_nsecs(PSI_FREQ); + group_init(&psi_system); +} + +static bool test_state(unsigned int *tasks, enum psi_states state) +{ + switch (state) { + case PSI_IO_SOME: + return tasks[NR_IOWAIT]; + case PSI_IO_FULL: + return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + case PSI_MEM_SOME: + return tasks[NR_MEMSTALL]; + case PSI_MEM_FULL: + return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + case PSI_CPU_SOME: + return tasks[NR_RUNNING] > 1; + case PSI_NONIDLE: + return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || + tasks[NR_RUNNING]; + default: + return false; + } +} + +static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +{ + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + unsigned int tasks[NR_PSI_TASK_COUNTS]; + u64 now, state_start; + unsigned int seq; + int s; + + /* Snapshot a coherent view of the CPU state */ + do { + seq = read_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); + memcpy(times, groupc->times, sizeof(groupc->times)); + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_start = groupc->state_start; + } while (read_seqcount_retry(&groupc->seq, seq)); + + /* Calculate state time deltas against the previous snapshot */ + for (s = 0; s < NR_PSI_STATES; s++) { + u32 delta; + /* + * In addition to already concluded states, we also + * incorporate currently active states on the CPU, + * since states may last for many sampling periods. + * + * This way we keep our delta sampling buckets small + * (u32) and our reported pressure close to what's + * actually happening. + */ + if (test_state(tasks, s)) + times[s] += now - state_start; + + delta = times[s] - groupc->times_prev[s]; + groupc->times_prev[s] = times[s]; + + times[s] = delta; + } +} + +static void calc_avgs(unsigned long avg[3], int missed_periods, + u64 time, u64 period) +{ + unsigned long pct; + + /* Fill in zeroes for periods of no activity */ + if (missed_periods) { + avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods); + avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods); + avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods); + } + + /* Sample the most recent active period */ + pct = div_u64(time * 100, period); + pct *= FIXED_1; + avg[0] = calc_load(avg[0], EXP_10s, pct); + avg[1] = calc_load(avg[1], EXP_60s, pct); + avg[2] = calc_load(avg[2], EXP_300s, pct); +} + +static bool update_stats(struct psi_group *group) +{ + u64 deltas[NR_PSI_STATES - 1] = { 0, }; + unsigned long missed_periods = 0; + unsigned long nonidle_total = 0; + u64 now, expires, period; + int cpu; + int s; + + mutex_lock(&group->stat_lock); + + /* + * Collect the per-cpu time buckets and average them into a + * single time sample that is normalized to wallclock time. + * + * For averaging, each CPU is weighted by its non-idle time in + * the sampling period. This eliminates artifacts from uneven + * loading, or even entirely idle CPUs. + */ + for_each_possible_cpu(cpu) { + u32 times[NR_PSI_STATES]; + u32 nonidle; + + get_recent_times(group, cpu, times); + + nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); + nonidle_total += nonidle; + + for (s = 0; s < PSI_NONIDLE; s++) + deltas[s] += (u64)times[s] * nonidle; + } + + /* + * Integrate the sample into the running statistics that are + * reported to userspace: the cumulative stall times and the + * decaying averages. + * + * Pressure percentages are sampled at PSI_FREQ. We might be + * called more often when the user polls more frequently than + * that; we might be called less often when there is no task + * activity, thus no data, and clock ticks are sporadic. The + * below handles both. + */ + + /* total= */ + for (s = 0; s < NR_PSI_STATES - 1; s++) + group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + + /* avgX= */ + now = sched_clock(); + expires = group->next_update; + if (now < expires) + goto out; + if (now - expires > psi_period) + missed_periods = div_u64(now - expires, psi_period); + + /* + * The periodic clock tick can get delayed for various + * reasons, especially on loaded systems. To avoid clock + * drift, we schedule the clock in fixed psi_period intervals. + * But the deltas we sample out of the per-cpu buckets above + * are based on the actual time elapsing between clock ticks. + */ + group->next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->last_update + (missed_periods * psi_period)); + group->last_update = now; + + for (s = 0; s < NR_PSI_STATES - 1; s++) { + u32 sample; + + sample = group->total[s] - group->total_prev[s]; + /* + * Due to the lockless sampling of the time buckets, + * recorded time deltas can slip into the next period, + * which under full pressure can result in samples in + * excess of the period length. + * + * We don't want to report non-sensical pressures in + * excess of 100%, nor do we want to drop such events + * on the floor. Instead we punt any overage into the + * future until pressure subsides. By doing this we + * don't underreport the occurring pressure curve, we + * just report it delayed by one period length. + * + * The error isn't cumulative. As soon as another + * delta slips from a period P to P+1, by definition + * it frees up its time T in P. + */ + if (sample > period) + sample = period; + group->total_prev[s] += sample; + calc_avgs(group->avg[s], missed_periods, sample, period); + } +out: + mutex_unlock(&group->stat_lock); + return nonidle_total; +} + +static void psi_update_work(struct work_struct *work) +{ + struct delayed_work *dwork; + struct psi_group *group; + bool nonidle; + + dwork = to_delayed_work(work); + group = container_of(dwork, struct psi_group, clock_work); + + /* + * If there is task activity, periodically fold the per-cpu + * times and feed samples into the running averages. If things + * are idle and there is no data to process, stop the clock. + * Once restarted, we'll catch up the running averages in one + * go - see calc_avgs() and missed_periods. + */ + + nonidle = update_stats(group); + + if (nonidle) { + unsigned long delay = 0; + u64 now; + + now = sched_clock(); + if (group->next_update > now) + delay = nsecs_to_jiffies(group->next_update - now) + 1; + schedule_delayed_work(dwork, delay); + } +} + +static void record_times(struct psi_group_cpu *groupc, int cpu, + bool memstall_tick) +{ + u32 delta; + u64 now; + + now = cpu_clock(cpu); + delta = now - groupc->state_start; + groupc->state_start = now; + + if (test_state(groupc->tasks, PSI_IO_SOME)) { + groupc->times[PSI_IO_SOME] += delta; + if (test_state(groupc->tasks, PSI_IO_FULL)) + groupc->times[PSI_IO_FULL] += delta; + } + + if (test_state(groupc->tasks, PSI_MEM_SOME)) { + groupc->times[PSI_MEM_SOME] += delta; + if (test_state(groupc->tasks, PSI_MEM_FULL)) + groupc->times[PSI_MEM_FULL] += delta; + else if (memstall_tick) { + u32 sample; + /* + * Since we care about lost potential, a + * memstall is FULL when there are no other + * working tasks, but also when the CPU is + * actively reclaiming and nothing productive + * could run even if it were runnable. + * + * When the timer tick sees a reclaiming CPU, + * regardless of runnable tasks, sample a FULL + * tick (or less if it hasn't been a full tick + * since the last state change). + */ + sample = min(delta, (u32)jiffies_to_nsecs(1)); + groupc->times[PSI_MEM_FULL] += sample; + } + } + + if (test_state(groupc->tasks, PSI_CPU_SOME)) + groupc->times[PSI_CPU_SOME] += delta; + + if (test_state(groupc->tasks, PSI_NONIDLE)) + groupc->times[PSI_NONIDLE] += delta; +} + +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) +{ + struct psi_group_cpu *groupc; + unsigned int t, m; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + /* + * First we assess the aggregate resource states this CPU's + * tasks have been in since the last change, and account any + * SOME and FULL time these may have resulted in. + * + * Then we update the task counts according to the state + * change requested through the @clear and @set bits. + */ + write_seqcount_begin(&groupc->seq); + + record_times(groupc, cpu, false); + + for (t = 0, m = clear; m; m &= ~(1 << t), t++) { + if (!(m & (1 << t))) + continue; + if (groupc->tasks[t] == 0 && !psi_bug) { + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + cpu, t, groupc->tasks[0], + groupc->tasks[1], groupc->tasks[2], + clear, set); + psi_bug = 1; + } + groupc->tasks[t]--; + } + + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + groupc->tasks[t]++; + + write_seqcount_end(&groupc->seq); + + if (!delayed_work_pending(&group->clock_work)) + schedule_delayed_work(&group->clock_work, PSI_FREQ); +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + + if (!task->pid) + return; + + if (((task->psi_flags & set) || + (task->psi_flags & clear) != clear) && + !psi_bug) { + printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", + task->pid, task->comm, cpu, + task->psi_flags, clear, set); + psi_bug = 1; + } + + task->psi_flags &= ~clear; + task->psi_flags |= set; + + psi_group_change(&psi_system, cpu, clear, set); +} + +void psi_memstall_tick(struct task_struct *task, int cpu) +{ + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(psi_system.pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); +} + +/** + * psi_memstall_enter - mark the beginning of a memory stall section + * @flags: flags to handle nested sections + * + * Marks the calling task as being stalled due to a lack of memory, + * such as waiting for a refault or performing reclaim. + */ +void psi_memstall_enter(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + *flags = current->flags & PF_MEMSTALL; + if (*flags) + return; + /* + * PF_MEMSTALL setting & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we can + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags |= PF_MEMSTALL; + psi_task_change(current, 0, TSK_MEMSTALL); + + rq_unlock_irq(rq, &rf); +} + +/** + * psi_memstall_leave - mark the end of an memory stall section + * @flags: flags to handle nested memdelay sections + * + * Marks the calling task as no longer stalled due to lack of memory. + */ +void psi_memstall_leave(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + if (*flags) + return; + /* + * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we could + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags &= ~PF_MEMSTALL; + psi_task_change(current, TSK_MEMSTALL, 0); + + rq_unlock_irq(rq, &rf); +} + +static int psi_show(struct seq_file *m, struct psi_group *group, + enum psi_res res) +{ + int full; + + if (psi_disabled) + return -EOPNOTSUPP; + + update_stats(group); + + for (full = 0; full < 2 - (res == PSI_CPU); full++) { + unsigned long avg[3]; + u64 total; + int w; + + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + + return 0; +} + +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_io_show, NULL); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_memory_show, NULL); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_cpu_show, NULL); +} + +static const struct file_operations psi_io_fops = { + .open = psi_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_memory_fops = { + .open = psi_memory_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_cpu_fops = { + .open = psi_cpu_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init psi_proc_init(void) +{ + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + return 0; +} +module_init(psi_proc_init); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eef13b5..abb0af2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -55,6 +55,7 @@ #include <linux/proc_fs.h> #include <linux/prefetch.h> #include <linux/profile.h> +#include <linux/psi.h> #include <linux/rcupdate_wait.h> #include <linux/security.h> #include <linux/stackprotector.h> @@ -330,6 +331,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) #ifdef CONFIG_CGROUP_SCHED
#include <linux/cgroup.h> +#include <linux/psi.h>
struct cfs_rq; struct rt_rq; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index f6a7d0b..7cd0104 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -76,6 +76,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_end_time(rq, t) do { } while (0) #endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_PSI +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (psi_disabled) + return; + + if (!wakeup || p->sched_psi_wake_requeue) { + if (p->flags & PF_MEMSTALL) + set |= TSK_MEMSTALL; + if (p->sched_psi_wake_requeue) + p->sched_psi_wake_requeue = 0; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + int clear = TSK_RUNNING, set = 0; + + if (psi_disabled) + return; + + if (!sleep) { + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + set |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (psi_disabled) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + struct rq_flags rf; + struct rq *rq; + int clear = 0; + + if (p->in_iowait) + clear |= TSK_IOWAIT; + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, clear, 0); + p->sched_psi_wake_requeue = 1; + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_task_tick(struct rq *rq) +{ + if (psi_disabled) + return; + + if (unlikely(rq->curr->flags & PF_MEMSTALL)) + psi_memstall_tick(rq->curr, cpu_of(rq)); +} +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_task_tick(struct rq *rq) {} +#endif /* CONFIG_PSI */ + #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) { diff --git a/mm/compaction.c b/mm/compaction.c index 5079ddb..120e555 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -22,6 +22,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/page_owner.h> +#include <linux/psi.h> #include "internal.h"
#ifdef CONFIG_COMPACTION @@ -2059,11 +2060,15 @@ static int kcompactd(void *p) pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
while (!kthread_should_stop()) { + unsigned long pflags; + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); wait_event_freezable(pgdat->kcompactd_wait, kcompactd_work_requested(pgdat));
+ psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); }
return 0; diff --git a/mm/filemap.c b/mm/filemap.c index 6c964d8..94f7f38 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -37,6 +37,7 @@ #include <linux/shmem_fs.h> #include <linux/rmap.h> #include <linux/delayacct.h> +#include <linux/psi.h> #include "internal.h"
#define CREATE_TRACE_POINTS @@ -1185,11 +1186,14 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; + unsigned long pflags; int ret = 0;
- if (bit_nr == PG_locked && !PageSwapBacked(page) && + if (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) { - delayacct_thrashing_start(); + if (!PageSwapBacked(page)) + delayacct_thrashing_start(); + psi_memstall_enter(&pflags); thrashing = true; }
@@ -1231,8 +1235,11 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
finish_wait(q, wait);
- if (thrashing) - delayacct_thrashing_end(); + if (thrashing) { + if (!PageSwapBacked(page)) + delayacct_thrashing_end(); + psi_memstall_leave(&pflags); + }
/* * A signal could leave PageWaiters set. Clearing it here if diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12e0899..18ab49f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -66,6 +66,7 @@ #include <linux/ftrace.h> #include <linux/lockdep.h> #include <linux/nmi.h> +#include <linux/psi.h> #include <linux/khugepaged.h> #include <linux/ktask.h>
@@ -3696,15 +3697,20 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) enum compact_priority prio, enum compact_result *compact_result) { struct page *page; + unsigned long pflags; unsigned int noreclaim_flag;
if (!order) return NULL;
+ psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags);
if (*compact_result <= COMPACT_INACTIVE) return NULL; @@ -3911,11 +3917,13 @@ void fs_reclaim_release(gfp_t gfp_mask) struct reclaim_state reclaim_state; int progress; unsigned int noreclaim_flag; + unsigned long pflags;
cond_resched();
/* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); + psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; @@ -3927,6 +3935,7 @@ void fs_reclaim_release(gfp_t gfp_mask) current->reclaim_state = NULL; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); + psi_memstall_leave(&pflags);
cond_resched();
diff --git a/mm/vmscan.c b/mm/vmscan.c index abcffa3..3069980 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -50,6 +50,7 @@ #include <linux/prefetch.h> #include <linux/printk.h> #include <linux/dax.h> +#include <linux/psi.h>
#include <asm/tlbflush.h> #include <asm/div64.h> @@ -3340,6 +3341,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long pflags; int nid; unsigned int noreclaim_flag; struct scan_control sc = { @@ -3368,9 +3370,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx);
+ psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -3535,6 +3541,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long pflags; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -3545,6 +3552,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_swap = 1, };
+ psi_memstall_enter(&pflags); __fs_reclaim_acquire();
count_vm_event(PAGEOUTRUN); @@ -3651,6 +3659,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); + psi_memstall_leave(&pflags); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller
mainline inclusion from mainline-v5.4 commit 2ce7135adc9ad081aa3c49744144376ac74fea60 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
On a system that executes multiple cgrouped jobs and independent workloads, we don't just care about the health of the overall system, but also that of individual jobs, so that we can ensure individual job health, fairness between jobs, or prioritize some jobs over others.
This patch implements pressure stall tracking for cgroups. In kernels with CONFIG_PSI=y, cgroup2 groups will have cpu.pressure, memory.pressure, and io.pressure files that track aggregate pressure stall times for only the tasks inside the cgroup.
Link: http://lkml.kernel.org/r/20180828172258.3185-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- Documentation/accounting/psi.txt | 9 +++ Documentation/admin-guide/cgroup-v2.rst | 18 +++++ include/linux/cgroup-defs.h | 4 ++ include/linux/cgroup.h | 15 ++++ include/linux/psi.h | 25 +++++++ init/Kconfig | 4 ++ kernel/cgroup/cgroup.c | 44 +++++++++++- kernel/sched/psi.c | 118 +++++++++++++++++++++++++++++--- 8 files changed, 227 insertions(+), 10 deletions(-)
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt index 3753a82..b8ca28b 100644 --- a/Documentation/accounting/psi.txt +++ b/Documentation/accounting/psi.txt @@ -62,3 +62,12 @@ well as medium and long term trends. The total absolute stall time is tracked and exported as well, to allow detection of latency spikes which wouldn't necessarily make a dent in the time averages, or to average trends over custom time frames. + +Cgroup2 interface +================= + +In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem +mounted, pressure stall information is also tracked for tasks grouped +into cgroups. Each subdirectory in the cgroupfs mountpoint contains +cpu.pressure, memory.pressure, and io.pressure files; the format is +the same as the /proc/pressure/ files. diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8c1fe54..f2b4d62 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -972,6 +972,12 @@ All time durations are in microseconds. $PERIOD duration. "max" for $MAX indicates no limit. If only one number is written, $MAX is updated.
+ cpu.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for CPU. See + Documentation/accounting/psi.txt for details. +
Memory ------ @@ -1297,6 +1303,12 @@ PAGE_SIZE multiple when read back. higher than the limit for an extended period of time. This reduces the impact on the workload and memory management.
+ memory.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for memory. See + Documentation/accounting/psi.txt for details. +
Usage Guidelines ~~~~~~~~~~~~~~~~ @@ -1531,6 +1543,12 @@ IO Interface Files
8:16 rbps=2097152 wbps=max riops=max wiops=max
+ io.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for IO. See + Documentation/accounting/psi.txt for details. +
Writeback ~~~~~~~~~ diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f92264d..daf2233 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -20,6 +20,7 @@ #include <linux/u64_stats_sync.h> #include <linux/workqueue.h> #include <linux/bpf-cgroup.h> +#include <linux/psi_types.h>
#ifdef CONFIG_CGROUPS
@@ -442,6 +443,9 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work;
+ /* used to track pressure stalls */ + struct psi_group psi; + /* used to store eBPF programs */ struct cgroup_bpf bpf;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index d4d3a3a..0e1f062 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -671,6 +671,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); }
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return &cgrp->psi; +} + static inline void cgroup_init_kthreadd(void) { /* @@ -725,6 +730,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) return NULL; }
+static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + return NULL; +} + +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return NULL; +} + static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { diff --git a/include/linux/psi.h b/include/linux/psi.h index b0daf05..8e0725a 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -4,6 +4,9 @@ #include <linux/psi_types.h> #include <linux/sched.h>
+struct seq_file; +struct css_set; + #ifdef CONFIG_PSI
extern bool psi_disabled; @@ -16,6 +19,14 @@ void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags);
+int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgrp); +void psi_cgroup_free(struct cgroup *cgrp); +void cgroup_move_task(struct task_struct *p, struct css_set *to); +#endif + #else /* CONFIG_PSI */
static inline void psi_init(void) {} @@ -23,6 +34,20 @@ static inline void psi_init(void) {} static inline void psi_memstall_enter(unsigned long *flags) {} static inline void psi_memstall_leave(unsigned long *flags) {}
+#ifdef CONFIG_CGROUPS +static inline int psi_cgroup_alloc(struct cgroup *cgrp) +{ + return 0; +} +static inline void psi_cgroup_free(struct cgroup *cgrp) +{ +} +static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) +{ + rcu_assign_pointer(p->cgroups, to); +} +#endif + #endif /* CONFIG_PSI */
#endif /* _LINUX_PSI_H */ diff --git a/init/Kconfig b/init/Kconfig index ef392c2..c797e89 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -514,6 +514,10 @@ config PSI the share of walltime in which some or all tasks in the system are delayed due to contention of the respective resource.
+ In kernels with cgroup support, cgroups (cgroup2 only) will + have cpu.pressure, memory.pressure, and io.pressure files, + which aggregate pressure stalls for the grouped tasks only. + For more details see Documentation/accounting/psi.txt.
Say N if unsure. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d150a5e..71ac258 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -55,6 +55,7 @@ #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/sched/cputime.h> +#include <linux/psi.h> #include <net/sock.h>
#define CREATE_TRACE_POINTS @@ -836,7 +837,7 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING);
- rcu_assign_pointer(task->cgroups, to_cset); + cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } @@ -3423,6 +3424,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) return ret; }
+#ifdef CONFIG_PSI +static int cgroup_io_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); +} +static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); +} +static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); +} +#endif + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4604,6 +4620,23 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, +#ifdef CONFIG_PSI + { + .name = "io.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_io_pressure_show, + }, + { + .name = "memory.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_memory_pressure_show, + }, + { + .name = "cpu.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_cpu_pressure_show, + }, +#endif { } /* terminate */ };
@@ -4664,6 +4697,7 @@ static void css_free_rwork_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + psi_cgroup_free(cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_exit(cgrp); kfree(cgrp); @@ -4922,10 +4956,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; - ret = cgroup_bpf_inherit(cgrp); + ret = psi_cgroup_alloc(cgrp); if (ret) goto out_idr_free;
+ ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_psi_free; + spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4965,6 +5003,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
return cgrp;
+out_psi_free: + psi_cgroup_free(cgrp); out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 5954145..7cdecfc 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -473,9 +473,35 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->clock_work, PSI_FREQ); }
+static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +{ +#ifdef CONFIG_CGROUPS + struct cgroup *cgroup = NULL; + + if (!*iter) + cgroup = task->cgroups->dfl_cgrp; + else if (*iter == &psi_system) + return NULL; + else + cgroup = cgroup_parent(*iter); + + if (cgroup && cgroup_parent(cgroup)) { + *iter = cgroup; + return cgroup_psi(cgroup); + } +#else + if (*iter) + return NULL; +#endif + *iter = &psi_system; + return &psi_system; +} + void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); + struct psi_group *group; + void *iter = NULL;
if (!task->pid) return; @@ -492,17 +518,23 @@ void psi_task_change(struct task_struct *task, int clear, int set) task->psi_flags &= ~clear; task->psi_flags |= set;
- psi_group_change(&psi_system, cpu, clear, set); + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set); }
void psi_memstall_tick(struct task_struct *task, int cpu) { - struct psi_group_cpu *groupc; + struct psi_group *group; + void *iter = NULL;
- groupc = per_cpu_ptr(psi_system.pcpu, cpu); - write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, true); - write_seqcount_end(&groupc->seq); + while ((group = iterate_groups(task, &iter))) { + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(group->pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); + } }
/** @@ -565,8 +597,78 @@ void psi_memstall_leave(unsigned long *flags) rq_unlock_irq(rq, &rf); }
-static int psi_show(struct seq_file *m, struct psi_group *group, - enum psi_res res) +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgroup) +{ + if (psi_disabled) + return 0; + + cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi.pcpu) + return -ENOMEM; + group_init(&cgroup->psi); + return 0; +} + +void psi_cgroup_free(struct cgroup *cgroup) +{ + if (psi_disabled) + return; + + cancel_delayed_work_sync(&cgroup->psi.clock_work); + free_percpu(cgroup->psi.pcpu); +} + +/** + * cgroup_move_task - move task to a different cgroup + * @task: the task + * @to: the target css_set + * + * Move task to a new cgroup and safely migrate its associated stall + * state between the different groups. + * + * This function acquires the task's rq lock to lock out concurrent + * changes to the task's scheduling state and - in case the task is + * running - concurrent changes to its stall state. + */ +void cgroup_move_task(struct task_struct *task, struct css_set *to) +{ + bool move_psi = !psi_disabled; + unsigned int task_flags = 0; + struct rq_flags rf; + struct rq *rq; + + if (move_psi) { + rq = task_rq_lock(task, &rf); + + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; + + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; + + if (task_flags) + psi_task_change(task, task_flags, 0); + } + + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + + if (move_psi) { + if (task_flags) + psi_task_change(task, 0, task_flags); + + task_rq_unlock(rq, task, &rf); + } +} +#endif /* CONFIG_CGROUPS */ + +int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full;
mainline inclusion from mainline-v5.4 commit 8fcb2312d1e3300e81aa871aad00d4c038cfc184 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
The existing code triggered an invalid warning about 'rq' possibly being used uninitialized. Instead of doing the silly warning suppression by initializa it to NULL, refactor the code to bail out early instead.
Warning was:
kernel/sched/psi.c: In function `cgroup_move_task': kernel/sched/psi.c:639:13: warning: `rq' may be used uninitialized in this function [-Wmaybe-uninitialized]
Link: http://lkml.kernel.org/r/20181103183339.8669-1-olof@lixom.net Fixes: 2ce7135adc9ad ("psi: cgroup support") Signed-off-by: Olof Johansson olof@lixom.net Reviewed-by: Andrew Morton akpm@linux-foundation.org Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Ingo Molnar mingo@redhat.com Cc: Peter Zijlstra peterz@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7cdecfc..3d7355d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -633,38 +633,39 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - bool move_psi = !psi_disabled; unsigned int task_flags = 0; struct rq_flags rf; struct rq *rq;
- if (move_psi) { - rq = task_rq_lock(task, &rf); + if (psi_disabled) { + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + return; + }
- if (task_on_rq_queued(task)) - task_flags = TSK_RUNNING; - else if (task->in_iowait) - task_flags = TSK_IOWAIT; + rq = task_rq_lock(task, &rf);
- if (task->flags & PF_MEMSTALL) - task_flags |= TSK_MEMSTALL; + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT;
- if (task_flags) - psi_task_change(task, task_flags, 0); - } + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL;
- /* - * Lame to do this here, but the scheduler cannot be locked - * from the outside, so we move cgroups from inside sched/. - */ + if (task_flags) + psi_task_change(task, task_flags, 0); + + /* See comment above */ rcu_assign_pointer(task->cgroups, to);
- if (move_psi) { - if (task_flags) - psi_task_change(task, 0, task_flags); + if (task_flags) + psi_task_change(task, 0, task_flags);
- task_rq_unlock(rq, task, &rf); - } + task_rq_unlock(rq, task, &rf); } #endif /* CONFIG_CGROUPS */
mainline inclusion from mainline-v5.4 commit e0c274472d5d27f277af722e017525e0b33784cj category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
Mel Gorman reports a hackbench regression with psi that would prohibit shipping the suse kernel with it default-enabled, but he'd still like users to be able to opt in at little to no cost to others.
With the current combination of CONFIG_PSI and the psi_disabled bool set from the commandline, this is a challenge. Do the following things to make it easier:
1. Add a config option CONFIG_PSI_DEFAULT_DISABLED that allows distros to enable CONFIG_PSI in their kernel but leave the feature disabled unless a user requests it at boot-time.
To avoid double negatives, rename psi_disabled= to psi=.
2. Make psi_disabled a static branch to eliminate any branch costs when the feature is disabled.
In terms of numbers before and after this patch, Mel says:
: The following is a comparision using CONFIG_PSI=n as a baseline against : your patch and a vanilla kernel : : 4.20.0-rc4 4.20.0-rc4 4.20.0-rc4 : kconfigdisable-v1r1 vanilla psidisable-v1r1 : Amean 1 1.3100 ( 0.00%) 1.3923 ( -6.28%) 1.3427 ( -2.49%) : Amean 3 3.8860 ( 0.00%) 4.1230 * -6.10%* 3.8860 ( -0.00%) : Amean 5 6.8847 ( 0.00%) 8.0390 * -16.77%* 6.7727 ( 1.63%) : Amean 7 9.9310 ( 0.00%) 10.8367 * -9.12%* 9.9910 ( -0.60%) : Amean 12 16.6577 ( 0.00%) 18.2363 * -9.48%* 17.1083 ( -2.71%) : Amean 18 26.5133 ( 0.00%) 27.8833 * -5.17%* 25.7663 ( 2.82%) : Amean 24 34.3003 ( 0.00%) 34.6830 ( -1.12%) 32.0450 ( 6.58%) : Amean 30 40.0063 ( 0.00%) 40.5800 ( -1.43%) 41.5087 ( -3.76%) : Amean 32 40.1407 ( 0.00%) 41.2273 ( -2.71%) 39.9417 ( 0.50%) : : It's showing that the vanilla kernel takes a hit (as the bisection : indicated it would) and that disabling PSI by default is reasonably : close in terms of performance for this particular workload on this : particular machine so;
Link: http://lkml.kernel.org/r/20181127165329.GA29728@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Tested-by: Mel Gorman mgorman@techsingularity.net Reported-by: Mel Gorman mgorman@techsingularity.net Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ include/linux/psi.h | 3 ++- init/Kconfig | 9 ++++++++ kernel/sched/psi.c | 30 +++++++++++++++++-------- kernel/sched/stats.h | 8 +++---- 5 files changed, 40 insertions(+), 14 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 257b0d1..2a5e76c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3710,6 +3710,10 @@ before loading. See Documentation/blockdev/ramdisk.txt.
+ psi= [KNL] Enable or disable pressure stall information + tracking. + Format: <bool> + psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to probe for; one of (bare|imps|exps|lifebook|any). psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports diff --git a/include/linux/psi.h b/include/linux/psi.h index 8e0725a..7006008 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -1,6 +1,7 @@ #ifndef _LINUX_PSI_H #define _LINUX_PSI_H
+#include <linux/jump_label.h> #include <linux/psi_types.h> #include <linux/sched.h>
@@ -9,7 +10,7 @@
#ifdef CONFIG_PSI
-extern bool psi_disabled; +extern struct static_key_false psi_disabled;
void psi_init(void);
diff --git a/init/Kconfig b/init/Kconfig index c797e89..ea7cfe74 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -522,6 +522,15 @@ config PSI
Say N if unsure.
+config PSI_DEFAULT_DISABLED + bool "Require boot parameter to enable pressure stall information tracking" + default n + depends on PSI + help + If set, pressure stall information tracking will be disabled + per default but can be enabled through passing psi_enable=1 + on the kernel commandline during boot. + endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 3d7355d..fe24de3 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,8 +136,18 @@
static int psi_bug __read_mostly;
-bool psi_disabled __read_mostly; -core_param(psi_disabled, psi_disabled, bool, 0644); +DEFINE_STATIC_KEY_FALSE(psi_disabled); + +#ifdef CONFIG_PSI_DEFAULT_DISABLED +bool psi_enable; +#else +bool psi_enable = true; +#endif +static int __init setup_psi(char *str) +{ + return kstrtobool(str, &psi_enable) == 0; +} +__setup("psi=", setup_psi);
/* Running averages - we need to be higher-res than loadavg */ #define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ @@ -169,8 +179,10 @@ static void group_init(struct psi_group *group)
void __init psi_init(void) { - if (psi_disabled) + if (!psi_enable) { + static_branch_enable(&psi_disabled); return; + }
psi_period = jiffies_to_nsecs(PSI_FREQ); group_init(&psi_system); @@ -549,7 +561,7 @@ void psi_memstall_enter(unsigned long *flags) struct rq_flags rf; struct rq *rq;
- if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return;
*flags = current->flags & PF_MEMSTALL; @@ -579,7 +591,7 @@ void psi_memstall_leave(unsigned long *flags) struct rq_flags rf; struct rq *rq;
- if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return;
if (*flags) @@ -600,7 +612,7 @@ void psi_memstall_leave(unsigned long *flags) #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return 0;
cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); @@ -612,7 +624,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
void psi_cgroup_free(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return;
cancel_delayed_work_sync(&cgroup->psi.clock_work); @@ -637,7 +649,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq;
- if (psi_disabled) { + if (static_branch_likely(&psi_disabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -673,7 +685,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full;
- if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP;
update_stats(group); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 7cd0104..1b49905 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -87,7 +87,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) { int clear = 0, set = TSK_RUNNING;
- if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return;
if (!wakeup || p->sched_psi_wake_requeue) { @@ -107,7 +107,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) { int clear = TSK_RUNNING, set = 0;
- if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return;
if (!sleep) { @@ -123,7 +123,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
static inline void psi_ttwu_dequeue(struct task_struct *p) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; /* * Is the task being migrated during a wakeup? Make sure to @@ -149,7 +149,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
static inline void psi_task_tick(struct rq *rq) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return;
if (unlikely(rq->curr->flags & PF_MEMSTALL))
mainline inclusion from mainline-v5.4 commit 1b69ac6b40ebd85eed73e4dbccde2a36961ab990 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
psi has provisions to shut off the periodic aggregation worker when there is a period of no task activity - and thus no data that needs aggregating. However, while developing psi monitoring, Suren noticed that the aggregation clock currently won't stay shut off for good.
Debugging this revealed a flaw in the idle design: an aggregation run will see no task activity and decide to go to sleep; shortly thereafter, the kworker thread that executed the aggregation will go idle and cause a scheduling change, during which the psi callback will kick the !pending worker again. This will ping-pong forever, and is equivalent to having no shut-off logic at all (but with more code!)
Fix this by exempting aggregation workers from psi's clock waking logic when the state change is them going to sleep. To do this, tag workers with the last work function they executed, and if in psi we see a worker going to sleep after aggregating psi data, we will not reschedule the aggregation work item.
What if the worker is also executing other items before or after?
Any psi state times that were incurred by work items preceding the aggregation work will have been collected from the per-cpu buckets during the aggregation itself. If there are work items following the aggregation work, the worker's last_func tag will be overwritten and the aggregator will be kept alive to process this genuine new activity.
If the aggregation work is the last thing the worker does, and we decide to go idle, the brief period of non-idle time incurred between the aggregation run and the kworker's dequeue will be stranded in the per-cpu buckets until the clock is woken by later activity. But that should not be a problem. The buckets can hold 4s worth of time, and future activity will wake the clock with a 2s delay, giving us 2s worth of data we can leave behind when disabling aggregation. If it takes a worker more than two seconds to go idle after it finishes its last work item, we likely have bigger problems in the system, and won't notice one sample that was averaged with a bogus per-CPU weight.
Link: http://lkml.kernel.org/r/20190116193501.1910-1-hannes@cmpxchg.org Fixes: eb414681d5a0 ("psi: pressure stall information for CPU, memory, and IO") Signed-off-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 21 +++++++++++++++++---- kernel/workqueue.c | 23 +++++++++++++++++++++++ kernel/workqueue_internal.h | 6 +++++- 3 files changed, 45 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index fe24de3..c348478 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -124,6 +124,7 @@ * sampling of the aggregate task states would be. */
+#include "../workqueue_internal.h" #include <linux/sched/loadavg.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> @@ -480,9 +481,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->tasks[t]++;
write_seqcount_end(&groupc->seq); - - if (!delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); }
static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -513,6 +511,7 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; + bool wake_clock = true; void *iter = NULL;
if (!task->pid) @@ -530,8 +529,22 @@ void psi_task_change(struct task_struct *task, int clear, int set) task->psi_flags &= ~clear; task->psi_flags |= set;
- while ((group = iterate_groups(task, &iter))) + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((clear & TSK_RUNNING) && + (task->flags & PF_WQ_WORKER) && + wq_worker_last_func(task) == psi_update_work)) + wake_clock = false; + + while ((group = iterate_groups(task, &iter))) { psi_group_change(group, cpu, clear, set); + if (wake_clock && !delayed_work_pending(&group->clock_work)) + schedule_delayed_work(&group->clock_work, PSI_FREQ); + } }
void psi_memstall_tick(struct task_struct *task, int cpu) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7438f87..576c186 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -912,6 +912,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) }
/** + * wq_worker_last_func - retrieve worker's last work function + * + * Determine the last function a worker executed. This is called from + * the scheduler to get a worker's last known identity. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + * + * Return: + * The last work function %current executed as a worker, NULL if it + * hasn't executed any work yet. + */ +work_func_t wq_worker_last_func(struct task_struct *task) +{ + struct worker *worker = kthread_data(task); + + return worker->last_func; +} + +/** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set @@ -2197,6 +2217,9 @@ static void process_one_work(struct worker *worker, struct work_struct *work) worker_clr_flags(worker, WORKER_NICED); }
+ /* tag the worker for identification in schedule() */ + worker->last_func = worker->current_func; + /* we're done with it, release */ hash_del(&worker->hentry); worker->current_work = NULL; diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 66fbb5a..cb68b03 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -53,6 +53,9 @@ struct worker {
/* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ + + /* used by the scheduler to determine a worker's last known identity */ + work_func_t last_func; };
/** @@ -67,9 +70,10 @@ static inline struct worker *current_wq_worker(void)
/* * Scheduler hooks for concurrency managed workqueue. Only to be used from - * sched/core.c and workqueue.c. + * sched/ and workqueue.c. */ void wq_worker_waking_up(struct task_struct *task, int cpu); struct task_struct *wq_worker_sleeping(struct task_struct *task); +work_func_t wq_worker_last_func(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
mainline inclusion from mainline-v5.4 commit 4e37504d1c49eec6434d0cc97278d2b51c9e8763 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
-------------------------------- MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit
We've been seeing hard-to-trigger psi crashes when running inside VM instances:
divide error: 0000 [#1] SMP PTI Modules linked in: [...] CPU: 0 PID: 212 Comm: kworker/0:2 Not tainted 4.16.18-119_fbk9_3817_gfe944c98d695 #119 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 Workqueue: events psi_clock RIP: 0010:psi_update_stats+0x270/0x490 RSP: 0018:ffffc90001117e10 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8800a35a13f8 RDX: 0000000000000000 RSI: ffff8800a35a1340 RDI: 0000000000000000 RBP: 0000000000000658 R08: ffff8800a35a1470 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 00000000000f8502 FS: 0000000000000000(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbe370fa000 CR3: 00000000b1e3a000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: psi_clock+0x12/0x50 process_one_work+0x1e0/0x390 worker_thread+0x2b/0x3c0 ? rescuer_thread+0x330/0x330 kthread+0x113/0x130 ? kthread_create_worker_on_cpu+0x40/0x40 ? SyS_exit_group+0x10/0x10 ret_from_fork+0x35/0x40 Code: 48 0f 47 c7 48 01 c2 45 85 e4 48 89 16 0f 85 e6 00 00 00 4c 8b 49 10 4c 8b 51 08 49 69 d9 f2 07 00 00 48 6b c0 64 4c 8b 29 31 d2 <48> f7 f7 49 69 d5 8d 06 00 00 48 89 c5 4c 69 f0 00 98 0b 00 48
The Code-line points to `period` being 0 inside update_stats(), and we divide by that when calculating that period's pressure percentage.
The elapsed period should never be 0. The reason this can happen is due to an off-by-one in the idle time / missing period calculation combined with a coarse sched_clock() in the virtual machine.
The target time for aggregation is advanced into the future on a fixed grid to prevent clock drift. So when an aggregation runs after some idle period, we can not just set it to "now + psi_period", but have to calculate the downtime and advance the target time relative to itself.
However, if the aggregator was disabled exactly one psi_period (ns), we drop one idle period in the calculation due to a > when we should do >=. In that case, next_update will be advanced from 'now - psi_period' to 'now' when it should be moved to 'now + psi_period'. The run finishes with last_update == next_update == sched_clock().
With hardware clocks, this exact nanosecond match isn't likely in the first place; but if it does happen, the clock will still have moved on and the period non-zero by the time the worker runs. A pointlessly short period, but besides the extra work, no harm no foul. However, a slow sched_clock() like we have on VMs might not have advanced either by the time the worker runs again. And when we calculate the elapsed period, the result, our pressure divisor, will be 0. Ouch.
Fix this by correctly handling the situation when the elapsed time between aggregation runs is precisely two periods, and advance the expiration timestamp correctly to period into the future.
Link: http://lkml.kernel.org/r/20190214193157.15788-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Reported-by: Łukasz Siudut lsiudut@fb.com Reviewed-by: Andrew Morton akpm@linux-foundation.org Cc: Peter Zijlstra peterz@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c348478..0e97ca9 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -322,7 +322,7 @@ static bool update_stats(struct psi_group *group) expires = group->next_update; if (now < expires) goto out; - if (now - expires > psi_period) + if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period);
/*
mainline inclusion from mainline-v5.4 commit 147e1a97c4a0bdd43f55a582a9416bb9092563a9 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
Patch series "psi: pressure stall monitors", v3.
Android is adopting psi to detect and remedy memory pressure that results in stuttering and decreased responsiveness on mobile devices.
Psi gives us the stall information, but because we're dealing with latencies in the millisecond range, periodically reading the pressure files to detect stalls in a timely fashion is not feasible. Psi also doesn't aggregate its averages at a high enough frequency right now.
This patch series extends the psi interface such that users can configure sensitive latency thresholds and use poll() and friends to be notified when these are breached.
As high-frequency aggregation is costly, it implements an aggregation method that is optimized for fast, short-interval averaging, and makes the aggregation frequency adaptive, such that high-frequency updates only happen while monitored stall events are actively occurring.
With these patches applied, Android can monitor for, and ward off, mounting memory shortages before they cause problems for the user. For example, using memory stall monitors in userspace low memory killer daemon (lmkd) we can detect mounting pressure and kill less important processes before device becomes visibly sluggish.
In our memory stress testing psi memory monitors produce roughly 10x less false positives compared to vmpressure signals. Having ability to specify multiple triggers for the same psi metric allows other parts of Android framework to monitor memory state of the device and act accordingly.
The new interface is straightforward. The user opens one of the pressure files for writing and writes a trigger description into the file descriptor that defines the stall state - some or full, and the maximum stall time over a given window of time. E.g.:
/* Signal when stall time exceeds 100ms of a 1s window */ char trigger[] = "full 100000 1000000"; fd = open("/proc/pressure/memory"); write(fd, trigger, sizeof(trigger)); while (poll() >= 0) { ... } close(fd);
When the monitored stall state is entered, psi adapts its aggregation frequency according to what the configured time window requires in order to emit event signals in a timely fashion. Once the stalling subsides, aggregation reverts back to normal.
The trigger is associated with the open file descriptor. To stop monitoring, the user only needs to close the file descriptor and the trigger is discarded.
Patches 1-4 prepare the psi code for polling support. Patch 5 implements the adaptive polling logic, the pressure growth detection optimized for short intervals, and hooks up write() and poll() on the pressure files.
The patches were developed in collaboration with Johannes Weiner.
This patch (of 5):
Kernfs has a standardized poll/notification mechanism for waking all pollers on all fds when a filesystem node changes. To allow polling for custom events, add a .poll callback that can override the default.
This is in preparation for pollable cgroup pressure files which have per-fd trigger configurations.
Link: http://lkml.kernel.org/r/20190124211518.244221-2-surenb@google.com Signed-off-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Suren Baghdasaryan surenb@google.com Cc: Dennis Zhou dennis@kernel.org Cc: Ingo Molnar mingo@redhat.com Cc: Jens Axboe axboe@kernel.dk Cc: Li Zefan lizefan@huawei.com Cc: Peter Zijlstra peterz@infradead.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- fs/kernfs/file.c | 31 ++++++++++++++++++++----------- include/linux/kernfs.h | 6 ++++++ 2 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index dbf5bc2..2d8b91f 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -832,26 +832,35 @@ void kernfs_drain_open_files(struct kernfs_node *kn) * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ +__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) +{ + struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry); + struct kernfs_open_node *on = kn->attr.open; + + poll_wait(of->file, &on->poll, wait); + + if (of->event != atomic_read(&on->event)) + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + + return DEFAULT_POLLMASK; +} + static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); - struct kernfs_open_node *on = kn->attr.open; + __poll_t ret;
if (!kernfs_get_active(kn)) - goto trigger; + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
- poll_wait(filp, &on->poll, wait); + if (kn->attr.ops->poll) + ret = kn->attr.ops->poll(of, wait); + else + ret = kernfs_generic_poll(of, wait);
kernfs_put_active(kn); - - if (of->event != atomic_read(&on->event)) - goto trigger; - - return DEFAULT_POLLMASK; - - trigger: - return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + return ret; }
static void kernfs_notify_workfn(struct work_struct *work) diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index c41790d..ab81a22 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -26,6 +26,7 @@ struct vm_area_struct; struct super_block; struct file_system_type; +struct poll_table_struct;
struct kernfs_open_node; struct kernfs_iattrs; @@ -268,6 +269,9 @@ struct kernfs_ops { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off);
+ __poll_t (*poll)(struct kernfs_open_file *of, + struct poll_table_struct *pt); + int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
#ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -360,6 +364,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns); int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); +__poll_t kernfs_generic_poll(struct kernfs_open_file *of, + struct poll_table_struct *pt); void kernfs_notify(struct kernfs_node *kn);
const void *kernfs_super_ns(struct super_block *sb);
mainline inclusion from mainline-v5.4 commit dc50537bdd1a0804fa2cbc990565ee9a944e66fa category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
Cgroup has a standardized poll/notification mechanism for waking all pollers on all fds when a filesystem node changes. To allow polling for custom events, add a .poll callback that can override the default.
This is in preparation for pollable cgroup pressure files which have per-fd trigger configurations.
Link: http://lkml.kernel.org/r/20190124211518.244221-3-surenb@google.com Signed-off-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Suren Baghdasaryan surenb@google.com Cc: Dennis Zhou dennis@kernel.org Cc: Ingo Molnar mingo@redhat.com Cc: Jens Axboe axboe@kernel.dk Cc: Li Zefan lizefan@huawei.com Cc: Peter Zijlstra peterz@infradead.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- include/linux/cgroup-defs.h | 4 ++++ kernel/cgroup/cgroup.c | 12 ++++++++++++ 2 files changed, 16 insertions(+)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index daf2233..454ea4f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -32,6 +32,7 @@ struct kernfs_ops; struct kernfs_open_file; struct seq_file; +struct poll_table_struct;
#define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 @@ -579,6 +580,9 @@ struct cftype { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off);
+ __poll_t (*poll)(struct kernfs_open_file *of, + struct poll_table_struct *pt); + #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 71ac258..1c893a0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3506,6 +3506,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; }
+static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) +{ + struct cftype *cft = of->kn->priv; + + if (cft->poll) + return cft->poll(of, pt); + + return kernfs_generic_poll(of, pt); +} + static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); @@ -3544,6 +3554,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, };
@@ -3552,6 +3563,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop,
mainline inclusion from mainline-v5.4 commit 33b2d6302abc4ccea1d9b3f095e2e27b02ca264e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
Patch series "psi: pressure stall monitors", v6.
This is a respin of: https://lwn.net/ml/linux-kernel/20190308184311.144521-1-surenb%40google.com/
Android is adopting psi to detect and remedy memory pressure that results in stuttering and decreased responsiveness on mobile devices.
Psi gives us the stall information, but because we're dealing with latencies in the millisecond range, periodically reading the pressure files to detect stalls in a timely fashion is not feasible. Psi also doesn't aggregate its averages at a high-enough frequency right now.
This patch series extends the psi interface such that users can configure sensitive latency thresholds and use poll() and friends to be notified when these are breached.
As high-frequency aggregation is costly, it implements an aggregation method that is optimized for fast, short-interval averaging, and makes the aggregation frequency adaptive, such that high-frequency updates only happen while monitored stall events are actively occurring.
With these patches applied, Android can monitor for, and ward off, mounting memory shortages before they cause problems for the user. For example, using memory stall monitors in userspace low memory killer daemon (lmkd) we can detect mounting pressure and kill less important processes before device becomes visibly sluggish. In our memory stress testing psi memory monitors produce roughly 10x less false positives compared to vmpressure signals. Having ability to specify multiple triggers for the same psi metric allows other parts of Android framework to monitor memory state of the device and act accordingly.
The new interface is straight-forward. The user opens one of the pressure files for writing and writes a trigger description into the file descriptor that defines the stall state - some or full, and the maximum stall time over a given window of time. E.g.:
/* Signal when stall time exceeds 100ms of a 1s window */ char trigger[] = "full 100000 1000000" fd = open("/proc/pressure/memory") write(fd, trigger, sizeof(trigger)) while (poll() >= 0) { ... }; close(fd);
When the monitored stall state is entered, psi adapts its aggregation frequency according to what the configured time window requires in order to emit event signals in a timely fashion. Once the stalling subsides, aggregation reverts back to normal.
The trigger is associated with the open file descriptor. To stop monitoring, the user only needs to close the file descriptor and the trigger is discarded.
Patches 1-6 prepare the psi code for polling support. Patch 7 implements the adaptive polling logic, the pressure growth detection optimized for short intervals, and hooks up write() and poll() on the pressure files.
The patches were developed in collaboration with Johannes Weiner.
This patch (of 7):
The psi monitoring patches will need to determine the same states as record_times(). To avoid calculating them twice, maintain a state mask that can be consulted cheaply. Do this in a separate patch to keep the churn in the main feature patch at a minimum.
This adds 4-byte state_mask member into psi_group_cpu struct which results in its first cacheline-aligned part becoming 52 bytes long. Add explicit values to enumeration element counters that affect psi_group_cpu struct size.
Link: http://lkml.kernel.org/r/20190124211518.244221-4-surenb@google.com Signed-off-by: Suren Baghdasaryan surenb@google.com Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Dennis Zhou dennis@kernel.org Cc: Ingo Molnar mingo@redhat.com Cc: Jens Axboe axboe@kernel.dk Cc: Li Zefan lizefan@huawei.com Cc: Peter Zijlstra peterz@infradead.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- include/linux/psi_types.h | 9 ++++++--- kernel/sched/psi.c | 29 +++++++++++++++++++---------- 2 files changed, 25 insertions(+), 13 deletions(-)
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 2cf422d..762c6bb 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -11,7 +11,7 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - NR_PSI_TASK_COUNTS, + NR_PSI_TASK_COUNTS = 3, };
/* Task state bitmasks */ @@ -24,7 +24,7 @@ enum psi_res { PSI_IO, PSI_MEM, PSI_CPU, - NR_PSI_RESOURCES, + NR_PSI_RESOURCES = 3, };
/* @@ -41,7 +41,7 @@ enum psi_states { PSI_CPU_SOME, /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES, + NR_PSI_STATES = 6, };
struct psi_group_cpu { @@ -53,6 +53,9 @@ struct psi_group_cpu { /* States of the tasks belonging to this group */ unsigned int tasks[NR_PSI_TASK_COUNTS];
+ /* Aggregate pressure state derived from the tasks */ + u32 state_mask; + /* Period time sampling buckets for each state of interest (ns) */ u32 times[NR_PSI_STATES];
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0e97ca9..22c1505 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -213,17 +213,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state) static void get_recent_times(struct psi_group *group, int cpu, u32 *times) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); - unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; + enum psi_states s; unsigned int seq; - int s; + u32 state_mask;
/* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); - memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_mask = groupc->state_mask; state_start = groupc->state_start; } while (read_seqcount_retry(&groupc->seq, seq));
@@ -239,7 +239,7 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(tasks, s)) + if (state_mask & (1 << s)) times[s] += now - state_start;
delta = times[s] - groupc->times_prev[s]; @@ -407,15 +407,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, delta = now - groupc->state_start; groupc->state_start = now;
- if (test_state(groupc->tasks, PSI_IO_SOME)) { + if (groupc->state_mask & (1 << PSI_IO_SOME)) { groupc->times[PSI_IO_SOME] += delta; - if (test_state(groupc->tasks, PSI_IO_FULL)) + if (groupc->state_mask & (1 << PSI_IO_FULL)) groupc->times[PSI_IO_FULL] += delta; }
- if (test_state(groupc->tasks, PSI_MEM_SOME)) { + if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] += delta; - if (test_state(groupc->tasks, PSI_MEM_FULL)) + if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; else if (memstall_tick) { u32 sample; @@ -436,10 +436,10 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } }
- if (test_state(groupc->tasks, PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) groupc->times[PSI_CPU_SOME] += delta;
- if (test_state(groupc->tasks, PSI_NONIDLE)) + if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; }
@@ -448,6 +448,8 @@ static void psi_group_change(struct psi_group *group, int cpu, { struct psi_group_cpu *groupc; unsigned int t, m; + enum psi_states s; + u32 state_mask = 0;
groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -480,6 +482,13 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++;
+ /* Calculate state mask representing active states */ + for (s = 0; s < NR_PSI_STATES; s++) { + if (test_state(groupc->tasks, s)) + state_mask |= (1 << s); + } + groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); }
mainline inclusion from mainline-v5.4 commit 9289c5e6a78a5a9397df5fa60eb82b105abcfecf category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
psi_enable is not used outside of psi.c, make it static.
Link: http://lkml.kernel.org/r/20190319235619.260832-3-surenb@google.com Signed-off-by: Suren Baghdasaryan surenb@google.com Suggested-by: Andrew Morton akpm@linux-foundation.org Acked-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 22c1505..281702d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -140,9 +140,9 @@ DEFINE_STATIC_KEY_FALSE(psi_disabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable; #else -bool psi_enable = true; +static bool psi_enable = true; #endif static int __init setup_psi(char *str) {
mainline inclusion from mainline-v5.4 commit bcc78db64168eb6dede056fed2999f75f7ace309 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
Rename psi_group structure member fields used for calculating psi totals and averages for clear distinction between them and for trigger-related fields that will be added by "psi: introduce psi monitor".
[surenb@google.com: v6] Link: http://lkml.kernel.org/r/20190319235619.260832-4-surenb@google.com Link: http://lkml.kernel.org/r/20190124211518.244221-5-surenb@google.com Signed-off-by: Suren Baghdasaryan surenb@google.com Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Dennis Zhou dennis@kernel.org Cc: Ingo Molnar mingo@redhat.com Cc: Jens Axboe axboe@kernel.dk Cc: Li Zefan lizefan@huawei.com Cc: Peter Zijlstra peterz@infradead.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- include/linux/psi_types.h | 14 +++++++------- kernel/sched/psi.c | 41 +++++++++++++++++++++-------------------- 2 files changed, 28 insertions(+), 27 deletions(-)
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 762c6bb..4d1c1f6 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -69,17 +69,17 @@ struct psi_group_cpu { };
struct psi_group { - /* Protects data updated during an aggregation */ - struct mutex stat_lock; + /* Protects data used by the aggregator */ + struct mutex avgs_lock;
/* Per-cpu task state & time tracking */ struct psi_group_cpu __percpu *pcpu;
- /* Periodic aggregation state */ - u64 total_prev[NR_PSI_STATES - 1]; - u64 last_update; - u64 next_update; - struct delayed_work clock_work; + /* Running pressure averages */ + u64 avg_total[NR_PSI_STATES - 1]; + u64 avg_last_update; + u64 avg_next_update; + struct delayed_work avgs_work;
/* Total stall times and sampled pressure averages */ u64 total[NR_PSI_STATES - 1]; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 281702d..4fb4d991 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -165,7 +165,7 @@ static int __init setup_psi(char *str) .pcpu = &system_group_pcpu, };
-static void psi_update_work(struct work_struct *work); +static void psi_avgs_work(struct work_struct *work);
static void group_init(struct psi_group *group) { @@ -173,9 +173,9 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->next_update = sched_clock() + psi_period; - INIT_DELAYED_WORK(&group->clock_work, psi_update_work); - mutex_init(&group->stat_lock); + group->avg_next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + mutex_init(&group->avgs_lock); }
void __init psi_init(void) @@ -278,7 +278,7 @@ static bool update_stats(struct psi_group *group) int cpu; int s;
- mutex_lock(&group->stat_lock); + mutex_lock(&group->avgs_lock);
/* * Collect the per-cpu time buckets and average them into a @@ -319,7 +319,7 @@ static bool update_stats(struct psi_group *group)
/* avgX= */ now = sched_clock(); - expires = group->next_update; + expires = group->avg_next_update; if (now < expires) goto out; if (now - expires >= psi_period) @@ -332,14 +332,14 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->last_update + (missed_periods * psi_period)); - group->last_update = now; + group->avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now;
for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample;
- sample = group->total[s] - group->total_prev[s]; + sample = group->total[s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -359,22 +359,22 @@ static bool update_stats(struct psi_group *group) */ if (sample > period) sample = period; - group->total_prev[s] += sample; + group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } out: - mutex_unlock(&group->stat_lock); + mutex_unlock(&group->avgs_lock); return nonidle_total; }
-static void psi_update_work(struct work_struct *work) +static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; bool nonidle;
dwork = to_delayed_work(work); - group = container_of(dwork, struct psi_group, clock_work); + group = container_of(dwork, struct psi_group, avgs_work);
/* * If there is task activity, periodically fold the per-cpu @@ -391,8 +391,9 @@ static void psi_update_work(struct work_struct *work) u64 now;
now = sched_clock(); - if (group->next_update > now) - delay = nsecs_to_jiffies(group->next_update - now) + 1; + if (group->avg_next_update > now) + delay = nsecs_to_jiffies( + group->avg_next_update - now) + 1; schedule_delayed_work(dwork, delay); } } @@ -546,13 +547,13 @@ void psi_task_change(struct task_struct *task, int clear, int set) */ if (unlikely((clear & TSK_RUNNING) && (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_update_work)) + wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false;
while ((group = iterate_groups(task, &iter))) { psi_group_change(group, cpu, clear, set); - if (wake_clock && !delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } }
@@ -649,7 +650,7 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return;
- cancel_delayed_work_sync(&cgroup->psi.clock_work); + cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); }
mainline inclusion from mainline-v5.4 commit 7fc70a3999366560ad1d4f2389a78360300c2c6a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
Split update_stats into collect_percpu_times and update_averages for collect_percpu_times to be reused later inside psi monitor.
Link: http://lkml.kernel.org/r/20190319235619.260832-5-surenb@google.com Signed-off-by: Suren Baghdasaryan surenb@google.com Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Dennis Zhou dennis@kernel.org Cc: Ingo Molnar mingo@redhat.com Cc: Jens Axboe axboe@kernel.dk Cc: Li Zefan lizefan@huawei.com Cc: Peter Zijlstra peterz@infradead.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 57 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 23 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 4fb4d991..ace5ed9 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -269,17 +269,13 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); }
-static bool update_stats(struct psi_group *group) +static bool collect_percpu_times(struct psi_group *group) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; - unsigned long missed_periods = 0; unsigned long nonidle_total = 0; - u64 now, expires, period; int cpu; int s;
- mutex_lock(&group->avgs_lock); - /* * Collect the per-cpu time buckets and average them into a * single time sample that is normalized to wallclock time. @@ -317,11 +313,18 @@ static bool update_stats(struct psi_group *group) for (s = 0; s < NR_PSI_STATES - 1; s++) group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
+ return nonidle_total; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; + /* avgX= */ - now = sched_clock(); expires = group->avg_next_update; - if (now < expires) - goto out; if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period);
@@ -332,7 +335,7 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->avg_next_update = expires + ((1 + missed_periods) * psi_period); + avg_next_update = expires + ((1 + missed_periods) * psi_period); period = now - (group->avg_last_update + (missed_periods * psi_period)); group->avg_last_update = now;
@@ -362,9 +365,8 @@ static bool update_stats(struct psi_group *group) group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } -out: - mutex_unlock(&group->avgs_lock); - return nonidle_total; + + return avg_next_update; }
static void psi_avgs_work(struct work_struct *work) @@ -372,10 +374,16 @@ static void psi_avgs_work(struct work_struct *work) struct delayed_work *dwork; struct psi_group *group; bool nonidle; + u64 now;
dwork = to_delayed_work(work); group = container_of(dwork, struct psi_group, avgs_work);
+ mutex_lock(&group->avgs_lock); + + now = sched_clock(); + + nonidle = collect_percpu_times(group); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -383,19 +391,15 @@ static void psi_avgs_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ - - nonidle = update_stats(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now);
if (nonidle) { - unsigned long delay = 0; - u64 now; - - now = sched_clock(); - if (group->avg_next_update > now) - delay = nsecs_to_jiffies( - group->avg_next_update - now) + 1; - schedule_delayed_work(dwork, delay); + schedule_delayed_work(dwork, nsecs_to_jiffies( + group->avg_next_update - now) + 1); } + + mutex_unlock(&group->avgs_lock); }
static void record_times(struct psi_group_cpu *groupc, int cpu, @@ -707,11 +711,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; + u64 now;
if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP;
- update_stats(group); + /* Update averages before reporting them */ + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2 - (res == PSI_CPU); full++) { unsigned long avg[3];
mainline inclusion from mainline-v5.4 commit 333f3017c5a893b000b2b4a3529814ce93fa83d7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
Introduce changed_states parameter into collect_percpu_times to track the states changed since the last update.
This will be needed to detect whether polled states activated in the monitor patch.
Link: http://lkml.kernel.org/r/20190319235619.260832-6-surenb@google.com Signed-off-by: Suren Baghdasaryan surenb@google.com Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Dennis Zhou dennis@kernel.org Cc: Ingo Molnar mingo@redhat.com Cc: Jens Axboe axboe@kernel.dk Cc: Li Zefan lizefan@huawei.com Cc: Peter Zijlstra peterz@infradead.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ace5ed9..1b99eef 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -210,7 +210,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } }
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, u32 *times, + u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); u64 now, state_start; @@ -218,6 +219,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) unsigned int seq; u32 state_mask;
+ *pchanged_states = 0; + /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); @@ -246,6 +249,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) groupc->times_prev[s] = times[s];
times[s] = delta; + if (delta) + *pchanged_states |= (1 << s); } }
@@ -269,10 +274,11 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); }
-static bool collect_percpu_times(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; unsigned long nonidle_total = 0; + u32 changed_states = 0; int cpu; int s;
@@ -287,8 +293,11 @@ static bool collect_percpu_times(struct psi_group *group) for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; u32 nonidle; + u32 cpu_changed_states;
- get_recent_times(group, cpu, times); + get_recent_times(group, cpu, times, + &cpu_changed_states); + changed_states |= cpu_changed_states;
nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); nonidle_total += nonidle; @@ -313,7 +322,8 @@ static bool collect_percpu_times(struct psi_group *group) for (s = 0; s < NR_PSI_STATES - 1; s++) group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
- return nonidle_total; + if (pchanged_states) + *pchanged_states = changed_states; }
static u64 update_averages(struct psi_group *group, u64 now) @@ -373,6 +383,7 @@ static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; + u32 changed_states; bool nonidle; u64 now;
@@ -383,7 +394,8 @@ static void psi_avgs_work(struct work_struct *work)
now = sched_clock();
- nonidle = collect_percpu_times(group); + collect_percpu_times(group, &changed_states); + nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -719,7 +731,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); - collect_percpu_times(group); + collect_percpu_times(group, NULL); if (now >= group->avg_next_update) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock);
mainline inclusion from mainline-v5.4 commit 8af0c18af1425fc70686c0fdcfc0072cd8431aa0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
kthread.h can't be included in psi_types.h because it creates a circular inclusion with kthread.h eventually including psi_types.h and complaining on kthread structures not being defined because they are defined further in the kthread.h. Resolve this by removing psi_types.h inclusion from the headers included from kthread.h.
Link: http://lkml.kernel.org/r/20190319235619.260832-7-surenb@google.com Signed-off-by: Suren Baghdasaryan surenb@google.com Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Dennis Zhou dennis@kernel.org Cc: Ingo Molnar mingo@redhat.com Cc: Jens Axboe axboe@kernel.dk Cc: Li Zefan lizefan@huawei.com Cc: Peter Zijlstra peterz@infradead.org Cc: Tejun Heo tj@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- drivers/spi/spi-rockchip.c | 1 + include/linux/kthread.h | 3 ++- include/linux/sched.h | 1 - kernel/kthread.c | 1 + 4 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c index 185bbdc..e054260 100644 --- a/drivers/spi/spi-rockchip.c +++ b/drivers/spi/spi-rockchip.c @@ -15,6 +15,7 @@
#include <linux/clk.h> #include <linux/dmaengine.h> +#include <linux/interrupt.h> #include <linux/module.h> #include <linux/of.h> #include <linux/pinctrl/consumer.h> diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 8613e49..206f0c3 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -4,7 +4,6 @@ /* Simple interface for creating and stopping kernel threads without mess. */ #include <linux/err.h> #include <linux/sched.h> -#include <linux/cgroup.h>
__printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), @@ -202,6 +201,8 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
void kthread_destroy_worker(struct kthread_worker *worker);
+struct cgroup_subsys_state; + #ifdef CONFIG_BLK_CGROUP void kthread_associate_blkcg(struct cgroup_subsys_state *css); struct cgroup_subsys_state *kthread_blkcg(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index e1b822d..b68456e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -25,7 +25,6 @@ #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/signal_types.h> -#include <linux/psi_types.h> #include <linux/mm_types_task.h> #include <linux/task_io_accounting.h> #include <linux/rseq.h> diff --git a/kernel/kthread.c b/kernel/kthread.c index 8873128..2ec3583 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -11,6 +11,7 @@ #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> +#include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h>
mainline inclusion from mainline-v5.4 commit 0e94682b73bfa6c44c98af7a26771c9c08c055d5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
Psi monitor aims to provide a low-latency short-term pressure detection mechanism configurable by users. It allows users to monitor psi metrics growth and trigger events whenever a metric raises above user-defined threshold within user-defined time window.
Time window and threshold are both expressed in usecs. Multiple psi resources with different thresholds and window sizes can be monitored concurrently.
Psi monitors activate when system enters stall state for the monitored psi metric and deactivate upon exit from the stall state. While system is in the stall state psi signal growth is monitored at a rate of 10 times per tracking window. Min window size is 500ms, therefore the min monitoring interval is 50ms. Max window size is 10s with monitoring interval of 1s.
When activated psi monitor stays active for at least the duration of one tracking window to avoid repeated activations/deactivations when psi signal is bouncing.
Notifications to the users are rate-limited to one per tracking window.
Link: http://lkml.kernel.org/r/20190319235619.260832-8-surenb@google.com Signed-off-by: Suren Baghdasaryan surenb@google.com Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- Documentation/accounting/psi.txt | 107 +++++++++ include/linux/psi.h | 8 + include/linux/psi_types.h | 82 ++++++- kernel/cgroup/cgroup.c | 71 +++++- kernel/sched/psi.c | 494 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 742 insertions(+), 20 deletions(-)
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt index b8ca28b..4fb40fe 100644 --- a/Documentation/accounting/psi.txt +++ b/Documentation/accounting/psi.txt @@ -63,6 +63,110 @@ tracked and exported as well, to allow detection of latency spikes which wouldn't necessarily make a dent in the time averages, or to average trends over custom time frames.
+Monitoring for pressure thresholds +================================== + +Users can register triggers and use poll() to be woken up when resource +pressure exceeds certain thresholds. + +A trigger describes the maximum cumulative stall time over a specific +time window, e.g. 100ms of total stall time within any 500ms window to +generate a wakeup event. + +To register a trigger user has to open psi interface file under +/proc/pressure/ representing the resource to be monitored and write the +desired threshold and time window. The open file descriptor should be +used to wait for trigger events using select(), poll() or epoll(). +The following format is used: + +<some|full> <stall amount in us> <time window in us> + +For example writing "some 150000 1000000" into /proc/pressure/memory +would add 150ms threshold for partial memory stall measured within +1sec time window. Writing "full 50000 1000000" into /proc/pressure/io +would add 50ms threshold for full io stall measured within 1sec time window. + +Triggers can be set on more than one psi metric and more than one trigger +for the same psi metric can be specified. However for each trigger a separate +file descriptor is required to be able to poll it separately from others, +therefore for each trigger a separate open() syscall should be made even +when opening the same psi interface file. + +Monitors activate only when system enters stall state for the monitored +psi metric and deactivates upon exit from the stall state. While system is +in the stall state psi signal growth is monitored at a rate of 10 times per +tracking window. + +The kernel accepts window sizes ranging from 500ms to 10s, therefore min +monitoring update interval is 50ms and max is 1s. Min limit is set to +prevent overly frequent polling. Max limit is chosen as a high enough number +after which monitors are most likely not needed and psi averages can be used +instead. + +When activated, psi monitor stays active for at least the duration of one +tracking window to avoid repeated activations/deactivations when system is +bouncing in and out of the stall state. + +Notifications to the userspace are rate-limited to one per tracking window. + +The trigger will de-register when the file descriptor used to define the +trigger is closed. + +Userspace monitor usage example +=============================== + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <poll.h> +#include <string.h> +#include <unistd.h> + +/* + * Monitor memory partial stall with 1s tracking window size + * and 150ms threshold. + */ +int main() { + const char trig[] = "some 150000 1000000"; + struct pollfd fds; + int n; + + fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK); + if (fds.fd < 0) { + printf("/proc/pressure/memory open error: %s\n", + strerror(errno)); + return 1; + } + fds.events = POLLPRI; + + if (write(fds.fd, trig, strlen(trig) + 1) < 0) { + printf("/proc/pressure/memory write error: %s\n", + strerror(errno)); + return 1; + } + + printf("waiting for events...\n"); + while (1) { + n = poll(&fds, 1, -1); + if (n < 0) { + printf("poll error: %s\n", strerror(errno)); + return 1; + } + if (fds.revents & POLLERR) { + printf("got POLLERR, event source is gone\n"); + return 0; + } + if (fds.revents & POLLPRI) { + printf("event triggered!\n"); + } else { + printf("unknown event received: 0x%x\n", fds.revents); + return 1; + } + } + + return 0; +} + Cgroup2 interface =================
@@ -71,3 +175,6 @@ mounted, pressure stall information is also tracked for tasks grouped into cgroups. Each subdirectory in the cgroupfs mountpoint contains cpu.pressure, memory.pressure, and io.pressure files; the format is the same as the /proc/pressure/ files. + +Per-cgroup psi monitors can be specified and used the same way as +system-wide ones. diff --git a/include/linux/psi.h b/include/linux/psi.h index 7006008..af892c2 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -4,6 +4,7 @@ #include <linux/jump_label.h> #include <linux/psi_types.h> #include <linux/sched.h> +#include <linux/poll.h>
struct seq_file; struct css_set; @@ -26,6 +27,13 @@ int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); + +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res); +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); + +__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, + poll_table *wait); #endif
#else /* CONFIG_PSI */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 4d1c1f6..07aaf9b 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -1,8 +1,11 @@ #ifndef _LINUX_PSI_TYPES_H #define _LINUX_PSI_TYPES_H
+#include <linux/kthread.h> #include <linux/seqlock.h> #include <linux/types.h> +#include <linux/kref.h> +#include <linux/wait.h>
#ifdef CONFIG_PSI
@@ -44,6 +47,12 @@ enum psi_states { NR_PSI_STATES = 6, };
+enum psi_aggregators { + PSI_AVGS = 0, + PSI_POLL, + NR_PSI_AGGREGATORS, +}; + struct psi_group_cpu { /* 1st cacheline updated by the scheduler */
@@ -65,7 +74,55 @@ struct psi_group_cpu { /* 2nd cacheline updated by the aggregator */
/* Delta detection against the sampling buckets */ - u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp; + u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES] + ____cacheline_aligned_in_smp; +}; + +/* PSI growth tracking window */ +struct psi_window { + /* Window size in ns */ + u64 size; + + /* Start time of the current window in ns */ + u64 start_time; + + /* Value at the start of the window */ + u64 start_value; + + /* Value growth in the previous window */ + u64 prev_growth; +}; + +struct psi_trigger { + /* PSI state being monitored by the trigger */ + enum psi_states state; + + /* User-spacified threshold in ns */ + u64 threshold; + + /* List node inside triggers list */ + struct list_head node; + + /* Backpointer needed during trigger destruction */ + struct psi_group *group; + + /* Wait queue for polling */ + wait_queue_head_t event_wait; + + /* Pending event flag */ + int event; + + /* Tracking window */ + struct psi_window win; + + /* + * Time last event was generated. Used for rate-limiting + * events to one per window + */ + u64 last_event_time; + + /* Refcounting to prevent premature destruction */ + struct kref refcount; };
struct psi_group { @@ -79,11 +136,32 @@ struct psi_group { u64 avg_total[NR_PSI_STATES - 1]; u64 avg_last_update; u64 avg_next_update; + + /* Aggregator work control */ struct delayed_work avgs_work;
/* Total stall times and sampled pressure averages */ - u64 total[NR_PSI_STATES - 1]; + u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1]; unsigned long avg[NR_PSI_STATES - 1][3]; + + /* Monitor work control */ + atomic_t poll_scheduled; + struct kthread_worker __rcu *poll_kworker; + struct kthread_delayed_work poll_work; + + /* Protects data used by the monitor */ + struct mutex trigger_lock; + + /* Configured polling triggers */ + struct list_head triggers; + u32 nr_triggers[NR_PSI_STATES - 1]; + u32 poll_states; + u64 poll_min_period; + + /* Total stall times at the start of monitor activation */ + u64 polling_total[NR_PSI_STATES - 1]; + u64 polling_next_update; + u64 polling_until; };
#else /* CONFIG_PSI */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1c893a0..ac1f70e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3437,7 +3437,65 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); } -#endif + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) +{ + struct psi_trigger *new; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + cgroup_get(cgrp); + cgroup_kn_unlock(of->kn); + + new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + if (IS_ERR(new)) { + cgroup_put(cgrp); + return PTR_ERR(new); + } + + psi_trigger_replace(&of->priv, new); + + cgroup_put(cgrp); + + return nbytes; +} + +static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IO); +} + +static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); +} + +static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); +} + +static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, + poll_table *pt) +{ + return psi_trigger_poll(&of->priv, of->file, pt); +} + +static void cgroup_pressure_release(struct kernfs_open_file *of) +{ + psi_trigger_replace(&of->priv, NULL); +} +#endif /* CONFIG_PSI */
static int cgroup_file_open(struct kernfs_open_file *of) { @@ -4637,18 +4695,27 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, .name = "io.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "memory.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "cpu.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, -#endif +#endif /* CONFIG_PSI */ { } /* terminate */ };
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1b99eef..e88918e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -4,6 +4,9 @@ * Copyright (c) 2018 Facebook, Inc. * Author: Johannes Weiner hannes@cmpxchg.org * + * Polling support by Suren Baghdasaryan surenb@google.com + * Copyright (c) 2018 Google, Inc. + * * When CPU, memory and IO are contended, tasks experience delays that * reduce throughput and introduce latencies into the workload. Memory * and IO contention, in addition, can cause a full loss of forward @@ -129,9 +132,13 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/seqlock.h> +#include <linux/uaccess.h> #include <linux/cgroup.h> #include <linux/module.h> #include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/file.h> +#include <linux/poll.h> #include <linux/psi.h> #include "sched.h"
@@ -156,6 +163,11 @@ static int __init setup_psi(char *str) #define EXP_60s 1981 /* 1/exp(2s/60s) */ #define EXP_300s 2034 /* 1/exp(2s/300s) */
+/* PSI trigger definitions */ +#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ +#define WINDOW_MAX_US 10000000 /* Max window size is 10s */ +#define UPDATES_PER_WINDOW 10 /* 10 updates per window */ + /* Sampling frequency in nanoseconds */ static u64 psi_period __read_mostly;
@@ -176,6 +188,17 @@ static void group_init(struct psi_group *group) group->avg_next_update = sched_clock() + psi_period; INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); + /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); + mutex_init(&group->trigger_lock); + INIT_LIST_HEAD(&group->triggers); + memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); + group->poll_states = 0; + group->poll_min_period = U32_MAX; + memset(group->polling_total, 0, sizeof(group->polling_total)); + group->polling_next_update = ULLONG_MAX; + group->polling_until = 0; + rcu_assign_pointer(group->poll_kworker, NULL); }
void __init psi_init(void) @@ -210,7 +233,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } }
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times, +static void get_recent_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times, u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); @@ -245,8 +269,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times, if (state_mask & (1 << s)) times[s] += now - state_start;
- delta = times[s] - groupc->times_prev[s]; - groupc->times_prev[s] = times[s]; + delta = times[s] - groupc->times_prev[aggregator][s]; + groupc->times_prev[aggregator][s] = times[s];
times[s] = delta; if (delta) @@ -274,7 +298,9 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); }
-static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) +static void collect_percpu_times(struct psi_group *group, + enum psi_aggregators aggregator, + u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; unsigned long nonidle_total = 0; @@ -295,7 +321,7 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) u32 nonidle; u32 cpu_changed_states;
- get_recent_times(group, cpu, times, + get_recent_times(group, cpu, aggregator, times, &cpu_changed_states); changed_states |= cpu_changed_states;
@@ -320,7 +346,8 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
/* total= */ for (s = 0; s < NR_PSI_STATES - 1; s++) - group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + group->total[aggregator][s] += + div_u64(deltas[s], max(nonidle_total, 1UL));
if (pchanged_states) *pchanged_states = changed_states; @@ -352,7 +379,7 @@ static u64 update_averages(struct psi_group *group, u64 now) for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample;
- sample = group->total[s] - group->avg_total[s]; + sample = group->total[PSI_AVGS][s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -394,7 +421,7 @@ static void psi_avgs_work(struct work_struct *work)
now = sched_clock();
- collect_percpu_times(group, &changed_states); + collect_percpu_times(group, PSI_AVGS, &changed_states); nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu @@ -414,6 +441,187 @@ static void psi_avgs_work(struct work_struct *work) mutex_unlock(&group->avgs_lock); }
+/* Trigger tracking window manupulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void init_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + + list_for_each_entry(t, &group->triggers, node) + window_reset(&t->win, now, + group->total[PSI_POLL][t->state], 0); + memcpy(group->polling_total, group->total[PSI_POLL], + sizeof(group->polling_total)); + group->polling_next_update = now + group->poll_min_period; +} + +static u64 update_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + bool new_stall = false; + u64 *total = group->total[PSI_POLL]; + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, &group->triggers, node) { + u64 growth; + + /* Check for stall activity */ + if (group->polling_total[t->state] == total[t->state]) + continue; + + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + new_stall = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) + wake_up_interruptible(&t->event_wait); + t->last_event_time = now; + } + + if (new_stall) + memcpy(group->polling_total, total, + sizeof(group->polling_total)); + + return now + group->poll_min_period; +} + +/* + * Schedule polling if it's not already scheduled. It's safe to call even from + * hotpath because even though kthread_queue_delayed_work takes worker->lock + * spinlock that spinlock is never contended due to poll_scheduled atomic + * preventing such competition. + */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +{ + struct kthread_worker *kworker; + + /* Do not reschedule if already scheduled */ + if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + return; + + rcu_read_lock(); + + kworker = rcu_dereference(group->poll_kworker); + /* + * kworker might be NULL in case psi_trigger_destroy races with + * psi_task_change (hotpath) which can't use locks + */ + if (likely(kworker)) + kthread_queue_delayed_work(kworker, &group->poll_work, delay); + else + atomic_set(&group->poll_scheduled, 0); + + rcu_read_unlock(); +} + +static void psi_poll_work(struct kthread_work *work) +{ + struct kthread_delayed_work *dwork; + struct psi_group *group; + u32 changed_states; + u64 now; + + dwork = container_of(work, struct kthread_delayed_work, work); + group = container_of(dwork, struct psi_group, poll_work); + + atomic_set(&group->poll_scheduled, 0); + + mutex_lock(&group->trigger_lock); + + now = sched_clock(); + + collect_percpu_times(group, PSI_POLL, &changed_states); + + if (changed_states & group->poll_states) { + /* Initialize trigger windows when entering polling mode */ + if (now > group->polling_until) + init_triggers(group, now); + + /* + * Keep the monitor active for at least the duration of the + * minimum tracking window as long as monitor states are + * changing. + */ + group->polling_until = now + + group->poll_min_period * UPDATES_PER_WINDOW; + } + + if (now > group->polling_until) { + group->polling_next_update = ULLONG_MAX; + goto out; + } + + if (now >= group->polling_next_update) + group->polling_next_update = update_triggers(group, now); + + psi_schedule_poll_work(group, + nsecs_to_jiffies(group->polling_next_update - now) + 1); + +out: + mutex_unlock(&group->trigger_lock); +} + static void record_times(struct psi_group_cpu *groupc, int cpu, bool memstall_tick) { @@ -460,8 +668,8 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_NONIDLE] += delta; }
-static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static u32 psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) { struct psi_group_cpu *groupc; unsigned int t, m; @@ -507,6 +715,8 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq); + + return state_mask; }
static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -567,7 +777,11 @@ void psi_task_change(struct task_struct *task, int clear, int set) wake_clock = false;
while ((group = iterate_groups(task, &iter))) { - psi_group_change(group, cpu, clear, set); + u32 state_mask = psi_group_change(group, cpu, clear, set); + + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + if (wake_clock && !delayed_work_pending(&group->avgs_work)) schedule_delayed_work(&group->avgs_work, PSI_FREQ); } @@ -668,6 +882,8 @@ void psi_cgroup_free(struct cgroup *cgroup)
cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); + /* All triggers must be removed by now */ + WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); }
/** @@ -731,7 +947,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); - collect_percpu_times(group, NULL); + collect_percpu_times(group, PSI_AVGS, NULL); if (now >= group->avg_next_update) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); @@ -743,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
for (w = 0; w < 3; w++) avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC);
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", @@ -786,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file) return single_open(file, psi_cpu_show, NULL); }
+struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res) +{ + struct psi_trigger *t; + enum psi_states state; + u32 threshold_us; + u32 window_us; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_SOME + res * 2; + else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_FULL + res * 2; + else + return ERR_PTR(-EINVAL); + + if (state >= PSI_NONIDLE) + return ERR_PTR(-EINVAL); + + if (window_us < WINDOW_MIN_US || + window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* Check threshold */ + if (threshold_us == 0 || threshold_us > window_us) + return ERR_PTR(-EINVAL); + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + + t->group = group; + t->state = state; + t->threshold = threshold_us * NSEC_PER_USEC; + t->win.size = window_us * NSEC_PER_USEC; + window_reset(&t->win, 0, 0, 0); + + t->event = 0; + t->last_event_time = 0; + init_waitqueue_head(&t->event_wait); + kref_init(&t->refcount); + + mutex_lock(&group->trigger_lock); + + if (!rcu_access_pointer(group->poll_kworker)) { + struct sched_param param = { + .sched_priority = MAX_RT_PRIO - 1, + }; + struct kthread_worker *kworker; + + kworker = kthread_create_worker(0, "psimon"); + if (IS_ERR(kworker)) { + kfree(t); + mutex_unlock(&group->trigger_lock); + return ERR_CAST(kworker); + } + sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); + kthread_init_delayed_work(&group->poll_work, + psi_poll_work); + rcu_assign_pointer(group->poll_kworker, kworker); + } + + list_add(&t->node, &group->triggers); + group->poll_min_period = min(group->poll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->nr_triggers[t->state]++; + group->poll_states |= (1 << t->state); + + mutex_unlock(&group->trigger_lock); + + return t; +} + +static void psi_trigger_destroy(struct kref *ref) +{ + struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); + struct psi_group *group = t->group; + struct kthread_worker *kworker_to_destroy = NULL; + + if (static_branch_likely(&psi_disabled)) + return; + + /* + * Wakeup waiters to stop polling. Can happen if cgroup is deleted + * from under a polling process. + */ + wake_up_interruptible(&t->event_wait); + + mutex_lock(&group->trigger_lock); + + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->nr_triggers[t->state]--; + if (!group->nr_triggers[t->state]) + group->poll_states &= ~(1 << t->state); + /* reset min update period for the remaining triggers */ + list_for_each_entry(tmp, &group->triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->poll_min_period = period; + /* Destroy poll_kworker when the last trigger is destroyed */ + if (group->poll_states == 0) { + group->polling_until = 0; + kworker_to_destroy = rcu_dereference_protected( + group->poll_kworker, + lockdep_is_held(&group->trigger_lock)); + rcu_assign_pointer(group->poll_kworker, NULL); + } + } + + mutex_unlock(&group->trigger_lock); + + /* + * Wait for both *trigger_ptr from psi_trigger_replace and + * poll_kworker RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_kworker + */ + synchronize_rcu(); + /* + * Destroy the kworker after releasing trigger_lock to prevent a + * deadlock while waiting for psi_poll_work to acquire trigger_lock + */ + if (kworker_to_destroy) { + kthread_cancel_delayed_work_sync(&group->poll_work); + kthread_destroy_worker(kworker_to_destroy); + } + kfree(t); +} + +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) +{ + struct psi_trigger *old = *trigger_ptr; + + if (static_branch_likely(&psi_disabled)) + return; + + rcu_assign_pointer(*trigger_ptr, new); + if (old) + kref_put(&old->refcount, psi_trigger_destroy); +} + +__poll_t psi_trigger_poll(void **trigger_ptr, + struct file *file, poll_table *wait) +{ + __poll_t ret = DEFAULT_POLLMASK; + struct psi_trigger *t; + + if (static_branch_likely(&psi_disabled)) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + + rcu_read_lock(); + + t = rcu_dereference(*(void __rcu __force **)trigger_ptr); + if (!t) { + rcu_read_unlock(); + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + } + kref_get(&t->refcount); + + rcu_read_unlock(); + + poll_wait(file, &t->event_wait, wait); + + if (cmpxchg(&t->event, 1, 0) == 1) + ret |= EPOLLPRI; + + kref_put(&t->refcount, psi_trigger_destroy); + + return ret; +} + +static ssize_t psi_write(struct file *file, const char __user *user_buf, + size_t nbytes, enum psi_res res) +{ + char buf[32]; + size_t buf_size; + struct seq_file *seq; + struct psi_trigger *new; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + buf_size = min(nbytes, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size - 1] = '\0'; + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) + return PTR_ERR(new); + + seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ + mutex_lock(&seq->lock); + psi_trigger_replace(&seq->private, new); + mutex_unlock(&seq->lock); + + return nbytes; +} + +static ssize_t psi_io_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IO); +} + +static ssize_t psi_memory_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_MEM); +} + +static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_CPU); +} + +static __poll_t psi_fop_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + + return psi_trigger_poll(&seq->private, file, wait); +} + +static int psi_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + psi_trigger_replace(&seq->private, NULL); + return single_release(inode, file); +} + static const struct file_operations psi_io_fops = { .open = psi_io_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_io_write, + .poll = psi_fop_poll, + .release = psi_fop_release, };
static const struct file_operations psi_memory_fops = { .open = psi_memory_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_memory_write, + .poll = psi_fop_poll, + .release = psi_fop_release, };
static const struct file_operations psi_cpu_fops = { .open = psi_cpu_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_cpu_write, + .poll = psi_fop_poll, + .release = psi_fop_release, };
static int __init psi_proc_init(void)
mainline inclusion from mainline-v5.4 commit df5ba5be7425e1df296d40c5f37a39d98ec666a2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
Pressure metrics are already recorded and exposed in procfs for the entire system, but any tool which monitors cgroup pressure has to special case the root cgroup to read from procfs. This patch exposes the already recorded pressure metrics on the root cgroup.
Link: http://lkml.kernel.org/r/20190510174938.3361741-1-dschatzberg@fb.com Signed-off-by: Dan Schatzberg dschatzberg@fb.com Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Tejun Heo tj@kernel.org Cc: Li Zefan lizefan@huawei.com Cc: Ingo Molnar mingo@redhat.com Cc: Peter Zijlstra peterz@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- include/linux/psi.h | 1 + kernel/cgroup/cgroup.c | 18 ++++++++++++------ kernel/sched/psi.c | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/include/linux/psi.h b/include/linux/psi.h index af892c2..7b3de73 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -12,6 +12,7 @@ #ifdef CONFIG_PSI
extern struct static_key_false psi_disabled; +extern struct psi_group psi_system;
void psi_init(void);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ac1f70e..732d1ed 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3427,15 +3427,24 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_CPU); }
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, @@ -4693,7 +4702,6 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, #ifdef CONFIG_PSI { .name = "io.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -4701,7 +4709,6 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, }, { .name = "memory.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -4709,7 +4716,6 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, }, { .name = "cpu.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index e88918e..7acc632 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -173,7 +173,7 @@ static int __init setup_psi(char *str)
/* System-level pressure and stall tracking */ static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); -static struct psi_group psi_system = { +struct psi_group psi_system = { .pcpu = &system_group_pcpu, };
mainline inclusion from mainline-v5.4 commit 14f5c7b46a41a595fc61db37f55721714729e59e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
PSI defaults to a FIFO-99 thread, reduce this to FIFO-1.
FIFO-99 is the very highest priority available to SCHED_FIFO and it not a suitable default; it would indicate the psi work is the most important work on the machine.
Since Real-Time tasks will have pre-allocated memory and locked it in place, Real-Time tasks do not care about PSI. All it needs is to be above OTHER.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Johannes Weiner hannes@cmpxchg.org Tested-by: Suren Baghdasaryan surenb@google.com Cc: Thomas Gleixner tglx@linutronix.de Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7acc632..7fe2c5f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1051,7 +1051,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
if (!rcu_access_pointer(group->poll_kworker)) { struct sched_param param = { - .sched_priority = MAX_RT_PRIO - 1, + .sched_priority = 1, }; struct kthread_worker *kworker;
mainline inclusion from mainline-v5.4 commit 04e048cf09d7b5fc995817cdc5ae1acd4482429c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
-------------------------------- trigger creator
When a process creates a new trigger by writing into /proc/pressure/* files, permissions to write such a file should be used to determine whether the process is allowed to do so or not. Current implementation would also require such a process to have setsched capability. Setting of psi trigger thread's scheduling policy is an implementation detail and should not be exposed to the user level. Remove the permission check by using _nocheck version of the function.
Suggested-by: Nick Kralevich nnk@google.com Signed-off-by: Suren Baghdasaryan surenb@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: lizefan@huawei.com Cc: mingo@redhat.com Cc: akpm@linux-foundation.org Cc: kernel-team@android.com Cc: dennisszhou@gmail.com Cc: dennis@kernel.org Cc: hannes@cmpxchg.org Cc: axboe@kernel.dk Link: https://lkml.kernel.org/r/20190730013310.162367-1-surenb@google.com Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7fe2c5f..23fbbcc 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1061,7 +1061,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, mutex_unlock(&group->trigger_lock); return ERR_CAST(kworker); } - sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); kthread_init_delayed_work(&group->poll_work, psi_poll_work); rcu_assign_pointer(group->poll_kworker, kworker);
mainline inclusion from mainline-v5.4 commit 7b2b55da1db10a5525460633ae4b6fb0be060c41 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
Only when calling the poll syscall the first time can user receive POLLPRI correctly. After that, user always fails to acquire the event signal.
Reproduce case: 1. Get the monitor code in Documentation/accounting/psi.txt 2. Run it, and wait for the event triggered. 3. Kill and restart the process.
The question is why we can end up with poll_scheduled = 1 but the work not running (which would reset it to 0). And the answer is because the scheduling side sees group->poll_kworker under RCU protection and then schedules it, but here we cancel the work and destroy the worker. The cancel needs to pair with resetting the poll_scheduled flag.
Link: http://lkml.kernel.org/r/1566357985-97781-1-git-send-email-joseph.qi@linux.a... Signed-off-by: Jason Xing kerneljasonxing@linux.alibaba.com Signed-off-by: Joseph Qi joseph.qi@linux.alibaba.com Reviewed-by: Caspar Zhang caspar@linux.alibaba.com Reviewed-by: Suren Baghdasaryan surenb@google.com Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Ingo Molnar mingo@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 23fbbcc..6e52b67 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref) * deadlock while waiting for psi_poll_work to acquire trigger_lock */ if (kworker_to_destroy) { + /* + * After the RCU grace period has expired, the worker + * can no longer be found through group->poll_kworker. + * But it might have been already scheduled before + * that - deschedule it cleanly before destroying it. + */ kthread_cancel_delayed_work_sync(&group->poll_work); + atomic_set(&group->poll_scheduled, 0); + kthread_destroy_worker(kworker_to_destroy); } kfree(t);
mainline inclusion from mainline-v5.4 commit 4adcdcea717cb2d8436bef00dd689aa5bc76f11b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
When passing a equal or more then 32 bytes long string to psi_write(), psi_write() copies 31 bytes to its buf and overwrites buf[30] with '\0'. Which makes the input string 1 byte shorter than it should be.
Fix it by copying sizeof(buf) bytes when nbytes >= sizeof(buf).
This does not cause problems in normal use case like: "some 500000 10000000" or "full 500000 10000000" because they are less than 32 bytes in length.
/* assuming nbytes == 35 */ char buf[32];
buf_size = min(nbytes, (sizeof(buf) - 1)); /* buf_size = 31 */ if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT;
buf[buf_size - 1] = '\0'; /* buf[30] = '\0' */
Before:
%cd /proc/pressure/ %echo "123456789|123456789|123456789|1234" > memory [ 22.473497] nbytes=35,buf_size=31 [ 22.473775] 123456789|123456789|123456789| (print 30 chars) %sh: write error: Invalid argument
%echo "123456789|123456789|123456789|1" > memory [ 64.916162] nbytes=32,buf_size=31 [ 64.916331] 123456789|123456789|123456789| (print 30 chars) %sh: write error: Invalid argument
After:
%cd /proc/pressure/ %echo "123456789|123456789|123456789|1234" > memory [ 254.837863] nbytes=35,buf_size=32 [ 254.838541] 123456789|123456789|123456789|1 (print 31 chars) %sh: write error: Invalid argument
%echo "123456789|123456789|123456789|1" > memory [ 9965.714935] nbytes=32,buf_size=32 [ 9965.715096] 123456789|123456789|123456789|1 (print 31 chars) %sh: write error: Invalid argument
Also remove the superfluous parentheses.
Signed-off-by: Miles Chen miles.chen@mediatek.com Cc: linux-mediatek@lists.infradead.org Cc: wsd_upstream@mediatek.com Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Link: https://lkml.kernel.org/r/20190912103452.13281-1-miles.chen@mediatek.com Signed-off-by: Ingo Molnar mingo@kernel.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 6e52b67..517e371 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1198,7 +1198,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP;
- buf_size = min(nbytes, (sizeof(buf) - 1)); + buf_size = min(nbytes, sizeof(buf)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT;
mainline inclusion from mainline-v5.4 commit 74e2bdcb7d16fcfb7ddbda615a91094abc727114 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
[ Upstream commit 3dfbe25c27eab7c90c8a7e97b4c354a9d24dd985 ]
Jingfeng reports rare div0 crashes in psi on systems with some uptime:
[58914.066423] divide error: 0000 [#1] SMP [58914.070416] Modules linked in: ipmi_poweroff ipmi_watchdog toa overlay fuse tcp_diag inet_diag binfmt_misc aisqos(O) aisqos_hotfixes(O) [58914.083158] CPU: 94 PID: 140364 Comm: kworker/94:2 Tainted: G W OE K 4.9.151-015.ali3000.alios7.x86_64 #1 [58914.093722] Hardware name: Alibaba Alibaba Cloud ECS/Alibaba Cloud ECS, BIOS 3.23.34 02/14/2019 [58914.102728] Workqueue: events psi_update_work [58914.107258] task: ffff8879da83c280 task.stack: ffffc90059dcc000 [58914.113336] RIP: 0010:[] [] psi_update_stats+0x1c1/0x330 [58914.122183] RSP: 0018:ffffc90059dcfd60 EFLAGS: 00010246 [58914.127650] RAX: 0000000000000000 RBX: ffff8858fe98be50 RCX: 000000007744d640 [58914.134947] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00003594f700648e [58914.142243] RBP: ffffc90059dcfdf8 R08: 0000359500000000 R09: 0000000000000000 [58914.149538] R10: 0000000000000000 R11: 0000000000000000 R12: 0000359500000000 [58914.156837] R13: 0000000000000000 R14: 0000000000000000 R15: ffff8858fe98bd78 [58914.164136] FS: 0000000000000000(0000) GS:ffff887f7f380000(0000) knlGS:0000000000000000 [58914.172529] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [58914.178467] CR2: 00007f2240452090 CR3: 0000005d5d258000 CR4: 00000000007606f0 [58914.185765] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [58914.193061] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [58914.200360] PKRU: 55555554 [58914.203221] Stack: [58914.205383] ffff8858fe98bd48 00000000000002f0 0000002e81036d09 ffffc90059dcfde8 [58914.213168] ffff8858fe98bec8 0000000000000000 0000000000000000 0000000000000000 [58914.220951] 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [58914.228734] Call Trace: [58914.231337] [] psi_update_work+0x22/0x60 [58914.237067] [] process_one_work+0x189/0x420 [58914.243063] [] worker_thread+0x4e/0x4b0 [58914.248701] [] ? process_one_work+0x420/0x420 [58914.254869] [] kthread+0xe6/0x100 [58914.259994] [] ? kthread_park+0x60/0x60 [58914.265640] [] ret_from_fork+0x39/0x50 [58914.271193] Code: 41 29 c3 4d 39 dc 4d 0f 42 dc <49> f7 f1 48 8b 13 48 89 c7 48 c1 [58914.279691] RIP [] psi_update_stats+0x1c1/0x330
The crashing instruction is trying to divide the observed stall time by the sampling period. The period, stored in R8, is not 0, but we are dividing by the lower 32 bits only, which are all 0 in this instance.
We could switch to a 64-bit division, but the period shouldn't be that big in the first place. It's the time between the last update and the next scheduled one, and so should always be around 2s and comfortably fit into 32 bits.
The bug is in the initialization of new cgroups: we schedule the first sampling event in a cgroup as an offset of sched_clock(), but fail to initialize the last_update timestamp, and it defaults to 0. That results in a bogusly large sampling period the first time we run the sampling code, and consequently we underreport pressure for the first 2s of a cgroup's life. But worse, if sched_clock() is sufficiently advanced on the system, and the user gets unlucky, the period's lower 32 bits can all be 0 and the sampling division will crash.
Fix this by initializing the last update timestamp to the creation time of the cgroup, thus correctly marking the start of the first pressure sampling period in a new cgroup.
Reported-by: Jingfeng Xie xiejingfeng@linux.alibaba.com Signed-off-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Suren Baghdasaryan surenb@google.com Link: https://lkml.kernel.org/r/20191203183524.41378-2-hannes@cmpxchg.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 517e371..970db46 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -185,7 +185,8 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->avg_next_update = sched_clock() + psi_period; + group->avg_last_update = sched_clock(); + group->avg_next_update = group->avg_last_update + psi_period; INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */
mainline inclusion from mainline-v5.4 commit 4e38135180004cf68190242ea795ba8d19898b42 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
[ Upstream commit c3466952ca1514158d7c16c9cfc48c27d5c5dc0f ]
The psi window size is a u64 an can be up to 10 seconds right now, which exceeds the lower 32 bits of the variable. We currently use div_u64 for it, which is meant only for 32-bit divisors. The result is garbage pressure sampling values and even potential div0 crashes.
Use div64_u64.
Signed-off-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Suren Baghdasaryan surenb@google.com Cc: Jingfeng Xie xiejingfeng@linux.alibaba.com Link: https://lkml.kernel.org/r/20191203183524.41378-3-hannes@cmpxchg.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 970db46..ce8f674 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -482,7 +482,7 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value) u32 remaining;
remaining = win->size - elapsed; - growth += div_u64(win->prev_growth * remaining, win->size); + growth += div64_u64(win->prev_growth * remaining, win->size); }
return growth;
mainline inclusion from mainline-v5.4 commit e61c236dcf3416211008774b6c2bfa01753a82c1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
--------------------------------
commit 6fcca0fa48118e6d63733eb4644c6cd880c15b8f upstream.
Issuing write() with count parameter set to 0 on any file under /proc/pressure/ will cause an OOB write because of the access to buf[buf_size-1] when NUL-termination is performed. Fix this by checking for buf_size to be non-zero.
Signed-off-by: Suren Baghdasaryan surenb@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Acked-by: Johannes Weiner hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20200203212216.7076-1-surenb@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- kernel/sched/psi.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ce8f674..9154e74 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1199,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP;
+ if (!nbytes) + return -EINVAL; + buf_size = min(nbytes, sizeof(buf)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT;
enable the psi feature in kernel config
Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn --- arch/x86/configs/openeuler_defconfig | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index ba9c4ae..9de47c5 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -102,6 +102,8 @@ CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_PSI=y +CONFIG_PSI_DEFAULT_DISABLED=y CONFIG_CPU_ISOLATION=y
#
Hi xinpeng,
Thanks for your patchset. PSI is very useful for openEuler, I'm very glad you backporting this patchset.
I'll review and check this series, and hope apply it soon.
This patchset might break some kabi(Kernel Application Binary Interface compatibility), and we need find a way to fix them before apply it.
--- Xie XiuQi
On 2021/9/5 22:26, Liu Xinpeng wrote:
mainline inclusion from mainline-v5.4 commit b1d29ba82cf2bc784f4c963ddd6a2cf29e229b33 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I47QS2 CVE: NA
Delay accounting already measures the time a task spends in direct reclaim and waiting for swapin, but in low memory situations tasks spend can spend a significant amount of their time waiting on thrashing page cache. This isn't tracked right now.
To know the full impact of memory contention on an individual task, measure the delay when waiting for a recently evicted active cache page to read back into memory.
Also update tools/accounting/getdelays.c:
[hannes@computer accounting]$ sudo ./getdelays -d -p 1 print delayacct stats ON PID 1 CPU count real total virtual total delay total delay average 50318 745000000 847346785 400533713 0.008ms IO count delay total delay average 435 122601218 0ms SWAP count delay total delay average 0 0 0ms RECLAIM count delay total delay average 0 0 0ms THRASHING count delay total delay average 19 12621439 0ms
Link: http://lkml.kernel.org/r/20180828172258.3185-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner hannes@cmpxchg.org Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Daniel Drake drake@endlessm.com Tested-by: Suren Baghdasaryan surenb@google.com Cc: Christopher Lameter cl@linux.com Cc: Ingo Molnar mingo@redhat.com Cc: Johannes Weiner jweiner@fb.com Cc: Mike Galbraith efault@gmx.de Cc: Peter Enderborg peter.enderborg@sony.com Cc: Randy Dunlap rdunlap@infradead.org Cc: Shakeel Butt shakeelb@google.com Cc: Tejun Heo tj@kernel.org Cc: Vinayak Menon vinmenon@codeaurora.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Xinpeng liuxp11@chinatelecom.cn Signed-off-by: Ctyun Kernel ctyuncommiter01@chinatelecom.cn
include/linux/delayacct.h | 23 +++++++++++++++++++++++ include/uapi/linux/taskstats.h | 6 +++++- kernel/delayacct.c | 15 +++++++++++++++ mm/filemap.c | 11 +++++++++++ tools/accounting/getdelays.c | 8 +++++++- 5 files changed, 61 insertions(+), 2 deletions(-)
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 31c865d..577d1b2 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -57,7 +57,12 @@ struct task_delay_info {
u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */
- u64 thrashing_start;
- u64 thrashing_delay; /* wait for thrashing page */
- u32 freepages_count; /* total count of memory reclaim */
- u32 thrashing_count; /* total count of thrash waits */
}; #endif
@@ -76,6 +81,8 @@ struct task_delay_info { extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); +extern void __delayacct_thrashing_start(void); +extern void __delayacct_thrashing_end(void);
static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { @@ -156,6 +163,18 @@ static inline void delayacct_freepages_end(void) __delayacct_freepages_end(); }
+static inline void delayacct_thrashing_start(void) +{
- if (current->delays)
__delayacct_thrashing_start();
+}
+static inline void delayacct_thrashing_end(void) +{
- if (current->delays)
__delayacct_thrashing_end();
+}
#else static inline void delayacct_set_flag(int flag) {} @@ -182,6 +201,10 @@ static inline void delayacct_freepages_start(void) {} static inline void delayacct_freepages_end(void) {} +static inline void delayacct_thrashing_start(void) +{} +static inline void delayacct_thrashing_end(void) +{}
#endif /* CONFIG_TASK_DELAY_ACCT */
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index b7aa7bb..5e8ca16 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */
-#define TASKSTATS_VERSION 8 +#define TASKSTATS_VERSION 9 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */
@@ -164,6 +164,10 @@ struct taskstats { /* Delay waiting for memory reclaim */ __u64 freepages_count; __u64 freepages_delay_total;
- /* Delay waiting for thrashing page */
- __u64 thrashing_count;
- __u64 thrashing_delay_total;
};
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ca8ac28..2a12b98 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; tmp = d->freepages_delay_total + tsk->delays->freepages_delay; d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count;
d->thrashing_count += tsk->delays->thrashing_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
@@ -169,3 +172,15 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_count); }
+void __delayacct_thrashing_start(void) +{
- current->delays->thrashing_start = ktime_get_ns();
+}
+void __delayacct_thrashing_end(void) +{
- delayacct_end(¤t->delays->lock,
¤t->delays->thrashing_start,
¤t->delays->thrashing_delay,
¤t->delays->thrashing_count);
+} diff --git a/mm/filemap.c b/mm/filemap.c index 4cf4b09..6c964d8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -36,6 +36,7 @@ #include <linux/cleancache.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> +#include <linux/delayacct.h> #include "internal.h"
#define CREATE_TRACE_POINTS @@ -1183,8 +1184,15 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait;
bool thrashing = false; int ret = 0;
if (bit_nr == PG_locked && !PageSwapBacked(page) &&
!PageUptodate(page) && PageWorkingset(page)) {
delayacct_thrashing_start();
thrashing = true;
}
init_wait(wait); wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function;
@@ -1223,6 +1231,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
finish_wait(q, wait);
- if (thrashing)
delayacct_thrashing_end();
- /*
- A signal could leave PageWaiters set. Clearing it here if
- !waitqueue_active would be possible (by open-coding finish_wait),
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 6bf6a20..5ef1c15 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -203,6 +203,8 @@ static void print_delayacct(struct taskstats *t) "SWAP %15s%15s%15s\n" " %15llu%15llu%15llums\n" "RECLAIM %12s%15s%15s\n"
" %15llu%15llu%15llums\n"
"THRASHING%12s%15s%15s\n" " %15llu%15llu%15llums\n", "count", "real total", "virtual total", "delay total", "delay average",
@@ -222,7 +224,11 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->freepages_count, (unsigned long long)t->freepages_delay_total,
average_ms(t->freepages_delay_total, t->freepages_count));
average_ms(t->freepages_delay_total, t->freepages_count),
"count", "delay total", "delay average",
(unsigned long long)t->thrashing_count,
(unsigned long long)t->thrashing_delay_total,
average_ms(t->thrashing_delay_total, t->thrashing_count));
}
static void task_context_switch_counts(struct taskstats *t)