From: Chunxin Zang zangchunxin@bytedance.com
mainline inclusion from mainline-v5.10-rc1 commit 069c411de40a621c82efd2618663fee51d8c59b8 category: bugfix bugzilla: 175105 CVE: NA
-------------------------------------------------
We have observed that drop_caches can take a considerable amount of time (<put data here>). Especially when there are many memcgs involved because they are adding an additional overhead.
It is quite unfortunate that the operation cannot be interrupted by a signal currently. Add a check for fatal signals into the main loop so that userspace can control early bailout.
There are two reasons:
1. We have too many memcgs, even though one object freed in one memcg, the sum of object is bigger than 10.
2. We spend a lot of time in traverse memcg once. So, the memcg who traversed at the first have been freed many objects. Traverse memcg next time, the freed count bigger than 10 again.
We can get the following info through 'ps':
root:~# ps -aux | grep drop root 357956 ... R Aug25 21119854:55 echo 3 > /proc/sys/vm/drop_caches root 1771385 ... R Aug16 21146421:17 echo 3 > /proc/sys/vm/drop_caches root 1986319 ... R 18:56 117:27 echo 3 > /proc/sys/vm/drop_caches root 2002148 ... R Aug24 5720:39 echo 3 > /proc/sys/vm/drop_caches root 2564666 ... R 18:59 113:58 echo 3 > /proc/sys/vm/drop_caches root 2639347 ... R Sep03 2383:39 echo 3 > /proc/sys/vm/drop_caches root 3904747 ... R 03:35 993:31 echo 3 > /proc/sys/vm/drop_caches root 4016780 ... R Aug21 7882:18 echo 3 > /proc/sys/vm/drop_caches
Use bpftrace follow 'freed' value in drop_slab_node:
root:~# bpftrace -e 'kprobe:drop_slab_node+70 {@ret=hist(reg("bp")); }' Attaching 1 probe... ^B^C
@ret: [64, 128) 1 | | [128, 256) 28 | | [256, 512) 107 |@ | [512, 1K) 298 |@@@ | [1K, 2K) 613 |@@@@@@@ | [2K, 4K) 4435 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [4K, 8K) 442 |@@@@@ | [8K, 16K) 299 |@@@ | [16K, 32K) 100 |@ | [32K, 64K) 139 |@ | [64K, 128K) 56 | | [128K, 256K) 26 | | [256K, 512K) 2 | |
In the while loop, we can check whether the TASK_KILLABLE signal is set, if so, we should break the loop.
Signed-off-by: Chunxin Zang zangchunxin@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Chris Down chris@chrisdown.name Acked-by: Michal Hocko mhocko@suse.com Cc: Vlastimil Babka vbabka@suse.cz Cc: Matthew Wilcox willy@infradead.org Link: https://lkml.kernel.org/r/20200909152047.27905-1-zangchunxin@bytedance.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: tong tiangen tongtiangen@huawei.com Reviewed-by: tong tiangen tongtiangen@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmscan.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/mm/vmscan.c b/mm/vmscan.c index bedea8b4024a8..71bf44b27d958 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -739,6 +739,9 @@ void drop_slab_node(int nid) do { struct mem_cgroup *memcg = NULL;
+ if (fatal_signal_pending(current)) + return; + freed = 0; memcg = mem_cgroup_iter(NULL, NULL, NULL); do {
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: bugfix bugzilla: 175105 CVE: NA
-------------------------------------------------
Command 'echo 2 > /proc/sys/vm/drop_caches' could lead to a loop in drop_slab_node for a long while. Add /proc/sys/vm/drop_caches_loop_limit to break loop in drop_slab_node.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: tong tiangen tongtiangen@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/drop_caches.c | 1 + include/linux/mm.h | 1 + kernel/sysctl.c | 8 ++++++++ mm/vmscan.c | 15 +++++++++++++++ 4 files changed, 25 insertions(+)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c index dc1a1d5d825b4..1f866b32cd150 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -13,6 +13,7 @@
/* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches; +unsigned int sysctl_drop_caches_loop_limit __read_mostly;
static void drop_pagecache_sb(struct super_block *sb, void *unused) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ee2f01d9ea52..6d457e38fec7f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2790,6 +2790,7 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
#ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; +extern unsigned int sysctl_drop_caches_loop_limit; int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a8572ce8dfffe..60d899cb8b4e6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1487,6 +1487,14 @@ static struct ctl_table vm_table[] = { .extra1 = &one, .extra2 = &four, }, + { + .procname = "drop_caches_loop_limit", + .data = &sysctl_drop_caches_loop_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + #ifdef CONFIG_COMPACTION { .procname = "compact_memory", diff --git a/mm/vmscan.c b/mm/vmscan.c index 71bf44b27d958..d997fe96918ff 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -735,6 +735,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, void drop_slab_node(int nid) { unsigned long freed; +#ifdef CONFIG_SYSCTL + unsigned int counts = 0; +#endif
do { struct mem_cgroup *memcg = NULL; @@ -747,6 +750,18 @@ void drop_slab_node(int nid) do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + +#ifdef CONFIG_SYSCTL + if (unlikely(sysctl_drop_caches_loop_limit)) { + counts++; + if (counts >= sysctl_drop_caches_loop_limit) { + pr_info("%s (%d): drop_caches early break: %u loops\n", + current->comm, task_pid_nr(current), + counts); + return; + } + } +#endif } while (freed > 10); }