[PATCH OLK-6.6 0/3] adapt LTS patch 5ba723cc37230e616fe47ec19d441ba3168b4c4c
The original LTS patch is incompatible with the atomic mode of the mm counter. Therefore, the patch needs to be reverted and adapted again. Baolin Wang (1): mm: fix the inaccurate memory statistics issue for users Quanmin Yan (2): Revert "mm: fix the inaccurate memory statistics issue for users" mm: add config option for atomic mode in mm counter arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/mm.h | 53 ++++++++++++++++++++------ include/trace/events/kmem.h | 2 +- kernel/fork.c | 26 ++++++++++--- mm/Kconfig | 12 ++++++ 6 files changed, 78 insertions(+), 17 deletions(-) -- 2.43.0
hulk inclusion category: bugfix bugzilla: https://atomgit.com/openeuler/kernel/issues/8456 -------------------------------- This reverts commit 5ba723cc37230e616fe47ec19d441ba3168b4c4c. The existing LTS patch has an adaptation issue and is incompatible with the existing atomic mode in mm Counter. Now revert it and adapt it again later. Fixes: 5ba723cc3723 ("mm: fix the inaccurate memory statistics issue for users") Signed-off-by: Quanmin Yan <yanquanmin1@huawei.com> --- fs/proc/task_mmu.c | 14 +++++++------- include/linux/mm.h | 5 ----- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fa1678a431a5..411c36224ae8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -33,9 +33,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; - anon = get_mm_counter_sum(mm, MM_ANONPAGES); - file = get_mm_counter_sum(mm, MM_FILEPAGES); - shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); + anon = get_mm_counter(mm, MM_ANONPAGES); + file = get_mm_counter(mm, MM_FILEPAGES); + shmem = get_mm_counter(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and @@ -56,7 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = min(text, mm->exec_vm << PAGE_SHIFT); lib = (mm->exec_vm << PAGE_SHIFT) - text; - swap = get_mm_counter_sum(mm, MM_SWAPENTS); + swap = get_mm_counter(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); @@ -90,12 +90,12 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + - get_mm_counter_sum(mm, MM_SHMEMPAGES); + *shared = get_mm_counter(mm, MM_FILEPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; - *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); + *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 585197c9c2fc..036822cb1b9d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2653,11 +2653,6 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) return percpu_counter_atomic_read(fbc); } -static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member) -{ - return percpu_counter_sum_positive(&mm->rss_stat[member]); -} - void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) -- 2.43.0
hulk inclusion category: bugfix bugzilla: https://atomgit.com/openeuler/kernel/issues/8456 ------------------------ During the conversion of struct percpu_counters from atomic mode to percpu mode, the entire percpu counter init process must be an atomic operation. It needs to ensure that mm counter can only use atomic mode when in a non-preemptible kernel. Add MM_COUNTER_ATOMIC configuration option to enable atomic mode for mm counter, now the atomic mode requires PREEMPT_NONE and disables PREEMPT_DYNAMIC. Additionally, add a cmdline option 'rss_atomic_disable' to actively disable atomic mode. Fixes: c333c4444953 ("mm: convert mm's rss stats to use atomic mode") Signed-off-by: Quanmin Yan <yanquanmin1@huawei.com> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/mm.h | 46 +++++++++++++++++++++++--- kernel/fork.c | 26 ++++++++++++--- mm/Kconfig | 12 +++++++ 5 files changed, 76 insertions(+), 10 deletions(-) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 425616aa8422..daef0d8504d7 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1278,6 +1278,7 @@ CONFIG_DAMON_LRU_SORT=y # end of Data Access Monitoring CONFIG_THP_CONTROL=y +CONFIG_MM_COUNTER_ATOMIC=y # end of Memory Management options CONFIG_NET=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index e6c7a62045d9..43c0b2751024 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1240,6 +1240,7 @@ CONFIG_DAMON_LRU_SORT=y # end of Data Access Monitoring # CONFIG_THP_CONTROL is not set +CONFIG_MM_COUNTER_ATOMIC=y # end of Memory Management options CONFIG_NET=y diff --git a/include/linux/mm.h b/include/linux/mm.h index 036822cb1b9d..2e5500e215c6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2643,11 +2643,35 @@ static inline bool get_user_page_fast_only(unsigned long addr, /* * per-process(per-mm_struct) statistics. */ +#ifdef CONFIG_MM_COUNTER_ATOMIC +extern bool mm_counter_atomic; + +static inline bool mm_counter_is_atomic(void) +{ + return mm_counter_atomic; +} + +static inline bool mm_counter_is_pcpu(struct percpu_counter *fbc) +{ + return !mm_counter_is_atomic() || percpu_counter_initialized(fbc); +} +#else +static inline bool mm_counter_is_atomic(void) +{ + return false; +} + +static inline bool mm_counter_is_pcpu(struct percpu_counter *fbc) +{ + return true; +} +#endif + static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { struct percpu_counter *fbc = &mm->rss_stat[member]; - if (percpu_counter_initialized(fbc)) + if (mm_counter_is_pcpu(fbc)) return percpu_counter_read_positive(fbc); return percpu_counter_atomic_read(fbc); @@ -2659,7 +2683,7 @@ static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { struct percpu_counter *fbc = &mm->rss_stat[member]; - if (percpu_counter_initialized(fbc)) + if (mm_counter_is_pcpu(fbc)) percpu_counter_add(fbc, value); else percpu_counter_atomic_add(fbc, value); @@ -2681,7 +2705,7 @@ static inline s64 mm_counter_sum(struct mm_struct *mm, int member) { struct percpu_counter *fbc = &mm->rss_stat[member]; - if (percpu_counter_initialized(fbc)) + if (mm_counter_is_pcpu(fbc)) return percpu_counter_sum(fbc); return percpu_counter_atomic_read(fbc); @@ -2691,17 +2715,29 @@ static inline s64 mm_counter_sum_positive(struct mm_struct *mm, int member) { struct percpu_counter *fbc = &mm->rss_stat[member]; - if (percpu_counter_initialized(fbc)) + if (mm_counter_is_pcpu(fbc)) return percpu_counter_sum_positive(fbc); return percpu_counter_atomic_read(fbc); } -static inline int mm_counter_switch_to_pcpu(struct mm_struct *mm) +static inline int mm_counter_try_switch_to_pcpu(struct mm_struct *mm) { + if (!mm_counter_is_atomic()) + return 0; + return percpu_counter_switch_to_pcpu_many(mm->rss_stat, NR_MM_COUNTERS); } +static inline int mm_counter_init(struct mm_struct *mm) +{ + if (mm_counter_is_atomic()) + return 0; + + return percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, + NR_MM_COUNTERS); +} + static inline void mm_counter_destroy(struct mm_struct *mm) { percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); diff --git a/kernel/fork.c b/kernel/fork.c index 021fbacde947..9bffb74d26d5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1343,6 +1343,18 @@ static void mm_init_uprobes_state(struct mm_struct *mm) #endif } +#ifdef CONFIG_MM_COUNTER_ATOMIC +bool mm_counter_atomic __ro_after_init = true; + +static int __init disable_rss_atomic_mode(char *str) +{ + mm_counter_atomic = false; + + return 0; +} +__setup("rss_atomic_disable", disable_rss_atomic_mode); +#endif /* CONFIG_MM_COUNTER_ATOMIC */ + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -1398,11 +1410,16 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm)) goto fail_cid; + if (mm_counter_init(mm)) + goto fail_pcpu; + sp_init_mm(mm); mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; +fail_pcpu: + mm_destroy_cid(mm); fail_cid: destroy_context(mm); fail_nocontext: @@ -1824,14 +1841,13 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) return 0; /* - * For single-thread processes, rss_stat is in atomic mode, which + * For single-thread processes, mm counter is using atomic mode, which * reduces the memory consumption and performance regression caused by - * using percpu. For multiple-thread processes, rss_stat is switched to - * the percpu mode to reduce the error margin. + * using percpu mode. For multiple-thread processes, will try to switch + * to the percpu mode, but still using atomic mode once some error occurs. */ if (clone_flags & CLONE_THREAD) - if (mm_counter_switch_to_pcpu(oldmm)) - return -ENOMEM; + mm_counter_try_switch_to_pcpu(oldmm); if (clone_flags & CLONE_VM) { mmget(oldmm); diff --git a/mm/Kconfig b/mm/Kconfig index 12438e8dff88..ded89ba92a6c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1537,4 +1537,16 @@ config THP_CONTROL help This provides interface to control thp policy. +config MM_COUNTER_ATOMIC + bool "use atomic for mm counters when single-thread process" + depends on PREEMPT_NONE && !PREEMPT_DYNAMIC + depends on SMP + default n + help + For single-thread processes, mm counter uses atomic counter + operations, which reduces the memory consumption and performance + regression caused by using percpu counter. For multiple-thread + processes, atomic counter is switched to percpu counter mode to + reduce the error margin. + endmenu -- 2.43.0
From: Baolin Wang <baolin.wang@linux.alibaba.com> stable inclusion from stable-v6.6.99 commit 56995226431a37182128d0b0adf39cd003bf94d7 category: bugfix bugzilla: https://atomgit.com/openeuler/kernel/issues/8456 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=... -------------------------------- commit 82241a83cd15aaaf28200a40ad1a8b480012edaf upstream. On some large machines with a high number of CPUs running a 64K pagesize kernel, we found that the 'RES' field is always 0 displayed by the top command for some processes, which will cause a lot of confusion for users. PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 875525 root 20 0 12480 0 0 R 0.3 0.0 0:00.08 top 1 root 20 0 172800 0 0 S 0.0 0.0 0:04.52 systemd The main reason is that the batch size of the percpu counter is quite large on these machines, caching a significant percpu value, since converting mm's rss stats into percpu_counter by commit f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter"). Intuitively, the batch number should be optimized, but on some paths, performance may take precedence over statistical accuracy. Therefore, introducing a new interface to add the percpu statistical count and display it to users, which can remove the confusion. In addition, this change is not expected to be on a performance-critical path, so the modification should be acceptable. In addition, the 'mm->rss_stat' is updated by using add_mm_counter() and dec/inc_mm_counter(), which are all wrappers around percpu_counter_add_batch(). In percpu_counter_add_batch(), there is percpu batch caching to avoid 'fbc->lock' contention. This patch changes task_mem() and task_statm() to get the accurate mm counters under the 'fbc->lock', but this should not exacerbate kernel 'mm->rss_stat' lock contention due to the percpu batch caching of the mm counters. The following test also confirm the theoretical analysis. I run the stress-ng that stresses anon page faults in 32 threads on my 32 cores machine, while simultaneously running a script that starts 32 threads to busy-loop pread each stress-ng thread's /proc/pid/status interface. From the following data, I did not observe any obvious impact of this patch on the stress-ng tests. w/o patch: stress-ng: info: [6848] 4,399,219,085,152 CPU Cycles 67.327 B/sec stress-ng: info: [6848] 1,616,524,844,832 Instructions 24.740 B/sec (0.367 instr. per cycle) stress-ng: info: [6848] 39,529,792 Page Faults Total 0.605 M/sec stress-ng: info: [6848] 39,529,792 Page Faults Minor 0.605 M/sec w/patch: stress-ng: info: [2485] 4,462,440,381,856 CPU Cycles 68.382 B/sec stress-ng: info: [2485] 1,615,101,503,296 Instructions 24.750 B/sec (0.362 instr. per cycle) stress-ng: info: [2485] 39,439,232 Page Faults Total 0.604 M/sec stress-ng: info: [2485] 39,439,232 Page Faults Minor 0.604 M/sec On comparing a very simple app which just allocates & touches some memory against v6.1 (which doesn't have f1a7941243c1) and latest Linus tree (4c06e63b9203) I can see that on latest Linus tree the values for VmRSS, RssAnon and RssFile from /proc/self/status are all zeroes while they do report values on v6.1 and a Linus tree with this patch. Link: https://lkml.kernel.org/r/f4586b17f66f97c174f7fd1f8647374fdb53de1c.174911905... Fixes: f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> Reviewed-by: Aboorva Devarajan <aboorvad@linux.ibm.com> Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com> Tested-by Donet Tom <donettom@linux.ibm.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Acked-by: SeongJae Park <sj@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: David Hildenbrand <david@redhat.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Mike Rapoport <rppt@kernel.org> Cc: Suren Baghdasaryan <surenb@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Quanmin Yan <yanquanmin1@huawei.com> --- fs/proc/task_mmu.c | 14 +++++++------- include/linux/mm.h | 2 +- include/trace/events/kmem.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 411c36224ae8..fa1678a431a5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -33,9 +33,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; - anon = get_mm_counter(mm, MM_ANONPAGES); - file = get_mm_counter(mm, MM_FILEPAGES); - shmem = get_mm_counter(mm, MM_SHMEMPAGES); + anon = get_mm_counter_sum(mm, MM_ANONPAGES); + file = get_mm_counter_sum(mm, MM_FILEPAGES); + shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and @@ -56,7 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = min(text, mm->exec_vm << PAGE_SHIFT); lib = (mm->exec_vm << PAGE_SHIFT) - text; - swap = get_mm_counter(mm, MM_SWAPENTS); + swap = get_mm_counter_sum(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); @@ -90,12 +90,12 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES) + - get_mm_counter(mm, MM_SHMEMPAGES); + *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + + get_mm_counter_sum(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; - *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); return mm->total_vm; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e5500e215c6..8ade65d07fb1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2711,7 +2711,7 @@ static inline s64 mm_counter_sum(struct mm_struct *mm, int member) return percpu_counter_atomic_read(fbc); } -static inline s64 mm_counter_sum_positive(struct mm_struct *mm, int member) +static inline s64 get_mm_counter_sum(struct mm_struct *mm, int member) { struct percpu_counter *fbc = &mm->rss_stat[member]; diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 28b9d6958724..aeb5e3965ab8 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -402,7 +402,7 @@ TRACE_EVENT(rss_stat, __entry->mm_id = mm_ptr_to_hash(mm); __entry->curr = !!(current->mm == mm); __entry->member = member; - __entry->size = (mm_counter_sum_positive(mm, member) + __entry->size = (get_mm_counter_sum(mm, member) << PAGE_SHIFT); ), -- 2.43.0
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://atomgit.com/openeuler/kernel/merge_requests/20500 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/6WQ... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://atomgit.com/openeuler/kernel/merge_requests/20500 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/6WQ...
participants (2)
-
patchwork bot -
Quanmin Yan