From: ZhangPeng zhangpeng362@huawei.com
Since commit f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter"), the rss_stats have converted into percpu_counter, which convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2). However, the new percpu allocation in mm_init() causes a performance regression on fork/exec/shell. Even after commit 14ef95be6f55 ("kernel/fork: group allocation/free of per-cpu counters for mm struct"), the performance of fork/exec/shell is still poor compared to previous kernel versions.
To mitigate performance regression, we delay the allocation of percpu memory for rss_stats. Therefore, we convert mm's rss stats to use percpu_counter atomic mode. For single-thread processes, rss_stat is in atomic mode, which reduces the memory consumption and performance regression caused by using percpu. For multiple-thread processes, rss_stat is switched to the percpu mode to reduce the error margin. We convert rss_stats from atomic mode to percpu mode only when the second thread is created.
After lmbench test, we can get 2% ~ 4% performance improvement for lmbench fork_proc/exec_proc/shell_proc and 6.7% performance improvement for lmbench page_fault (before batch mode[1]).
The test results are as follows: base base+revert base+this patch
fork_proc 416.3ms 400.0ms (3.9%) 398.6ms (4.2%) exec_proc 2095.9ms 2061.1ms (1.7%) 2047.7ms (2.3%) shell_proc 3028.2ms 2954.7ms (2.4%) 2961.2ms (2.2%) page_fault 0.3603ms 0.3358ms (6.8%) 0.3361ms (6.7%)
[1] https://lore.kernel.org/all/20240412064751.119015-1-wangkefeng.wang@huawei.c...
ChangeLog:
v1->v2: - Split patch 2 into two patches.
ZhangPeng (3): percpu_counter: introduce atomic mode for percpu_counter mm: convert mm's rss stats to use atomic mode mm: introduce cmdline to disable mm counter atomic mode
include/linux/mm.h | 50 +++++++++++++++++++++++++++++----- include/linux/percpu_counter.h | 48 ++++++++++++++++++++++++++++++-- include/trace/events/kmem.h | 4 +-- kernel/fork.c | 46 ++++++++++++++++++++++++++++--- lib/percpu_counter.c | 35 ++++++++++++++++++++++-- 5 files changed, 165 insertions(+), 18 deletions(-)
From: ZhangPeng zhangpeng362@huawei.com
maillist inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I9IA1I CVE: NA
Reference: https://lore.kernel.org/all/20240418142008.2775308-1-zhangpeng362@huawei.com...
--------------------------------
Depending on whether counters is NULL, we can support two modes: atomic mode and perpcu mode. We implement both modes by grouping the s64 count and atomic64_t count_atomic in a union. At the same time, we create the interface for adding and reading in atomic mode and for switching atomic mode to percpu mode.
Suggested-by: Jan Kara jack@suse.cz Signed-off-by: ZhangPeng zhangpeng362@huawei.com Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com --- include/linux/percpu_counter.h | 48 +++++++++++++++++++++++++++++++--- lib/percpu_counter.c | 35 +++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 5 deletions(-)
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index d01351b1526f..1a0f25a27d7b 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -21,7 +21,18 @@
struct percpu_counter { raw_spinlock_t lock; - s64 count; + /* + * Depending on whether counters is NULL, we can support two modes, + * atomic mode using count_atomic and perpcu mode using count. + * The single-thread processes should use atomic mode to reduce the + * memory consumption and performance regression. + * The multiple-thread processes should use percpu mode to reduce the + * error margin. + */ + union { + s64 count; + atomic64_t count_atomic; + }; #ifdef CONFIG_HOTPLUG_CPU struct list_head list; /* All percpu_counters are on a list */ #endif @@ -32,14 +43,14 @@ extern int percpu_counter_batch;
int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, - struct lock_class_key *key); + struct lock_class_key *key, bool switch_mode);
#define percpu_counter_init_many(fbc, value, gfp, nr_counters) \ ({ \ static struct lock_class_key __key; \ \ __percpu_counter_init_many(fbc, value, gfp, nr_counters,\ - &__key); \ + &__key, false); \ })
@@ -121,6 +132,20 @@ static inline bool percpu_counter_initialized(struct percpu_counter *fbc) return (fbc->counters != NULL); }
+static inline s64 percpu_counter_atomic_read(struct percpu_counter *fbc) +{ + return atomic64_read(&fbc->count_atomic); +} + +static inline void percpu_counter_atomic_add(struct percpu_counter *fbc, + s64 amount) +{ + atomic64_add(amount, &fbc->count_atomic); +} + +int percpu_counter_switch_to_pcpu_many(struct percpu_counter *fbc, + u32 nr_counters); + #else /* !CONFIG_SMP */
struct percpu_counter { @@ -230,6 +255,23 @@ static inline bool percpu_counter_initialized(struct percpu_counter *fbc) static inline void percpu_counter_sync(struct percpu_counter *fbc) { } + +static inline s64 percpu_counter_atomic_read(struct percpu_counter *fbc) +{ + return fbc->count; +} + +static inline void percpu_counter_atomic_add(struct percpu_counter *fbc, + s64 amount) +{ + percpu_counter_add(fbc, amount); +} + +static inline int percpu_counter_switch_to_pcpu_many(struct percpu_counter *fbc, + u32 nr_counters) +{ + return 0; +} #endif /* CONFIG_SMP */
static inline void percpu_counter_inc(struct percpu_counter *fbc) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 9073430dc865..7d2eaba4db1d 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(__percpu_counter_sum);
int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, - struct lock_class_key *key) + struct lock_class_key *key, bool switch_mode) { unsigned long flags __maybe_unused; size_t counter_size; @@ -174,7 +174,8 @@ int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc[i].list); #endif - fbc[i].count = amount; + if (likely(!switch_mode)) + fbc[i].count = amount; fbc[i].counters = (void *)counters + (i * counter_size);
debug_percpu_counter_activate(&fbc[i]); @@ -278,6 +279,36 @@ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) } EXPORT_SYMBOL(__percpu_counter_compare);
+/* + * percpu_counter_switch_to_pcpu_many: Converts struct percpu_counters from + * atomic mode to percpu mode. + * + * Return: 0 if percpu_counter is already in atomic mode or successfully + * switched to atomic mode; -ENOMEM if perpcu memory allocation fails, + * perpcu_counter is still in atomic mode. + */ +int percpu_counter_switch_to_pcpu_many(struct percpu_counter *fbc, + u32 nr_counters) +{ + static struct lock_class_key __key; + unsigned long flags; + bool ret = 0; + + if (percpu_counter_initialized(fbc)) + return 0; + + preempt_disable(); + local_irq_save(flags); + if (likely(!percpu_counter_initialized(fbc))) + ret = __percpu_counter_init_many(fbc, 0, + GFP_ATOMIC|__GFP_NOWARN|__GFP_ZERO, + nr_counters, &__key, true); + local_irq_restore(flags); + preempt_enable(); + + return ret; +} + static int __init percpu_counter_startup(void) { int ret;
From: ZhangPeng zhangpeng362@huawei.com
maillist inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I9IA1I CVE: NA
Reference: https://lore.kernel.org/all/20240418142008.2775308-1-zhangpeng362@huawei.com...
--------------------------------
Since commit f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter"), the rss_stats have converted into percpu_counter, which convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2). However, the new percpu allocation in mm_init() causes a performance regression on fork/exec/shell. Even after commit 14ef95be6f55 ("kernel/fork: group allocation/free of per-cpu counters for mm struct"), the performance of fork/exec/shell is still poor compared to previous kernel versions.
To mitigate performance regression, we delay the allocation of percpu memory for rss_stats. Therefore, we convert mm's rss stats to use percpu_counter atomic mode. For single-thread processes, rss_stat is in atomic mode, which reduces the memory consumption and performance regression caused by using percpu. For multiple-thread processes, rss_stat is switched to the percpu mode to reduce the error margin. We convert rss_stats from atomic mode to percpu mode only when the second thread is created.
After lmbench test, we can get 2% ~ 4% performance improvement for lmbench fork_proc/exec_proc/shell_proc and 6.7% performance improvement for lmbench page_fault (before batch mode[1]).
The test results are as follows:
base base+revert base+this patch
fork_proc 416.3ms 400.0ms (3.9%) 398.6ms (4.2%) exec_proc 2095.9ms 2061.1ms (1.7%) 2047.7ms (2.3%) shell_proc 3028.2ms 2954.7ms (2.4%) 2961.2ms (2.2%) page_fault 0.3603ms 0.3358ms (6.8%) 0.3361ms (6.7%)
[1] https://lore.kernel.org/all/20240412064751.119015-1-wangkefeng.wang@huawei.c...
Suggested-by: Jan Kara jack@suse.cz Signed-off-by: ZhangPeng zhangpeng362@huawei.com Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com --- include/linux/mm.h | 50 +++++++++++++++++++++++++++++++------ include/trace/events/kmem.h | 4 +-- kernel/fork.c | 20 +++++++++------ 3 files changed, 57 insertions(+), 17 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h index 46c7b073824c..9166008da1c4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2581,30 +2581,66 @@ static inline bool get_user_page_fast_only(unsigned long addr, */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { - return percpu_counter_read_positive(&mm->rss_stat[member]); + struct percpu_counter *fbc = &mm->rss_stat[member]; + + if (percpu_counter_initialized(fbc)) + return percpu_counter_read_positive(fbc); + + return percpu_counter_atomic_read(fbc); }
void mm_trace_rss_stat(struct mm_struct *mm, int member);
static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - percpu_counter_add(&mm->rss_stat[member], value); + struct percpu_counter *fbc = &mm->rss_stat[member]; + + if (percpu_counter_initialized(fbc)) + percpu_counter_add(fbc, value); + else + percpu_counter_atomic_add(fbc, value);
mm_trace_rss_stat(mm, member); }
static inline void inc_mm_counter(struct mm_struct *mm, int member) { - percpu_counter_inc(&mm->rss_stat[member]); - - mm_trace_rss_stat(mm, member); + add_mm_counter(mm, member, 1); }
static inline void dec_mm_counter(struct mm_struct *mm, int member) { - percpu_counter_dec(&mm->rss_stat[member]); + add_mm_counter(mm, member, -1); +}
- mm_trace_rss_stat(mm, member); +static inline s64 mm_counter_sum(struct mm_struct *mm, int member) +{ + struct percpu_counter *fbc = &mm->rss_stat[member]; + + if (percpu_counter_initialized(fbc)) + return percpu_counter_sum(fbc); + + return percpu_counter_atomic_read(fbc); +} + +static inline s64 mm_counter_sum_positive(struct mm_struct *mm, int member) +{ + struct percpu_counter *fbc = &mm->rss_stat[member]; + + if (percpu_counter_initialized(fbc)) + return percpu_counter_sum_positive(fbc); + + return percpu_counter_atomic_read(fbc); +} + +static inline int mm_counter_switch_to_pcpu(struct mm_struct *mm) +{ + return percpu_counter_switch_to_pcpu_many(mm->rss_stat, NR_MM_COUNTERS); +} + +static inline void mm_counter_destroy(struct mm_struct *mm) +{ + percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); }
/* Optimized variant when folio is already known not to be anon */ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 58688768ef0f..be39ca5af0ba 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -361,8 +361,8 @@ TRACE_EVENT(rss_stat, __entry->mm_id = mm_ptr_to_hash(mm); __entry->curr = !!(current->mm == mm); __entry->member = member; - __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member]) - << PAGE_SHIFT); + __entry->size = (mm_counter_sum_positive(mm, member) + << PAGE_SHIFT); ),
TP_printk("mm_id=%u curr=%d type=%s size=%ldB", diff --git a/kernel/fork.c b/kernel/fork.c index 43c7a00bb935..d75c18892eb6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -853,7 +853,7 @@ static void check_mm(struct mm_struct *mm) "Please make sure 'struct resident_page_types[]' is updated as well");
for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = percpu_counter_sum(&mm->rss_stat[i]); + long x = mm_counter_sum(mm, i);
if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", @@ -954,7 +954,7 @@ void __mmdrop(struct mm_struct *mm) put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); - percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); + mm_counter_destroy(mm);
free_mm(mm); } @@ -1357,17 +1357,11 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm)) goto fail_cid;
- if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, - NR_MM_COUNTERS)) - goto fail_pcpu; - sp_init_mm(mm); mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm;
-fail_pcpu: - mm_destroy_cid(mm); fail_cid: destroy_context(mm); fail_nocontext: @@ -1783,6 +1777,16 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0;
+ /* + * For single-thread processes, rss_stat is in atomic mode, which + * reduces the memory consumption and performance regression caused by + * using percpu. For multiple-thread processes, rss_stat is switched to + * the percpu mode to reduce the error margin. + */ + if (clone_flags & CLONE_THREAD) + if (mm_counter_switch_to_pcpu(oldmm)) + return -ENOMEM; + if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm;
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I9IA1I
---------------------------
Depending on whether counters is NULL, we can support two modes for mm counter, atomic mode and perpcu mode. Currently, the mm counter atomic mode is enabled by default. Introduce cmdline interface disable_mm_counter_atomic to disable mm counter atomic mode, which changes mm_counter_atomic_enable from true to false.
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- kernel/fork.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+)
diff --git a/kernel/fork.c b/kernel/fork.c index d75c18892eb6..9b16283a125c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -154,6 +154,8 @@ int lockdep_tasklist_lock_is_held(void) EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); #endif /* #ifdef CONFIG_PROVE_RCU */
+static bool __ro_after_init mm_counter_atomic_enable = true; + int nr_processes(void) { int cpu; @@ -1307,6 +1309,25 @@ static void mm_init_uprobes_state(struct mm_struct *mm) #endif }
+static __always_inline int mm_counter_init(struct mm_struct *mm) +{ + /* + * Depending on whether counters is NULL, we can support two modes for + * mm counter, atomic mode and perpcu mode. Currently, the mm counter + * atomic mode is enabled by default. Introduce cmdline interface + * disable_mm_counter_atomic to disable mm counter atomic mode, which + * changes mm_counter_atomic_enable from true to false. + */ + if (mm_counter_atomic_enable) + return 0; + + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, + NR_MM_COUNTERS)) + return -ENOMEM; + + return 0; +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -1357,11 +1378,16 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm)) goto fail_cid;
+ if (mm_counter_init(mm)) + goto fail_pcpu; + sp_init_mm(mm); mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm;
+fail_pcpu: + mm_destroy_cid(mm); fail_cid: destroy_context(mm); fail_nocontext: @@ -3627,3 +3653,11 @@ int sysctl_max_threads(struct ctl_table *table, int write,
return 0; } + +static int __init disable_mm_counter_atomic(char *buf) +{ + mm_counter_atomic_enable = false; + + return 0; +} +early_param("disable_mm_counter_atomic", disable_mm_counter_atomic);
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/6435 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/T...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/6435 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/T...