[PATCH OLK-6.6 8/9] ucounts: reinplement rlimit with percpu_counter

27 May 2025

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5

----------------------------------------

The will-it-scale test case signal1 [1] has been observed. and the test
results reveal that the signal sending system call lacks linearity.
To further investigate this issue, we initiated a series of tests by
launching varying numbers of dockers and closely monitored the throughput
of each individual docker. The detailed test outcomes are presented as
follows:

  | Dockers 	|1	|4	|8	|16	|32	|64	|
  | Throughput 	|380068	|353204	|308948	|306453	|180659	|129152	|

The data clearly demonstrates a discernible trend: as the quantity of
dockers increases, the throughput per container progressively declines.
In-depth analysis has identified the root cause of this performance
degradation. The ucounts module conducts statistics on rlimit, which
involves a significant number of atomic operations. These atomic
operations, when acting on the same variable, trigger a substantial number
of cache misses or remote accesses, ultimately resulting in a drop in
performance.

To address the above issues, this patch converts the atomic rlimit to a
percpu_counter. Summing up the percpu counters is expensive. To overcome
this, this patch modifies the conditions for freeing ucounts. Instead of
complex checks regarding whether a pending signal is the first or the last
one, the ucounts can now be freed only when both the refcount and the
rlimits are zero.

After the optimization, the performance data is shown below, demonstrating
that the throughput no longer declines as the number of Docker containers
increases:

  | Dockers 	|1	|4	|8	|16	|32	|64	|
  | Throughput 	|374737	|376377	|374814	|379284	|374950	|377509	|

[1] https://github.com/antonblanchard/will-it-scale/blob/master/tests/
Signed-off-by: Chen Ridong <chenridong@huawei.com>
---
 include/linux/user_namespace.h |  17 ++++
 init/main.c                    |   1 +
 kernel/ucount.c                | 172 ++++++++++++++++++++++++++++++++-
 3 files changed, 188 insertions(+), 2 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 8ec4a694d1f2a..908d1bba7db17 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -13,6 +13,9 @@
 #include <linux/sysctl.h>
 #include <linux/err.h>
 #include <linux/kabi.h>
+#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER
+#include <linux/percpu_counter.h>
+#endif
 
 #define UID_GID_MAP_MAX_BASE_EXTENTS 5
 #define UID_GID_MAP_MAX_EXTENTS 340
@@ -120,7 +123,12 @@ struct ucounts {
 	struct rcu_head rcu;
 	rcuref_t count;
 	atomic_long_t ucount[UCOUNT_COUNTS];
+#ifndef CONFIG_UCOUNTS_PERCPU_COUNTER
 	atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS];
+#else
+	struct percpu_counter rlimit[UCOUNT_RLIMIT_COUNTS];
+	atomic_long_t freed;
+#endif
 };
 
 extern struct user_namespace init_user_ns;
@@ -132,6 +140,11 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_ty
 void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
 struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
 void put_ucounts(struct ucounts *ucounts);
+#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER
+void __init ucounts_init(void);
+#else
+static inline void __init ucounts_init(void) { }
+#endif
 
 static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts)
 {
@@ -142,7 +155,11 @@ static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts)
 
 static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type)
 {
+#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER
+	return percpu_counter_sum(&ucounts->rlimit[type]);
+#else
 	return atomic_long_read(&ucounts->rlimit[type]);
+#endif
 }
 
 long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, long v, long limit);
diff --git a/init/main.c b/init/main.c
index 8fdfa69dba0fa..02a2c5d9be671 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1050,6 +1050,7 @@ void start_kernel(void)
 		efi_enter_virtual_mode();
 #endif
 	thread_stack_cache_init();
+	ucounts_init();
 	cred_init();
 	fork_init();
 	proc_caches_init();
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 778279318e1dd..20145f12ee3a8 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -171,7 +171,13 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
 	new = kzalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
 		return NULL;
-
+#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER
+	if (percpu_counter_init_many(&new->rlimit[0], 0, GFP_KERNEL_ACCOUNT,
+		UCOUNT_RLIMIT_COUNTS)) {
+		kfree(new);
+		return NULL;
+	}
+#endif
 	new->ns = ns;
 	new->uid = uid;
 	rcuref_init(&new->count, 1);
@@ -180,6 +186,9 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
 	ucounts = find_ucounts(ns, uid, hashent);
 	if (ucounts) {
 		spin_unlock_irq(&ucounts_lock);
+#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER
+		percpu_counter_destroy_many(&new->rlimit[0], UCOUNT_RLIMIT_COUNTS);
+#endif
 		kfree(new);
 		return ucounts;
 	}
@@ -190,6 +199,65 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
 	return new;
 }
 
+#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER
+/*
+ * Whether all the rlimits are zero.
+ * For now, only UCOUNT_RLIMIT_SIGPENDING is considered.
+ * Other rlimit can be added.
+ */
+static bool rlimits_are_zero(struct ucounts *ucounts)
+{
+	int rtypes[] = { UCOUNT_RLIMIT_SIGPENDING };
+	int rtype;
+
+	for (int i = 0; i < sizeof(rtypes) / sizeof(int); ++i) {
+		rtype = rtypes[i];
+		if (get_rlimit_value(ucounts, rtype) > 0)
+			return false;
+	}
+	return true;
+}
+
+/*
+ * Ucounts can be freed only when the ucount->count is released
+ * and the rlimits are zero.
+ * The caller should hold rcu_read_lock();
+ */
+static bool ucounts_can_be_freed(struct ucounts *ucounts)
+{
+	if (rcuref_read(&ucounts->count) > 0)
+		return false;
+	if (!rlimits_are_zero(ucounts))
+		return false;
+	/* Prevent double free */
+	return atomic_long_cmpxchg(&ucounts->freed, 0, 1) == 0;
+}
+
+static void free_ucounts(struct ucounts *ucounts)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ucounts_lock, flags);
+	hlist_nulls_del_rcu(&ucounts->node);
+	spin_unlock_irqrestore(&ucounts_lock, flags);
+	percpu_counter_destroy_many(&ucounts->rlimit[0], UCOUNT_RLIMIT_COUNTS);
+	put_user_ns(ucounts->ns);
+	kfree_rcu(ucounts, rcu);
+}
+
+void put_ucounts(struct ucounts *ucounts)
+{
+	rcu_read_lock();
+	if (rcuref_put(&ucounts->count) &&
+	    ucounts_can_be_freed(ucounts)) {
+		rcu_read_unlock();
+		free_ucounts(ucounts);
+		return;
+	}
+	rcu_read_unlock();
+}
+#else
+
 void put_ucounts(struct ucounts *ucounts)
 {
 	unsigned long flags;
@@ -203,6 +271,7 @@ void put_ucounts(struct ucounts *ucounts)
 		kfree_rcu(ucounts, rcu);
 	}
 }
+#endif // CONFIG_UCOUNTS_PERCPU_COUNTER
 
 static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
 {
@@ -251,6 +320,105 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
 	put_ucounts(ucounts);
 }
 
+#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER
+/* Return 1 if increments successful, otherwise return LONG_MAX. */
+long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type,
+					long v, long limit)
+{
+	struct ucounts *iter;
+	long max = LONG_MAX;
+	bool over_limit = false;
+
+	for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+		max = min(limit, max);
+		if (!percpu_counter_limited_add(&iter->rlimit[type], max, v))
+			over_limit = true;
+
+		max = get_userns_rlimit_max(iter->ns, type);
+	}
+
+	if (over_limit)
+		return LONG_MAX;
+	return 1;
+}
+
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
+{
+	struct ucounts *iter;
+
+	for (iter = ucounts; iter; iter = iter->ns->ucounts)
+		percpu_counter_sub(&iter->rlimit[type], v);
+	return false;
+}
+
+/*
+ * The inc_rlimit_get_ucounts does not grab the refcount.
+ * The rlimit_release should be called very time the rlimit is decremented.
+ */
+static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
+				struct ucounts *last, enum rlimit_type type)
+{
+	struct ucounts *iter, *next;
+
+	for (iter = ucounts; iter != last; iter = next) {
+		bool to_free;
+
+		rcu_read_lock();
+		percpu_counter_sub(&iter->rlimit[type], 1);
+		next = iter->ns->ucounts;
+		to_free = ucounts_can_be_freed(iter);
+		rcu_read_unlock();
+		/* If ucounts->count is zero and the rlimits are zero, free ucounts */
+		if (to_free)
+			free_ucounts(iter);
+	}
+}
+
+void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
+{
+	do_dec_rlimit_put_ucounts(ucounts, NULL, type);
+}
+
+/*
+ * Though this function does not grab the refcount, it is promised that the
+ * ucounts will not be freed as long as there have any rlimit pins to it.
+ * Caller must hold a reference to ucounts or under rcu_read_lock().
+ *
+ * Return 1 if increments successful, otherwise return 0.
+ */
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
+			    bool override_rlimit, long limit)
+{
+	struct ucounts *iter;
+	long max = LONG_MAX;
+	long in_limit = limit;
+
+	if (override_rlimit)
+		in_limit = LONG_MAX;
+
+	for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+		/* Can not exceed the limit(inputed) or the ns->rlimit_max */
+		max = min(in_limit, max);
+		if (!percpu_counter_limited_add(&iter->rlimit[type], max, 1))
+			goto dec_unwind;
+
+		if (!override_rlimit)
+			max = get_userns_rlimit_max(iter->ns, type);
+	}
+	return 1;
+dec_unwind:
+	do_dec_rlimit_put_ucounts(ucounts, iter, type);
+	return 0;
+}
+
+void __init ucounts_init(void)
+{
+	if (percpu_counter_init_many(&init_ucounts.rlimit[0], 0, GFP_KERNEL,
+				       UCOUNT_RLIMIT_COUNTS))
+		panic("Cannot create init_ucounts rlimit counters");
+}
+#else
+
 long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type,
 					long v, long limit)
 {
@@ -332,7 +500,7 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
 	do_dec_rlimit_put_ucounts(ucounts, iter, type);
 	return 0;
 }
-
+#endif
 bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit)
 {
 	struct ucounts *iter;
-- 
2.34.1