[PATCH OLK-6.6 7/8] ucounts: turn the atomic rlimit to percpu_counter

23 May 2025

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5

----------------------------------------

The will-it-scale test case signal1 [1] has been observed. and the test
results reveal that the signal sending system call lacks linearity.
To further investigate this issue, we initiated a series of tests by
launching varying numbers of dockers and closely monitored the throughput
of each individual docker. The detailed test outcomes are presented as
follows:

  | Dockers 	|1	|4	|8	|16	|32	|64	|
  | Throughput 	|380068	|353204	|308948	|306453	|180659	|129152	|

The data clearly demonstrates a discernible trend: as the quantity of
dockers increases, the throughput per container progressively declines.
In-depth analysis has identified the root cause of this performance
degradation. The ucounts module conducts statistics on rlimit, which
involves a significant number of atomic operations. These atomic
operations, when acting on the same variable, trigger a substantial number
of cache misses or remote accesses, ultimately resulting in a drop in
performance.

To address the above issues, this patch converts the atomic rlimit to a
percpu_counter. After the optimization, the performance data is shown
below, demonstrating that the throughput no longer declines as the number
of Docker containers increases:

  | Dockers 	|1	|4	|8	|16	|32	|64	|
  | Throughput 	|374737	|376377	|374814	|379284	|374950	|377509	|

[1] https://github.com/antonblanchard/will-it-scale/blob/master/tests/
Signed-off-by: Chen Ridong <chenridong@huawei.com>
---
 include/linux/user_namespace.h | 16 ++++--
 init/main.c                    |  1 +
 ipc/mqueue.c                   |  6 +--
 kernel/signal.c                | 11 ++---
 kernel/ucount.c                | 89 +++++++++++++++++++++-------------
 mm/mlock.c                     |  5 +-
 6 files changed, 75 insertions(+), 53 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index d504d506a70f1..0f6cf35c831f7 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -13,6 +13,7 @@
 #include <linux/sysctl.h>
 #include <linux/err.h>
 #include <linux/kabi.h>
+#include <linux/percpu_counter.h>
 
 #define UID_GID_MAP_MAX_BASE_EXTENTS 5
 #define UID_GID_MAP_MAX_EXTENTS 340
@@ -121,7 +122,7 @@ struct ucounts {
 	rcuref_t count;
 	atomic_long_t freed;
 	atomic_long_t ucount[UCOUNT_COUNTS];
-	atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS];
+	struct percpu_counter rlimit[UCOUNT_RLIMIT_COUNTS];
 };
 
 extern struct user_namespace init_user_ns;
@@ -133,6 +134,7 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_ty
 void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
 struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
 void put_ucounts(struct ucounts *ucounts);
+void __init ucounts_init(void);
 
 static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts)
 {
@@ -143,13 +145,17 @@ static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts)
 
 static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type)
 {
-	return atomic_long_read(&ucounts->rlimit[type]);
+	return percpu_counter_sum(&ucounts->rlimit[type]);
 }
 
-long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
-bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
+bool inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, long v, long limit);
+static inline bool inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
+{
+	return inc_rlimit_ucounts_limit(ucounts, type, v, LONG_MAX);
+}
+void dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
 long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
-			    bool override_rlimit);
+			    bool override_rlimit, long limit);
 void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
 bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);
 
diff --git a/init/main.c b/init/main.c
index 8fdfa69dba0fa..02a2c5d9be671 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1050,6 +1050,7 @@ void start_kernel(void)
 		efi_enter_virtual_mode();
 #endif
 	thread_stack_cache_init();
+	ucounts_init();
 	cred_init();
 	fork_init();
 	proc_caches_init();
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index ba8215ed663a4..a910c93bea08a 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -371,11 +371,9 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 		mq_bytes += mq_treesize;
 		info->ucounts = get_ucounts(current_ucounts());
 		if (info->ucounts) {
-			long msgqueue;
-
 			spin_lock(&mq_lock);
-			msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
-			if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) {
+			if (!inc_rlimit_ucounts_limit(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE,
+							mq_bytes, rlimit(RLIMIT_MSGQUEUE))) {
 				dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
 				spin_unlock(&mq_lock);
 				put_ucounts(info->ucounts);
diff --git a/kernel/signal.c b/kernel/signal.c
index c73873d67a63f..c75ef0e3f5264 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -429,17 +429,14 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
 	rcu_read_lock();
 	ucounts = task_ucounts(t);
 	sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING,
-					    override_rlimit);
+					    override_rlimit,  task_rlimit(t, RLIMIT_SIGPENDING));
 	rcu_read_unlock();
-	if (!sigpending)
-		return NULL;
-
-	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
-		q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
-	} else {
+	if (!sigpending) {
 		print_dropped_signal(sig);
+		return NULL;
 	}
 
+	q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
 	if (unlikely(q == NULL)) {
 		dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
 	} else {
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 1e300184f5edb..bdaa6261b7cae 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -175,11 +175,17 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
 	new->ns = ns;
 	new->uid = uid;
 	rcuref_init(&new->count, 1);
+	if (percpu_counter_init_many(&new->rlimit[0], 0, GFP_KERNEL_ACCOUNT,
+		UCOUNT_RLIMIT_COUNTS)) {
+		kfree(new);
+		return NULL;
+	}
 
 	spin_lock_irq(&ucounts_lock);
 	ucounts = find_ucounts(ns, uid, hashent);
 	if (ucounts) {
 		spin_unlock_irq(&ucounts_lock);
+		percpu_counter_destroy_many(&new->rlimit[0], UCOUNT_RLIMIT_COUNTS);
 		kfree(new);
 		return ucounts;
 	}
@@ -202,7 +208,7 @@ static bool rlimits_are_zero(struct ucounts *ucounts)
 
 	for (int i = 0; i < sizeof(rtypes) / sizeof(int); ++i) {
 		rtype = rtypes[i];
-		if (atomic_long_read(&ucounts->rlimit[rtype]) > 0)
+		if (get_rlimit_value(ucounts, rtype) > 0)
 			return false;
 	}
 	return true;
@@ -230,7 +236,7 @@ static void free_ucounts(struct ucounts *ucounts)
 	spin_lock_irqsave(&ucounts_lock, flags);
 	hlist_nulls_del_rcu(&ucounts->node);
 	spin_unlock_irqrestore(&ucounts_lock, flags);
-
+	percpu_counter_destroy_many(&ucounts->rlimit[0], UCOUNT_RLIMIT_COUNTS);
 	put_user_ns(ucounts->ns);
 	kfree_rcu(ucounts, rcu);
 }
@@ -294,36 +300,35 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
 	put_ucounts(ucounts);
 }
 
-long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
+bool inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type,
+					long v, long limit)
 {
 	struct ucounts *iter;
 	long max = LONG_MAX;
-	long ret = 0;
+	bool good = true;
 
 	for (iter = ucounts; iter; iter = iter->ns->ucounts) {
-		long new = atomic_long_add_return(v, &iter->rlimit[type]);
-		if (new < 0 || new > max)
-			ret = LONG_MAX;
-		else if (iter == ucounts)
-			ret = new;
+		max = min(limit, max);
+		if (!percpu_counter_limited_add(&iter->rlimit[type], max, v))
+			good = false;
+
 		max = get_userns_rlimit_max(iter->ns, type);
 	}
-	return ret;
+	return good;
 }
 
-bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
+void dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
 {
 	struct ucounts *iter;
-	long new = -1; /* Silence compiler warning */
-	for (iter = ucounts; iter; iter = iter->ns->ucounts) {
-		long dec = atomic_long_sub_return(v, &iter->rlimit[type]);
-		WARN_ON_ONCE(dec < 0);
-		if (iter == ucounts)
-			new = dec;
-	}
-	return (new == 0);
+
+	for (iter = ucounts; iter; iter = iter->ns->ucounts)
+		percpu_counter_sub(&iter->rlimit[type], v);
 }
 
+/*
+ * The inc_rlimit_get_ucounts does not grab the refcount.
+ * The rlimit_release should be called very time the rlimit is decremented.
+ */
 static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
 				struct ucounts *last, enum rlimit_type type)
 {
@@ -332,8 +337,7 @@ static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
 		bool to_free;
 
 		rcu_read_lock();
-		long dec = atomic_long_sub_return(1, &iter->rlimit[type]);
-		WARN_ON_ONCE(dec < 0);
+		percpu_counter_sub(&iter->rlimit[type], 1);
 		next = iter->ns->ucounts;
 		to_free = ucounts_can_be_freed(iter);
 		rcu_read_unlock();
@@ -348,29 +352,37 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
 	do_dec_rlimit_put_ucounts(ucounts, NULL, type);
 }
 
+/*
+ * Though this function does not grab the refcount, it is promised that the
+ * ucounts will not be freed as long as there have any rlimit pins to it.
+ * Caller must hold a reference to ucounts or under rcu_read_lock().
+ *
+ * Return 1 if increments successful, otherwise return 0.
+ */
 long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
-			    bool override_rlimit)
+			    bool override_rlimit, long limit)
 {
-	/* Caller must hold a reference to ucounts */
 	struct ucounts *iter;
 	long max = LONG_MAX;
-	long dec, ret = 0;
+	long ret = 0;
+	long in_limit = limit;
+
+	if (override_rlimit)
+		in_limit = LONG_MAX;
 
 	for (iter = ucounts; iter; iter = iter->ns->ucounts) {
-		long new = atomic_long_add_return(1, &iter->rlimit[type]);
-		if (new < 0 || new > max)
+		/* Can not exceed the limit(inputed) or the ns->rlimit_max */
+		max = min(in_limit, max);
+		if (!percpu_counter_limited_add(&iter->rlimit[type], max, 1))
 			goto dec_unwind;
-		if (iter == ucounts)
-			ret = new;
+
 		if (!override_rlimit)
 			max = get_userns_rlimit_max(iter->ns, type);
 	}
-	return ret;
+	return 1;
 dec_unwind:
-	dec = atomic_long_sub_return(1, &iter->rlimit[type]);
-	WARN_ON_ONCE(dec < 0);
 	do_dec_rlimit_put_ucounts(ucounts, iter, type);
-	return 0;
+	return ret;
 }
 
 bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit)
@@ -379,15 +391,24 @@ bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigne
 	long max = rlimit;
 	if (rlimit > LONG_MAX)
 		max = LONG_MAX;
+
 	for (iter = ucounts; iter; iter = iter->ns->ucounts) {
-		long val = get_rlimit_value(iter, type);
-		if (val < 0 || val > max)
+		/* iter->rlimit[type] > max return 1 */
+		if (percpu_counter_compare(&iter->rlimit[type], max) > 0)
 			return true;
+
 		max = get_userns_rlimit_max(iter->ns, type);
 	}
 	return false;
 }
 
+void __init ucounts_init(void)
+{
+	if (percpu_counter_init_many(&init_ucounts.rlimit[0], 0, GFP_KERNEL,
+				       UCOUNT_RLIMIT_COUNTS))
+		panic("Cannot create init_ucounts rlimit counters");
+}
+
 static __init int user_namespace_sysctl_init(void)
 {
 #ifdef CONFIG_SYSCTL
diff --git a/mm/mlock.c b/mm/mlock.c
index cd0997d89c7c5..65e5c40c26795 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -825,7 +825,6 @@ static DEFINE_SPINLOCK(shmlock_user_lock);
 int user_shm_lock(size_t size, struct ucounts *ucounts)
 {
 	unsigned long lock_limit, locked;
-	long memlock;
 	int allowed = 0;
 
 	locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -833,9 +832,9 @@ int user_shm_lock(size_t size, struct ucounts *ucounts)
 	if (lock_limit != RLIM_INFINITY)
 		lock_limit >>= PAGE_SHIFT;
 	spin_lock(&shmlock_user_lock);
-	memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 
-	if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
+	if (!inc_rlimit_ucounts_limit(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked, lock_limit)
+		&& !capable(CAP_IPC_LOCK)) {
 		dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 		goto out;
 	}
-- 
2.34.1