
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5 ---------------------------------------- The will-it-scale test case signal1 [1] has been observed. and the test results reveal that the signal sending system call lacks linearity. To further investigate this issue, we initiated a series of tests by launching varying numbers of dockers and closely monitored the throughput of each individual docker. The detailed test outcomes are presented as follows: | Dockers |1 |4 |8 |16 |32 |64 | | Throughput |380068 |353204 |308948 |306453 |180659 |129152 | The data clearly demonstrates a discernible trend: as the quantity of dockers increases, the throughput per container progressively declines. In-depth analysis has identified the root cause of this performance degradation. The ucounts module conducts statistics on rlimit, which involves a significant number of atomic operations. These atomic operations, when acting on the same variable, trigger a substantial number of cache misses or remote accesses, ultimately resulting in a drop in performance. To address the above issues, this patch converts the atomic rlimit to a percpu_counter. After the optimization, the performance data is shown below, demonstrating that the throughput no longer declines as the number of Docker containers increases: | Dockers |1 |4 |8 |16 |32 |64 | | Throughput |374737 |376377 |374814 |379284 |374950 |377509 | [1] https://github.com/antonblanchard/will-it-scale/blob/master/tests/ Signed-off-by: Chen Ridong <chenridong@huawei.com> --- include/linux/user_namespace.h | 16 ++++-- init/main.c | 1 + ipc/mqueue.c | 6 +-- kernel/signal.c | 11 ++--- kernel/ucount.c | 89 +++++++++++++++++++++------------- mm/mlock.c | 5 +- 6 files changed, 75 insertions(+), 53 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index d504d506a70f1..0f6cf35c831f7 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -13,6 +13,7 @@ #include <linux/sysctl.h> #include <linux/err.h> #include <linux/kabi.h> +#include <linux/percpu_counter.h> #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 @@ -121,7 +122,7 @@ struct ucounts { rcuref_t count; atomic_long_t freed; atomic_long_t ucount[UCOUNT_COUNTS]; - atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; + struct percpu_counter rlimit[UCOUNT_RLIMIT_COUNTS]; }; extern struct user_namespace init_user_ns; @@ -133,6 +134,7 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_ty void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); void put_ucounts(struct ucounts *ucounts); +void __init ucounts_init(void); static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) { @@ -143,13 +145,17 @@ static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { - return atomic_long_read(&ucounts->rlimit[type]); + return percpu_counter_sum(&ucounts->rlimit[type]); } -long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); -bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); +bool inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, long v, long limit); +static inline bool inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +{ + return inc_rlimit_ucounts_limit(ucounts, type, v, LONG_MAX); +} +void dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, - bool override_rlimit); + bool override_rlimit, long limit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); diff --git a/init/main.c b/init/main.c index 8fdfa69dba0fa..02a2c5d9be671 100644 --- a/init/main.c +++ b/init/main.c @@ -1050,6 +1050,7 @@ void start_kernel(void) efi_enter_virtual_mode(); #endif thread_stack_cache_init(); + ucounts_init(); cred_init(); fork_init(); proc_caches_init(); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index ba8215ed663a4..a910c93bea08a 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -371,11 +371,9 @@ static struct inode *mqueue_get_inode(struct super_block *sb, mq_bytes += mq_treesize; info->ucounts = get_ucounts(current_ucounts()); if (info->ucounts) { - long msgqueue; - spin_lock(&mq_lock); - msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); - if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) { + if (!inc_rlimit_ucounts_limit(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, + mq_bytes, rlimit(RLIMIT_MSGQUEUE))) { dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); spin_unlock(&mq_lock); put_ucounts(info->ucounts); diff --git a/kernel/signal.c b/kernel/signal.c index c73873d67a63f..c75ef0e3f5264 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -429,17 +429,14 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, rcu_read_lock(); ucounts = task_ucounts(t); sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, - override_rlimit); + override_rlimit, task_rlimit(t, RLIMIT_SIGPENDING)); rcu_read_unlock(); - if (!sigpending) - return NULL; - - if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { - q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); - } else { + if (!sigpending) { print_dropped_signal(sig); + return NULL; } + q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); if (unlikely(q == NULL)) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); } else { diff --git a/kernel/ucount.c b/kernel/ucount.c index 1e300184f5edb..bdaa6261b7cae 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -175,11 +175,17 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) new->ns = ns; new->uid = uid; rcuref_init(&new->count, 1); + if (percpu_counter_init_many(&new->rlimit[0], 0, GFP_KERNEL_ACCOUNT, + UCOUNT_RLIMIT_COUNTS)) { + kfree(new); + return NULL; + } spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); if (ucounts) { spin_unlock_irq(&ucounts_lock); + percpu_counter_destroy_many(&new->rlimit[0], UCOUNT_RLIMIT_COUNTS); kfree(new); return ucounts; } @@ -202,7 +208,7 @@ static bool rlimits_are_zero(struct ucounts *ucounts) for (int i = 0; i < sizeof(rtypes) / sizeof(int); ++i) { rtype = rtypes[i]; - if (atomic_long_read(&ucounts->rlimit[rtype]) > 0) + if (get_rlimit_value(ucounts, rtype) > 0) return false; } return true; @@ -230,7 +236,7 @@ static void free_ucounts(struct ucounts *ucounts) spin_lock_irqsave(&ucounts_lock, flags); hlist_nulls_del_rcu(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); - + percpu_counter_destroy_many(&ucounts->rlimit[0], UCOUNT_RLIMIT_COUNTS); put_user_ns(ucounts->ns); kfree_rcu(ucounts, rcu); } @@ -294,36 +300,35 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } -long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +bool inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, + long v, long limit) { struct ucounts *iter; long max = LONG_MAX; - long ret = 0; + bool good = true; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long new = atomic_long_add_return(v, &iter->rlimit[type]); - if (new < 0 || new > max) - ret = LONG_MAX; - else if (iter == ucounts) - ret = new; + max = min(limit, max); + if (!percpu_counter_limited_add(&iter->rlimit[type], max, v)) + good = false; + max = get_userns_rlimit_max(iter->ns, type); } - return ret; + return good; } -bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +void dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; - long new = -1; /* Silence compiler warning */ - for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long dec = atomic_long_sub_return(v, &iter->rlimit[type]); - WARN_ON_ONCE(dec < 0); - if (iter == ucounts) - new = dec; - } - return (new == 0); + + for (iter = ucounts; iter; iter = iter->ns->ucounts) + percpu_counter_sub(&iter->rlimit[type], v); } +/* + * The inc_rlimit_get_ucounts does not grab the refcount. + * The rlimit_release should be called very time the rlimit is decremented. + */ static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, struct ucounts *last, enum rlimit_type type) { @@ -332,8 +337,7 @@ static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, bool to_free; rcu_read_lock(); - long dec = atomic_long_sub_return(1, &iter->rlimit[type]); - WARN_ON_ONCE(dec < 0); + percpu_counter_sub(&iter->rlimit[type], 1); next = iter->ns->ucounts; to_free = ucounts_can_be_freed(iter); rcu_read_unlock(); @@ -348,29 +352,37 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) do_dec_rlimit_put_ucounts(ucounts, NULL, type); } +/* + * Though this function does not grab the refcount, it is promised that the + * ucounts will not be freed as long as there have any rlimit pins to it. + * Caller must hold a reference to ucounts or under rcu_read_lock(). + * + * Return 1 if increments successful, otherwise return 0. + */ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, - bool override_rlimit) + bool override_rlimit, long limit) { - /* Caller must hold a reference to ucounts */ struct ucounts *iter; long max = LONG_MAX; - long dec, ret = 0; + long ret = 0; + long in_limit = limit; + + if (override_rlimit) + in_limit = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long new = atomic_long_add_return(1, &iter->rlimit[type]); - if (new < 0 || new > max) + /* Can not exceed the limit(inputed) or the ns->rlimit_max */ + max = min(in_limit, max); + if (!percpu_counter_limited_add(&iter->rlimit[type], max, 1)) goto dec_unwind; - if (iter == ucounts) - ret = new; + if (!override_rlimit) max = get_userns_rlimit_max(iter->ns, type); } - return ret; + return 1; dec_unwind: - dec = atomic_long_sub_return(1, &iter->rlimit[type]); - WARN_ON_ONCE(dec < 0); do_dec_rlimit_put_ucounts(ucounts, iter, type); - return 0; + return ret; } bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) @@ -379,15 +391,24 @@ bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigne long max = rlimit; if (rlimit > LONG_MAX) max = LONG_MAX; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long val = get_rlimit_value(iter, type); - if (val < 0 || val > max) + /* iter->rlimit[type] > max return 1 */ + if (percpu_counter_compare(&iter->rlimit[type], max) > 0) return true; + max = get_userns_rlimit_max(iter->ns, type); } return false; } +void __init ucounts_init(void) +{ + if (percpu_counter_init_many(&init_ucounts.rlimit[0], 0, GFP_KERNEL, + UCOUNT_RLIMIT_COUNTS)) + panic("Cannot create init_ucounts rlimit counters"); +} + static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL diff --git a/mm/mlock.c b/mm/mlock.c index cd0997d89c7c5..65e5c40c26795 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -825,7 +825,6 @@ static DEFINE_SPINLOCK(shmlock_user_lock); int user_shm_lock(size_t size, struct ucounts *ucounts) { unsigned long lock_limit, locked; - long memlock; int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -833,9 +832,9 @@ int user_shm_lock(size_t size, struct ucounts *ucounts) if (lock_limit != RLIM_INFINITY) lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); - memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); - if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { + if (!inc_rlimit_ucounts_limit(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked, lock_limit) + && !capable(CAP_IPC_LOCK)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); goto out; } -- 2.34.1