
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC97W5 ---------------------------------------- The will-it-scale test case signal1 [1] has been observed. and the test results reveal that the signal sending system call lacks linearity. To further investigate this issue, we initiated a series of tests by launching varying numbers of dockers and closely monitored the throughput of each individual docker. The detailed test outcomes are presented as follows: | Dockers |1 |4 |8 |16 |32 |64 | | Throughput |380068 |353204 |308948 |306453 |180659 |129152 | The data clearly demonstrates a discernible trend: as the quantity of dockers increases, the throughput per container progressively declines. In-depth analysis has identified the root cause of this performance degradation. The ucounts module conducts statistics on rlimit, which involves a significant number of atomic operations. These atomic operations, when acting on the same variable, trigger a substantial number of cache misses or remote accesses, ultimately resulting in a drop in performance. To address the above issues, this patch converts the atomic rlimit to a percpu_counter. Summing up the percpu counters is expensive. To overcome this, this patch modifies the conditions for freeing ucounts. Instead of complex checks regarding whether a pending signal is the first or the last one, the ucounts can now be freed only when both the refcount and the rlimits are zero. After the optimization, the performance data is shown below, demonstrating that the throughput no longer declines as the number of Docker containers increases: | Dockers |1 |4 |8 |16 |32 |64 | | Throughput |374737 |376377 |374814 |379284 |374950 |377509 | [1] https://github.com/antonblanchard/will-it-scale/blob/master/tests/ Signed-off-by: Chen Ridong <chenridong@huawei.com> --- include/linux/user_namespace.h | 17 ++++ init/main.c | 1 + kernel/ucount.c | 172 ++++++++++++++++++++++++++++++++- 3 files changed, 188 insertions(+), 2 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 8ec4a694d1f2a..908d1bba7db17 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -13,6 +13,9 @@ #include <linux/sysctl.h> #include <linux/err.h> #include <linux/kabi.h> +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +#include <linux/percpu_counter.h> +#endif #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 @@ -120,7 +123,12 @@ struct ucounts { struct rcu_head rcu; rcuref_t count; atomic_long_t ucount[UCOUNT_COUNTS]; +#ifndef CONFIG_UCOUNTS_PERCPU_COUNTER atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; +#else + struct percpu_counter rlimit[UCOUNT_RLIMIT_COUNTS]; + atomic_long_t freed; +#endif }; extern struct user_namespace init_user_ns; @@ -132,6 +140,11 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_ty void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); void put_ucounts(struct ucounts *ucounts); +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +void __init ucounts_init(void); +#else +static inline void __init ucounts_init(void) { } +#endif static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) { @@ -142,7 +155,11 @@ static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER + return percpu_counter_sum(&ucounts->rlimit[type]); +#else return atomic_long_read(&ucounts->rlimit[type]); +#endif } long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, long v, long limit); diff --git a/init/main.c b/init/main.c index 8fdfa69dba0fa..02a2c5d9be671 100644 --- a/init/main.c +++ b/init/main.c @@ -1050,6 +1050,7 @@ void start_kernel(void) efi_enter_virtual_mode(); #endif thread_stack_cache_init(); + ucounts_init(); cred_init(); fork_init(); proc_caches_init(); diff --git a/kernel/ucount.c b/kernel/ucount.c index 778279318e1dd..20145f12ee3a8 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -171,7 +171,13 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; - +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER + if (percpu_counter_init_many(&new->rlimit[0], 0, GFP_KERNEL_ACCOUNT, + UCOUNT_RLIMIT_COUNTS)) { + kfree(new); + return NULL; + } +#endif new->ns = ns; new->uid = uid; rcuref_init(&new->count, 1); @@ -180,6 +186,9 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) ucounts = find_ucounts(ns, uid, hashent); if (ucounts) { spin_unlock_irq(&ucounts_lock); +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER + percpu_counter_destroy_many(&new->rlimit[0], UCOUNT_RLIMIT_COUNTS); +#endif kfree(new); return ucounts; } @@ -190,6 +199,65 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) return new; } +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +/* + * Whether all the rlimits are zero. + * For now, only UCOUNT_RLIMIT_SIGPENDING is considered. + * Other rlimit can be added. + */ +static bool rlimits_are_zero(struct ucounts *ucounts) +{ + int rtypes[] = { UCOUNT_RLIMIT_SIGPENDING }; + int rtype; + + for (int i = 0; i < sizeof(rtypes) / sizeof(int); ++i) { + rtype = rtypes[i]; + if (get_rlimit_value(ucounts, rtype) > 0) + return false; + } + return true; +} + +/* + * Ucounts can be freed only when the ucount->count is released + * and the rlimits are zero. + * The caller should hold rcu_read_lock(); + */ +static bool ucounts_can_be_freed(struct ucounts *ucounts) +{ + if (rcuref_read(&ucounts->count) > 0) + return false; + if (!rlimits_are_zero(ucounts)) + return false; + /* Prevent double free */ + return atomic_long_cmpxchg(&ucounts->freed, 0, 1) == 0; +} + +static void free_ucounts(struct ucounts *ucounts) +{ + unsigned long flags; + + spin_lock_irqsave(&ucounts_lock, flags); + hlist_nulls_del_rcu(&ucounts->node); + spin_unlock_irqrestore(&ucounts_lock, flags); + percpu_counter_destroy_many(&ucounts->rlimit[0], UCOUNT_RLIMIT_COUNTS); + put_user_ns(ucounts->ns); + kfree_rcu(ucounts, rcu); +} + +void put_ucounts(struct ucounts *ucounts) +{ + rcu_read_lock(); + if (rcuref_put(&ucounts->count) && + ucounts_can_be_freed(ucounts)) { + rcu_read_unlock(); + free_ucounts(ucounts); + return; + } + rcu_read_unlock(); +} +#else + void put_ucounts(struct ucounts *ucounts) { unsigned long flags; @@ -203,6 +271,7 @@ void put_ucounts(struct ucounts *ucounts) kfree_rcu(ucounts, rcu); } } +#endif // CONFIG_UCOUNTS_PERCPU_COUNTER static inline bool atomic_long_inc_below(atomic_long_t *v, int u) { @@ -251,6 +320,105 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } +#ifdef CONFIG_UCOUNTS_PERCPU_COUNTER +/* Return 1 if increments successful, otherwise return LONG_MAX. */ +long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, + long v, long limit) +{ + struct ucounts *iter; + long max = LONG_MAX; + bool over_limit = false; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + max = min(limit, max); + if (!percpu_counter_limited_add(&iter->rlimit[type], max, v)) + over_limit = true; + + max = get_userns_rlimit_max(iter->ns, type); + } + + if (over_limit) + return LONG_MAX; + return 1; +} + +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) +{ + struct ucounts *iter; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) + percpu_counter_sub(&iter->rlimit[type], v); + return false; +} + +/* + * The inc_rlimit_get_ucounts does not grab the refcount. + * The rlimit_release should be called very time the rlimit is decremented. + */ +static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, + struct ucounts *last, enum rlimit_type type) +{ + struct ucounts *iter, *next; + + for (iter = ucounts; iter != last; iter = next) { + bool to_free; + + rcu_read_lock(); + percpu_counter_sub(&iter->rlimit[type], 1); + next = iter->ns->ucounts; + to_free = ucounts_can_be_freed(iter); + rcu_read_unlock(); + /* If ucounts->count is zero and the rlimits are zero, free ucounts */ + if (to_free) + free_ucounts(iter); + } +} + +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) +{ + do_dec_rlimit_put_ucounts(ucounts, NULL, type); +} + +/* + * Though this function does not grab the refcount, it is promised that the + * ucounts will not be freed as long as there have any rlimit pins to it. + * Caller must hold a reference to ucounts or under rcu_read_lock(). + * + * Return 1 if increments successful, otherwise return 0. + */ +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, + bool override_rlimit, long limit) +{ + struct ucounts *iter; + long max = LONG_MAX; + long in_limit = limit; + + if (override_rlimit) + in_limit = LONG_MAX; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + /* Can not exceed the limit(inputed) or the ns->rlimit_max */ + max = min(in_limit, max); + if (!percpu_counter_limited_add(&iter->rlimit[type], max, 1)) + goto dec_unwind; + + if (!override_rlimit) + max = get_userns_rlimit_max(iter->ns, type); + } + return 1; +dec_unwind: + do_dec_rlimit_put_ucounts(ucounts, iter, type); + return 0; +} + +void __init ucounts_init(void) +{ + if (percpu_counter_init_many(&init_ucounts.rlimit[0], 0, GFP_KERNEL, + UCOUNT_RLIMIT_COUNTS)) + panic("Cannot create init_ucounts rlimit counters"); +} +#else + long inc_rlimit_ucounts_limit(struct ucounts *ucounts, enum rlimit_type type, long v, long limit) { @@ -332,7 +500,7 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } - +#endif bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) { struct ucounts *iter; -- 2.34.1