From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.13-rc1 commit e40f74c535b8a0ecf3ef0388b51a34cdadb34fb5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6VS35
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Introduce a cpumask that indicates (for each CPU) what direction the CPU hotplug is currently going. Notably, it tracks rollbacks. Eg. when an up fails and we do a roll-back down, it will accurately reflect the direction.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Valentin Schneider valentin.schneider@arm.com Link: https://lkml.kernel.org/r/20210310150109.151441252@infradead.org Signed-off-by: Zeng Heng zengheng4@huawei.com --- include/linux/cpumask.h | 19 +++++++++++++++++++ kernel/cpu.c | 6 ++++++ 2 files changed, 25 insertions(+)
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 0159986ac9ce..92d5ecad8de6 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -91,10 +91,12 @@ extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; +extern struct cpumask __cpu_dying_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) +#define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask)
extern atomic_t __num_online_cpus;
@@ -118,6 +120,11 @@ static inline unsigned int num_online_cpus(void) #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) #define cpu_active(cpu) cpumask_test_cpu((cpu), cpu_active_mask) +static inline int cpumask_test_cpu(int cpu, const struct cpumask *cpumask); +static inline bool cpu_dying(unsigned int cpu) +{ + return cpumask_test_cpu(cpu, cpu_dying_mask); +} #else #define num_online_cpus() 1U #define num_possible_cpus() 1U @@ -127,6 +134,10 @@ static inline unsigned int num_online_cpus(void) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) #define cpu_active(cpu) ((cpu) == 0) +static inline bool cpu_dying(unsigned int cpu) +{ + return false; +} #endif
extern cpumask_t cpus_booted_once_mask; @@ -851,6 +862,14 @@ set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, &__cpu_active_mask); }
+static inline void +set_cpu_dying(unsigned int cpu, bool dying) +{ + if (dying) + cpumask_set_cpu(cpu, &__cpu_dying_mask); + else + cpumask_clear_cpu(cpu, &__cpu_dying_mask); +}
/** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * diff --git a/kernel/cpu.c b/kernel/cpu.c index c06ced18f78a..2a81a9dae3a7 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -157,6 +157,9 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt;
+ if (cpu_dying(cpu) != !bringup) + set_cpu_dying(cpu, !bringup); + if (st->fail == state) { st->fail = CPUHP_INVALID;
@@ -2502,6 +2505,9 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask);
+struct cpumask __cpu_dying_mask __read_mostly; +EXPORT_SYMBOL(__cpu_dying_mask); + atomic_t __num_online_cpus __read_mostly; EXPORT_SYMBOL(__num_online_cpus);
From: Yury Norov yury.norov@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit 586eaebea5988302c5a8b018096dd6c6f4564940 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6VS35
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
find_bit would also benefit from small_const_nbits() optimizations. The detailed comment is provided by Rasmus Villemoes.
Link: https://lkml.kernel.org/r/20210401003153.97325-6-yury.norov@gmail.com Suggested-by: Rasmus Villemoes linux@rasmusvillemoes.dk Signed-off-by: Yury Norov yury.norov@gmail.com Acked-by: Rasmus Villemoes linux@rasmusvillemoes.dk Acked-by: Andy Shevchenko andy.shevchenko@gmail.com Cc: Alexey Klimov aklimov@redhat.com Cc: Arnd Bergmann arnd@arndb.de Cc: David Sterba dsterba@suse.com Cc: Dennis Zhou dennis@kernel.org Cc: Geert Uytterhoeven geert@linux-m68k.org Cc: Jianpeng Ma jianpeng.ma@intel.com Cc: Joe Perches joe@perches.com Cc: John Paul Adrian Glaubitz glaubitz@physik.fu-berlin.de Cc: Josh Poimboeuf jpoimboe@redhat.com Cc: Rich Felker dalias@libc.org Cc: Stefano Brivio sbrivio@redhat.com Cc: Wei Yang richard.weiyang@linux.alibaba.com Cc: Wolfram Sang wsa+renesas@sang-engineering.com Cc: Yoshinori Sato ysato@users.osdn.me Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Zeng Heng zengheng4@huawei.com --- include/asm-generic/bitsperlong.h | 12 ++++++++++++ include/linux/bitmap.h | 8 -------- 2 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/include/asm-generic/bitsperlong.h b/include/asm-generic/bitsperlong.h index 3905c1c93dc2..1023e2a4bd37 100644 --- a/include/asm-generic/bitsperlong.h +++ b/include/asm-generic/bitsperlong.h @@ -23,4 +23,16 @@ #define BITS_PER_LONG_LONG 64 #endif
+/* + * small_const_nbits(n) is true precisely when it is known at compile-time + * that BITMAP_SIZE(n) is 1, i.e. 1 <= n <= BITS_PER_LONG. This allows + * various bit/bitmap APIs to provide a fast inline implementation. Bitmaps + * of size 0 are very rare, and a compile-time-known-size 0 is most likely + * a sign of error. They will be handled correctly by the bit/bitmap APIs, + * but using the out-of-line functions, so that the inline implementations + * can unconditionally dereference the pointer(s). + */ +#define small_const_nbits(nbits) \ + (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) + #endif /* __ASM_GENERIC_BITS_PER_LONG */ diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index e98da9994cdd..6f8803eaae25 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -234,14 +234,6 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
-/* - * The static inlines below do not handle constant nbits==0 correctly, - * so make such users (should any ever turn up) call the out-of-line - * versions. - */ -#define small_const_nbits(nbits) \ - (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) - static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v6.3-rc4 commit 1470afefc3c42df5d1662f87d079b46651bdc95b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6VS35
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Equivalent of for_each_cpu_and, except it ORs the two masks together so it iterates all the CPUs present in either mask.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Zeng Heng zengheng4@huawei.com --- include/asm-generic/bitops/find.h | 38 ++++++++++++++++++++++ include/linux/cpumask.h | 52 +++++++++++++++++++++++++++++++ lib/find_bit.c | 37 ++++++++++++++++++++++ 3 files changed, 127 insertions(+)
diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 9fdf21302fdf..39ef8d3c54dc 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -97,4 +97,42 @@ extern unsigned long find_next_clump8(unsigned long *clump, #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0)
+unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start); + +#ifndef find_next_or_bit +/** + * find_next_or_bit - find the next set bit in either memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * @offset: The bitnumber to start searching at + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_or_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = (*addr1 | *addr2) & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_or_bit(addr1, addr2, size, offset); +} +#endif + +#define for_each_or_bit(bit, addr1, addr2, size) \ + for ((bit) = 0; \ + (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ + (bit)++) + #endif /*_ASM_GENERIC_BITOPS_FIND_H_ */ diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 92d5ecad8de6..7cdec529b1d9 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -325,6 +325,58 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool (cpu) < nr_cpu_ids;) #endif /* SMP */
+/* + * We have several different "preferred sizes" for the cpumask + * operations, depending on operation. + * + * For example, the bitmap scanning and operating operations have + * optimized routines that work for the single-word case, but only when + * the size is constant. So if NR_CPUS fits in one single word, we are + * better off using that small constant, in order to trigger the + * optimized bit finding. That is 'small_cpumask_size'. + * + * The clearing and copying operations will similarly perform better + * with a constant size, but we limit that size arbitrarily to four + * words. We call this 'large_cpumask_size'. + * + * Finally, some operations just want the exact limit, either because + * they set bits or just don't have any faster fixed-sized versions. We + * call this just 'nr_cpumask_bits'. + * + * Note that these optional constants are always guaranteed to be at + * least as big as 'nr_cpu_ids' itself is, and all our cpumask + * allocations are at least that size (see cpumask_size()). The + * optimization comes from being able to potentially use a compile-time + * constant instead of a run-time generated exact number of CPUs. + */ +#if NR_CPUS <= BITS_PER_LONG + #define small_cpumask_bits ((unsigned int)NR_CPUS) + #define large_cpumask_bits ((unsigned int)NR_CPUS) +#elif NR_CPUS <= 4*BITS_PER_LONG + #define small_cpumask_bits nr_cpu_ids + #define large_cpumask_bits ((unsigned int)NR_CPUS) +#else + #define small_cpumask_bits nr_cpu_ids + #define large_cpumask_bits nr_cpu_ids +#endif + +/** + * for_each_cpu_or - iterate over every cpu present in either mask + * @cpu: the (optionally unsigned) integer iterator + * @mask1: the first cpumask pointer + * @mask2: the second cpumask pointer + * + * This saves a temporary CPU mask in many places. It is equivalent to: + * struct cpumask tmp; + * cpumask_or(&tmp, &mask1, &mask2); + * for_each_cpu(cpu, &tmp) + * ... + * + * After the loop, cpu is >= nr_cpu_ids. + */ +#define for_each_cpu_or(cpu, mask1, mask2) \ + for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) + #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ diff --git a/lib/find_bit.c b/lib/find_bit.c index 4a8751010d59..d9aef1da9929 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -81,6 +81,43 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, EXPORT_SYMBOL(find_next_bit); #endif
+/* + * Common helper for find_next_bit() function family + * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) + * @MUNGE: The expression that post-processes a word containing found bit (may be empty) + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at + */ +#define FIND_NEXT_BIT(FETCH, MUNGE, size, start) \ +({ \ + unsigned long mask, idx, tmp, sz = (size), __start = (start); \ + \ + if (unlikely(__start >= sz)) \ + goto out; \ + \ + mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start)); \ + idx = __start / BITS_PER_LONG; \ + \ + for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) { \ + if ((idx + 1) * BITS_PER_LONG >= sz) \ + goto out; \ + idx++; \ + } \ + \ + sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz); \ +out: \ + sz; \ +}) + +#ifndef find_next_or_bit +unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr1[idx] | addr2[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_or_bit); +#endif + #ifndef find_next_zero_bit unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset)
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v6.3-rc4 commit 8b57b11cca88f397035a95b9e12b03511847b0e8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6VS35
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
In commit f689054aace2 ("percpu_counter: add percpu_counter_sum_all interface") a race condition between a cpu dying and percpu_counter_sum() iterating online CPUs was identified. The solution was to iterate all possible CPUs for summation via percpu_counter_sum_all().
We recently had a percpu_counter_sum() call in XFS trip over this same race condition and it fired a debug assert because the filesystem was unmounting and the counter *should* be zero just before we destroy it. That was reported here:
https://lore.kernel.org/linux-kernel/20230314090649.326642-1-yebin@huaweiclo...
likely as a result of running generic/648 which exercises filesystems in the presence of CPU online/offline events.
The solution to use percpu_counter_sum_all() is an awful one. We use percpu counters and percpu_counter_sum() for accurate and reliable threshold detection for space management, so a summation race condition during these operations can result in overcommit of available space and that may result in filesystem shutdowns.
As percpu_counter_sum_all() iterates all possible CPUs rather than just those online or even those present, the mask can include CPUs that aren't even installed in the machine, or in the case of machines that can hot-plug CPU capable nodes, even have physical sockets present in the machine.
Fundamentally, this race condition is caused by the CPU being offlined being removed from the cpu_online_mask before the notifier that cleans up per-cpu state is run. Hence percpu_counter_sum() will not sum the count for a cpu currently being taken offline, regardless of whether the notifier has run or not. This is the root cause of the bug.
The percpu counter notifier iterates all the registered counters, locks the counter and moves the percpu count to the global sum. This is serialised against other operations that move the percpu counter to the global sum as well as percpu_counter_sum() operations that sum the percpu counts while holding the counter lock.
Hence the notifier is safe to run concurrently with sum operations, and the only thing we actually need to care about is that percpu_counter_sum() iterates dying CPUs. That's trivial to do, and when there are no CPUs dying, it has no addition overhead except for a cpumask_or() operation.
This change makes percpu_counter_sum() always do the right thing in the presence of CPU hot unplug events and makes percpu_counter_sum_all() unnecessary. This, in turn, means that filesystems like XFS, ext4, and btrfs don't have to work out when they should use percpu_counter_sum() vs percpu_counter_sum_all() in their space accounting algorithms
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Zeng Heng zengheng4@huawei.com --- lib/percpu_counter.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 00f666d94486..8940ca14acca 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -119,7 +119,15 @@ EXPORT_SYMBOL(percpu_counter_sync);
/* * Add up all the per-cpu counts, return the result. This is a more accurate - * but much slower version of percpu_counter_read_positive() + * but much slower version of percpu_counter_read_positive(). + * + * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums + * from CPUs that are in the process of being taken offline. Dying cpus have + * been removed from the online mask, but may not have had the hotplug dead + * notifier called to fold the percpu count back into the global counter sum. + * By including dying CPUs in the iteration mask, we avoid this race condition + * so __percpu_counter_sum() just does the right thing when CPUs are being taken + * offline. */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { @@ -129,7 +137,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; - for_each_online_cpu(cpu) { + for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; }