Backport compact NUMA-aware lock feature v15 from linux maillist, and add support for arch arm64.
bugzilla: https://gitee.com/openeuler/kernel/issues/I7LEG5 Reference: https://lore.kernel.org/linux-arm-kernel/20210514200743.3026725-1-alex.kogan...
Alex Kogan (6): locking/qspinlock: Rename mcs lock/unlock macros and make them more generic locking/qspinlock: Refactor the qspinlock slow path locking/qspinlock: Introduce CNA into the slow path of qspinlock locking/qspinlock: Introduce starvation avoidance into CNA locking/qspinlock: Avoid moving certain threads between waiting queues in CNA locking/qspinlock: Introduce the shuffle reduction optimization into CNA
Wei Li (2): locking/qspinlock: Add CNA support for ARM64 locking/qspinlock: Disable CNA by default
.../admin-guide/kernel-parameters.txt | 18 + arch/arm/include/asm/mcs_spinlock.h | 6 +- arch/arm64/Kconfig | 16 + arch/x86/Kconfig | 20 + arch/x86/include/asm/qspinlock.h | 4 + include/asm-generic/mcs_spinlock.h | 4 +- kernel/locking/mcs_spinlock.h | 20 +- kernel/locking/qspinlock.c | 82 +++- kernel/locking/qspinlock_cna.h | 425 ++++++++++++++++++ kernel/locking/qspinlock_paravirt.h | 2 +- mm/mempolicy.c | 4 + 11 files changed, 578 insertions(+), 23 deletions(-) create mode 100644 kernel/locking/qspinlock_cna.h
From: Alex Kogan alex.kogan@oracle.com
maillist inclusion category: feature bugzilla: 169576 https://gitee.com/openeuler/kernel/issues/I7LEG5
Reference: https://lore.kernel.org/linux-arm-kernel/20210514200743.3026725-2-alex.kogan...
--------------------------------
The mcs unlock macro (arch_mcs_lock_handoff) should accept the value to be stored into the lock argument as another argument. This allows using the same macro in cases where the value to be stored when passing the lock is different from 1.
Signed-off-by: Alex Kogan alex.kogan@oracle.com Reviewed-by: Steve Sistare steven.sistare@oracle.com Reviewed-by: Waiman Long longman@redhat.com Signed-off-by: Wei Li liwei391@huawei.com --- arch/arm/include/asm/mcs_spinlock.h | 6 +++--- include/asm-generic/mcs_spinlock.h | 4 ++-- kernel/locking/mcs_spinlock.h | 18 +++++++++--------- kernel/locking/qspinlock.c | 4 ++-- kernel/locking/qspinlock_paravirt.h | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/arch/arm/include/asm/mcs_spinlock.h b/arch/arm/include/asm/mcs_spinlock.h index 529d2cf4d06f..1eb4d733459c 100644 --- a/arch/arm/include/asm/mcs_spinlock.h +++ b/arch/arm/include/asm/mcs_spinlock.h @@ -6,7 +6,7 @@ #include <asm/spinlock.h>
/* MCS spin-locking. */ -#define arch_mcs_spin_lock_contended(lock) \ +#define arch_mcs_spin_wait(lock) \ do { \ /* Ensure prior stores are observed before we enter wfe. */ \ smp_mb(); \ @@ -14,9 +14,9 @@ do { \ wfe(); \ } while (0) \
-#define arch_mcs_spin_unlock_contended(lock) \ +#define arch_mcs_lock_handoff(lock, val) \ do { \ - smp_store_release(lock, 1); \ + smp_store_release((lock), (val)); \ dsb_sev(); \ } while (0)
diff --git a/include/asm-generic/mcs_spinlock.h b/include/asm-generic/mcs_spinlock.h index 10cd4ffc6ba2..f933d99c63e0 100644 --- a/include/asm-generic/mcs_spinlock.h +++ b/include/asm-generic/mcs_spinlock.h @@ -4,8 +4,8 @@ /* * Architectures can define their own: * - * arch_mcs_spin_lock_contended(l) - * arch_mcs_spin_unlock_contended(l) + * arch_mcs_spin_wait(l) + * arch_mcs_lock_handoff(l, val) * * See kernel/locking/mcs_spinlock.c. */ diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 85251d8771d9..e794babc519a 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -21,7 +21,7 @@ struct mcs_spinlock { int count; /* nesting count, see qspinlock.c */ };
-#ifndef arch_mcs_spin_lock_contended +#ifndef arch_mcs_spin_wait /* * Using smp_cond_load_acquire() provides the acquire semantics * required so that subsequent operations happen after the @@ -29,20 +29,20 @@ struct mcs_spinlock { * ARM64 would like to do spin-waiting instead of purely * spinning, and smp_cond_load_acquire() provides that behavior. */ -#define arch_mcs_spin_lock_contended(l) \ -do { \ - smp_cond_load_acquire(l, VAL); \ +#define arch_mcs_spin_wait(l) \ +do { \ + smp_cond_load_acquire(l, VAL); \ } while (0) #endif
-#ifndef arch_mcs_spin_unlock_contended +#ifndef arch_mcs_lock_handoff /* * smp_store_release() provides a memory barrier to ensure all * operations in the critical section has been completed before * unlocking. */ -#define arch_mcs_spin_unlock_contended(l) \ - smp_store_release((l), 1) +#define arch_mcs_lock_handoff(l, val) \ + smp_store_release((l), (val)) #endif
/* @@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) WRITE_ONCE(prev->next, node);
/* Wait until the lock holder passes the lock down. */ - arch_mcs_spin_lock_contended(&node->locked); + arch_mcs_spin_wait(&node->locked); }
/* @@ -115,7 +115,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) }
/* Pass lock to next waiter. */ - arch_mcs_spin_unlock_contended(&next->locked); + arch_mcs_lock_handoff(&next->locked, 1); }
#endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ebe6b8ec7cb3..07c396408e84 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -474,7 +474,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) WRITE_ONCE(prev->next, node);
pv_wait_node(node, prev); - arch_mcs_spin_lock_contended(&node->locked); + arch_mcs_spin_wait(&node->locked);
/* * While waiting for the MCS lock, the next pointer may have @@ -553,7 +553,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) if (!next) next = smp_cond_load_relaxed(&node->next, (VAL));
- arch_mcs_spin_unlock_contended(&next->locked); + arch_mcs_lock_handoff(&next->locked, 1); pv_kick_node(lock, next);
release: diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 6afc249ce697..2b35460d5658 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -368,7 +368,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() * - * The write to next->locked in arch_mcs_spin_unlock_contended() + * The write to next->locked in arch_mcs_lock_handoff() * must be ordered before the read of pn->state in the cmpxchg() * below for the code to work correctly. To guarantee full ordering * irrespective of the success or failure of the cmpxchg(),
From: Alex Kogan alex.kogan@oracle.com
maillist inclusion category: feature bugzilla: 169576 https://gitee.com/openeuler/kernel/issues/I7LEG5
Reference: https://lore.kernel.org/linux-arm-kernel/20210514200743.3026725-3-alex.kogan...
--------------------------------
Move some of the code manipulating the spin lock into separate functions. This would allow easier integration of alternative ways to manipulate that lock.
Signed-off-by: Alex Kogan alex.kogan@oracle.com Reviewed-by: Steve Sistare steven.sistare@oracle.com Reviewed-by: Waiman Long longman@redhat.com Signed-off-by: Wei Li liwei391@huawei.com --- kernel/locking/qspinlock.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-)
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 07c396408e84..49c3b4c0dce8 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -290,6 +290,34 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif
+/* + * __try_clear_tail - try to clear tail by setting the lock value to + * _Q_LOCKED_VAL. + * @lock: Pointer to the queued spinlock structure + * @val: Current value of the lock + * @node: Pointer to the MCS node of the lock holder + */ +static __always_inline bool __try_clear_tail(struct qspinlock *lock, + u32 val, + struct mcs_spinlock *node) +{ + return atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL); +} + +/* + * __mcs_lock_handoff - pass the MCS lock to the next waiter + * @node: Pointer to the MCS node of the lock holder + * @next: Pointer to the MCS node of the first waiter in the MCS queue + */ +static __always_inline void __mcs_lock_handoff(struct mcs_spinlock *node, + struct mcs_spinlock *next) +{ + arch_mcs_lock_handoff(&next->locked, 1); +} + +#define try_clear_tail __try_clear_tail +#define mcs_lock_handoff __mcs_lock_handoff + #endif /* _GEN_PV_LOCK_SLOWPATH */
/** @@ -536,7 +564,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * PENDING will make the uncontended transition fail. */ if ((val & _Q_TAIL_MASK) == tail) { - if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + if (try_clear_tail(lock, val, node)) goto release; /* No contention */ }
@@ -553,7 +581,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) if (!next) next = smp_cond_load_relaxed(&node->next, (VAL));
- arch_mcs_lock_handoff(&next->locked, 1); + mcs_lock_handoff(node, next); pv_kick_node(lock, next);
release: @@ -580,6 +608,12 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #undef pv_kick_node #undef pv_wait_head_or_lock
+#undef try_clear_tail +#define try_clear_tail __try_clear_tail + +#undef mcs_lock_handoff +#define mcs_lock_handoff __mcs_lock_handoff + #undef queued_spin_lock_slowpath #define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
From: Alex Kogan alex.kogan@oracle.com
maillist inclusion category: feature bugzilla: 169576 https://gitee.com/openeuler/kernel/issues/I7LEG5
Reference: https://lore.kernel.org/linux-arm-kernel/20210514200743.3026725-4-alex.kogan...
--------------------------------
In CNA, spinning threads are organized in two queues, a primary queue for threads running on the same node as the current lock holder, and a secondary queue for threads running on other nodes. After acquiring the MCS lock and before acquiring the spinlock, the MCS lock holder checks whether the next waiter in the primary queue (if exists) is running on the same NUMA node. If it is not, that waiter is detached from the main queue and moved into the tail of the secondary queue. This way, we gradually filter the primary queue, leaving only waiters running on the same preferred NUMA node. For more details, see https://arxiv.org/abs/1810.05600.
Note that this variant of CNA may introduce starvation by continuously passing the lock between waiters in the main queue. This issue will be addressed later in the series.
Enabling CNA is controlled via a new configuration option (NUMA_AWARE_SPINLOCKS). By default, the CNA variant is patched in at the boot time only if we run on a multi-node machine in native environment and the new config is enabled. (For the time being, the patching requires CONFIG_PARAVIRT_SPINLOCKS to be enabled as well. However, this should be resolved once static_call() is available.) This default behavior can be overridden with the new kernel boot command-line option "numa_spinlock=on/off" (default is "auto").
Signed-off-by: Alex Kogan alex.kogan@oracle.com Reviewed-by: Steve Sistare steven.sistare@oracle.com Reviewed-by: Waiman Long longman@redhat.com Signed-off-by: Wei Li liwei391@huawei.com --- .../admin-guide/kernel-parameters.txt | 10 + arch/x86/Kconfig | 20 ++ arch/x86/include/asm/qspinlock.h | 4 + arch/x86/kernel/alternative.c | 4 + kernel/locking/mcs_spinlock.h | 2 +- kernel/locking/qspinlock.c | 42 ++- kernel/locking/qspinlock_cna.h | 325 ++++++++++++++++++ 7 files changed, 402 insertions(+), 5 deletions(-) create mode 100644 kernel/locking/qspinlock_cna.h
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9e5bab29685f..4e8eb265d54c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3951,6 +3951,16 @@ NUMA balancing. Allowed values are enable and disable
+ numa_spinlock= [NUMA, PV_OPS] Select the NUMA-aware variant + of spinlock. The options are: + auto - Enable this variant if running on a multi-node + machine in native environment. + on - Unconditionally enable this variant. + off - Unconditionally disable this variant. + + Not specifying this option is equivalent to + numa_spinlock=auto. + numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. 'node', 'default' can be specified This can be set from sysctl after boot. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 53bab123a8ee..0ccde77b685b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1586,6 +1586,26 @@ config NUMA
Otherwise, you should say N.
+config NUMA_AWARE_SPINLOCKS + bool "Numa-aware spinlocks" + depends on NUMA + depends on QUEUED_SPINLOCKS + depends on 64BIT + # For now, we depend on PARAVIRT_SPINLOCKS to make the patching work. + # This is awkward, but hopefully would be resolved once static_call() + # is available. + depends on PARAVIRT_SPINLOCKS + default y + help + Introduce NUMA (Non Uniform Memory Access) awareness into + the slow path of spinlocks. + + In this variant of qspinlock, the kernel will try to keep the lock + on the same node, thus reducing the number of remote cache misses, + while trading some of the short term fairness for better performance. + + Say N if you want absolute first come first serve fairness. + config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index d87451df480b..f48a2a250e57 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -27,6 +27,10 @@ static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lo return val; }
+#ifdef CONFIG_NUMA_AWARE_SPINLOCKS +extern void cna_configure_spin_lock_slowpath(void); +#endif + #ifdef CONFIG_PARAVIRT_SPINLOCKS extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f615e0cb6d93..4074238e6a37 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1371,6 +1371,10 @@ void __init alternative_instructions(void) */ paravirt_set_cap();
+#if defined(CONFIG_NUMA_AWARE_SPINLOCKS) + cna_configure_spin_lock_slowpath(); +#endif + /* * First patch paravirt functions, such that we overwrite the indirect * call with the direct call. diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index e794babc519a..3926aad129ed 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -17,7 +17,7 @@
struct mcs_spinlock { struct mcs_spinlock *next; - int locked; /* 1 if lock acquired */ + unsigned int locked; /* 1 if lock acquired */ int count; /* nesting count, see qspinlock.c */ };
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 49c3b4c0dce8..d3f99060b60f 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -11,7 +11,7 @@ * Peter Zijlstra peterz@infradead.org */
-#ifndef _GEN_PV_LOCK_SLOWPATH +#if !defined(_GEN_PV_LOCK_SLOWPATH) && !defined(_GEN_CNA_LOCK_SLOWPATH)
#include <linux/smp.h> #include <linux/bug.h> @@ -72,7 +72,8 @@ /* * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in * size and four of them will fit nicely in one 64-byte cacheline. For - * pvqspinlock, however, we need more space for extra data. To accommodate + * pvqspinlock, however, we need more space for extra data. The same also + * applies for the NUMA-aware variant of spinlocks (CNA). To accommodate * that, we insert two more long words to pad it up to 32 bytes. IOW, only * two of them can fit in a cacheline in this case. That is OK as it is rare * to have more than 2 levels of slowpath nesting in actual use. We don't @@ -81,7 +82,7 @@ */ struct qnode { struct mcs_spinlock mcs; -#ifdef CONFIG_PARAVIRT_SPINLOCKS +#if defined(CONFIG_PARAVIRT_SPINLOCKS) || defined(CONFIG_NUMA_AWARE_SPINLOCKS) long reserved[2]; #endif }; @@ -105,6 +106,8 @@ struct qnode { * Exactly fits one 64-byte cacheline on a 64-bit architecture. * * PV doubles the storage and uses the second cacheline for PV state. + * CNA also doubles the storage and uses the second cacheline for + * CNA-specific state. */ static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);
@@ -318,7 +321,7 @@ static __always_inline void __mcs_lock_handoff(struct mcs_spinlock *node, #define try_clear_tail __try_clear_tail #define mcs_lock_handoff __mcs_lock_handoff
-#endif /* _GEN_PV_LOCK_SLOWPATH */ +#endif /* _GEN_PV_LOCK_SLOWPATH && _GEN_CNA_LOCK_SLOWPATH */
/** * queued_spin_lock_slowpath - acquire the queued spinlock @@ -594,6 +597,37 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) } EXPORT_SYMBOL(queued_spin_lock_slowpath);
+/* + * Generate the code for NUMA-aware spinlocks + */ +#if !defined(_GEN_CNA_LOCK_SLOWPATH) && defined(CONFIG_NUMA_AWARE_SPINLOCKS) +#define _GEN_CNA_LOCK_SLOWPATH + +#undef pv_init_node +#define pv_init_node cna_init_node + +#undef pv_wait_head_or_lock +#define pv_wait_head_or_lock cna_wait_head_or_lock + +#undef try_clear_tail +#define try_clear_tail cna_try_clear_tail + +#undef mcs_lock_handoff +#define mcs_lock_handoff cna_lock_handoff + +#undef queued_spin_lock_slowpath +/* + * defer defining queued_spin_lock_slowpath until after the include to + * avoid a name clash with the identically named field in pv_ops.lock + * (see cna_configure_spin_lock_slowpath()) + */ +#include "qspinlock_cna.h" +#define queued_spin_lock_slowpath __cna_queued_spin_lock_slowpath + +#include "qspinlock.c" + +#endif + /* * Generate the paravirt code for queued_spin_unlock_slowpath(). */ diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h new file mode 100644 index 000000000000..ca564e64e5de --- /dev/null +++ b/kernel/locking/qspinlock_cna.h @@ -0,0 +1,325 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _GEN_CNA_LOCK_SLOWPATH +#error "do not include this file" +#endif + +#include <linux/topology.h> + +/* + * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock). + * + * In CNA, spinning threads are organized in two queues, a primary queue for + * threads running on the same NUMA node as the current lock holder, and a + * secondary queue for threads running on other nodes. Schematically, it + * looks like this: + * + * cna_node + * +----------+ +--------+ +--------+ + * |mcs:next | --> |mcs:next| --> ... |mcs:next| --> NULL [Primary queue] + * |mcs:locked| -. +--------+ +--------+ + * +----------+ | + * `----------------------. + * v + * +--------+ +--------+ + * |mcs:next| --> ... |mcs:next| [Secondary queue] + * +--------+ +--------+ + * ^ | + * `--------------------' + * + * N.B. locked := 1 if secondary queue is absent. Otherwise, it contains the + * encoded pointer to the tail of the secondary queue, which is organized as a + * circular list. + * + * After acquiring the MCS lock and before acquiring the spinlock, the MCS lock + * holder checks whether the next waiter in the primary queue (if exists) is + * running on the same NUMA node. If it is not, that waiter is detached from the + * main queue and moved into the tail of the secondary queue. This way, we + * gradually filter the primary queue, leaving only waiters running on the same + * preferred NUMA node. + * + * For more details, see https://arxiv.org/abs/1810.05600. + * + * Authors: Alex Kogan alex.kogan@oracle.com + * Dave Dice dave.dice@oracle.com + */ + +struct cna_node { + struct mcs_spinlock mcs; + u16 numa_node; + u16 real_numa_node; + u32 encoded_tail; /* self */ +}; + +static void __init cna_init_nodes_per_cpu(unsigned int cpu) +{ + struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu); + int numa_node = cpu_to_node(cpu); + int i; + + for (i = 0; i < MAX_NODES; i++) { + struct cna_node *cn = (struct cna_node *)grab_mcs_node(base, i); + + cn->real_numa_node = numa_node; + cn->encoded_tail = encode_tail(cpu, i); + /* + * make sure @encoded_tail is not confused with other valid + * values for @locked (0 or 1) + */ + WARN_ON(cn->encoded_tail <= 1); + } +} + +static int __init cna_init_nodes(void) +{ + unsigned int cpu; + + /* + * this will break on 32bit architectures, so we restrict + * the use of CNA to 64bit only (see arch/x86/Kconfig) + */ + BUILD_BUG_ON(sizeof(struct cna_node) > sizeof(struct qnode)); + /* we store an ecoded tail word in the node's @locked field */ + BUILD_BUG_ON(sizeof(u32) > sizeof(unsigned int)); + + for_each_possible_cpu(cpu) + cna_init_nodes_per_cpu(cpu); + + return 0; +} + +static __always_inline void cna_init_node(struct mcs_spinlock *node) +{ + struct cna_node *cn = (struct cna_node *)node; + + cn->numa_node = cn->real_numa_node; +} + +/* + * cna_splice_head -- splice the entire secondary queue onto the head of the + * primary queue. + * + * Returns the new primary head node or NULL on failure. + */ +static struct mcs_spinlock * +cna_splice_head(struct qspinlock *lock, u32 val, + struct mcs_spinlock *node, struct mcs_spinlock *next) +{ + struct mcs_spinlock *head_2nd, *tail_2nd; + u32 new; + + tail_2nd = decode_tail(node->locked); + head_2nd = tail_2nd->next; + + if (next) { + /* + * If the primary queue is not empty, the primary tail doesn't + * need to change and we can simply link the secondary tail to + * the old primary head. + */ + tail_2nd->next = next; + } else { + /* + * When the primary queue is empty, the secondary tail becomes + * the primary tail. + */ + + /* + * Speculatively break the secondary queue's circular link such + * that when the secondary tail becomes the primary tail it all + * works out. + */ + tail_2nd->next = NULL; + + /* + * tail_2nd->next = NULL; old = xchg_tail(lock, tail); + * prev = decode_tail(old); + * try_cmpxchg_release(...); WRITE_ONCE(prev->next, node); + * + * If the following cmpxchg() succeeds, our stores will not + * collide. + */ + new = ((struct cna_node *)tail_2nd)->encoded_tail | + _Q_LOCKED_VAL; + if (!atomic_try_cmpxchg_release(&lock->val, &val, new)) { + /* Restore the secondary queue's circular link. */ + tail_2nd->next = head_2nd; + return NULL; + } + } + + /* The primary queue head now is what was the secondary queue head. */ + return head_2nd; +} + +static inline bool cna_try_clear_tail(struct qspinlock *lock, u32 val, + struct mcs_spinlock *node) +{ + /* + * We're here because the primary queue is empty; check the secondary + * queue for remote waiters. + */ + if (node->locked > 1) { + struct mcs_spinlock *next; + + /* + * When there are waiters on the secondary queue, try to move + * them back onto the primary queue and let them rip. + */ + next = cna_splice_head(lock, val, node, NULL); + if (next) { + arch_mcs_lock_handoff(&next->locked, 1); + return true; + } + + return false; + } + + /* Both queues are empty. Do what MCS does. */ + return __try_clear_tail(lock, val, node); +} + +/* + * cna_splice_next -- splice the next node from the primary queue onto + * the secondary queue. + */ +static void cna_splice_next(struct mcs_spinlock *node, + struct mcs_spinlock *next, + struct mcs_spinlock *nnext) +{ + /* remove 'next' from the main queue */ + node->next = nnext; + + /* stick `next` on the secondary queue tail */ + if (node->locked <= 1) { /* if secondary queue is empty */ + /* create secondary queue */ + next->next = next; + } else { + /* add to the tail of the secondary queue */ + struct mcs_spinlock *tail_2nd = decode_tail(node->locked); + struct mcs_spinlock *head_2nd = tail_2nd->next; + + tail_2nd->next = next; + next->next = head_2nd; + } + + node->locked = ((struct cna_node *)next)->encoded_tail; +} + +/* + * cna_order_queue - check whether the next waiter in the main queue is on + * the same NUMA node as the lock holder; if not, and it has a waiter behind + * it in the main queue, move the former onto the secondary queue. + * Returns 1 if the next waiter runs on the same NUMA node; 0 otherwise. + */ +static int cna_order_queue(struct mcs_spinlock *node) +{ + struct mcs_spinlock *next = READ_ONCE(node->next); + struct cna_node *cn = (struct cna_node *)node; + int numa_node, next_numa_node; + + if (!next) + return 0; + + numa_node = cn->numa_node; + next_numa_node = ((struct cna_node *)next)->numa_node; + + if (next_numa_node != numa_node) { + struct mcs_spinlock *nnext = READ_ONCE(next->next); + + if (nnext) + cna_splice_next(node, next, nnext); + + return 0; + } + return 1; +} + +#define LOCK_IS_BUSY(lock) (atomic_read(&(lock)->val) & _Q_LOCKED_PENDING_MASK) + +/* Abuse the pv_wait_head_or_lock() hook to get some work done */ +static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock, + struct mcs_spinlock *node) +{ + /* + * Try and put the time otherwise spent spin waiting on + * _Q_LOCKED_PENDING_MASK to use by sorting our lists. + */ + while (LOCK_IS_BUSY(lock) && !cna_order_queue(node)) + cpu_relax(); + + return 0; /* we lied; we didn't wait, go do so now */ +} + +static inline void cna_lock_handoff(struct mcs_spinlock *node, + struct mcs_spinlock *next) +{ + u32 val = 1; + + if (node->locked > 1) { + struct cna_node *cn = (struct cna_node *)node; + + val = node->locked; /* preseve secondary queue */ + + /* + * We have a local waiter, either real or fake one; + * reload @next in case it was changed by cna_order_queue(). + */ + next = node->next; + + /* + * Pass over NUMA node id of primary queue, to maintain the + * preference even if the next waiter is on a different node. + */ + ((struct cna_node *)next)->numa_node = cn->numa_node; + } + + arch_mcs_lock_handoff(&next->locked, val); +} + +/* + * Constant (boot-param configurable) flag selecting the NUMA-aware variant + * of spinlock. Possible values: -1 (off) / 0 (auto, default) / 1 (on). + */ +static int numa_spinlock_flag; + +static int __init numa_spinlock_setup(char *str) +{ + if (!strcmp(str, "auto")) { + numa_spinlock_flag = 0; + return 1; + } else if (!strcmp(str, "on")) { + numa_spinlock_flag = 1; + return 1; + } else if (!strcmp(str, "off")) { + numa_spinlock_flag = -1; + return 1; + } + + return 0; +} +__setup("numa_spinlock=", numa_spinlock_setup); + +void __cna_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); + +/* + * Switch to the NUMA-friendly slow path for spinlocks when we have + * multiple NUMA nodes in native environment, unless the user has + * overridden this default behavior by setting the numa_spinlock flag. + */ +void __init cna_configure_spin_lock_slowpath(void) +{ + + if (numa_spinlock_flag < 0) + return; + + if (numa_spinlock_flag == 0 && (nr_node_ids < 2 || + pv_ops.lock.queued_spin_lock_slowpath != + native_queued_spin_lock_slowpath)) + return; + + cna_init_nodes(); + + pv_ops.lock.queued_spin_lock_slowpath = __cna_queued_spin_lock_slowpath; + + pr_info("Enabling CNA spinlock\n"); +}
From: Alex Kogan alex.kogan@oracle.com
maillist inclusion category: feature bugzilla: 169576 https://gitee.com/openeuler/kernel/issues/I7LEG5
Reference: https://lore.kernel.org/linux-arm-kernel/20210514200743.3026725-5-alex.kogan...
--------------------------------
Keep track of the time the thread at the head of the secondary queue has been waiting, and force inter-node handoff once this time passes a preset threshold. The default value for the threshold (1ms) can be overridden with the new kernel boot command-line option "qspinlock.numa_spinlock_threshold_ns".
Signed-off-by: Alex Kogan alex.kogan@oracle.com Reviewed-by: Steve Sistare steven.sistare@oracle.com Reviewed-by: Waiman Long longman@redhat.com Signed-off-by: Wei Li liwei391@huawei.com --- .../admin-guide/kernel-parameters.txt | 8 ++ kernel/locking/qspinlock_cna.h | 81 +++++++++++++++---- 2 files changed, 73 insertions(+), 16 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4e8eb265d54c..705ff4b379c8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4641,6 +4641,14 @@ [KNL] Number of legacy pty's. Overwrites compiled-in default number.
+ qspinlock.numa_spinlock_threshold_ns= [NUMA, PV_OPS] + Set the time threshold in nanoseconds for the + number of intra-node lock hand-offs before the + NUMA-aware spinlock is forced to be passed to + a thread on another NUMA node. Smaller values + result in a more fair, but less performant spinlock, + and vice versa. The default value is 1000000 (=1ms). + quiet [KNL] Disable most log messages
r128= [HW,DRM] diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h index ca564e64e5de..0b991c340fb1 100644 --- a/kernel/locking/qspinlock_cna.h +++ b/kernel/locking/qspinlock_cna.h @@ -4,6 +4,8 @@ #endif
#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/moduleparam.h>
/* * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock). @@ -37,19 +39,39 @@ * gradually filter the primary queue, leaving only waiters running on the same * preferred NUMA node. * + * We change the NUMA node preference after a waiter at the head of the + * secondary queue spins for a certain amount of time (1ms, by default). + * We do that by flushing the secondary queue into the head of the primary queue, + * effectively changing the preference to the NUMA node of the waiter at the head + * of the secondary queue at the time of the flush. + * * For more details, see https://arxiv.org/abs/1810.05600. * * Authors: Alex Kogan alex.kogan@oracle.com * Dave Dice dave.dice@oracle.com */
+#define FLUSH_SECONDARY_QUEUE 1 + struct cna_node { struct mcs_spinlock mcs; u16 numa_node; u16 real_numa_node; u32 encoded_tail; /* self */ + u64 start_time; };
+static ulong numa_spinlock_threshold_ns = 1000000; /* 1ms, by default */ +module_param(numa_spinlock_threshold_ns, ulong, 0644); + +static inline bool intra_node_threshold_reached(struct cna_node *cn) +{ + u64 current_time = local_clock(); + u64 threshold = cn->start_time + numa_spinlock_threshold_ns; + + return current_time > threshold; +} + static void __init cna_init_nodes_per_cpu(unsigned int cpu) { struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu); @@ -92,6 +114,7 @@ static __always_inline void cna_init_node(struct mcs_spinlock *node) struct cna_node *cn = (struct cna_node *)node;
cn->numa_node = cn->real_numa_node; + cn->start_time = 0; }
/* @@ -191,8 +214,14 @@ static void cna_splice_next(struct mcs_spinlock *node,
/* stick `next` on the secondary queue tail */ if (node->locked <= 1) { /* if secondary queue is empty */ + struct cna_node *cn = (struct cna_node *)node; + /* create secondary queue */ next->next = next; + + cn->start_time = local_clock(); + /* secondary queue is not empty iff start_time != 0 */ + WARN_ON(!cn->start_time); } else { /* add to the tail of the secondary queue */ struct mcs_spinlock *tail_2nd = decode_tail(node->locked); @@ -240,12 +269,18 @@ static int cna_order_queue(struct mcs_spinlock *node) static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { - /* - * Try and put the time otherwise spent spin waiting on - * _Q_LOCKED_PENDING_MASK to use by sorting our lists. - */ - while (LOCK_IS_BUSY(lock) && !cna_order_queue(node)) - cpu_relax(); + struct cna_node *cn = (struct cna_node *)node; + + if (!cn->start_time || !intra_node_threshold_reached(cn)) { + /* + * Try and put the time otherwise spent spin waiting on + * _Q_LOCKED_PENDING_MASK to use by sorting our lists. + */ + while (LOCK_IS_BUSY(lock) && !cna_order_queue(node)) + cpu_relax(); + } else { + cn->start_time = FLUSH_SECONDARY_QUEUE; + }
return 0; /* we lied; we didn't wait, go do so now */ } @@ -253,24 +288,38 @@ static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock, static inline void cna_lock_handoff(struct mcs_spinlock *node, struct mcs_spinlock *next) { + struct cna_node *cn = (struct cna_node *)node; u32 val = 1;
- if (node->locked > 1) { - struct cna_node *cn = (struct cna_node *)node; + if (cn->start_time != FLUSH_SECONDARY_QUEUE) { + if (node->locked > 1) { + val = node->locked; /* preseve secondary queue */ + + /* + * We have a local waiter, either real or fake one; + * reload @next in case it was changed by cna_order_queue(). + */ + next = node->next;
- val = node->locked; /* preseve secondary queue */ + /* + * Pass over NUMA node id of primary queue, to maintain the + * preference even if the next waiter is on a different node. + */ + ((struct cna_node *)next)->numa_node = cn->numa_node;
+ ((struct cna_node *)next)->start_time = cn->start_time; + } + } else { /* - * We have a local waiter, either real or fake one; - * reload @next in case it was changed by cna_order_queue(). + * We decided to flush the secondary queue; + * this can only happen if that queue is not empty. */ - next = node->next; - + WARN_ON(node->locked <= 1); /* - * Pass over NUMA node id of primary queue, to maintain the - * preference even if the next waiter is on a different node. + * Splice the secondary queue onto the primary queue and pass the lock + * to the longest waiting remote waiter. */ - ((struct cna_node *)next)->numa_node = cn->numa_node; + next = cna_splice_head(NULL, 0, node, next); }
arch_mcs_lock_handoff(&next->locked, val);
From: Alex Kogan alex.kogan@oracle.com
maillist inclusion category: feature bugzilla: 169576 https://gitee.com/openeuler/kernel/issues/I7LEG5
Reference: https://lore.kernel.org/linux-arm-kernel/20210514200743.3026725-6-alex.kogan...
--------------------------------
Prohibit moving certain threads (e.g., in irq and nmi contexts) to the secondary queue. Those prioritized threads will always stay in the primary queue, and so will have a shorter wait time for the lock.
Signed-off-by: Alex Kogan alex.kogan@oracle.com Reviewed-by: Steve Sistare steven.sistare@oracle.com Reviewed-by: Waiman Long longman@redhat.com Signed-off-by: Wei Li liwei391@huawei.com --- kernel/locking/qspinlock_cna.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h index 0b991c340fb1..ffc5c3301f0f 100644 --- a/kernel/locking/qspinlock_cna.h +++ b/kernel/locking/qspinlock_cna.h @@ -6,6 +6,7 @@ #include <linux/topology.h> #include <linux/sched/clock.h> #include <linux/moduleparam.h> +#include <linux/sched/rt.h>
/* * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock). @@ -37,7 +38,8 @@ * running on the same NUMA node. If it is not, that waiter is detached from the * main queue and moved into the tail of the secondary queue. This way, we * gradually filter the primary queue, leaving only waiters running on the same - * preferred NUMA node. + * preferred NUMA node. Note that certain priortized waiters (e.g., in + * irq and nmi contexts) are excluded from being moved to the secondary queue. * * We change the NUMA node preference after a waiter at the head of the * secondary queue spins for a certain amount of time (1ms, by default). @@ -53,6 +55,8 @@
#define FLUSH_SECONDARY_QUEUE 1
+#define CNA_PRIORITY_NODE 0xffff + struct cna_node { struct mcs_spinlock mcs; u16 numa_node; @@ -111,9 +115,10 @@ static int __init cna_init_nodes(void)
static __always_inline void cna_init_node(struct mcs_spinlock *node) { + bool priority = !in_task() || irqs_disabled() || rt_task(current); struct cna_node *cn = (struct cna_node *)node;
- cn->numa_node = cn->real_numa_node; + cn->numa_node = priority ? CNA_PRIORITY_NODE : cn->real_numa_node; cn->start_time = 0; }
@@ -252,7 +257,7 @@ static int cna_order_queue(struct mcs_spinlock *node) numa_node = cn->numa_node; next_numa_node = ((struct cna_node *)next)->numa_node;
- if (next_numa_node != numa_node) { + if (next_numa_node != numa_node && next_numa_node != CNA_PRIORITY_NODE) { struct mcs_spinlock *nnext = READ_ONCE(next->next);
if (nnext) @@ -272,6 +277,13 @@ static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock, struct cna_node *cn = (struct cna_node *)node;
if (!cn->start_time || !intra_node_threshold_reached(cn)) { + /* + * We are at the head of the wait queue, no need to use + * the fake NUMA node ID. + */ + if (cn->numa_node == CNA_PRIORITY_NODE) + cn->numa_node = cn->real_numa_node; + /* * Try and put the time otherwise spent spin waiting on * _Q_LOCKED_PENDING_MASK to use by sorting our lists.
From: Alex Kogan alex.kogan@oracle.com
maillist inclusion category: feature bugzilla: 169576 https://gitee.com/openeuler/kernel/issues/I7LEG5
Reference: https://lore.kernel.org/linux-arm-kernel/20210514200743.3026725-7-alex.kogan...
--------------------------------
This performance optimization chooses probabilistically to avoid moving threads from the main queue into the secondary one when the secondary queue is empty.
It is helpful when the lock is only lightly contended. In particular, it makes CNA less eager to create a secondary queue, but does not introduce any extra delays for threads waiting in that queue once it is created.
Signed-off-by: Alex Kogan alex.kogan@oracle.com Reviewed-by: Steve Sistare steven.sistare@oracle.com Reviewed-by: Waiman Long longman@redhat.com Signed-off-by: Wei Li liwei391@huawei.com --- kernel/locking/qspinlock_cna.h | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+)
diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h index ffc5c3301f0f..17d56c739e57 100644 --- a/kernel/locking/qspinlock_cna.h +++ b/kernel/locking/qspinlock_cna.h @@ -7,6 +7,7 @@ #include <linux/sched/clock.h> #include <linux/moduleparam.h> #include <linux/sched/rt.h> +#include <linux/random.h>
/* * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock). @@ -76,6 +77,34 @@ static inline bool intra_node_threshold_reached(struct cna_node *cn) return current_time > threshold; }
+/* + * Controls the probability for enabling the ordering of the main queue + * when the secondary queue is empty. The chosen value reduces the amount + * of unnecessary shuffling of threads between the two waiting queues + * when the contention is low, while responding fast enough and enabling + * the shuffling when the contention is high. + */ +#define SHUFFLE_REDUCTION_PROB_ARG (7) + +/* Per-CPU pseudo-random number seed */ +static DEFINE_PER_CPU(u32, seed); + +/* + * Return false with probability 1 / 2^@num_bits. + * Intuitively, the larger @num_bits the less likely false is to be returned. + * @num_bits must be a number between 0 and 31. + */ +static bool probably(unsigned int num_bits) +{ + u32 s; + + s = this_cpu_read(seed); + s = next_pseudo_random32(s); + this_cpu_write(seed, s); + + return s & ((1 << num_bits) - 1); +} + static void __init cna_init_nodes_per_cpu(unsigned int cpu) { struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu); @@ -276,6 +305,16 @@ static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock, { struct cna_node *cn = (struct cna_node *)node;
+ if (node->locked <= 1 && probably(SHUFFLE_REDUCTION_PROB_ARG)) { + /* + * When the secondary queue is empty, skip the calls to + * cna_order_queue() below with high probability. This optimization + * reduces the overhead of unnecessary shuffling of threads + * between waiting queues when the lock is only lightly contended. + */ + return 0; + } + if (!cn->start_time || !intra_node_threshold_reached(cn)) { /* * We are at the head of the wait queue, no need to use
hulk inclusion category: feature bugzilla: 169576 https://gitee.com/openeuler/kernel/issues/I7LEG5
--------------------------------
Enabling CNA is controlled via a new configuration option (NUMA_AWARE_SPINLOCKS). Add it for arm64.
Signed-off-by: Wei Li liwei391@huawei.com --- arch/arm64/Kconfig | 16 ++++++++++++++++ arch/x86/kernel/alternative.c | 4 ---- mm/mempolicy.c | 4 ++++ 3 files changed, 20 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 343e1e1cae10..a302c82c182b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1400,6 +1400,22 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables.
+config NUMA_AWARE_SPINLOCKS + bool "Numa-aware spinlocks" + depends on NUMA + depends on QUEUED_SPINLOCKS + depends on PARAVIRT_SPINLOCKS + default y + help + Introduce NUMA (Non Uniform Memory Access) awareness into + the slow path of spinlocks. + + In this variant of qspinlock, the kernel will try to keep the lock + on the same node, thus reducing the number of remote cache misses, + while trading some of the short term fairness for better performance. + + Say N if you want absolute first come first serve fairness. + source "kernel/Kconfig.hz"
config ARCH_SPARSEMEM_ENABLE diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4074238e6a37..f615e0cb6d93 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1371,10 +1371,6 @@ void __init alternative_instructions(void) */ paravirt_set_cap();
-#if defined(CONFIG_NUMA_AWARE_SPINLOCKS) - cna_configure_spin_lock_slowpath(); -#endif - /* * First patch paravirt functions, such that we overwrite the indirect * call with the direct call. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1756389a0609..40985c9d92d0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2944,6 +2944,10 @@ void __init numa_policy_init(void) pr_err("%s: interleaving failed\n", __func__);
check_numabalancing_enable(); + +#if defined(CONFIG_NUMA_AWARE_SPINLOCKS) + cna_configure_spin_lock_slowpath(); +#endif }
/* Reset policy of current process to default */
hulk inclusion category: feature bugzilla: 169576 https://gitee.com/openeuler/kernel/issues/I7LEG5
--------------------------------
Disable CNA by default, this default behavior can be overridden with the kernel boot command-line option "numa_spinlock=on/off/auto".
Signed-off-by: Wei Li liwei391@huawei.com --- kernel/locking/qspinlock_cna.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h index 17d56c739e57..3983505c1118 100644 --- a/kernel/locking/qspinlock_cna.h +++ b/kernel/locking/qspinlock_cna.h @@ -378,9 +378,9 @@ static inline void cna_lock_handoff(struct mcs_spinlock *node,
/* * Constant (boot-param configurable) flag selecting the NUMA-aware variant - * of spinlock. Possible values: -1 (off) / 0 (auto, default) / 1 (on). + * of spinlock. Possible values: -1 (off, default) / 0 (auto) / 1 (on). */ -static int numa_spinlock_flag; +static int numa_spinlock_flag = -1;
static int __init numa_spinlock_setup(char *str) {
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/1431 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/M...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/1431 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/M...