[PATCH v2 0/4] refactor the ringtest testing for ptr_ring

Yunsheng Lin

20 Jul 2021 20 Jul '21

10:21 a.m.

tools/include/* has a lot of abstract layer for building kernel code from userspace, so reuse or add the abstract layer in tools/include/ to build the ptr_ring for ringtest testing. The same abstract layer can be used to build the ptr_ring for ptr_ring benchmark app too, see [1]. 1. https://lkml.org/lkml/2021/7/1/275 V2: 1. rebased on the Eugenio's patchset and split patch 1 to more reviewable ones. 2. only add the interface used by ringtest, so that the added code can be built and tested. 3. cpu_relax() only support x86 and arm64 now. 4. use 64 bytes as the default SMP_CACHE_BYTES. Yunsheng Lin (4): tools headers UAPI: add cache aligning related macro tools headers UAPI: add kmalloc/vmalloc related interface tools headers UAPI: add cpu_relax() implementation for x86 and arm64 tools/virtio: use common infrastructure to build ptr_ring.h tools/include/asm/processor.h | 26 ++++++++++ tools/include/linux/cache.h | 25 ++++++++++ tools/include/linux/gfp.h | 2 + tools/include/linux/slab.h | 46 ++++++++++++++++++ tools/virtio/ringtest/Makefile | 2 +- tools/virtio/ringtest/main.h | 99 +++----------------------------------- tools/virtio/ringtest/ptr_ring.c | 101 ++------------------------------------- 7 files changed, 109 insertions(+), 192 deletions(-) create mode 100644 tools/include/asm/processor.h create mode 100644 tools/include/linux/cache.h create mode 100644 tools/include/linux/slab.h -- 2.7.4

Show replies by date

Yunsheng Lin

20 Jul 20 Jul

10:21 a.m.

New subject: [PATCH v2 1/4] tools headers UAPI: add cache aligning related macro

____cacheline_aligned_in_smp macro is needed to avoid cache bouncing in SMP system, which is used in ptr_ring lib. So add the related macro in order to bulid ptr_ring from user space. As SMP_CACHE_BYTES is 64 bytes for arm64 and most of x86 system, so use 64 bytes as the default SMP_CACHE_BYTES if SMP_CACHE_BYTES is not defined. Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> --- tools/include/linux/cache.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 tools/include/linux/cache.h diff --git a/tools/include/linux/cache.h b/tools/include/linux/cache.h new file mode 100644 index 0000000..df04307 --- /dev/null +++ b/tools/include/linux/cache.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TOOLS_LINUX__CACHE_H +#define __TOOLS_LINUX__CACHE_H + +#ifndef CONFIG_SMP +#define CONFIG_SMP 1 +#endif + +#ifndef SMP_CACHE_BYTES +#define SMP_CACHE_BYTES 64 +#endif + +#ifndef ____cacheline_aligned +#define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) +#endif + +#ifndef ____cacheline_aligned_in_smp +#ifdef CONFIG_SMP +#define ____cacheline_aligned_in_smp ____cacheline_aligned +#else +#define ____cacheline_aligned_in_smp +#endif /* CONFIG_SMP */ +#endif + +#endif /* __LINUX_CACHE_H */ -- 2.7.4

Yunsheng Lin

10:21 a.m.

New subject: [PATCH v2 2/4] tools headers UAPI: add kmalloc/vmalloc related interface

Implement the kmalloc/vmalloc related interface based on malloc interface in user space. Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> --- tools/include/linux/gfp.h | 2 ++ tools/include/linux/slab.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 tools/include/linux/slab.h diff --git a/tools/include/linux/gfp.h b/tools/include/linux/gfp.h index 2203075..a660ab9 100644 --- a/tools/include/linux/gfp.h +++ b/tools/include/linux/gfp.h @@ -1,4 +1,6 @@ #ifndef _TOOLS_INCLUDE_LINUX_GFP_H #define _TOOLS_INCLUDE_LINUX_GFP_H +#define __GFP_ZERO 0x100u + #endif /* _TOOLS_INCLUDE_LINUX_GFP_H */ diff --git a/tools/include/linux/slab.h b/tools/include/linux/slab.h new file mode 100644 index 0000000..f0b7da6 --- /dev/null +++ b/tools/include/linux/slab.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TOOLS_LINUX_SLAB_H +#define __TOOLS_LINUX_SLAB_H + +#include <linux/gfp.h> +#include <linux/cache.h> + +static inline void *kmalloc(size_t size, gfp_t gfp) +{ + void *p; + + p = memalign(SMP_CACHE_BYTES, size); + if (!p) + return p; + + if (gfp & __GFP_ZERO) + memset(p, 0, size); + + return p; +} + +static inline void *kzalloc(size_t size, gfp_t flags) +{ + return kmalloc(size, flags | __GFP_ZERO); +} + +static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) +{ + return kmalloc(n * size, flags); +} + +static inline void *kcalloc(size_t n, size_t size, gfp_t flags) +{ + return kmalloc_array(n, size, flags | __GFP_ZERO); +} + +static inline void kfree(void *p) +{ + free(p); +} + +#define kvmalloc_array kmalloc_array +#define kvfree kfree +#define KMALLOC_MAX_SIZE SIZE_MAX + +#endif -- 2.7.4

Yunsheng Lin

10:21 a.m.

New subject: [PATCH v2 3/4] tools headers UAPI: add cpu_relax() implementation for x86 and arm64

As x86 and arm64 is the two available systems that I can build and test the cpu_relax() implementation, so only add cpu_relax() implementation for x86 and arm64, other arches can be added easily when needed. Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> --- tools/include/asm/processor.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tools/include/asm/processor.h diff --git a/tools/include/asm/processor.h b/tools/include/asm/processor.h new file mode 100644 index 0000000..f9b3902 --- /dev/null +++ b/tools/include/asm/processor.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __TOOLS_LINUX_ASM_PROCESSOR_H +#define __TOOLS_LINUX_ASM_PROCESSOR_H + +#if defined(__i386__) || defined(__x86_64__) +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static __always_inline void rep_nop(void) +{ + asm volatile("rep; nop" ::: "memory"); +} + +static __always_inline void cpu_relax(void) +{ + rep_nop(); +} +#elif defined(__aarch64__) +static inline void cpu_relax(void) +{ + asm volatile("yield" ::: "memory"); +} +#else +#error "Architecture not supported" +#endif + +#endif -- 2.7.4

David Laight

22 Jul 22 Jul

4:53 a.m.

New subject: [PATCH v2 3/4] tools headers UAPI: add cpu_relax() implementation for x86 and arm64

From: Yunsheng Lin

...

Sent: 20 July 2021 03:22

As x86 and arm64 is the two available systems that I can build and test the cpu_relax() implementation, so only add cpu_relax() implementation for x86 and arm64, other arches can be added easily when needed.

...

+#if defined(__i386__) || defined(__x86_64__) +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static __always_inline void rep_nop(void) +{ + asm volatile("rep; nop" ::: "memory"); +}

Beware, Intel increased the stall for 'rep nop' in some recent cpu to IIRC about 200 cycles. They even document that this might have a detrimental effect. It is basically far too long for the sort of thing it makes sense to busy-wait for. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)

Yunsheng Lin

4:18 p.m.

New subject: [PATCH v2 3/4] tools headers UAPI: add cpu_relax() implementation for x86 and arm64

On 2021/7/22 4:53, David Laight wrote:

...

From: Yunsheng Lin

...
Sent: 20 July 2021 03:22

As x86 and arm64 is the two available systems that I can build and test the cpu_relax() implementation, so only add cpu_relax() implementation for x86 and arm64, other arches can be added easily when needed.

...

...
+#if defined(__i386__) || defined(__x86_64__) +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static __always_inline void rep_nop(void) +{ + asm volatile("rep; nop" ::: "memory"); +}

Beware, Intel increased the stall for 'rep nop' in some recent cpu to IIRC about 200 cycles.

They even document that this might have a detrimental effect. It is basically far too long for the sort of thing it makes sense to busy-wait for.

Thanks for the info:) I will be beware of that when playing with 'rep nop' in newer x86 cpu.

...

.

David Laight

4:45 p.m.

New subject: [PATCH v2 3/4] tools headers UAPI: add cpu_relax() implementation for x86 and arm64

...

...
Beware, Intel increased the stall for 'rep nop' in some recent cpu to IIRC about 200 cycles.

They even document that this might have a detrimental effect. It is basically far too long for the sort of thing it makes sense to busy-wait for.

Thanks for the info:) I will be beware of that when playing with 'rep nop' in newer x86 cpu.

See 8.4.7 Pause Latency in Skylake Microarchitecture in Intel® 64 and IA-32 Architectures Optimization Reference Manual The latency of PAUSE instruction in prior generation microarchitecture is about 10 cycles, whereas on Skylake microarchitecture it has been extended to as many as 140 cycles. An earlier section does explain why you need pause though. One of its effects is to stop the cpu speculatively executing multiple iterations of the wait look - each with its own pending read of the memory location that is being looked at. Unwinding that isn't free - and was particularly expensive on P4 Netburst - what a surprise, they ran everything except benchmark looks very slowly. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)

Yunsheng Lin

20 Jul 20 Jul

10:21 a.m.

New subject: [PATCH v2 4/4] tools/virtio: use common infrastructure to build ptr_ring.h

Use the common infrastructure in tools/include to build ptr_ring.h in user space. Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> --- tools/virtio/ringtest/Makefile | 2 +- tools/virtio/ringtest/main.h | 99 +++----------------------------------- tools/virtio/ringtest/ptr_ring.c | 101 ++------------------------------------- 3 files changed, 10 insertions(+), 192 deletions(-) diff --git a/tools/virtio/ringtest/Makefile b/tools/virtio/ringtest/Makefile index 85c98c2..89fc024 100644 --- a/tools/virtio/ringtest/Makefile +++ b/tools/virtio/ringtest/Makefile @@ -3,7 +3,7 @@ all: all: ring virtio_ring_0_9 virtio_ring_poll virtio_ring_inorder ptr_ring noring -CFLAGS += -Wall +CFLAGS += -Wall -I../../include CFLAGS += -pthread -O2 -ggdb -flto -fwhole-program LDFLAGS += -pthread -O2 -ggdb -flto -fwhole-program diff --git a/tools/virtio/ringtest/main.h b/tools/virtio/ringtest/main.h index 6d1fccd..26a8659 100644 --- a/tools/virtio/ringtest/main.h +++ b/tools/virtio/ringtest/main.h @@ -10,6 +10,12 @@ #include <stdbool.h> +#include <asm/barrier.h> +#include <asm/processor.h> + +#define smp_acquire smp_rmb +#define smp_release smp_wmb + extern int param; extern bool do_exit; @@ -87,18 +93,6 @@ void wait_for_call(void); extern unsigned ring_size; -/* Compiler barrier - similar to what Linux uses */ -#define barrier() asm volatile("" ::: "memory") - -/* Is there a portable way to do this? */ -#if defined(__x86_64__) || defined(__i386__) -#define cpu_relax() asm ("rep; nop" ::: "memory") -#elif defined(__s390x__) -#define cpu_relax() barrier() -#else -#define cpu_relax() assert(0) -#endif - extern bool do_relax; static inline void busy_wait(void) @@ -110,85 +104,4 @@ static inline void busy_wait(void) barrier(); } -#if defined(__x86_64__) || defined(__i386__) -#define smp_mb() asm volatile("lock; addl $0,-132(%%rsp)" ::: "memory", "cc") -#else -/* - * Not using __ATOMIC_SEQ_CST since gcc docs say they are only synchronized - * with other __ATOMIC_SEQ_CST calls. - */ -#define smp_mb() __sync_synchronize() -#endif - -/* - * This abuses the atomic builtins for thread fences, and - * adds a compiler barrier. - */ -#define smp_release() do { \ - barrier(); \ - __atomic_thread_fence(__ATOMIC_RELEASE); \ -} while (0) - -#define smp_acquire() do { \ - __atomic_thread_fence(__ATOMIC_ACQUIRE); \ - barrier(); \ -} while (0) - -#if defined(__i386__) || defined(__x86_64__) || defined(__s390x__) -#define smp_wmb() barrier() -#else -#define smp_wmb() smp_release() -#endif - -#ifdef __alpha__ -#define smp_read_barrier_depends() smp_acquire() -#else -#define smp_read_barrier_depends() do {} while(0) -#endif - -static __always_inline -void __read_once_size(const volatile void *p, void *res, int size) -{ - switch (size) { \ - case 1: *(unsigned char *)res = *(volatile unsigned char *)p; break; \ - case 2: *(unsigned short *)res = *(volatile unsigned short *)p; break; \ - case 4: *(unsigned int *)res = *(volatile unsigned int *)p; break; \ - case 8: *(unsigned long long *)res = *(volatile unsigned long long *)p; break; \ - default: \ - barrier(); \ - __builtin_memcpy((void *)res, (const void *)p, size); \ - barrier(); \ - } \ -} - -static __always_inline void __write_once_size(volatile void *p, void *res, int size) -{ - switch (size) { - case 1: *(volatile unsigned char *)p = *(unsigned char *)res; break; - case 2: *(volatile unsigned short *)p = *(unsigned short *)res; break; - case 4: *(volatile unsigned int *)p = *(unsigned int *)res; break; - case 8: *(volatile unsigned long long *)p = *(unsigned long long *)res; break; - default: - barrier(); - __builtin_memcpy((void *)p, (const void *)res, size); - barrier(); - } -} - -#define READ_ONCE(x) \ -({ \ - union { typeof(x) __val; char __c[1]; } __u; \ - __read_once_size(&(x), __u.__c, sizeof(x)); \ - smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ - __u.__val; \ -}) - -#define WRITE_ONCE(x, val) \ -({ \ - union { typeof(x) __val; char __c[1]; } __u = \ - { .__val = (typeof(x)) (val) }; \ - __write_once_size(&(x), __u.__c, sizeof(x)); \ - __u.__val; \ -}) - #endif diff --git a/tools/virtio/ringtest/ptr_ring.c b/tools/virtio/ringtest/ptr_ring.c index c9b2633..e9849a3 100644 --- a/tools/virtio/ringtest/ptr_ring.c +++ b/tools/virtio/ringtest/ptr_ring.c @@ -10,104 +10,9 @@ #include <errno.h> #include <limits.h> -#define SMP_CACHE_BYTES 64 -#define cache_line_size() SMP_CACHE_BYTES -#define ____cacheline_aligned_in_smp __attribute__ ((aligned (SMP_CACHE_BYTES))) -#define unlikely(x) (__builtin_expect(!!(x), 0)) -#define likely(x) (__builtin_expect(!!(x), 1)) -#define ALIGN(x, a) (((x) + (a) - 1) / (a) * (a)) -#define SIZE_MAX (~(size_t)0) -#define KMALLOC_MAX_SIZE SIZE_MAX - -typedef pthread_spinlock_t spinlock_t; - -typedef int gfp_t; -#define __GFP_ZERO 0x1 - -static void *kmalloc(unsigned size, gfp_t gfp) -{ - void *p = memalign(64, size); - if (!p) - return p; - - if (gfp & __GFP_ZERO) - memset(p, 0, size); - return p; -} - -static inline void *kzalloc(unsigned size, gfp_t flags) -{ - return kmalloc(size, flags | __GFP_ZERO); -} - -static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) -{ - if (size != 0 && n > SIZE_MAX / size) - return NULL; - return kmalloc(n * size, flags); -} - -static inline void *kcalloc(size_t n, size_t size, gfp_t flags) -{ - return kmalloc_array(n, size, flags | __GFP_ZERO); -} - -static void kfree(void *p) -{ - if (p) - free(p); -} - -#define kvmalloc_array kmalloc_array -#define kvfree kfree - -static void spin_lock_init(spinlock_t *lock) -{ - int r = pthread_spin_init(lock, 0); - assert(!r); -} - -static void spin_lock(spinlock_t *lock) -{ - int ret = pthread_spin_lock(lock); - assert(!ret); -} - -static void spin_unlock(spinlock_t *lock) -{ - int ret = pthread_spin_unlock(lock); - assert(!ret); -} - -static void spin_lock_bh(spinlock_t *lock) -{ - spin_lock(lock); -} - -static void spin_unlock_bh(spinlock_t *lock) -{ - spin_unlock(lock); -} - -static void spin_lock_irq(spinlock_t *lock) -{ - spin_lock(lock); -} - -static void spin_unlock_irq(spinlock_t *lock) -{ - spin_unlock(lock); -} - -static void spin_lock_irqsave(spinlock_t *lock, unsigned long f) -{ - spin_lock(lock); -} - -static void spin_unlock_irqrestore(spinlock_t *lock, unsigned long f) -{ - spin_unlock(lock); -} +#include <linux/cache.h> +#include <linux/slab.h> +#include <linux/spinlock.h> #include "../../../include/linux/ptr_ring.h" -- 2.7.4

1554

Age (days ago)

1556

Last active (days ago)

List overview

7 comments

2 participants

participants (2)

David Laight
Yunsheng Lin