[PATCH OLK-6.6] arm64: lib: improve usercopy performance by mitigating data dependencies - Kernel

16 May 2024

hulk inclusion
category: performance
bugzilla: https://gitee.com/openeuler/kernel/issues/I9PXLD?from=project-issue
CVE: NA
--------------------------------
In the copy_from/to_user scenario, due to the need to accurately
calculate the number of copied bytes for fixup, it is necessary to
increment the values of src and dst after each copy of the data.
Therefore, this introduce RAW dependency which result in some bubble
in pipline and degrade the IPC.
Consider updating the values of src and dst only when the copy is
completed in the current stage to reduce data dependence. But this will
break the function of fixup, so in order to maintain the function of
fixup, a new macro USER_OFF is introduced to determine the number of bytes
that load/store has completed copying at this stage, the final number of
bytes that have not been copied can be determined.
The following are the test results of UnixBench in HiSilicon KunPeng920:
                                    without this patch     with this patch
System Benchmarks Index Values              INDEX               INDEX
Dhrystone 2 using register variables       3714.0              3770.5
Double-Precision Whetstone                  797.1               797.1
Execl Throughput                            665.8               663.9
File Copy 1024 bufsize 2000 maxblocks      2465.5              2435.5
File Copy 256 bufsize 500 maxblocks        1748.2              1724.4
File Copy 4096 bufsize 8000 maxblocks      3734.4              3634.8
Pipe Throughput                            1019.9              1022.0
Pipe-based Context Switching                346.9               394.0
Process Creation                            500.1               512.0
Shell Scripts (1 concurrent)               1495.8              1521.4
Shell Scripts (8 concurrent)               5132.4              5202.2
System Call Overhead                        681.6               697.2
                                          ========            ========
System Benchmarks Index Score               1325.3              1343.7
Signed-off-by: Li Zetao lizetao1@huawei.com
---
 arch/arm64/include/asm/asm-uaccess.h |  29 ----
 arch/arm64/lib/copy_from_user.S      | 241 +++++++++++++++++++++-----
 arch/arm64/lib/copy_to_user.S        | 242 ++++++++++++++++++++++-----
 3 files changed, 406 insertions(+), 106 deletions(-)

diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 7bbebfa5b7103..7a872c77c03ac 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -65,33 +65,4 @@ alternative_else_nop_endif
 9999:	x;					\
    _asm_extable_uaccess	9999b, l
-/*
- * Generate the assembly for LDTR/STTR with exception table entries.
- * This is complicated as there is no post-increment or pair versions of the
- * unprivileged instructions, and USER() only works for single instructions.
- */
-	.macro user_ldp l, reg1, reg2, addr, post_inc
-8888:		ldtr	\reg1, [\addr];
-8889:		ldtr	\reg2, [\addr, #8];
-		add	\addr, \addr, \post_inc;
-
-		_asm_extable_uaccess	8888b, \l;
-		_asm_extable_uaccess	8889b, \l;
-	.endm
-
-	.macro user_stp l, reg1, reg2, addr, post_inc
-8888:		sttr	\reg1, [\addr];
-8889:		sttr	\reg2, [\addr, #8];
-		add	\addr, \addr, \post_inc;
-
-		_asm_extable_uaccess	8888b,\l;
-		_asm_extable_uaccess	8889b,\l;
-	.endm
-
-	.macro user_ldst l, inst, reg, addr, post_inc
-8888:		\inst		\reg, [\addr];
-		add		\addr, \addr, \post_inc;
-
-		_asm_extable_uaccess	8888b, \l;
-	.endm
 #endif
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 34e3179075244..a80b8679c4b58 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -20,54 +20,219 @@
  *	x0 - bytes not copied
  */
-	.macro ldrb1 reg, ptr, val
-	user_ldst 9998f, ldtrb, \reg, \ptr, \val
-	.endm
+dstin	.req	x0
+end	.req	x5
+src	.req	x1
+srcin	.req	x15
+count	.req	x2
+tmp1	.req	x3
+tmp1w	.req	w3
+tmp2	.req	x4
+tmp2w	.req	w4
+dst	.req	x6
-	.macro strb1 reg, ptr, val
-	strb \reg, [\ptr], \val
-	.endm
+A_l	.req	x7
+A_h	.req	x8
+B_l	.req	x9
+B_h	.req	x10
+C_l	.req	x11
+C_h	.req	x12
+D_l	.req	x13
+D_h	.req	x14
-	.macro ldrh1 reg, ptr, val
-	user_ldst 9997f, ldtrh, \reg, \ptr, \val
-	.endm
+#define USER_OFF(off, x...)	USER(fixup_offset_##off, x)
+#define FIXUP_OFFSET(n)				\
+fixup_offset_##n:				\
+	sub	x0, end, dst;			\
+	sub	x0, x0, n;			\
+	ret
-	.macro strh1 reg, ptr, val
-	strh \reg, [\ptr], \val
-	.endm
+FIXUP_OFFSET(0)
+FIXUP_OFFSET(8)
+FIXUP_OFFSET(16)
+FIXUP_OFFSET(24)
+FIXUP_OFFSET(32)
+FIXUP_OFFSET(40)
+FIXUP_OFFSET(48)
+FIXUP_OFFSET(56)
-	.macro ldr1 reg, ptr, val
-	user_ldst 9997f, ldtr, \reg, \ptr, \val
-	.endm
+SYM_FUNC_START(__arch_copy_from_user)
+	add	end, x0, x2
+	mov	srcin, x1
-	.macro str1 reg, ptr, val
-	str \reg, [\ptr], \val
-	.endm
+	mov	dst, dstin
+	cmp	count, #16
+	/*When memory length is less than 16, the accessed are not aligned.*/
+	b.lo	.Ltiny15
-	.macro ldp1 reg1, reg2, ptr, val
-	user_ldp 9997f, \reg1, \reg2, \ptr, \val
-	.endm
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+	/*
+	* Copy the leading memory data from src to dst in an increasing
+	* address order.By this way,the risk of overwriting the source
+	* memory data is eliminated when the distance between src and
+	* dst is less than 16. The memory accesses here are alignment.
+	*/
+	tbz	tmp2, #0, 1f
+USER_OFF(0,	ldtrb tmp1w, [src, #0])
+	strb	tmp1w, [dst], #1
+	add	src, src, #1
+1:
+	tbz	tmp2, #1, 2f
+USER_OFF(0,	ldtrh tmp1w, [src, #0])
+	strh	tmp1w, [dst], #2
+	add	src, src, #2
+2:
+	tbz	tmp2, #2, 3f
+USER_OFF(0,	ldtr tmp1w, [src, #0])
+	str	tmp1w, [dst], #4
+	add	src, src, #4
+3:
+	tbz	tmp2, #3, .LSrcAligned
+USER_OFF(0,	ldtr tmp1, [src, #0])
+	str	tmp1, [dst], #8
+	add	src, src, #8
-	.macro stp1 reg1, reg2, ptr, val
-	stp \reg1, \reg2, [\ptr], \val
-	.endm
+.LSrcAligned:
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15
+	USER_OFF(0,	ldtr A_l, [src, #0])
+	USER_OFF(8,	ldtr A_h, [src, #8])
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	stp 	A_l, A_h, [dst], #16
+	add	src, src, #16
+	USER_OFF(0,	ldtr A_l, [src, #0])
+	USER_OFF(8,	ldtr A_h, [src, #8])
+1:
+	stp 	A_l, A_h, [dst], #16
+	add	src, src, #16
+	USER_OFF(0,	ldtr A_l, [src, #0])
+	USER_OFF(8,	ldtr A_h, [src, #8])
+2:
+	stp 	A_l, A_h, [dst], #16
+	add	src, src, #16
+.Ltiny15:
+	/*
+	* Prefer to break one ldp/stp into several load/store to access
+	* memory in an increasing address order,rather than to load/store 16
+	* bytes from (src-16) to (dst-16) and to backward the src to aligned
+	* address,which way is used in original cortex memcpy. If keeping
+	* the original memcpy process here, memmove need to satisfy the
+	* precondition that src address is at least 16 bytes bigger than dst
+	* address,otherwise some source data will be overwritten when memove
+	* call memcpy directly. To make memmove simpler and decouple the
+	* memcpy's dependency on memmove, withdrew the original process.
+	*/
+	tbz	count, #3, 1f
+USER_OFF(0,	ldtr tmp1, [src, #0])
+	str	tmp1, [dst], #8
+	add	src, src, #8
+1:
+	tbz	count, #2, 2f
+USER_OFF(0,	ldtr tmp1w, [src, #0])
+	str	tmp1w, [dst], #4
+	add	src, src, #4
+2:
+	tbz	count, #1, 3f
+USER_OFF(0,	ldtrh tmp1w, [src, #0])
+	strh	tmp1w, [dst], #2
+	add	src, src, #2
+3:
+	tbz	count, #0, .Lexitfunc
+USER_OFF(0,	ldtrb tmp1w, [src, #0])
+	strb	tmp1w, [dst], #1
+	add	src, src, #1
-end	.req	x5
-srcin	.req	x15
-SYM_FUNC_START(__arch_copy_from_user)
-	add	end, x0, x2
-	mov	srcin, x1
-#include "copy_template.S"
+	b	.Lexitfunc
+
+.Lcpy_over64:
+	.p2align	L1_CACHE_SHIFT
+	USER_OFF(0,	ldtr A_l, [src, #0])
+	USER_OFF(8,	ldtr A_h, [src, #8])
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+	stp 	A_l, A_h, [dst, #0]
+USER_OFF(16,	ldtr B_l, [src, #16])
+USER_OFF(24,	ldtr B_h, [src, #24])
+USER_OFF(32,	ldtr C_l, [src, #32])
+USER_OFF(40,	ldtr C_h, [src, #40])
+	stp 	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+USER_OFF(48,	ldtr D_l, [src, #48])
+USER_OFF(56,	ldtr D_h, [src, #56])
+	add	src, src, #64
+	stp	D_l, D_h, [dst, #48]
+	add	dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	b	.Lexitfunc
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+.Lcpy_body_large:
+	/* pre-get 64 bytes data. */
+USER_OFF(16,	ldtr B_l, [src, #16])
+USER_OFF(24,	ldtr B_h, [src, #24])
+USER_OFF(32,	ldtr C_l, [src, #32])
+USER_OFF(40,	ldtr C_h, [src, #40])
+USER_OFF(48,	ldtr D_l, [src, #48])
+USER_OFF(56,	ldtr D_h, [src, #56])
+	add	src, src, #64
+
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+	stp 	A_l, A_h, [dst, #0]
+USER_OFF(0,	ldtr A_l, [src, #0])
+USER_OFF(8,	ldtr A_h, [src, #8])
+	stp 	B_l, B_h, [dst, #16]
+USER_OFF(16,	ldtr B_l, [src, #16])
+USER_OFF(24,	ldtr B_h, [src, #24])
+	stp	C_l, C_h, [dst, #32]
+USER_OFF(32,	ldtr C_l, [src, #32])
+USER_OFF(40,	ldtr C_h, [src, #40])
+	stp	D_l, D_h, [dst, #48]
+USER_OFF(48,	ldtr D_l, [src, #48])
+	add	dst, dst, #64
+USER_OFF(56,	ldtr D_h, [src, #56])
+	add	src, src, #64
+	subs	count, count, #64
+	b.ge	1b
+	stp 	A_l, A_h, [dst, #0]
+	stp 	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+	stp	D_l, D_h, [dst, #48]
+	add	dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+.Lexitfunc:
    mov	x0, #0				// Nothing to copy
    ret
-	// Exception fixups
-9997:	cmp	dst, dstin
-	b.ne	9998f
-	// Before being absolutely sure we couldn't copy anything, try harder
-USER(9998f, ldtrb tmp1w, [srcin])
-	strb	tmp1w, [dst], #1
-9998:	sub	x0, end, dst			// bytes not copied
-	ret
 SYM_FUNC_END(__arch_copy_from_user)
 EXPORT_SYMBOL(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 2ac716c0d6d8c..7b69dece56f6d 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -19,55 +19,219 @@
  * Returns:
  *	x0 - bytes not copied
  */
-	.macro ldrb1 reg, ptr, val
-	KERNEL_ME_SAFE(9998f, ldrb  \reg, [\ptr], \val)
-	.endm
-	.macro strb1 reg, ptr, val
-	user_ldst 9998f, sttrb, \reg, \ptr, \val
-	.endm
-
-	.macro ldrh1 reg, ptr, val
-	KERNEL_ME_SAFE(9998f, ldrh  \reg, [\ptr], \val)
-	.endm
-
-	.macro strh1 reg, ptr, val
-	user_ldst 9997f, sttrh, \reg, \ptr, \val
-	.endm
-
-	.macro ldr1 reg, ptr, val
-	KERNEL_ME_SAFE(9998f, ldr \reg, [\ptr], \val)
-	.endm
+dstin	.req	x0
+src	.req	x1
+end	.req	x5
+srcin	.req	x15
+count	.req	x2
+tmp1	.req	x3
+tmp1w	.req	w3
+tmp2	.req	x4
+tmp2w	.req	w4
+dst	.req	x6
-	.macro str1 reg, ptr, val
-	user_ldst 9997f, sttr, \reg, \ptr, \val
-	.endm
+A_l	.req	x7
+A_h	.req	x8
+B_l	.req	x9
+B_h	.req	x10
+C_l	.req	x11
+C_h	.req	x12
+D_l	.req	x13
+D_h	.req	x14
-	.macro ldp1 reg1, reg2, ptr, val
-	KERNEL_ME_SAFE(9998f, ldp \reg1, \reg2, [\ptr], \val)
-	.endm
+#define USER_OFF(off, x...)    USER(fixup_offset_##off, x)
+#define FIXUP_OFFSET(n)                                \
+fixup_offset_##n:                              \
+       sub     x0, end, dst;                   \
+       sub     x0, x0, n;                      \
+       ret
-	.macro stp1 reg1, reg2, ptr, val
-	user_stp 9997f, \reg1, \reg2, \ptr, \val
-	.endm
+FIXUP_OFFSET(0)
+FIXUP_OFFSET(8)
+FIXUP_OFFSET(16)
+FIXUP_OFFSET(24)
+FIXUP_OFFSET(32)
+FIXUP_OFFSET(40)
+FIXUP_OFFSET(48)
+FIXUP_OFFSET(56)
-end	.req	x5
-srcin	.req	x15
 SYM_FUNC_START(__arch_copy_to_user)
    add	end, x0, x2
    mov	srcin, x1
-#include "copy_template.S"
+	mov	dst, dstin
+	cmp	count, #16
+	/*When memory length is less than 16, the accessed are not aligned.*/
+	b.lo	.Ltiny15
+
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+	/*
+	* Copy the leading memory data from src to dst in an increasing
+	* address order.By this way,the risk of overwriting the source
+	* memory data is eliminated when the distance between src and
+	* dst is less than 16. The memory accesses here are alignment.
+	*/
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src], #1
+USER_OFF(0,    sttrb tmp1w, [dst, #0])
+	add     dst, dst, #1
+1:
+	tbz	tmp2, #1, 2f
+	ldrh	tmp1w, [src], #2
+USER_OFF(0,    sttrh tmp1w, [dst, #0])
+	add     dst, dst, #2
+2:
+	tbz	tmp2, #2, 3f
+	ldr	tmp1w, [src], #4
+USER_OFF(0,    sttr tmp1w, [dst, #0])
+	add     dst, dst, #4
+3:
+	tbz	tmp2, #3, .LSrcAligned
+	ldr	tmp1, [src], #8
+USER_OFF(0,    sttr tmp1, [dst, #0])
+	add     dst, dst, #8
+
+.LSrcAligned:
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15
+	ldp	A_l, A_h, [src], #16
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+	ldp	A_l, A_h, [src], #16
+	add     dst, dst, #16
+1:
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+	ldp	A_l, A_h, [src], #16
+	add     dst, dst, #16
+2:
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+	add     dst, dst, #16
+.Ltiny15:
+	/*
+	* Prefer to break one ldp/stp into several load/store to access
+	* memory in an increasing address order,rather than to load/store 16
+	* bytes from (src-16) to (dst-16) and to backward the src to aligned
+	* address,which way is used in original cortex memcpy. If keeping
+	* the original memcpy process here, memmove need to satisfy the
+	* precondition that src address is at least 16 bytes bigger than dst
+	* address,otherwise some source data will be overwritten when memove
+	* call memcpy directly. To make memmove simpler and decouple the
+	* memcpy's dependency on memmove, withdrew the original process.
+	*/
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+USER_OFF(0,    sttr tmp1, [dst, #0])
+	add     dst, dst, #8
+1:
+	tbz	count, #2, 2f
+	ldr	tmp1w, [src], #4
+USER_OFF(0,    sttr tmp1w, [dst, #0])
+	add     dst, dst, #4
+2:
+	tbz	count, #1, 3f
+	ldrh	tmp1w, [src], #2
+USER_OFF(0,    sttrh tmp1w, [dst, #0])
+	add     dst, dst, #2
+3:
+	tbz	count, #0, .Lexitfunc
+	ldrb	tmp1w, [src], #1
+USER_OFF(0,    sttrb tmp1w, [dst, #0])
+	add     dst, dst, #1
+
+	b	.Lexitfunc
+
+.Lcpy_over64:
+	.p2align	L1_CACHE_SHIFT
+	ldp	A_l, A_h, [src, #0]
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+USER_OFF(16,   sttr B_l, [dst, #16])
+USER_OFF(24,   sttr B_h, [dst, #24])
+USER_OFF(32,   sttr C_l, [dst, #32])
+USER_OFF(40,   sttr C_h, [dst, #40])
+	ldp	D_l, D_h, [src, #48]
+	add     src, src, #64
+USER_OFF(48,   sttr D_l, [dst, #48])
+USER_OFF(56,   sttr D_h, [dst, #56])
+	add     dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	b	.Lexitfunc
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+.Lcpy_body_large:
+	/* pre-get 64 bytes data. */
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]
+	add     src, src, #64
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+	ldp	A_l, A_h, [src, #0]
+USER_OFF(16,   sttr B_l, [dst, #16])
+USER_OFF(24,   sttr B_h, [dst, #24])
+	ldp	B_l, B_h, [src, #16]
+USER_OFF(32,   sttr C_l, [dst, #32])
+USER_OFF(40,   sttr C_h, [dst, #40])
+	ldp	C_l, C_h, [src, #32]
+USER_OFF(48,   sttr D_l, [dst, #48])
+USER_OFF(56,   sttr D_h, [dst, #56])
+	add     dst, dst, #64
+	ldp	D_l, D_h, [src, #48]
+	add     src, src, #64
+	subs	count, count, #64
+	b.ge	1b
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+USER_OFF(16,   sttr B_l, [dst, #16])
+USER_OFF(24,   sttr B_h, [dst, #24])
+USER_OFF(32,   sttr C_l, [dst, #32])
+USER_OFF(40,   sttr C_h, [dst, #40])
+USER_OFF(48,   sttr D_l, [dst, #48])
+USER_OFF(56,   sttr D_h, [dst, #56])
+	add     dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+.Lexitfunc:
    mov	x0, #0
    ret
-	// Exception fixups
-9997:	cmp	dst, dstin
-	b.ne	9998f
-	// Before being absolutely sure we couldn't copy anything, try harder
-KERNEL_ME_SAFE(9998f, ldrb	tmp1w, [srcin])
-USER(9998f, sttrb tmp1w, [dst])
-	add	dst, dst, #1
-9998:	sub	x0, end, dst			// bytes not copied
-	ret
 SYM_FUNC_END(__arch_copy_to_user)
 EXPORT_SYMBOL(__arch_copy_to_user)
-- 
2.34.1