[PATCH OLK-6.6 v3 2/3] arm64: uaccess: add optimized copy_from_user for Hisilicon platforms

14 May 2026

hulk inclusion
category: feature
bugzilla: https://atomgit.com/openeuler/kernel/issues/9132

-------------------------------------------------------

Add __arch_copy_from_user_opt for selected HiSilicon cores,
using index-addressed ldp/stp pairs to copy data from user
space. The path temporarily disables PAN during the copy and
restores it on both the success and the exception fixup paths.
The new copy_template_opt.S handles the >=64B bulk copy loop
while the tail is managed by the existing byte/half/word
load/store macros.

Also add a "copy_opt_disable" cmdline parameter to disable the
optimization at boot time.

Enable the optimized path in raw_copy_from_user() for copies
...
= 4KB on CPUs with ARM64_HAS_COPY_OPT. The PAN toggle adds
a small overhead, so only large copies are routed to the new
path where the overhead is worth the faster throughput.
Signed-off-by: Qi Xi <xiqi2@huawei.com>
---
 .../admin-guide/kernel-parameters.txt         |   6 +
 arch/arm64/include/asm/asm-uaccess.h          |   7 +
 arch/arm64/include/asm/uaccess.h              |  19 +-
 arch/arm64/kernel/cpufeature.c                |  31 +++
 arch/arm64/lib/copy_from_user.S               |  32 +++
 arch/arm64/lib/copy_template_opt.S            | 189 ++++++++++++++++++
 arch/arm64/tools/cpucaps                      |   2 +-
 7 files changed, 283 insertions(+), 3 deletions(-)
 create mode 100644 arch/arm64/lib/copy_template_opt.S

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index d72ab64a69b9..03f59c496cc1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -718,6 +718,12 @@
 			which is located in node nid, if the allocation fails,
 			they will fallback to the global default memory area.
 
+	copy_opt_disable	[ARM64]
+			Disable the optimized copy_from_user path on Hisilicon
+			CPUs that support it. By default the optimization is
+			enabled; this parameter forces all copies through
+			the standard unoptimized path.
+
 	cmo_free_hint=	[PPC] Format: { yes | no }
 			Specify whether pages are marked as being inactive
 			when they are freed.  This is used in CMO environments
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 7bbebfa5b710..4edf6c7dc56f 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -94,4 +94,11 @@ alternative_else_nop_endif
 
 		_asm_extable_uaccess	8888b, \l;
 	.endm
+
+	.macro user_ldst_pair_index l, inst, reg1, reg2, addr, val
+8888:		\inst		\reg1, \reg2, [\addr, \val];
+
+		_asm_extable_uaccess	8888b, \l;
+	.endm
+
 #endif
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index dd0877a75922..8e2899b36b00 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -391,13 +391,28 @@ do {									\
 	} while (0);							\
 } while(0)
 
+#define COPY_OPT_THRESHOLD  4096
+
+static __always_inline bool use_copy_opt(unsigned long n)
+{
+	return (n) >= COPY_OPT_THRESHOLD &&
+		alternative_has_cap_unlikely(ARM64_HAS_COPY_OPT);
+}
+
+extern unsigned long __must_check __arch_copy_from_user_opt(void *to,
+				const void __user *from, unsigned long n);
 extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
 #define raw_copy_from_user(to, from, n)					\
 ({									\
 	unsigned long __acfu_ret;					\
 	uaccess_ttbr0_enable();						\
-	__acfu_ret = __arch_copy_from_user((to),			\
-				      __uaccess_mask_ptr(from), (n));	\
+	if (use_copy_opt(n)) {	\
+		__acfu_ret = __arch_copy_from_user_opt((to),		\
+					__uaccess_mask_ptr(from), (n));	\
+	} else {							\
+		__acfu_ret = __arch_copy_from_user((to),		\
+					__uaccess_mask_ptr(from), (n));	\
+	}								\
 	uaccess_ttbr0_disable();					\
 	__acfu_ret;							\
 })
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index a1928cf3c887..0ad4b3730b64 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2576,6 +2576,31 @@ static void cpu_enable_arch_xcall_xint(const struct arm64_cpu_capabilities *__un
 }
 #endif
 
+static bool copy_opt_disable __ro_after_init;
+
+static int __init parse_copy_opt_disable(char *str)
+{
+	copy_opt_disable = true;
+	return 0;
+}
+early_param("copy_opt_disable", parse_copy_opt_disable);
+
+static bool has_copy_opt(const struct arm64_cpu_capabilities *cap, int scope)
+{
+	/* List of CPUs that support copy_from_user_opt */
+	static const struct midr_range copy_opt_cpus[] = {
+		MIDR_ALL_VERSIONS(MIDR_HISI_HIP11),
+		MIDR_ALL_VERSIONS(MIDR_HISI_HIP12),
+		MIDR_ALL_VERSIONS(MIDR_HISI_LINXICORE9100),
+		{ }
+	};
+
+	if (copy_opt_disable)
+		return false;
+
+	return is_midr_in_range_list(copy_opt_cpus);
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.capability = ARM64_ALWAYS_BOOT,
@@ -3151,6 +3176,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpu_enable = cpu_enable_arch_xcall_xint,
 	},
 #endif
+	{
+		.desc = "Hisilicon Optimized Copy From User enabled",
+		.capability = ARM64_HAS_COPY_OPT,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = has_copy_opt,
+	},
 	{},
 };
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 34e317907524..59f1a4f36982 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -71,3 +71,35 @@ USER(9998f, ldtrb tmp1w, [srcin])
 	ret
 SYM_FUNC_END(__arch_copy_from_user)
 EXPORT_SYMBOL(__arch_copy_from_user)
+
+	.macro ldp2 reg1, reg2, ptr, val
+	user_ldst_pair_index 9997f, ldp, \reg1, \reg2, \ptr, \val
+	.endm
+
+	.macro stp2 reg1, reg2, ptr, val
+	stp \reg1, \reg2, [\ptr, \val]
+	.endm
+
+SYM_FUNC_START(__arch_copy_from_user_opt)
+	add	end, x0, x2
+	mov	srcin, x1
+
+	ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN)
+
+#include "copy_template_opt.S"
+	mov	x0, #0				// Nothing to copy
+
+.L__arch_copy_from_user_opt_exit:
+	ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN)
+	ret
+
+	// Exception fixups
+9997:	cmp	dst, dstin
+	b.ne	9998f
+	// Before being absolutely sure we couldn't copy anything, try harder
+USER(9998f, ldtrb tmp1w, [srcin])
+	strb	tmp1w, [dst], #1
+9998:	sub	x0, end, dst			// bytes not copied
+	b	.L__arch_copy_from_user_opt_exit
+SYM_FUNC_END(__arch_copy_from_user_opt)
+EXPORT_SYMBOL(__arch_copy_from_user_opt)
diff --git a/arch/arm64/lib/copy_template_opt.S b/arch/arm64/lib/copy_template_opt.S
new file mode 100644
index 000000000000..4b57bfe6d337
--- /dev/null
+++ b/arch/arm64/lib/copy_template_opt.S
@@ -0,0 +1,189 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2026 - Huawei Ltd.
+ */
+
+/*
+ * Copy a buffer from src to dest (alignment handled by the hardware)
+ *
+ * Parameters:
+ *	x0 - dest
+ *	x1 - src
+ *	x2 - n
+ * Returns:
+ *	x0 - dest
+ */
+dstin	.req	x0
+src	.req	x1
+count	.req	x2
+tmp1	.req	x3
+tmp1w	.req	w3
+tmp2	.req	x4
+tmp2w	.req	w4
+dst	.req	x6
+
+A_l	.req	x7
+A_h	.req	x8
+B_l	.req	x9
+B_h	.req	x10
+C_l	.req	x11
+C_h	.req	x12
+D_l	.req	x13
+D_h	.req	x14
+
+	mov	dst, dstin
+	cmp	count, #16
+	/*When memory length is less than 16, the accesses are not aligned.*/
+	b.lo	.Ltiny15_opt
+
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned_opt
+	sub	count, count, tmp2
+	/*
+	* Copy the leading memory data from src to dst in an increasing
+	* address order.By this way,the risk of overwriting the source
+	* memory data is eliminated when the distance between src and
+	* dst is less than 16. The memory accesses here are alignment.
+	*/
+	tbz	tmp2, #0, 1f
+	ldrb1	tmp1w, src, #1
+	strb1	tmp1w, dst, #1
+1:
+	tbz	tmp2, #1, 2f
+	ldrh1	tmp1w, src, #2
+	strh1	tmp1w, dst, #2
+2:
+	tbz	tmp2, #2, 3f
+	ldr1	tmp1w, src, #4
+	str1	tmp1w, dst, #4
+3:
+	tbz	tmp2, #3, .LSrcAligned_opt
+	ldr1	tmp1, src, #8
+	str1	tmp1, dst, #8
+
+.LSrcAligned_opt:
+	cmp	count, #64
+	b.ge	.Lcpy_over64_opt
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63_opt:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15_opt
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp2	A_l, A_h, src, #0
+	stp2	A_l, A_h, dst, #0
+	add	src, src, #16
+	add	dst, dst, #16
+1:
+	ldp2	A_l, A_h, src, #0
+	stp2	A_l, A_h, dst, #0
+	add	src, src, #16
+	add	dst, dst, #16
+2:
+	ldp2	A_l, A_h, src, #0
+	stp2	A_l, A_h, dst, #0
+	add	src, src, #16
+	add	dst, dst, #16
+.Ltiny15_opt:
+	/*
+	* Prefer to break one ldp/stp into several load/store to access
+	* memory in an increasing address order,rather than to load/store 16
+	* bytes from (src-16) to (dst-16) and to backward the src to aligned
+	* address,which way is used in original cortex memcpy. If keeping
+	* the original memcpy process here, memmove need to satisfy the
+	* precondition that src address is at least 16 bytes bigger than dst
+	* address,otherwise some source data will be overwritten when memove
+	* call memcpy directly. To make memmove simpler and decouple the
+	* memcpy's dependency on memmove, withdrew the original process.
+	*/
+	tbz	count, #3, 1f
+	ldr1	tmp1, src, #8
+	str1	tmp1, dst, #8
+1:
+	tbz	count, #2, 2f
+	ldr1	tmp1w, src, #4
+	str1	tmp1w, dst, #4
+2:
+	tbz	count, #1, 3f
+	ldrh1	tmp1w, src, #2
+	strh1	tmp1w, dst, #2
+3:
+	tbz	count, #0, .Lexitfunc_opt
+	ldrb1	tmp1w, src, #1
+	strb1	tmp1w, dst, #1
+
+	b	.Lexitfunc_opt
+
+.Lcpy_over64_opt:
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large_opt
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+	ldp2	A_l, A_h, src, #0
+	stp2	A_l, A_h, dst, #0
+	ldp2	B_l, B_h, src, #16
+	ldp2	C_l, C_h, src, #32
+	stp2	B_l, B_h, dst, #16
+	stp2	C_l, C_h, dst, #32
+	ldp2	D_l, D_h, src, #48
+	stp2	D_l, D_h, dst, #48
+	add	src, src, #64
+	add	dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63_opt
+	b	.Lexitfunc_opt
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+	.p2align	L1_CACHE_SHIFT
+.Lcpy_body_large_opt:
+
+	/* pre-get 64 bytes data. */
+	ldp2	A_l, A_h, src, #0
+	ldp2	B_l, B_h, src, #16
+	ldp2	C_l, C_h, src, #32
+	ldp2	D_l, D_h, src, #48
+	add	src, src, #64
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+	stp2	A_l, A_h, dst, #0
+	ldp2	A_l, A_h, src, #0
+	stp2	B_l, B_h, dst, #16
+	ldp2	B_l, B_h, src, #16
+	stp2	C_l, C_h, dst, #32
+	ldp2	C_l, C_h, src, #32
+	stp2	D_l, D_h, dst, #48
+	ldp2	D_l, D_h, src, #48
+	add	dst, dst, #64
+	add	src, src, #64
+	subs	count, count, #64
+	b.ge	1b
+
+	/* Post-loop: store the last block of data using stp2 */
+	/* (without post-increment) */
+	stp2	A_l, A_h, dst, #0
+	stp2	B_l, B_h, dst, #16
+	stp2	C_l, C_h, dst, #32
+	stp2	D_l, D_h, dst, #48
+	add	dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63_opt
+.Lexitfunc_opt:
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index d8f2db273def..ce0c60d48dff 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -114,7 +114,7 @@ HAS_LS64
 HAS_LS64_V
 HAS_HW_XCALL_XINT
 WORKAROUND_PHYTIUM_FT3386
-KABI_RESERVE_7
+HAS_COPY_OPT
 KABI_RESERVE_8
 KABI_RESERVE_9
 KABI_RESERVE_10
-- 
2.33.0