[PATCH OLK-6.6 1/3] arm64: lib: add __arch_copy_from_user_opt with index-addressed ldp/stp

13 May 2026

hulk inclusion
category: feature
bugzilla: https://atomgit.com/openeuler/kernel/issues/9132

-------------------------------------------------------

Add __arch_copy_from_user_opt for selected HiSilicon cores.
This path uses index-addressed ldp/stp pairs to copy data from
user space, temporarily disables PAN during the copy, and
restores it on both the success and the exception fixup paths.
The new copy_template_opt.S handles the >=64B bulk copy loop
while the tail is managed by the existing byte/half/word
load/store macros.

Signed-off-by: Qi Xi <xiqi2@huawei.com>
---
 .../admin-guide/kernel-parameters.txt         |   6 +
 arch/arm64/include/asm/asm-uaccess.h          |   7 +
 arch/arm64/kernel/cpufeature.c                |  30 +++
 arch/arm64/lib/copy_from_user.S               |  32 +++
 arch/arm64/lib/copy_template_opt.S            | 189 ++++++++++++++++++
 arch/arm64/tools/cpucaps                      |   2 +-
 6 files changed, 265 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/lib/copy_template_opt.S

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index d72ab64a69b9..03f59c496cc1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -718,6 +718,12 @@
 			which is located in node nid, if the allocation fails,
 			they will fallback to the global default memory area.
 
+	copy_opt_disable	[ARM64]
+			Disable the optimized copy_from_user path on Hisilicon
+			CPUs that support it. By default the optimization is
+			enabled; this parameter forces all copies through
+			the standard unoptimized path.
+
 	cmo_free_hint=	[PPC] Format: { yes | no }
 			Specify whether pages are marked as being inactive
 			when they are freed.  This is used in CMO environments
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 7bbebfa5b710..4edf6c7dc56f 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -94,4 +94,11 @@ alternative_else_nop_endif
 
 		_asm_extable_uaccess	8888b, \l;
 	.endm
+
+	.macro user_ldst_pair_index l, inst, reg1, reg2, addr, val
+8888:		\inst		\reg1, \reg2, [\addr, \val];
+
+		_asm_extable_uaccess	8888b, \l;
+	.endm
+
 #endif
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index a1928cf3c887..3f8889d5dd8c 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2576,6 +2576,30 @@ static void cpu_enable_arch_xcall_xint(const struct arm64_cpu_capabilities *__un
 }
 #endif
 
+static bool copy_opt_disable __ro_after_init;
+
+static int __init parse_copy_opt_disable(char *str)
+{
+	copy_opt_disable = true;
+	return 0;
+}
+early_param("copy_opt_disable", parse_copy_opt_disable);
+
+static bool has_copy_opt(const struct arm64_cpu_capabilities *cap, int scope)
+{
+	/* List of CPUs that support copy_from_user_opt */
+	static const struct midr_range copy_opt_cpus[] = {
+		MIDR_ALL_VERSIONS(MIDR_HISI_HIP12),
+		MIDR_ALL_VERSIONS(MIDR_HISI_LINXICORE9100),
+		{ }
+	};
+
+	if (copy_opt_disable)
+		return false;
+
+	return is_midr_in_range_list(copy_opt_cpus);
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.capability = ARM64_ALWAYS_BOOT,
@@ -3151,6 +3175,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpu_enable = cpu_enable_arch_xcall_xint,
 	},
 #endif
+	{
+		.desc = "Hisilicon Optimized Copy From User enabled",
+		.capability = ARM64_HAS_COPY_OPT,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = has_copy_opt,
+	},
 	{},
 };
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 34e317907524..59f1a4f36982 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -71,3 +71,35 @@ USER(9998f, ldtrb tmp1w, [srcin])
 	ret
 SYM_FUNC_END(__arch_copy_from_user)
 EXPORT_SYMBOL(__arch_copy_from_user)
+
+	.macro ldp2 reg1, reg2, ptr, val
+	user_ldst_pair_index 9997f, ldp, \reg1, \reg2, \ptr, \val
+	.endm
+
+	.macro stp2 reg1, reg2, ptr, val
+	stp \reg1, \reg2, [\ptr, \val]
+	.endm
+
+SYM_FUNC_START(__arch_copy_from_user_opt)
+	add	end, x0, x2
+	mov	srcin, x1
+
+	ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN)
+
+#include "copy_template_opt.S"
+	mov	x0, #0				// Nothing to copy
+
+.L__arch_copy_from_user_opt_exit:
+	ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN)
+	ret
+
+	// Exception fixups
+9997:	cmp	dst, dstin
+	b.ne	9998f
+	// Before being absolutely sure we couldn't copy anything, try harder
+USER(9998f, ldtrb tmp1w, [srcin])
+	strb	tmp1w, [dst], #1
+9998:	sub	x0, end, dst			// bytes not copied
+	b	.L__arch_copy_from_user_opt_exit
+SYM_FUNC_END(__arch_copy_from_user_opt)
+EXPORT_SYMBOL(__arch_copy_from_user_opt)
diff --git a/arch/arm64/lib/copy_template_opt.S b/arch/arm64/lib/copy_template_opt.S
new file mode 100644
index 000000000000..c4578ab8e399
--- /dev/null
+++ b/arch/arm64/lib/copy_template_opt.S
@@ -0,0 +1,189 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2026 - Huawei Ltd.
+ */
+
+/*
+ * Copy a buffer from src to dest (alignment handled by the hardware)
+ *
+ * Parameters:
+ *	x0 - dest
+ *	x1 - src
+ *	x2 - n
+ * Returns:
+ *	x0 - dest
+ */
+dstin	.req	x0
+src	.req	x1
+count	.req	x2
+tmp1	.req	x3
+tmp1w	.req	w3
+tmp2	.req	x4
+tmp2w	.req	w4
+dst	.req	x6
+
+A_l	.req	x7
+A_h	.req	x8
+B_l	.req	x9
+B_h	.req	x10
+C_l	.req	x11
+C_h	.req	x12
+D_l	.req	x13
+D_h	.req	x14
+
+	mov	dst, dstin
+	cmp	count, #16
+	/*When memory length is less than 16, the accesses are not aligned.*/
+	b.lo	.Ltiny15_opt
+
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned_opt
+	sub	count, count, tmp2
+	/*
+	* Copy the leading memory data from src to dst in an increasing
+	* address order.By this way,the risk of overwriting the source
+	* memory data is eliminated when the distance between src and
+	* dst is less than 16. The memory accesses here are alignment.
+	*/
+	tbz	tmp2, #0, 1f
+	ldrb1	tmp1w, src, #1
+	strb1	tmp1w, dst, #1
+1:
+	tbz	tmp2, #1, 2f
+	ldrh1	tmp1w, src, #2
+	strh1	tmp1w, dst, #2
+2:
+	tbz	tmp2, #2, 3f
+	ldr1	tmp1w, src, #4
+	str1	tmp1w, dst, #4
+3:
+	tbz	tmp2, #3, .LSrcAligned_opt
+	ldr1	tmp1, src, #8
+	str1	tmp1, dst, #8
+
+.LSrcAligned_opt:
+	cmp	count, #64
+	b.ge	.Lcpy_over64_opt
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63_opt:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15_opt
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp2	A_l, A_h, src, #0
+	stp2	A_l, A_h, dst, #0
+	add	src, src, #16
+	add	dst, dst, #16
+1:
+	ldp2	A_l, A_h, src, #0
+	stp2	A_l, A_h, dst, #0
+	add	src, src, #16
+	add	dst, dst, #16
+2:
+	ldp2	A_l, A_h, src, #0
+	stp2	A_l, A_h, dst, #0
+	add	src, src, #16
+	add	dst, dst, #16
+.Ltiny15_opt:
+	/*
+	* Prefer to break one ldp/stp into several load/store to access
+	* memory in an increasing address order,rather than to load/store 16
+	* bytes from (src-16) to (dst-16) and to backward the src to aligned
+	* address,which way is used in original cortex memcpy. If keeping
+	* the original memcpy process here, memmove need to satisfy the
+	* precondition that src address is at least 16 bytes bigger than dst
+	* address,otherwise some source data will be overwritten when memove
+	* call memcpy directly. To make memmove simpler and decouple the
+	* memcpy's dependency on memmove, withdrew the original process.
+	*/
+	tbz	count, #3, 1f
+	ldr1	tmp1, src, #8
+	str1	tmp1, dst, #8
+1:
+	tbz	count, #2, 2f
+	ldr1	tmp1w, src, #4
+	str1	tmp1w, dst, #4
+2:
+	tbz	count, #1, 3f
+	ldrh1	tmp1w, src, #2
+	strh1	tmp1w, dst, #2
+3:
+	tbz	count, #0, .Lexitfunc_opt
+	ldrb1	tmp1w, src, #1
+	strb1	tmp1w, dst, #1
+
+	b	.Lexitfunc_opt
+
+.Lcpy_over64_opt:
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large_opt
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+	ldp2	A_l, A_h, src, #0
+	stp2	A_l, A_h, dst, #0
+	ldp2	B_l, B_h, src, #16
+	ldp2	C_l, C_h, src, #32
+	stp2	B_l, B_h, dst, #16
+	stp2	C_l, C_h, dst, #32
+	ldp2	D_l, D_h, src, #48
+	stp2	D_l, D_h, dst, #48
+	add	src, src, #64
+	add	dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63_opt
+	b	.Lexitfunc_opt
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+	.p2align	L1_CACHE_SHIFT
+.Lcpy_body_large_opt:
+
+	/* pre-get 64 bytes data. */
+	ldp2	A_l, A_h, src, #0
+	ldp2	B_l, B_h, src, #16
+	ldp2	C_l, C_h, src, #32
+	ldp2	D_l, D_h, src, #48
+	add	src, src, #64
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+	stp2	A_l, A_h, dst, #0
+	ldp2	A_l, A_h, src, #0
+	stp2	B_l, B_h, dst, #16
+	ldp2	B_l, B_h, src, #16
+	stp2	C_l, C_h, dst, #32
+	ldp2	C_l, C_h, src, #32
+	stp2	D_l, D_h, dst, #48
+	ldp2	D_l, D_h, src, #48
+	add	dst, dst, #64
+	add	src, src, #64
+	subs	count, count, #64
+	b.ge	1b
+
+	/* Post-loop: store the last block of data using stp2 */
+	/* (without post-increment) */
+	stp2	A_l, A_h, dst, #0
+	stp2	B_l, B_h, dst, #16
+	stp2	C_l, C_h, dst, #32
+	stp2	D_l, D_h, dst, #48
+	add	dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63_opt
+.Lexitfunc_opt:
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index d8f2db273def..ce0c60d48dff 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -114,7 +114,7 @@ HAS_LS64
 HAS_LS64_V
 HAS_HW_XCALL_XINT
 WORKAROUND_PHYTIUM_FT3386
-KABI_RESERVE_7
+HAS_COPY_OPT
 KABI_RESERVE_8
 KABI_RESERVE_9
 KABI_RESERVE_10
-- 
2.33.0