---
.../admin-guide/kernel-parameters.txt | 3 +
arch/arm64/Kconfig | 11 ++
arch/arm64/include/asm/asm-uaccess.h | 13 ++
arch/arm64/include/asm/cputype.h | 2 +
arch/arm64/include/asm/uaccess.h | 21 ++-
arch/arm64/kernel/cpufeature.c | 51 +++++
arch/arm64/lib/copy_from_user.S | 33 ++++
arch/arm64/lib/copy_template_opt.S | 175 ++++++++++++++++++
arch/arm64/tools/cpucaps | 4 +-
9 files changed, 309 insertions(+), 4 deletions(-)
create mode 100644 arch/arm64/lib/copy_template_opt.S
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b443a9665e03..e2adafc903c1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -423,6 +423,9 @@
arm64.nomops [ARM64] Unconditionally disable Memory Copy and Memory
Set instructions support
+ copy_opt_disable [ARM64] Disable optimized copy_from_user
+ implementation
+
arm64.nomte [ARM64] Unconditionally disable Memory Tagging Extension
support
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 81d2baafdcd6..757d4bbe4251 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1317,6 +1317,14 @@ config HISILICON_ERRATUM_165010801
system watchdog hardlockup detection might be triggered. The arch_timer
driver addresses this by proactively increasing affected interrupt priorities.
+config ARM64_COPY_FROM_USER_OPT
+ bool "Optimized copy_from_user for Hisilicon CPUs"
+ depends on ARCH_HISI
+ default y
+ help
+ Enable an optimized copy_from_user implementation for Hisilicon
+ CPUs that benefit from LDP instruction based copy routines.
+
config QCOM_FALKOR_ERRATUM_1003
bool "Falkor E1003: Incorrect translation due to ASID change"
default y
@@ -2061,6 +2069,9 @@ config ARM64_PAN
config AS_HAS_LSE_ATOMICS
def_bool $(as-instr,.arch_extension lse)
+config AS_HAS_LSUI
+ def_bool $(as-instr,.arch_extension lsui)
+
config ARM64_LSE_ATOMICS
bool
default ARM64_USE_LSE_ATOMICS
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 7bbebfa5b710..342e77b0d7f2 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -94,4 +94,17 @@ alternative_else_nop_endif
_asm_extable_uaccess 8888b, \l;
.endm
+
+ .macro user_ldpair l, reg1, reg2, addr, val
+8888: ldp \reg1, \reg2, [\addr, \val];
+
+ _asm_extable_uaccess 8888b, \l;
+ .endm
+
+ .macro user_ldtpair l, reg1, reg2, addr, val
+8888: .arch_extension lsui
+ ldtp \reg1, \reg2, [\addr, \val];
+
+ _asm_extable_uaccess 8888b, \l;
+ .endm
#endif
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 490c2ac36ac0..16c4d867f199 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -142,6 +142,7 @@
#define HISI_CPU_PART_TSV110 0xD01
#define HISI_CPU_PART_LINXICORE9100 0xD02
+#define HISI_CPU_PART_HIP11 0xD22
#define HISI_CPU_PART_HIP12 0xD06
#define APPLE_CPU_PART_M1_ICESTORM 0x022
@@ -230,6 +231,7 @@
#define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX)
#define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110)
#define MIDR_HISI_LINXICORE9100 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_LINXICORE9100)
+#define MIDR_HISI_HIP11 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP11)
#define MIDR_HISI_HIP12 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP12)
#define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM)
#define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM)
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index dd0877a75922..ed307e1425b4 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -392,12 +392,29 @@ do { \
} while(0)
extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
+#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT
+#define COPY_OPT_THRESHOLD 4096
+static __always_inline bool use_copy_opt(unsigned long n)
+{
+ if (alternative_has_cap_unlikely(ARM64_HAS_LSUI))
+ return true;
+ return alternative_has_cap_unlikely(ARM64_HAS_COPY_OPT) && n >= COPY_OPT_THRESHOLD;
+}
+extern unsigned long __must_check __arch_copy_from_user_opt(void *to,
+ const void __user *from, unsigned long n);
+#else
+static __always_inline bool use_copy_opt(unsigned long n) { return false; }
+#endif
#define raw_copy_from_user(to, from, n) \
({ \
unsigned long __acfu_ret; \
uaccess_ttbr0_enable(); \
- __acfu_ret = __arch_copy_from_user((to), \
- __uaccess_mask_ptr(from), (n)); \
+ if (use_copy_opt(n)) \
+ __acfu_ret = __arch_copy_from_user_opt((to), \
+ __uaccess_mask_ptr(from), (n)); \
+ else \
+ __acfu_ret = __arch_copy_from_user((to), \
+ __uaccess_mask_ptr(from), (n)); \
uaccess_ttbr0_disable(); \
__acfu_ret; \
})
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index c8e3f16387cb..1ed2850ba72d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2338,6 +2338,37 @@ static void cpu_enable_dit(const struct arm64_cpu_capabilities *__unused)
set_pstate_dit(1);
}
+#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT
+static bool copy_opt_disable __ro_after_init;
+
+static int __init copy_opt_disable_param(char *str)
+{
+ copy_opt_disable = true;
+ return 0;
+}
+early_param("copy_opt_disable", copy_opt_disable_param);
+
+static bool has_copy_opt(const struct arm64_cpu_capabilities *entry, int scope)
+{
+ static const struct midr_range copy_opt_cpus[] = {
+ MIDR_ALL_VERSIONS(MIDR_HISI_LINXICORE9100),
+ MIDR_ALL_VERSIONS(MIDR_HISI_HIP11),
+ MIDR_ALL_VERSIONS(MIDR_HISI_HIP12),
+ { /* sentinel */ }
+ };
+
+ if (copy_opt_disable)
+ return false;
+
+ return is_midr_in_range_list(copy_opt_cpus); // todo
+}
+
+static void cpu_enable_copy_opt(const struct arm64_cpu_capabilities *__unused)
+{
+ pr_info("copy_from_user: optimized implementation enabled\n");
+}
+#endif
+
static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused)
{
sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn);
@@ -3154,6 +3185,26 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.matches = has_arch_xcall_xint_support,
.cpu_enable = cpu_enable_arch_xcall_xint,
},
+#endif
+ {
+ .desc = "Unprivileged Load Store Instructions",
+ .capability = ARM64_HAS_LSUI,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .sys_reg = SYS_ID_AA64ISAR2_EL1,
+ .sign = FTR_UNSIGNED,
+ .field_pos = 4,
+ .field_width = 4,
+ .min_field_value = 1,
+ .matches = has_cpuid_feature,
+ },
+#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT
+ {
+ .desc = "Optimized copy_from_user",
+ .capability = ARM64_HAS_COPY_OPT,
+ .type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
+ .matches = has_copy_opt,
+ .cpu_enable = cpu_enable_copy_opt,
+ },
#endif
{},
};
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 34e317907524..d090836aa9c0 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -52,6 +52,18 @@
stp \reg1, \reg2, [\ptr], \val
.endm
+ .macro ldp2 reg1, reg2, ptr, val
+ alternative_if_not ARM64_HAS_LSUI
+ user_ldpair 9997f, \reg1, \reg2, \ptr, \val
+ alternative_else
+ user_ldtpair 9997f, \reg1, \reg2, \ptr, \val
+ alternative_endif
+ .endm
+
+ .macro stp2 reg1, reg2, ptr, val
+ stp \reg1, \reg2, [\ptr, \val]
+ .endm
+
end .req x5
srcin .req x15
SYM_FUNC_START(__arch_copy_from_user)
@@ -71,3 +83,24 @@ USER(9998f, ldtrb tmp1w, [srcin])
ret
SYM_FUNC_END(__arch_copy_from_user)
EXPORT_SYMBOL(__arch_copy_from_user)
+
+#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT
+SYM_FUNC_START(__arch_copy_from_user_opt)
+ add end, x0, x2
+ mov srcin, x1
+
+#include "copy_template_opt.S"
+ mov x0, #0 // Nothing to copy
+ ret
+
+ // Exception fixups
+9997: cmp dst, dstin
+ b.ne 9998f
+ // Before being absolutely sure we couldn't copy anything, try harder
+USER(9998f, ldtrb tmp1w, [srcin])
+ strb tmp1w, [dst], #1
+9998: sub x0, end, dst // bytes not copied
+ ret
+SYM_FUNC_END(__arch_copy_from_user_opt)
+EXPORT_SYMBOL(__arch_copy_from_user_opt)
+#endif
diff --git a/arch/arm64/lib/copy_template_opt.S b/arch/arm64/lib/copy_template_opt.S
new file mode 100644
index 000000000000..df6c2be11dc0
--- /dev/null
+++ b/arch/arm64/lib/copy_template_opt.S
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2026 - Huawei Ltd.
+ */
+
+/*
+ * Copy a buffer from src to dest (alignment handled by the hardware)
+ *
+ * Parameters:
+ * x0 - dest
+ * x1 - src
+ * x2 - n
+ * Returns:
+ * x0 - dest
+ *
+ * Note: register aliases (dstin, src, count, tmp1, etc.) are defined by
+ * copy_template.S which is included earlier in copy_from_user.S.
+ */
+
+ mov dst, dstin
+ cmp count, #16
+ /*When memory length is less than 16, the accesses are not aligned.*/
+ b.lo .Ltiny15_opt
+
+ neg tmp2, src
+ ands tmp2, tmp2, #15/* Bytes to reach alignment. */
+ b.eq .LSrcAligned_opt
+ sub count, count, tmp2
+ /*
+ * Copy the leading memory data from src to dst in an increasing
+ * address order.By this way,the risk of overwriting the source
+ * memory data is eliminated when the distance between src and
+ * dst is less than 16. The memory accesses here are alignment.
+ */
+ tbz tmp2, #0, 1f
+ ldrb1 tmp1w, src, #1
+ strb1 tmp1w, dst, #1
+1:
+ tbz tmp2, #1, 2f
+ ldrh1 tmp1w, src, #2
+ strh1 tmp1w, dst, #2
+2:
+ tbz tmp2, #2, 3f
+ ldr1 tmp1w, src, #4
+ str1 tmp1w, dst, #4
+3:
+ tbz tmp2, #3, .LSrcAligned_opt
+ ldr1 tmp1, src, #8
+ str1 tmp1, dst, #8
+
+.LSrcAligned_opt:
+ cmp count, #64
+ b.ge .Lcpy_over64_opt
+ /*
+ * Deal with small copies quickly by dropping straight into the
+ * exit block.
+ */
+.Ltail63_opt:
+ /*
+ * Copy up to 48 bytes of data. At this point we only need the
+ * bottom 6 bits of count to be accurate.
+ */
+ ands tmp1, count, #0x30
+ b.eq .Ltiny15_opt
+ cmp tmp1w, #0x20
+ b.eq 1f
+ b.lt 2f
+ ldp2 A_l, A_h, src, #0
+ stp2 A_l, A_h, dst, #0
+ add src, src, #16
+ add dst, dst, #16
+1:
+ ldp2 A_l, A_h, src, #0
+ stp2 A_l, A_h, dst, #0
+ add src, src, #16
+ add dst, dst, #16
+2:
+ ldp2 A_l, A_h, src, #0
+ stp2 A_l, A_h, dst, #0
+ add src, src, #16
+ add dst, dst, #16
+.Ltiny15_opt:
+ /*
+ * Prefer to break one ldp/stp into several load/store to access
+ * memory in an increasing address order,rather than to load/store 16
+ * bytes from (src-16) to (dst-16) and to backward the src to aligned
+ * address,which way is used in original cortex memcpy. If keeping
+ * the original memcpy process here, memmove need to satisfy the
+ * precondition that src address is at least 16 bytes bigger than dst
+ * address,otherwise some source data will be overwritten when memove
+ * call memcpy directly. To make memmove simpler and decouple the
+ * memcpy's dependency on memmove, withdrew the original process.
+ */
+ tbz count, #3, 1f
+ ldr1 tmp1, src, #8
+ str1 tmp1, dst, #8
+1:
+ tbz count, #2, 2f
+ ldr1 tmp1w, src, #4
+ str1 tmp1w, dst, #4
+2:
+ tbz count, #1, 3f
+ ldrh1 tmp1w, src, #2
+ strh1 tmp1w, dst, #2
+3:
+ tbz count, #0, .Lexitfunc_opt
+ ldrb1 tmp1w, src, #1
+ strb1 tmp1w, dst, #1
+
+ b .Lexitfunc_opt
+
+.Lcpy_over64_opt:
+ subs count, count, #128
+ b.ge .Lcpy_body_large_opt
+ /*
+ * Less than 128 bytes to copy, so handle 64 here and then jump
+ * to the tail.
+ */
+ ldp2 A_l, A_h, src, #0
+ stp2 A_l, A_h, dst, #0
+ ldp2 B_l, B_h, src, #16
+ ldp2 C_l, C_h, src, #32
+ stp2 B_l, B_h, dst, #16
+ stp2 C_l, C_h, dst, #32
+ ldp2 D_l, D_h, src, #48
+ stp2 D_l, D_h, dst, #48
+ add src, src, #64
+ add dst, dst, #64
+
+ tst count, #0x3f
+ b.ne .Ltail63_opt
+ b .Lexitfunc_opt
+
+ /*
+ * Critical loop. Start at a new cache line boundary. Assuming
+ * 64 bytes per line this ensures the entire loop is in one line.
+ */
+ .p2align L1_CACHE_SHIFT
+.Lcpy_body_large_opt:
+
+ /* pre-get 64 bytes data. */
+ ldp2 A_l, A_h, src, #0
+ ldp2 B_l, B_h, src, #16
+ ldp2 C_l, C_h, src, #32
+ ldp2 D_l, D_h, src, #48
+ add src, src, #64
+1:
+ /*
+ * interlace the load of next 64 bytes data block with store of the last
+ * loaded 64 bytes data.
+ */
+ stp2 A_l, A_h, dst, #0
+ ldp2 A_l, A_h, src, #0
+ stp2 B_l, B_h, dst, #16
+ ldp2 B_l, B_h, src, #16
+ stp2 C_l, C_h, dst, #32
+ ldp2 C_l, C_h, src, #32
+ stp2 D_l, D_h, dst, #48
+ ldp2 D_l, D_h, src, #48
+ add dst, dst, #64
+ add src, src, #64
+ subs count, count, #64
+ b.ge 1b
+
+ /* Post-loop: store the last block of data using stp2 */
+ /* (without post-increment) */
+ stp2 A_l, A_h, dst, #0
+ stp2 B_l, B_h, dst, #16
+ stp2 C_l, C_h, dst, #32
+ stp2 D_l, D_h, dst, #48
+ add dst, dst, #64
+
+ tst count, #0x3f
+ b.ne .Ltail63_opt
+.Lexitfunc_opt:
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index d8f2db273def..f6445266e886 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -114,8 +114,8 @@ HAS_LS64
HAS_LS64_V
HAS_HW_XCALL_XINT
WORKAROUND_PHYTIUM_FT3386
-KABI_RESERVE_7
-KABI_RESERVE_8
+HAS_COPY_OPT
+HAS_LSUI
KABI_RESERVE_9
KABI_RESERVE_10
KABI_RESERVE_11
--
2.53.0