--- .../admin-guide/kernel-parameters.txt | 3 + arch/arm64/Kconfig | 11 ++ arch/arm64/include/asm/asm-uaccess.h | 13 ++ arch/arm64/include/asm/cputype.h | 2 + arch/arm64/include/asm/uaccess.h | 21 ++- arch/arm64/kernel/cpufeature.c | 51 +++++ arch/arm64/lib/copy_from_user.S | 33 ++++ arch/arm64/lib/copy_template_opt.S | 175 ++++++++++++++++++ arch/arm64/tools/cpucaps | 4 +- 9 files changed, 309 insertions(+), 4 deletions(-) create mode 100644 arch/arm64/lib/copy_template_opt.S diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b443a9665e03..e2adafc903c1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -423,6 +423,9 @@ arm64.nomops [ARM64] Unconditionally disable Memory Copy and Memory Set instructions support + copy_opt_disable [ARM64] Disable optimized copy_from_user + implementation + arm64.nomte [ARM64] Unconditionally disable Memory Tagging Extension support diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 81d2baafdcd6..757d4bbe4251 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1317,6 +1317,14 @@ config HISILICON_ERRATUM_165010801 system watchdog hardlockup detection might be triggered. The arch_timer driver addresses this by proactively increasing affected interrupt priorities. +config ARM64_COPY_FROM_USER_OPT + bool "Optimized copy_from_user for Hisilicon CPUs" + depends on ARCH_HISI + default y + help + Enable an optimized copy_from_user implementation for Hisilicon + CPUs that benefit from LDP instruction based copy routines. + config QCOM_FALKOR_ERRATUM_1003 bool "Falkor E1003: Incorrect translation due to ASID change" default y @@ -2061,6 +2069,9 @@ config ARM64_PAN config AS_HAS_LSE_ATOMICS def_bool $(as-instr,.arch_extension lse) +config AS_HAS_LSUI + def_bool $(as-instr,.arch_extension lsui) + config ARM64_LSE_ATOMICS bool default ARM64_USE_LSE_ATOMICS diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 7bbebfa5b710..342e77b0d7f2 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -94,4 +94,17 @@ alternative_else_nop_endif _asm_extable_uaccess 8888b, \l; .endm + + .macro user_ldpair l, reg1, reg2, addr, val +8888: ldp \reg1, \reg2, [\addr, \val]; + + _asm_extable_uaccess 8888b, \l; + .endm + + .macro user_ldtpair l, reg1, reg2, addr, val +8888: .arch_extension lsui + ldtp \reg1, \reg2, [\addr, \val]; + + _asm_extable_uaccess 8888b, \l; + .endm #endif diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 490c2ac36ac0..16c4d867f199 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -142,6 +142,7 @@ #define HISI_CPU_PART_TSV110 0xD01 #define HISI_CPU_PART_LINXICORE9100 0xD02 +#define HISI_CPU_PART_HIP11 0xD22 #define HISI_CPU_PART_HIP12 0xD06 #define APPLE_CPU_PART_M1_ICESTORM 0x022 @@ -230,6 +231,7 @@ #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) #define MIDR_HISI_LINXICORE9100 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_LINXICORE9100) +#define MIDR_HISI_HIP11 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP11) #define MIDR_HISI_HIP12 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP12) #define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM) #define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index dd0877a75922..ed307e1425b4 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -392,12 +392,29 @@ do { \ } while(0) extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); +#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT +#define COPY_OPT_THRESHOLD 4096 +static __always_inline bool use_copy_opt(unsigned long n) +{ + if (alternative_has_cap_unlikely(ARM64_HAS_LSUI)) + return true; + return alternative_has_cap_unlikely(ARM64_HAS_COPY_OPT) && n >= COPY_OPT_THRESHOLD; +} +extern unsigned long __must_check __arch_copy_from_user_opt(void *to, + const void __user *from, unsigned long n); +#else +static __always_inline bool use_copy_opt(unsigned long n) { return false; } +#endif #define raw_copy_from_user(to, from, n) \ ({ \ unsigned long __acfu_ret; \ uaccess_ttbr0_enable(); \ - __acfu_ret = __arch_copy_from_user((to), \ - __uaccess_mask_ptr(from), (n)); \ + if (use_copy_opt(n)) \ + __acfu_ret = __arch_copy_from_user_opt((to), \ + __uaccess_mask_ptr(from), (n)); \ + else \ + __acfu_ret = __arch_copy_from_user((to), \ + __uaccess_mask_ptr(from), (n)); \ uaccess_ttbr0_disable(); \ __acfu_ret; \ }) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c8e3f16387cb..1ed2850ba72d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2338,6 +2338,37 @@ static void cpu_enable_dit(const struct arm64_cpu_capabilities *__unused) set_pstate_dit(1); } +#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT +static bool copy_opt_disable __ro_after_init; + +static int __init copy_opt_disable_param(char *str) +{ + copy_opt_disable = true; + return 0; +} +early_param("copy_opt_disable", copy_opt_disable_param); + +static bool has_copy_opt(const struct arm64_cpu_capabilities *entry, int scope) +{ + static const struct midr_range copy_opt_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_HISI_LINXICORE9100), + MIDR_ALL_VERSIONS(MIDR_HISI_HIP11), + MIDR_ALL_VERSIONS(MIDR_HISI_HIP12), + { /* sentinel */ } + }; + + if (copy_opt_disable) + return false; + + return is_midr_in_range_list(copy_opt_cpus); // todo +} + +static void cpu_enable_copy_opt(const struct arm64_cpu_capabilities *__unused) +{ + pr_info("copy_from_user: optimized implementation enabled\n"); +} +#endif + static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) { sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn); @@ -3154,6 +3185,26 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_arch_xcall_xint_support, .cpu_enable = cpu_enable_arch_xcall_xint, }, +#endif + { + .desc = "Unprivileged Load Store Instructions", + .capability = ARM64_HAS_LSUI, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64ISAR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = 4, + .field_width = 4, + .min_field_value = 1, + .matches = has_cpuid_feature, + }, +#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT + { + .desc = "Optimized copy_from_user", + .capability = ARM64_HAS_COPY_OPT, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_copy_opt, + .cpu_enable = cpu_enable_copy_opt, + }, #endif {}, }; diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 34e317907524..d090836aa9c0 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -52,6 +52,18 @@ stp \reg1, \reg2, [\ptr], \val .endm + .macro ldp2 reg1, reg2, ptr, val + alternative_if_not ARM64_HAS_LSUI + user_ldpair 9997f, \reg1, \reg2, \ptr, \val + alternative_else + user_ldtpair 9997f, \reg1, \reg2, \ptr, \val + alternative_endif + .endm + + .macro stp2 reg1, reg2, ptr, val + stp \reg1, \reg2, [\ptr, \val] + .endm + end .req x5 srcin .req x15 SYM_FUNC_START(__arch_copy_from_user) @@ -71,3 +83,24 @@ USER(9998f, ldtrb tmp1w, [srcin]) ret SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user) + +#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT +SYM_FUNC_START(__arch_copy_from_user_opt) + add end, x0, x2 + mov srcin, x1 + +#include "copy_template_opt.S" + mov x0, #0 // Nothing to copy + ret + + // Exception fixups +9997: cmp dst, dstin + b.ne 9998f + // Before being absolutely sure we couldn't copy anything, try harder +USER(9998f, ldtrb tmp1w, [srcin]) + strb tmp1w, [dst], #1 +9998: sub x0, end, dst // bytes not copied + ret +SYM_FUNC_END(__arch_copy_from_user_opt) +EXPORT_SYMBOL(__arch_copy_from_user_opt) +#endif diff --git a/arch/arm64/lib/copy_template_opt.S b/arch/arm64/lib/copy_template_opt.S new file mode 100644 index 000000000000..df6c2be11dc0 --- /dev/null +++ b/arch/arm64/lib/copy_template_opt.S @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2026 - Huawei Ltd. + */ + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + * + * Note: register aliases (dstin, src, count, tmp1, etc.) are defined by + * copy_template.S which is included earlier in copy_from_user.S. + */ + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accesses are not aligned.*/ + b.lo .Ltiny15_opt + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned_opt + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned_opt + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned_opt: + cmp count, #64 + b.ge .Lcpy_over64_opt + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63_opt: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15_opt + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +1: + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +2: + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +.Ltiny15_opt: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc_opt + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc_opt + +.Lcpy_over64_opt: + subs count, count, #128 + b.ge .Lcpy_body_large_opt + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + ldp2 B_l, B_h, src, #16 + ldp2 C_l, C_h, src, #32 + stp2 B_l, B_h, dst, #16 + stp2 C_l, C_h, dst, #32 + ldp2 D_l, D_h, src, #48 + stp2 D_l, D_h, dst, #48 + add src, src, #64 + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63_opt + b .Lexitfunc_opt + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large_opt: + + /* pre-get 64 bytes data. */ + ldp2 A_l, A_h, src, #0 + ldp2 B_l, B_h, src, #16 + ldp2 C_l, C_h, src, #32 + ldp2 D_l, D_h, src, #48 + add src, src, #64 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp2 A_l, A_h, dst, #0 + ldp2 A_l, A_h, src, #0 + stp2 B_l, B_h, dst, #16 + ldp2 B_l, B_h, src, #16 + stp2 C_l, C_h, dst, #32 + ldp2 C_l, C_h, src, #32 + stp2 D_l, D_h, dst, #48 + ldp2 D_l, D_h, src, #48 + add dst, dst, #64 + add src, src, #64 + subs count, count, #64 + b.ge 1b + + /* Post-loop: store the last block of data using stp2 */ + /* (without post-increment) */ + stp2 A_l, A_h, dst, #0 + stp2 B_l, B_h, dst, #16 + stp2 C_l, C_h, dst, #32 + stp2 D_l, D_h, dst, #48 + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63_opt +.Lexitfunc_opt: diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index d8f2db273def..f6445266e886 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -114,8 +114,8 @@ HAS_LS64 HAS_LS64_V HAS_HW_XCALL_XINT WORKAROUND_PHYTIUM_FT3386 -KABI_RESERVE_7 -KABI_RESERVE_8 +HAS_COPY_OPT +HAS_LSUI KABI_RESERVE_9 KABI_RESERVE_10 KABI_RESERVE_11 -- 2.53.0
participants (1)
-
Qi Xi