hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9132 ------------------------------------------------------- Add __arch_copy_from_user_opt for selected HiSilicon cores, using index-addressed ldp/stp pairs to copy data from user space. The path temporarily disables PAN during the copy and restores it on both the success and the exception fixup paths. The new copy_template_opt.S handles the >=64B bulk copy loop while the tail is managed by the existing byte/half/word load/store macros. Also add a "copy_opt_disable" cmdline parameter to disable the optimization at boot time. Enable the optimized path in raw_copy_from_user() for copies
= 4KB on CPUs with ARM64_HAS_COPY_OPT. The PAN toggle adds a small overhead, so only large copies are routed to the new path where the overhead is worth the faster throughput.
Signed-off-by: Qi Xi <xiqi2@huawei.com> --- .../admin-guide/kernel-parameters.txt | 6 + arch/arm64/include/asm/asm-uaccess.h | 7 + arch/arm64/include/asm/uaccess.h | 19 +- arch/arm64/kernel/cpufeature.c | 31 +++ arch/arm64/lib/copy_from_user.S | 32 +++ arch/arm64/lib/copy_template_opt.S | 189 ++++++++++++++++++ arch/arm64/tools/cpucaps | 2 +- 7 files changed, 283 insertions(+), 3 deletions(-) create mode 100644 arch/arm64/lib/copy_template_opt.S diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d72ab64a69b9..03f59c496cc1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -718,6 +718,12 @@ which is located in node nid, if the allocation fails, they will fallback to the global default memory area. + copy_opt_disable [ARM64] + Disable the optimized copy_from_user path on Hisilicon + CPUs that support it. By default the optimization is + enabled; this parameter forces all copies through + the standard unoptimized path. + cmo_free_hint= [PPC] Format: { yes | no } Specify whether pages are marked as being inactive when they are freed. This is used in CMO environments diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 7bbebfa5b710..4edf6c7dc56f 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -94,4 +94,11 @@ alternative_else_nop_endif _asm_extable_uaccess 8888b, \l; .endm + + .macro user_ldst_pair_index l, inst, reg1, reg2, addr, val +8888: \inst \reg1, \reg2, [\addr, \val]; + + _asm_extable_uaccess 8888b, \l; + .endm + #endif diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index dd0877a75922..8e2899b36b00 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -391,13 +391,28 @@ do { \ } while (0); \ } while(0) +#define COPY_OPT_THRESHOLD 4096 + +static __always_inline bool use_copy_opt(unsigned long n) +{ + return (n) >= COPY_OPT_THRESHOLD && + alternative_has_cap_unlikely(ARM64_HAS_COPY_OPT); +} + +extern unsigned long __must_check __arch_copy_from_user_opt(void *to, + const void __user *from, unsigned long n); extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); #define raw_copy_from_user(to, from, n) \ ({ \ unsigned long __acfu_ret; \ uaccess_ttbr0_enable(); \ - __acfu_ret = __arch_copy_from_user((to), \ - __uaccess_mask_ptr(from), (n)); \ + if (use_copy_opt(n)) { \ + __acfu_ret = __arch_copy_from_user_opt((to), \ + __uaccess_mask_ptr(from), (n)); \ + } else { \ + __acfu_ret = __arch_copy_from_user((to), \ + __uaccess_mask_ptr(from), (n)); \ + } \ uaccess_ttbr0_disable(); \ __acfu_ret; \ }) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a1928cf3c887..0ad4b3730b64 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2576,6 +2576,31 @@ static void cpu_enable_arch_xcall_xint(const struct arm64_cpu_capabilities *__un } #endif +static bool copy_opt_disable __ro_after_init; + +static int __init parse_copy_opt_disable(char *str) +{ + copy_opt_disable = true; + return 0; +} +early_param("copy_opt_disable", parse_copy_opt_disable); + +static bool has_copy_opt(const struct arm64_cpu_capabilities *cap, int scope) +{ + /* List of CPUs that support copy_from_user_opt */ + static const struct midr_range copy_opt_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_HISI_HIP11), + MIDR_ALL_VERSIONS(MIDR_HISI_HIP12), + MIDR_ALL_VERSIONS(MIDR_HISI_LINXICORE9100), + { } + }; + + if (copy_opt_disable) + return false; + + return is_midr_in_range_list(copy_opt_cpus); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, @@ -3151,6 +3176,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_arch_xcall_xint, }, #endif + { + .desc = "Hisilicon Optimized Copy From User enabled", + .capability = ARM64_HAS_COPY_OPT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_copy_opt, + }, {}, }; diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 34e317907524..59f1a4f36982 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -71,3 +71,35 @@ USER(9998f, ldtrb tmp1w, [srcin]) ret SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user) + + .macro ldp2 reg1, reg2, ptr, val + user_ldst_pair_index 9997f, ldp, \reg1, \reg2, \ptr, \val + .endm + + .macro stp2 reg1, reg2, ptr, val + stp \reg1, \reg2, [\ptr, \val] + .endm + +SYM_FUNC_START(__arch_copy_from_user_opt) + add end, x0, x2 + mov srcin, x1 + + ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN) + +#include "copy_template_opt.S" + mov x0, #0 // Nothing to copy + +.L__arch_copy_from_user_opt_exit: + ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN) + ret + + // Exception fixups +9997: cmp dst, dstin + b.ne 9998f + // Before being absolutely sure we couldn't copy anything, try harder +USER(9998f, ldtrb tmp1w, [srcin]) + strb tmp1w, [dst], #1 +9998: sub x0, end, dst // bytes not copied + b .L__arch_copy_from_user_opt_exit +SYM_FUNC_END(__arch_copy_from_user_opt) +EXPORT_SYMBOL(__arch_copy_from_user_opt) diff --git a/arch/arm64/lib/copy_template_opt.S b/arch/arm64/lib/copy_template_opt.S new file mode 100644 index 000000000000..4b57bfe6d337 --- /dev/null +++ b/arch/arm64/lib/copy_template_opt.S @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2026 - Huawei Ltd. + */ + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accesses are not aligned.*/ + b.lo .Ltiny15_opt + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned_opt + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned_opt + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned_opt: + cmp count, #64 + b.ge .Lcpy_over64_opt + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63_opt: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15_opt + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +1: + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +2: + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +.Ltiny15_opt: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc_opt + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc_opt + +.Lcpy_over64_opt: + subs count, count, #128 + b.ge .Lcpy_body_large_opt + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + ldp2 B_l, B_h, src, #16 + ldp2 C_l, C_h, src, #32 + stp2 B_l, B_h, dst, #16 + stp2 C_l, C_h, dst, #32 + ldp2 D_l, D_h, src, #48 + stp2 D_l, D_h, dst, #48 + add src, src, #64 + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63_opt + b .Lexitfunc_opt + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large_opt: + + /* pre-get 64 bytes data. */ + ldp2 A_l, A_h, src, #0 + ldp2 B_l, B_h, src, #16 + ldp2 C_l, C_h, src, #32 + ldp2 D_l, D_h, src, #48 + add src, src, #64 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp2 A_l, A_h, dst, #0 + ldp2 A_l, A_h, src, #0 + stp2 B_l, B_h, dst, #16 + ldp2 B_l, B_h, src, #16 + stp2 C_l, C_h, dst, #32 + ldp2 C_l, C_h, src, #32 + stp2 D_l, D_h, dst, #48 + ldp2 D_l, D_h, src, #48 + add dst, dst, #64 + add src, src, #64 + subs count, count, #64 + b.ge 1b + + /* Post-loop: store the last block of data using stp2 */ + /* (without post-increment) */ + stp2 A_l, A_h, dst, #0 + stp2 B_l, B_h, dst, #16 + stp2 C_l, C_h, dst, #32 + stp2 D_l, D_h, dst, #48 + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63_opt +.Lexitfunc_opt: diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index d8f2db273def..ce0c60d48dff 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -114,7 +114,7 @@ HAS_LS64 HAS_LS64_V HAS_HW_XCALL_XINT WORKAROUND_PHYTIUM_FT3386 -KABI_RESERVE_7 +HAS_COPY_OPT KABI_RESERVE_8 KABI_RESERVE_9 KABI_RESERVE_10 -- 2.33.0