hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9132 ------------------------------------------------------- Add __arch_copy_from_user_opt for selected HiSilicon cores. This path uses index-addressed ldp/stp pairs to copy data from user space, temporarily disables PAN during the copy, and restores it on both the success and the exception fixup paths. The new copy_template_opt.S handles the >=64B bulk copy loop while the tail is managed by the existing byte/half/word load/store macros. Signed-off-by: Qi Xi <xiqi2@huawei.com> --- .../admin-guide/kernel-parameters.txt | 6 + arch/arm64/include/asm/asm-uaccess.h | 7 + arch/arm64/kernel/cpufeature.c | 30 +++ arch/arm64/lib/copy_from_user.S | 32 +++ arch/arm64/lib/copy_template_opt.S | 189 ++++++++++++++++++ arch/arm64/tools/cpucaps | 2 +- 6 files changed, 265 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/lib/copy_template_opt.S diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d72ab64a69b9..03f59c496cc1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -718,6 +718,12 @@ which is located in node nid, if the allocation fails, they will fallback to the global default memory area. + copy_opt_disable [ARM64] + Disable the optimized copy_from_user path on Hisilicon + CPUs that support it. By default the optimization is + enabled; this parameter forces all copies through + the standard unoptimized path. + cmo_free_hint= [PPC] Format: { yes | no } Specify whether pages are marked as being inactive when they are freed. This is used in CMO environments diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 7bbebfa5b710..4edf6c7dc56f 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -94,4 +94,11 @@ alternative_else_nop_endif _asm_extable_uaccess 8888b, \l; .endm + + .macro user_ldst_pair_index l, inst, reg1, reg2, addr, val +8888: \inst \reg1, \reg2, [\addr, \val]; + + _asm_extable_uaccess 8888b, \l; + .endm + #endif diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a1928cf3c887..3f8889d5dd8c 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2576,6 +2576,30 @@ static void cpu_enable_arch_xcall_xint(const struct arm64_cpu_capabilities *__un } #endif +static bool copy_opt_disable __ro_after_init; + +static int __init parse_copy_opt_disable(char *str) +{ + copy_opt_disable = true; + return 0; +} +early_param("copy_opt_disable", parse_copy_opt_disable); + +static bool has_copy_opt(const struct arm64_cpu_capabilities *cap, int scope) +{ + /* List of CPUs that support copy_from_user_opt */ + static const struct midr_range copy_opt_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_HISI_HIP12), + MIDR_ALL_VERSIONS(MIDR_HISI_LINXICORE9100), + { } + }; + + if (copy_opt_disable) + return false; + + return is_midr_in_range_list(copy_opt_cpus); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, @@ -3151,6 +3175,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_arch_xcall_xint, }, #endif + { + .desc = "Hisilicon Optimized Copy From User enabled", + .capability = ARM64_HAS_COPY_OPT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_copy_opt, + }, {}, }; diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 34e317907524..59f1a4f36982 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -71,3 +71,35 @@ USER(9998f, ldtrb tmp1w, [srcin]) ret SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user) + + .macro ldp2 reg1, reg2, ptr, val + user_ldst_pair_index 9997f, ldp, \reg1, \reg2, \ptr, \val + .endm + + .macro stp2 reg1, reg2, ptr, val + stp \reg1, \reg2, [\ptr, \val] + .endm + +SYM_FUNC_START(__arch_copy_from_user_opt) + add end, x0, x2 + mov srcin, x1 + + ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN) + +#include "copy_template_opt.S" + mov x0, #0 // Nothing to copy + +.L__arch_copy_from_user_opt_exit: + ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN) + ret + + // Exception fixups +9997: cmp dst, dstin + b.ne 9998f + // Before being absolutely sure we couldn't copy anything, try harder +USER(9998f, ldtrb tmp1w, [srcin]) + strb tmp1w, [dst], #1 +9998: sub x0, end, dst // bytes not copied + b .L__arch_copy_from_user_opt_exit +SYM_FUNC_END(__arch_copy_from_user_opt) +EXPORT_SYMBOL(__arch_copy_from_user_opt) diff --git a/arch/arm64/lib/copy_template_opt.S b/arch/arm64/lib/copy_template_opt.S new file mode 100644 index 000000000000..c4578ab8e399 --- /dev/null +++ b/arch/arm64/lib/copy_template_opt.S @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2026 - Huawei Ltd. + */ + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accesses are not aligned.*/ + b.lo .Ltiny15_opt + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned_opt + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned_opt + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned_opt: + cmp count, #64 + b.ge .Lcpy_over64_opt + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63_opt: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15_opt + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +1: + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +2: + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +.Ltiny15_opt: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc_opt + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc_opt + +.Lcpy_over64_opt: + subs count, count, #128 + b.ge .Lcpy_body_large_opt + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + ldp2 B_l, B_h, src, #16 + ldp2 C_l, C_h, src, #32 + stp2 B_l, B_h, dst, #16 + stp2 C_l, C_h, dst, #32 + ldp2 D_l, D_h, src, #48 + stp2 D_l, D_h, dst, #48 + add src, src, #64 + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63_opt + b .Lexitfunc_opt + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large_opt: + + /* pre-get 64 bytes data. */ + ldp2 A_l, A_h, src, #0 + ldp2 B_l, B_h, src, #16 + ldp2 C_l, C_h, src, #32 + ldp2 D_l, D_h, src, #48 + add src, src, #64 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp2 A_l, A_h, dst, #0 + ldp2 A_l, A_h, src, #0 + stp2 B_l, B_h, dst, #16 + ldp2 B_l, B_h, src, #16 + stp2 C_l, C_h, dst, #32 + ldp2 C_l, C_h, src, #32 + stp2 D_l, D_h, dst, #48 + ldp2 D_l, D_h, src, #48 + add dst, dst, #64 + add src, src, #64 + subs count, count, #64 + b.ge 1b + + /* Post-loop: store the last block of data using stp2 */ + /* (without post-increment) */ + stp2 A_l, A_h, dst, #0 + stp2 B_l, B_h, dst, #16 + stp2 C_l, C_h, dst, #32 + stp2 D_l, D_h, dst, #48 + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63_opt +.Lexitfunc_opt: diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index d8f2db273def..ce0c60d48dff 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -114,7 +114,7 @@ HAS_LS64 HAS_LS64_V HAS_HW_XCALL_XINT WORKAROUND_PHYTIUM_FT3386 -KABI_RESERVE_7 +HAS_COPY_OPT KABI_RESERVE_8 KABI_RESERVE_9 KABI_RESERVE_10 -- 2.33.0