[PATCH OLK-6.6 0/3] optimize copy_from_user
Qi Xi (3): arm64: lib: add __arch_copy_from_user_opt with index-addressed ldp/stp arm64: uaccess: enable optimized copy_from_user for >= 4kb copies arm64: uaccess: add CONFIG and cmdline control for copy_from_user_opt .../admin-guide/kernel-parameters.txt | 6 + arch/arm64/Kconfig | 11 + arch/arm64/include/asm/asm-uaccess.h | 7 + arch/arm64/include/asm/uaccess.h | 31 ++- arch/arm64/kernel/cpufeature.c | 35 ++++ arch/arm64/lib/copy_from_user.S | 36 ++++ arch/arm64/lib/copy_template_opt.S | 189 ++++++++++++++++++ arch/arm64/tools/cpucaps | 2 +- 8 files changed, 314 insertions(+), 3 deletions(-) create mode 100644 arch/arm64/lib/copy_template_opt.S -- 2.33.0
hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9132 ------------------------------------------------------- Add __arch_copy_from_user_opt for selected HiSilicon cores. This path uses index-addressed ldp/stp pairs to copy data from user space, temporarily disables PAN during the copy, and restores it on both the success and the exception fixup paths. The new copy_template_opt.S handles the >=64B bulk copy loop while the tail is managed by the existing byte/half/word load/store macros. Signed-off-by: Qi Xi <xiqi2@huawei.com> --- .../admin-guide/kernel-parameters.txt | 6 + arch/arm64/include/asm/asm-uaccess.h | 7 + arch/arm64/kernel/cpufeature.c | 30 +++ arch/arm64/lib/copy_from_user.S | 32 +++ arch/arm64/lib/copy_template_opt.S | 189 ++++++++++++++++++ arch/arm64/tools/cpucaps | 2 +- 6 files changed, 265 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/lib/copy_template_opt.S diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d72ab64a69b9..03f59c496cc1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -718,6 +718,12 @@ which is located in node nid, if the allocation fails, they will fallback to the global default memory area. + copy_opt_disable [ARM64] + Disable the optimized copy_from_user path on Hisilicon + CPUs that support it. By default the optimization is + enabled; this parameter forces all copies through + the standard unoptimized path. + cmo_free_hint= [PPC] Format: { yes | no } Specify whether pages are marked as being inactive when they are freed. This is used in CMO environments diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 7bbebfa5b710..4edf6c7dc56f 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -94,4 +94,11 @@ alternative_else_nop_endif _asm_extable_uaccess 8888b, \l; .endm + + .macro user_ldst_pair_index l, inst, reg1, reg2, addr, val +8888: \inst \reg1, \reg2, [\addr, \val]; + + _asm_extable_uaccess 8888b, \l; + .endm + #endif diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a1928cf3c887..3f8889d5dd8c 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2576,6 +2576,30 @@ static void cpu_enable_arch_xcall_xint(const struct arm64_cpu_capabilities *__un } #endif +static bool copy_opt_disable __ro_after_init; + +static int __init parse_copy_opt_disable(char *str) +{ + copy_opt_disable = true; + return 0; +} +early_param("copy_opt_disable", parse_copy_opt_disable); + +static bool has_copy_opt(const struct arm64_cpu_capabilities *cap, int scope) +{ + /* List of CPUs that support copy_from_user_opt */ + static const struct midr_range copy_opt_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_HISI_HIP12), + MIDR_ALL_VERSIONS(MIDR_HISI_LINXICORE9100), + { } + }; + + if (copy_opt_disable) + return false; + + return is_midr_in_range_list(copy_opt_cpus); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, @@ -3151,6 +3175,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_arch_xcall_xint, }, #endif + { + .desc = "Hisilicon Optimized Copy From User enabled", + .capability = ARM64_HAS_COPY_OPT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_copy_opt, + }, {}, }; diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 34e317907524..59f1a4f36982 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -71,3 +71,35 @@ USER(9998f, ldtrb tmp1w, [srcin]) ret SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user) + + .macro ldp2 reg1, reg2, ptr, val + user_ldst_pair_index 9997f, ldp, \reg1, \reg2, \ptr, \val + .endm + + .macro stp2 reg1, reg2, ptr, val + stp \reg1, \reg2, [\ptr, \val] + .endm + +SYM_FUNC_START(__arch_copy_from_user_opt) + add end, x0, x2 + mov srcin, x1 + + ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN) + +#include "copy_template_opt.S" + mov x0, #0 // Nothing to copy + +.L__arch_copy_from_user_opt_exit: + ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN) + ret + + // Exception fixups +9997: cmp dst, dstin + b.ne 9998f + // Before being absolutely sure we couldn't copy anything, try harder +USER(9998f, ldtrb tmp1w, [srcin]) + strb tmp1w, [dst], #1 +9998: sub x0, end, dst // bytes not copied + b .L__arch_copy_from_user_opt_exit +SYM_FUNC_END(__arch_copy_from_user_opt) +EXPORT_SYMBOL(__arch_copy_from_user_opt) diff --git a/arch/arm64/lib/copy_template_opt.S b/arch/arm64/lib/copy_template_opt.S new file mode 100644 index 000000000000..c4578ab8e399 --- /dev/null +++ b/arch/arm64/lib/copy_template_opt.S @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2026 - Huawei Ltd. + */ + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accesses are not aligned.*/ + b.lo .Ltiny15_opt + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned_opt + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned_opt + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned_opt: + cmp count, #64 + b.ge .Lcpy_over64_opt + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63_opt: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15_opt + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +1: + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +2: + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + add src, src, #16 + add dst, dst, #16 +.Ltiny15_opt: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc_opt + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc_opt + +.Lcpy_over64_opt: + subs count, count, #128 + b.ge .Lcpy_body_large_opt + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp2 A_l, A_h, src, #0 + stp2 A_l, A_h, dst, #0 + ldp2 B_l, B_h, src, #16 + ldp2 C_l, C_h, src, #32 + stp2 B_l, B_h, dst, #16 + stp2 C_l, C_h, dst, #32 + ldp2 D_l, D_h, src, #48 + stp2 D_l, D_h, dst, #48 + add src, src, #64 + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63_opt + b .Lexitfunc_opt + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large_opt: + + /* pre-get 64 bytes data. */ + ldp2 A_l, A_h, src, #0 + ldp2 B_l, B_h, src, #16 + ldp2 C_l, C_h, src, #32 + ldp2 D_l, D_h, src, #48 + add src, src, #64 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp2 A_l, A_h, dst, #0 + ldp2 A_l, A_h, src, #0 + stp2 B_l, B_h, dst, #16 + ldp2 B_l, B_h, src, #16 + stp2 C_l, C_h, dst, #32 + ldp2 C_l, C_h, src, #32 + stp2 D_l, D_h, dst, #48 + ldp2 D_l, D_h, src, #48 + add dst, dst, #64 + add src, src, #64 + subs count, count, #64 + b.ge 1b + + /* Post-loop: store the last block of data using stp2 */ + /* (without post-increment) */ + stp2 A_l, A_h, dst, #0 + stp2 B_l, B_h, dst, #16 + stp2 C_l, C_h, dst, #32 + stp2 D_l, D_h, dst, #48 + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63_opt +.Lexitfunc_opt: diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index d8f2db273def..ce0c60d48dff 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -114,7 +114,7 @@ HAS_LS64 HAS_LS64_V HAS_HW_XCALL_XINT WORKAROUND_PHYTIUM_FT3386 -KABI_RESERVE_7 +HAS_COPY_OPT KABI_RESERVE_8 KABI_RESERVE_9 KABI_RESERVE_10 -- 2.33.0
hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9132 ------------------------------------------------------- Use __arch_copy_from_user_opt for copies >= 4KB on CPUs with ARM64_HAS_COPY_OPT. The optimized path turns PAN off during the copy, which adds a small overhead. Only enable it for large copies so the overhead is worth the faster throughput. Signed-off-by: Qi Xi <xiqi2@huawei.com> --- arch/arm64/include/asm/uaccess.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index dd0877a75922..79587588aad4 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -391,13 +391,29 @@ do { \ } while (0); \ } while(0) +#define COPY_OPT_THRESHOLD 4096 + +extern unsigned long __must_check __arch_copy_from_user_opt(void *to, + const void __user *from, unsigned long n); + +static __always_inline bool use_copy_opt(unsigned long n) +{ + return (n) >= COPY_OPT_THRESHOLD && + alternative_has_cap_unlikely(ARM64_HAS_COPY_OPT); +} + extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); #define raw_copy_from_user(to, from, n) \ ({ \ unsigned long __acfu_ret; \ uaccess_ttbr0_enable(); \ - __acfu_ret = __arch_copy_from_user((to), \ - __uaccess_mask_ptr(from), (n)); \ + if (use_copy_opt(n)) { \ + __acfu_ret = __arch_copy_from_user_opt((to), \ + __uaccess_mask_ptr(from), (n)); \ + } else { \ + __acfu_ret = __arch_copy_from_user((to), \ + __uaccess_mask_ptr(from), (n)); \ + } \ uaccess_ttbr0_disable(); \ __acfu_ret; \ }) -- 2.33.0
hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9132 ------------------------------------------------------- Introduce CONFIG_ARM64_COPY_FROM_USER_OPT to control compilation of the optimized copy_from_user implementation, and a "copy_opt_disable" cmdline parameter to disable it. Signed-off-by: Qi Xi <xiqi2@huawei.com> --- arch/arm64/Kconfig | 11 +++++++++++ arch/arm64/include/asm/uaccess.h | 11 +++++++++++ arch/arm64/kernel/cpufeature.c | 5 +++++ arch/arm64/lib/copy_from_user.S | 4 ++++ 4 files changed, 31 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c3b38c890b45..eac62767857b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2626,6 +2626,17 @@ config CLEAR_USER_WORKAROUND help It has better performance while make sttr instruction 32-aligned in __arch_clear_user(). +config ARM64_COPY_FROM_USER_OPT + bool "Hisilicon Optimized Copy From User enabled" + depends on ARCH_HISI + default y + help + Enable an optimized copy_from_user implementation for + supported cores (e.g. Hisilicon HIP12, Linxicore9100). + This trades a small PAN toggle overhead for higher + throughput on large copies. This can be disabled at + boot via copy_opt=off. + endmenu # "Kernel Features" menu "Boot options" diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 79587588aad4..08a4ceef4a81 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -391,6 +391,8 @@ do { \ } while (0); \ } while(0) +#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT + #define COPY_OPT_THRESHOLD 4096 extern unsigned long __must_check __arch_copy_from_user_opt(void *to, @@ -402,6 +404,15 @@ static __always_inline bool use_copy_opt(unsigned long n) alternative_has_cap_unlikely(ARM64_HAS_COPY_OPT); } +#else /* !CONFIG_ARM64_COPY_FROM_USER_OPT */ + +static __always_inline bool use_copy_opt(unsigned long n) +{ + return false; +} + +#endif /* CONFIG_ARM64_COPY_FROM_USER_OPT */ + extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); #define raw_copy_from_user(to, from, n) \ ({ \ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 3f8889d5dd8c..659d6b9f12a5 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2576,6 +2576,8 @@ static void cpu_enable_arch_xcall_xint(const struct arm64_cpu_capabilities *__un } #endif +#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT + static bool copy_opt_disable __ro_after_init; static int __init parse_copy_opt_disable(char *str) @@ -2599,6 +2601,7 @@ static bool has_copy_opt(const struct arm64_cpu_capabilities *cap, int scope) return is_midr_in_range_list(copy_opt_cpus); } +#endif static const struct arm64_cpu_capabilities arm64_features[] = { { @@ -3175,12 +3178,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_arch_xcall_xint, }, #endif +#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT { .desc = "Hisilicon Optimized Copy From User enabled", .capability = ARM64_HAS_COPY_OPT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_copy_opt, }, +#endif {}, }; diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 59f1a4f36982..c3f24258803a 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -72,6 +72,8 @@ USER(9998f, ldtrb tmp1w, [srcin]) SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user) +#ifdef CONFIG_ARM64_COPY_FROM_USER_OPT + .macro ldp2 reg1, reg2, ptr, val user_ldst_pair_index 9997f, ldp, \reg1, \reg2, \ptr, \val .endm @@ -103,3 +105,5 @@ USER(9998f, ldtrb tmp1w, [srcin]) b .L__arch_copy_from_user_opt_exit SYM_FUNC_END(__arch_copy_from_user_opt) EXPORT_SYMBOL(__arch_copy_from_user_opt) + +#endif /* CONFIG_ARM64_COPY_FROM_USER_OPT */ -- 2.33.0
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://atomgit.com/openeuler/kernel/merge_requests/22448 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/XYH... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://atomgit.com/openeuler/kernel/merge_requests/22448 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/XYH...
participants (2)
-
patchwork bot -
Qi Xi