[PATCH v1 openEuler-26.09] Add copy to/from/in user with vectorization support
From: Artem Kuzin <artem.kuzin@huawei.com> kunpeng inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/8445 ------------------------------------------------- 1. This implementation uses st1/ld1 4-vector instructions which allow to copy 64 bytes at once 2. Copy code is used only if size of data block to copy is more than 128 bytes 4. To use this functionality you need to set configuration switch CONFIG_USE_VECTORIZED_COPY=y 5. Code can be used on any ARMv8 variant 6. In kernel copy functions like memcpy are not supported now, but can be enabled in future 7. For now we use lightweght version of register context saving/restoration (4-registers) We introduce support of vectorization for copy_from/to/in_user functions. Nowadays it works in parallel with original FPSIMD/SVE vectorization and doesn't affect it anyhow. We have special flag in task struct - TIF_KERNEL_FPSIMD, that set if currently we use lightweight vectorization in kernel. Task struct has been updated by two fields: user space fpsimd state and kernel fpsimd state. User space fpsimd state used by kernel_fpsimd_begin(), kernel_fpsimd_end() functions that wrap lightweight FPSIMD contexts usage in kernel space. Kernel fpsimd state is used to manage threads switch. Now there is no support of nested calls of kernel_neon_begin()/kernel_fpsimd_begin() and there is no plans to support this in future. This is not necessary. We save lightweight FPSIMD context in kernel_fpsimd_begin(), and restore it in /kernel_fpsimd_end(). On thread switch we preserve kernel FPSIMD context and restore user space one if any. This prevens curruption of user space FPSIMD state. Before switching to the next thread we restore it's kernel FPSIMD context if any. It is allowed to use FPSIMD in bottom halves, due to in case of BH preemption we check TIF_KERNEL_FPSIMD flag and save/restore contexts. Context management if quite lightweight and executed only in case of TIF_KERNEL_FPSIMD flag is set. To enable this feature, you need to manually modify one of the appropriate entries: /proc/sys/vm/copy_from_user_threshold /proc/sys/vm/copy_in_user_threshold /proc/sys/vm/copy_to_user_threshold Allowed values are following: -1 - feature enabled 0 - feature always enabled n (n >0) - feature enabled, if copied size is greater than n KB. P.S.: What I am personally don't like in current approach: 1. Additional fields and flag in task struct look quite ugly 2. No way to configure the size of chunk to copy using FPSIMD from user space 3. FPSIMD-based memory movement is not generic, need to enable for memmove(), memcpy() and friends in future. Co-developed-by: Alexander Kozhevnikov <alexander.kozhevnikov@huawei-partners.com> Signed-off-by: Alexander Kozhevnikov <alexander.kozhevnikov@huawei-partners.com> Co-developed-by: Nikita Panov <panov.nikita@huawei.com> Signed-off-by: Nikita Panov <panov.nikita@huawei.com> Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com> --- arch/arm64/Kconfig | 15 ++ arch/arm64/configs/openeuler_defconfig | 2 + arch/arm64/include/asm/fpsimd.h | 15 ++ arch/arm64/include/asm/fpsimdmacros.h | 14 ++ arch/arm64/include/asm/neon.h | 28 ++++ arch/arm64/include/asm/processor.h | 10 ++ arch/arm64/include/asm/thread_info.h | 5 + arch/arm64/include/asm/uaccess.h | 218 ++++++++++++++++++++++++- arch/arm64/kernel/entry-fpsimd.S | 22 +++ arch/arm64/kernel/fpsimd.c | 102 +++++++++++- arch/arm64/kernel/process.c | 2 +- arch/arm64/lib/copy_from_user.S | 30 ++++ arch/arm64/lib/copy_template_fpsimd.S | 180 ++++++++++++++++++++ arch/arm64/lib/copy_to_user.S | 30 ++++ kernel/softirq.c | 34 ++++ kernel/sysctl.c | 34 ++++ 16 files changed, 734 insertions(+), 7 deletions(-) create mode 100644 arch/arm64/lib/copy_template_fpsimd.S diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c3b38c890b45..8904e6476e3b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1870,6 +1870,21 @@ config ARM64_ILP32 is an ABI where long and pointers are 32bits but it uses the AARCH64 instruction set. +config USE_VECTORIZED_COPY + bool "Use vectorized instructions in copy_to/from user" + depends on KERNEL_MODE_NEON + default y + help + This option turns on vectorization to speed up copy_to/from_user routines. + +config VECTORIZED_COPY_VALIDATE + bool "Validate result of vectorized copy using regular implementation" + depends on KERNEL_MODE_NEON + depends on USE_VECTORIZED_COPY + default n + help + This option turns on vectorization to speed up copy_to/from_user routines. + menuconfig AARCH32_EL0 bool "Kernel support for 32-bit EL0" depends on ARM64_4K_PAGES || EXPERT diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 9e7bc82cba3a..9843dec071bf 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -527,6 +527,8 @@ CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY=y # CONFIG_RODATA_FULL_DEFAULT_ENABLED is not set # CONFIG_ARM64_SW_TTBR0_PAN is not set CONFIG_ARM64_TAGGED_ADDR_ABI=y +CONFIG_USE_VECTORIZED_COPY=y +# CONFIG_VECTORIZED_COPY_VALIDATE is not set CONFIG_AARCH32_EL0=y # CONFIG_KUSER_HELPERS is not set # CONFIG_COMPAT_ALIGNMENT_FIXUPS is not set diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index b6c6949984d8..1fc9089b4a47 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -46,6 +46,21 @@ struct task_struct; +#ifdef CONFIG_USE_VECTORIZED_COPY +extern void fpsimd_save_state_light(struct fpsimd_state *state); +extern void fpsimd_load_state_light(struct fpsimd_state *state); +#else +static inline void fpsimd_save_state_light(struct fpsimd_state *state) +{ + (void) state; +} + +static inline void fpsimd_load_state_light(struct fpsimd_state *state) +{ + (void) state; +} +#endif + extern void fpsimd_save_state(struct user_fpsimd_state *state); extern void fpsimd_load_state(struct user_fpsimd_state *state); diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index cdf6a35e3994..df9d3ed91931 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -8,6 +8,20 @@ #include <asm/assembler.h> +#ifdef CONFIG_USE_VECTORIZED_COPY +/* Lightweight fpsimd context saving/restoration. + * Necessary for vectorized kernel memory movement + * implementation + */ +.macro fpsimd_save_light state + st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [\state] +.endm + +.macro fpsimd_restore_light state + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [\state] +.endm +#endif + .macro fpsimd_save state, tmpnr stp q0, q1, [\state, #16 * 0] stp q2, q3, [\state, #16 * 2] diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index d4b1d172a79b..ab84b194d7b3 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h @@ -16,4 +16,32 @@ void kernel_neon_begin(void); void kernel_neon_end(void); +#ifdef CONFIG_USE_VECTORIZED_COPY +bool kernel_fpsimd_begin(void); +void kernel_fpsimd_end(void); +/* Functions to use in non-preemptible context */ +void _kernel_fpsimd_save(struct fpsimd_state *state); +void _kernel_fpsimd_load(struct fpsimd_state *state); +#else +bool kernel_fpsimd_begin(void) +{ + return false; +} + +void kernel_fpsimd_end(void) +{ +} + +/* Functions to use in non-preemptible context */ +void _kernel_fpsimd_save(struct fpsimd_state *state) +{ + (void) state; +} + +void _kernel_fpsimd_load(struct fpsimd_state *state) +{ + (void) state; +} +#endif + #endif /* ! __ASM_NEON_H */ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 9e688b1b13d4..9b81dbcd2126 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -153,6 +153,10 @@ struct cpu_context { unsigned long pc; }; +struct fpsimd_state { + __uint128_t v[4]; +}; + struct thread_struct { struct cpu_context cpu_context; /* cpu context */ @@ -196,6 +200,12 @@ struct thread_struct { KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8) +#ifdef CONFIG_USE_VECTORIZED_COPY + KABI_EXTEND( + struct fpsimd_state ustate; + struct fpsimd_state kstate; + ) +#endif }; static inline unsigned int thread_get_vl(struct thread_struct *thread, diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 379d24059f5b..60d0be8a2d58 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -89,6 +89,9 @@ void arch_setup_new_exec(void); #define TIF_SME 27 /* SME in use */ #define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */ #define TIF_32BIT_AARCH64 29 /* 32 bit process on AArch64(ILP32) */ +#define TIF_KERNEL_FPSIMD 31 /* Use FPSIMD in kernel */ +#define TIF_PRIV_UACC_ENABLED 32 /* Whether priviliged uaccess was manually enabled */ + #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) @@ -107,6 +110,8 @@ void arch_setup_new_exec(void); #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_32BIT_AARCH64 (1 << TIF_32BIT_AARCH64) +#define _TIF_KERNEL_FPSIMD (1 << TIF_KERNEL_FPSIMD) +#define _TIF_PRIV_UACC_ENABLED (1 << TIF_PRIV_UACC_ENABLED) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index dd0877a75922..fc9f1a40624d 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -26,6 +26,10 @@ #include <asm/memory.h> #include <asm/extable.h> +#ifndef __GENKSYMS__ +#include <asm/neon.h> +#endif + static inline int __access_ok(const void __user *ptr, unsigned long size); /* @@ -134,7 +138,7 @@ static inline void __uaccess_enable_hw_pan(void) CONFIG_ARM64_PAN)); } -static inline void uaccess_disable_privileged(void) +static inline void __uaccess_disable_privileged(void) { mte_disable_tco(); @@ -144,7 +148,22 @@ static inline void uaccess_disable_privileged(void) __uaccess_enable_hw_pan(); } -static inline void uaccess_enable_privileged(void) +static inline void uaccess_disable_privileged(void) +{ + preempt_disable(); + + if (!test_and_clear_thread_flag(TIF_PRIV_UACC_ENABLED)) { + WARN_ON(1); + preempt_enable(); + return; + } + + __uaccess_disable_privileged(); + + preempt_enable(); +} + +static inline void __uaccess_enable_privileged(void) { mte_enable_tco(); @@ -154,6 +173,47 @@ static inline void uaccess_enable_privileged(void) __uaccess_disable_hw_pan(); } +static inline void uaccess_enable_privileged(void) +{ + preempt_disable(); + + if (test_and_set_thread_flag(TIF_PRIV_UACC_ENABLED)) { + WARN_ON(1); + preempt_enable(); + return; + } + + __uaccess_enable_privileged(); + + preempt_enable(); +} + +static inline void uaccess_priviliged_context_switch(struct task_struct *next) +{ + bool curr_enabled = !!test_thread_flag(TIF_PRIV_UACC_ENABLED); + bool next_enabled = !!test_ti_thread_flag(&next->thread_info, TIF_PRIV_UACC_ENABLED); + + if (curr_enabled == next_enabled) + return; + + if (curr_enabled) + __uaccess_disable_privileged(); + else + __uaccess_enable_privileged(); +} + +static inline void uaccess_priviliged_state_save(void) +{ + if (test_thread_flag(TIF_PRIV_UACC_ENABLED)) + __uaccess_disable_privileged(); +} + +static inline void uaccess_priviliged_state_restore(void) +{ + if (test_thread_flag(TIF_PRIV_UACC_ENABLED)) + __uaccess_enable_privileged(); +} + /* * Sanitize a uaccess pointer such that it cannot reach any kernel address. * @@ -391,7 +451,97 @@ do { \ } while (0); \ } while(0) -extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); +#define USER_COPY_CHUNK_SIZE 4096 + +#ifdef CONFIG_USE_VECTORIZED_COPY + +extern int sysctl_copy_from_user_threshold; + +#define verify_fpsimd_copy(to, from, n, ret) \ +({ \ + unsigned long __verify_ret = 0; \ + __verify_ret = memcmp(to, from, ret ? n - ret : n); \ + if (__verify_ret) \ + pr_err("FPSIMD:%s inconsistent state\n", __func__); \ + if (ret) \ + pr_err("FPSIMD:%s failed to copy data, expected=%lu, copied=%lu\n", __func__, n, n - ret); \ + __verify_ret |= ret; \ + __verify_ret; \ +}) + +#define compare_fpsimd_copy(to, from, n, ret_fpsimd, ret) \ +({ \ + unsigned long __verify_ret = 0; \ + __verify_ret = memcmp(to, from, ret ? n - ret : n); \ + if (__verify_ret) \ + pr_err("FIXUP:%s inconsistent state\n", __func__); \ + if (ret) \ + pr_err("FIXUP:%s failed to copy data, expected=%lu, copied=%lu\n", __func__, n, n - ret); \ + __verify_ret |= ret; \ + if (ret_fpsimd != ret) { \ + pr_err("FIXUP:%s difference between FPSIMD %lu and regular %lu\n", __func__, n - ret_fpsimd, n - ret); \ + __verify_ret |= 1; \ + } else { \ + __verify_ret = 0; \ + } \ + __verify_ret; \ +}) + +extern unsigned long __must_check +__arch_copy_from_user(void *to, const void __user *from, unsigned long n); + +extern unsigned long __must_check +__arch_copy_from_user_fpsimd(void *to, const void __user *from, unsigned long n); + +static __always_inline unsigned long __must_check +raw_copy_from_user(void *to, const void __user *from, unsigned long n) +{ + unsigned long __acfu_ret; + + if (sysctl_copy_from_user_threshold == -1 || n < sysctl_copy_from_user_threshold) { + uaccess_ttbr0_enable(); + __acfu_ret = __arch_copy_from_user(to, + __uaccess_mask_ptr(from), n); + uaccess_ttbr0_disable(); + } else { + if (kernel_fpsimd_begin()) { + unsigned long __acfu_ret_fpsimd; + + uaccess_enable_privileged(); + __acfu_ret_fpsimd = __arch_copy_from_user_fpsimd((to), + __uaccess_mask_ptr(from), n); + uaccess_disable_privileged(); + + __acfu_ret = __acfu_ret_fpsimd; + kernel_fpsimd_end(); +#ifdef CONFIG_VECTORIZED_COPY_VALIDATE + if (verify_fpsimd_copy(to, __uaccess_mask_ptr(from), n, + __acfu_ret)) { + + uaccess_ttbr0_enable(); + __acfu_ret = __arch_copy_from_user((to), + __uaccess_mask_ptr(from), n); + uaccess_ttbr0_disable(); + + compare_fpsimd_copy(to, __uaccess_mask_ptr(from), n, + __acfu_ret_fpsimd, __acfu_ret); + } +#endif + } else { + uaccess_ttbr0_enable(); + __acfu_ret = __arch_copy_from_user((to), + __uaccess_mask_ptr(from), n); + uaccess_ttbr0_disable(); + } + } + + + return __acfu_ret; +} +#else +extern unsigned long __must_check +__arch_copy_from_user(void *to, const void __user *from, unsigned long n); + #define raw_copy_from_user(to, from, n) \ ({ \ unsigned long __acfu_ret; \ @@ -402,7 +552,66 @@ extern unsigned long __must_check __arch_copy_from_user(void *to, const void __u __acfu_ret; \ }) -extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); +#endif + +#ifdef CONFIG_USE_VECTORIZED_COPY + +extern int sysctl_copy_to_user_threshold; + +extern unsigned long __must_check +__arch_copy_to_user(void __user *to, const void *from, unsigned long n); + +extern unsigned long __must_check +__arch_copy_to_user_fpsimd(void __user *to, const void *from, unsigned long n); + +static __always_inline unsigned long __must_check +raw_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + unsigned long __actu_ret; + + + if (sysctl_copy_to_user_threshold == -1 || n < sysctl_copy_to_user_threshold) { + uaccess_ttbr0_enable(); + __actu_ret = __arch_copy_to_user(__uaccess_mask_ptr(to), + from, n); + uaccess_ttbr0_disable(); + } else { + if (kernel_fpsimd_begin()) { + unsigned long __actu_ret_fpsimd; + + uaccess_enable_privileged(); + __actu_ret_fpsimd = __arch_copy_to_user_fpsimd(__uaccess_mask_ptr(to), + from, n); + uaccess_disable_privileged(); + + kernel_fpsimd_end(); + __actu_ret = __actu_ret_fpsimd; +#ifdef CONFIG_VECTORIZED_COPY_VALIDATE + if (verify_fpsimd_copy(__uaccess_mask_ptr(to), from, n, + __actu_ret)) { + uaccess_ttbr0_enable(); + __actu_ret = __arch_copy_to_user(__uaccess_mask_ptr(to), + from, n); + uaccess_ttbr0_disable(); + + compare_fpsimd_copy(__uaccess_mask_ptr(to), from, n, + __actu_ret_fpsimd, __actu_ret); + } +#endif + } else { + uaccess_ttbr0_enable(); + __actu_ret = __arch_copy_to_user(__uaccess_mask_ptr(to), + from, n); + uaccess_ttbr0_disable(); + } + } + + return __actu_ret; +} +#else +extern unsigned long __must_check +__arch_copy_to_user(void __user *to, const void *from, unsigned long n); + #define raw_copy_to_user(to, from, n) \ ({ \ unsigned long __actu_ret; \ @@ -412,6 +621,7 @@ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const voi uaccess_ttbr0_disable(); \ __actu_ret; \ }) +#endif static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) { diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 6325db1a2179..6660465f1b7c 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -11,6 +11,28 @@ #include <asm/assembler.h> #include <asm/fpsimdmacros.h> +#ifdef CONFIG_USE_VECTORIZED_COPY +/* + * Save the FP registers. + * + * x0 - pointer to struct fpsimd_state_light + */ +SYM_FUNC_START(fpsimd_save_state_light) + fpsimd_save_light x0 + ret +SYM_FUNC_END(fpsimd_save_state_light) + +/* + * Load the FP registers. + * + * x0 - pointer to struct fpsimd_state_light + */ +SYM_FUNC_START(fpsimd_load_state_light) + fpsimd_restore_light x0 + ret +SYM_FUNC_END(fpsimd_load_state_light) +#endif + /* * Save the FP registers. * diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 998906b75075..1b6b1accfbbc 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1579,6 +1579,11 @@ void do_fpsimd_exc(unsigned long esr, struct pt_regs *regs) current); } +#ifdef CONFIG_USE_VECTORIZED_COPY +static void kernel_fpsimd_rollback_changes(void); +static void kernel_fpsimd_restore_changes(struct task_struct *tsk); +#endif + void fpsimd_thread_switch(struct task_struct *next) { bool wrong_task, wrong_cpu; @@ -1587,10 +1592,11 @@ void fpsimd_thread_switch(struct task_struct *next) return; __get_cpu_fpsimd_context(); - +#ifdef CONFIG_USE_VECTORIZED_COPY + kernel_fpsimd_rollback_changes(); +#endif /* Save unsaved fpsimd state, if any: */ fpsimd_save(); - /* * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's * state. For kernel threads, FPSIMD registers are never loaded @@ -1603,6 +1609,9 @@ void fpsimd_thread_switch(struct task_struct *next) update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, wrong_task || wrong_cpu); +#ifdef CONFIG_USE_VECTORIZED_COPY + kernel_fpsimd_restore_changes(next); +#endif __put_cpu_fpsimd_context(); } @@ -1933,6 +1942,95 @@ void kernel_neon_end(void) } EXPORT_SYMBOL_GPL(kernel_neon_end); +#ifdef CONFIG_USE_VECTORIZED_COPY +bool kernel_fpsimd_begin(void) +{ + if (WARN_ON(!system_capabilities_finalized()) || + !system_supports_fpsimd() || + in_irq() || irqs_disabled() || in_nmi()) + return false; + + preempt_disable(); + if (test_and_set_thread_flag(TIF_KERNEL_FPSIMD)) { + preempt_enable(); + + WARN_ON(1); + return false; + } + + /* + * Leaving streaming mode enabled will cause issues for any kernel + * NEON and leaving streaming mode or ZA enabled may increase power + * consumption. + */ + if (system_supports_sme()) + sme_smstop(); + + fpsimd_save_state_light(¤t->thread.ustate); + preempt_enable(); + + return true; +} +EXPORT_SYMBOL(kernel_fpsimd_begin); + +void kernel_fpsimd_end(void) +{ + if (!system_supports_fpsimd()) + return; + + preempt_disable(); + if (test_and_clear_thread_flag(TIF_KERNEL_FPSIMD)) + fpsimd_load_state_light(¤t->thread.ustate); + + preempt_enable(); +} +EXPORT_SYMBOL(kernel_fpsimd_end); + +void _kernel_fpsimd_save(struct fpsimd_state *state) +{ + if (!system_supports_fpsimd()) + return; + + BUG_ON(preemptible()); + if (test_thread_flag(TIF_KERNEL_FPSIMD)) + fpsimd_save_state_light(state); +} + +void _kernel_fpsimd_load(struct fpsimd_state *state) +{ + if (!system_supports_fpsimd()) + return; + + BUG_ON(preemptible()); + if (test_thread_flag(TIF_KERNEL_FPSIMD)) + fpsimd_load_state_light(state); +} + +static void kernel_fpsimd_rollback_changes(void) +{ + if (!system_supports_fpsimd()) + return; + + BUG_ON(preemptible()); + if (test_thread_flag(TIF_KERNEL_FPSIMD)) { + fpsimd_save_state_light(¤t->thread.kstate); + fpsimd_load_state_light(¤t->thread.ustate); + } +} + +static void kernel_fpsimd_restore_changes(struct task_struct *tsk) +{ + if (!system_supports_fpsimd()) + return; + + BUG_ON(preemptible()); + if (test_ti_thread_flag(task_thread_info(tsk), TIF_KERNEL_FPSIMD)) { + fpsimd_save_state_light(&tsk->thread.ustate); + fpsimd_load_state_light(&tsk->thread.kstate); + } +} +#endif + #ifdef CONFIG_EFI static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index e9e5ce956f15..fd895189cb7e 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -529,7 +529,7 @@ struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next) { struct task_struct *last; - + uaccess_priviliged_context_switch(next); fpsimd_thread_switch(next); tls_thread_switch(next); hw_breakpoint_thread_switch(next); diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 34e317907524..60dc63e10233 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -71,3 +71,33 @@ USER(9998f, ldtrb tmp1w, [srcin]) ret SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user) + + + +#ifdef CONFIG_USE_VECTORIZED_COPY + .macro ldsve reg1, reg2, reg3, reg4, ptr + USER(9997f, ld1 {\reg1, \reg2, \reg3, \reg4}, [\ptr]) + .endm + + .macro stsve reg1, reg2, reg3, reg4, ptr + KERNEL_ME_SAFE(9998f, st1 {\reg1, \reg2, \reg3, \reg4}, [\ptr]) + .endm + +SYM_FUNC_START(__arch_copy_from_user_fpsimd) + add end, x0, x2 + mov srcin, x1 +#include "copy_template_fpsimd.S" + mov x0, #0 // Nothing to copy + ret + + // Exception fixups +9997: cmp dst, dstin + b.ne 9998f + // Before being absolutely sure we couldn't copy anything, try harder +USER(9998f, ldtrb tmp1w, [srcin]) + strb tmp1w, [dst], #1 +9998: sub x0, end, dst // bytes not copied + ret +SYM_FUNC_END(__arch_copy_from_user_fpsimd) +EXPORT_SYMBOL(__arch_copy_from_user_fpsimd) +#endif \ No newline at end of file diff --git a/arch/arm64/lib/copy_template_fpsimd.S b/arch/arm64/lib/copy_template_fpsimd.S new file mode 100644 index 000000000000..9b2e7ce1e4d2 --- /dev/null +++ b/arch/arm64/lib/copy_template_fpsimd.S @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + */ + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + +V_a .req v20 +V_b .req v21 +V_c .req v22 +V_d .req v23 + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15_fpsimd + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned_fpsimd + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned_fpsimd + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned_fpsimd: + cmp count, #64 + b.ge .Lcpy_over64_fpsimd + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63_fpsimd: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15_fpsimd + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +1: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +2: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +.Ltiny15_fpsimd: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc_fpsimd + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc_fpsimd + +.Lcpy_over64_fpsimd: + subs count, count, #128 + b.ge .Lcpy_body_large_fpsimd + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + ldp1 D_l, D_h, src, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63_fpsimd + b .Lexitfunc_fpsimd + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large_fpsimd: + /* pre-get 64 bytes data. */ + ldsve V_a.16b, V_b.16b, V_c.16b, V_d.16b, src + add src, src, #64 + +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stsve V_a.16b, V_b.16b, V_c.16b, V_d.16b, dst + ldsve V_a.16b, V_b.16b, V_c.16b, V_d.16b, src + add dst, dst, #64 + add src, src, #64 + + subs count, count, #64 + b.ge 1b + + stsve V_a.16b, V_b.16b, V_c.16b, V_d.16b, dst + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63_fpsimd +.Lexitfunc_fpsimd: diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 2ac716c0d6d8..c190e5f8a989 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -71,3 +71,33 @@ USER(9998f, sttrb tmp1w, [dst]) ret SYM_FUNC_END(__arch_copy_to_user) EXPORT_SYMBOL(__arch_copy_to_user) + + +#ifdef CONFIG_USE_VECTORIZED_COPY + .macro stsve reg1, reg2, reg3, reg4, ptr + USER(9997f, st1 {\reg1, \reg2, \reg3, \reg4}, [\ptr]) + .endm + + .macro ldsve reg1, reg2, reg3, reg4, ptr + KERNEL_ME_SAFE(9998f, ld1 {\reg1, \reg2, \reg3, \reg4}, [\ptr]) + .endm + +SYM_FUNC_START(__arch_copy_to_user_fpsimd) + add end, x0, x2 + mov srcin, x1 +#include "copy_template_fpsimd.S" + mov x0, #0 + ret + + // Exception fixups +9997: cmp dst, dstin + b.ne 9998f + // Before being absolutely sure we couldn't copy anything, try harder +KERNEL_ME_SAFE(9998f, ldrb tmp1w, [srcin]) +USER(9998f, sttrb tmp1w, [dst]) + add dst, dst, #1 +9998: sub x0, end, dst // bytes not copied + ret +SYM_FUNC_END(__arch_copy_to_user_fpsimd) +EXPORT_SYMBOL(__arch_copy_to_user_fpsimd) +#endif diff --git a/kernel/softirq.c b/kernel/softirq.c index f8cf88cc46c6..39b84ffbf4e5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -30,6 +30,10 @@ #include <asm/softirq_stack.h> +#ifdef CONFIG_USE_VECTORIZED_COPY +#include <asm/fpsimd.h> +#endif + #define CREATE_TRACE_POINTS #include <trace/events/irq.h> @@ -524,6 +528,9 @@ static void handle_softirqs(bool ksirqd) __u32 pending; int softirq_bit; +#ifdef CONFIG_USE_VECTORIZED_COPY + struct fpsimd_state state; +#endif /* * Mask out PF_MEMALLOC as the current task context is borrowed for the * softirq. A softirq handled, such as network RX, might set PF_MEMALLOC @@ -533,10 +540,16 @@ static void handle_softirqs(bool ksirqd) pending = local_softirq_pending(); + softirq_handle_begin(); in_hardirq = lockdep_softirq_start(); account_softirq_enter(current); +#ifdef CONFIG_USE_VECTORIZED_COPY + _kernel_fpsimd_save(&state); + uaccess_priviliged_state_save(); +#endif + restart: /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); @@ -585,7 +598,14 @@ static void handle_softirqs(bool ksirqd) account_softirq_exit(current); lockdep_softirq_end(in_hardirq); + +#ifdef CONFIG_USE_VECTORIZED_COPY + uaccess_priviliged_state_restore(); + _kernel_fpsimd_load(&state); +#endif + softirq_handle_end(); + current_restore_flags(old_flags, PF_MEMALLOC); } @@ -819,12 +839,21 @@ static void tasklet_action_common(struct softirq_action *a, { struct tasklet_struct *list; +#ifdef CONFIG_USE_VECTORIZED_COPY + struct fpsimd_state state; +#endif + local_irq_disable(); list = tl_head->head; tl_head->head = NULL; tl_head->tail = &tl_head->head; local_irq_enable(); +#ifdef CONFIG_USE_VECTORIZED_COPY + _kernel_fpsimd_save(&state); + uaccess_priviliged_state_save(); +#endif + while (list) { struct tasklet_struct *t = list; @@ -856,6 +885,11 @@ static void tasklet_action_common(struct softirq_action *a, __raise_softirq_irqoff(softirq_nr); local_irq_enable(); } + +#ifdef CONFIG_USE_VECTORIZED_COPY + uaccess_priviliged_state_restore(); + _kernel_fpsimd_load(&state); +#endif } static __latent_entropy void tasklet_action(struct softirq_action *a) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e84df0818517..6f8e22102bdc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -137,6 +137,17 @@ int sysctl_legacy_va_layout; #endif /* CONFIG_SYSCTL */ +#ifdef CONFIG_USE_VECTORIZED_COPY +int sysctl_copy_to_user_threshold = -1; +EXPORT_SYMBOL(sysctl_copy_to_user_threshold); + +int sysctl_copy_from_user_threshold = -1; +EXPORT_SYMBOL(sysctl_copy_from_user_threshold); + +int sysctl_copy_in_user_threshold = -1; +EXPORT_SYMBOL(sysctl_copy_in_user_threshold); +#endif + /* * /proc/sys support */ @@ -2250,6 +2261,29 @@ static struct ctl_table vm_table[] = { .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, +#endif +#ifdef CONFIG_USE_VECTORIZED_COPY + { + .procname = "copy_to_user_threshold", + .data = &sysctl_copy_to_user_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "copy_from_user_threshold", + .data = &sysctl_copy_from_user_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "copy_in_user_threshold", + .data = &sysctl_copy_in_user_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #endif { } }; -- 2.34.1
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://atomgit.com/openeuler/kernel/merge_requests/20363 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/UBD... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://atomgit.com/openeuler/kernel/merge_requests/20363 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/UBD...
participants (2)
-
Nikita Panov -
patchwork bot