From: Artem Kuzin <artem.kuzin@huawei.com> kunpeng inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/8445 ------------------------------------------------- 1. This implementation uses st1/ld1 4-vector instructions which allow to copy 64 bytes at once 2. Copy code is used only if size of data block to copy is more than 128 bytes 4. To use this functionality you need to set configuration switch CONFIG_USE_VECTORIZED_COPY=y 5. Code can be used on any ARMv8 variant 6. In kernel copy functions like memcpy are not supported now, but can be enabled in future 7. For now we use lightweght version of register context saving/restoration (4-registers) We introduce support of vectorization for copy_from/to/in_user functions. Nowadays it works in parallel with original FPSIMD/SVE vectorization and doesn't affect it anyhow. We have special flag in task struct - TIF_KERNEL_FPSIMD, that set if currently we use lightweight vectorization in kernel. Task struct has been updated by two fields: user space fpsimd state and kernel fpsimd state. User space fpsimd state used by kernel_fpsimd_begin(), kernel_fpsimd_end() functions that wrap lightweight FPSIMD contexts usage in kernel space. Kernel fpsimd state is used to manage threads switch. Now there is no support of nested calls of kernel_neon_begin()/kernel_fpsimd_begin() and there is no plans to support this in future. This is not necessary. We save lightweight FPSIMD context in kernel_fpsimd_begin(), and restore it in /kernel_fpsimd_end(). On thread switch we preserve kernel FPSIMD context and restore user space one if any. This prevens curruption of user space FPSIMD state. Before switching to the next thread we restore it's kernel FPSIMD context if any. It is allowed to use FPSIMD in bottom halves, due to in case of BH preemption we check TIF_KERNEL_FPSIMD flag and save/restore contexts. Context management if quite lightweight and executed only in case of TIF_KERNEL_FPSIMD flag is set. To enable this feature, you need to manually modify one of the appropriate entries: /proc/sys/vm/copy_from_user_threshold /proc/sys/vm/copy_in_user_threshold /proc/sys/vm/copy_to_user_threshold Allowed values are following: -1 - feature enabled 0 - feature always enabled n (n >0) - feature enabled, if copied size is greater than n KB. P.S.: What I am personally don't like in current approach: 1. Additional fields and flag in task struct look quite ugly 2. No way to configure the size of chunk to copy using FPSIMD from user space 3. FPSIMD-based memory movement is not generic, need to enable for memmove(), memcpy() and friends in future. Co-developed-by: Alexander Kozhevnikov <alexander.kozhevnikov@huawei-partners.com> Signed-off-by: Alexander Kozhevnikov <alexander.kozhevnikov@huawei-partners.com> Co-developed-by: Nikita Panov <panov.nikita@huawei.com> Signed-off-by: Nikita Panov <panov.nikita@huawei.com> Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com> --- arch/arm64/Kconfig | 15 ++ arch/arm64/include/asm/fpsimd.h | 15 ++ arch/arm64/include/asm/fpsimdmacros.h | 14 ++ arch/arm64/include/asm/neon.h | 28 +++ arch/arm64/include/asm/processor.h | 10 + arch/arm64/include/asm/thread_info.h | 4 + arch/arm64/include/asm/uaccess.h | 274 +++++++++++++++++++++++++- arch/arm64/kernel/entry-fpsimd.S | 22 +++ arch/arm64/kernel/fpsimd.c | 102 +++++++++- arch/arm64/kernel/process.c | 2 +- arch/arm64/lib/copy_from_user.S | 18 ++ arch/arm64/lib/copy_in_user.S | 19 ++ arch/arm64/lib/copy_template_fpsimd.S | 180 +++++++++++++++++ arch/arm64/lib/copy_to_user.S | 19 ++ kernel/softirq.c | 31 ++- kernel/sysctl.c | 35 ++++ 16 files changed, 780 insertions(+), 8 deletions(-) create mode 100644 arch/arm64/lib/copy_template_fpsimd.S diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index eb30ef59aca2..959af31f7e70 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1470,6 +1470,21 @@ config ARM64_ILP32 is an ABI where long and pointers are 32bits but it uses the AARCH64 instruction set. +config USE_VECTORIZED_COPY + bool "Use vectorized instructions in copy_to/from user" + depends on KERNEL_MODE_NEON + default y + help + This option turns on vectorization to speed up copy_to/from_user routines. + +config VECTORIZED_COPY_VALIDATE + bool "Validate result of vectorized copy using regular implementation" + depends on KERNEL_MODE_NEON + depends on USE_VECTORIZED_COPY + default n + help + This option turns on vectorization to speed up copy_to/from_user routines. + menuconfig AARCH32_EL0 bool "Kernel support for 32-bit EL0" depends on ARM64_4K_PAGES || EXPERT diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 22f6c6e23441..cb53767105ef 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -46,6 +46,21 @@ struct task_struct; +#ifdef CONFIG_USE_VECTORIZED_COPY +extern void fpsimd_save_state_light(struct fpsimd_state *state); +extern void fpsimd_load_state_light(struct fpsimd_state *state); +#else +static inline void fpsimd_save_state_light(struct fpsimd_state *state) +{ + (void) state; +} + +static inline void fpsimd_load_state_light(struct fpsimd_state *state) +{ + (void) state; +} +#endif + extern void fpsimd_save_state(struct user_fpsimd_state *state); extern void fpsimd_load_state(struct user_fpsimd_state *state); diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index ea2577e159f6..62f5f8a0540a 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -8,6 +8,20 @@ #include <asm/assembler.h> +#ifdef CONFIG_USE_VECTORIZED_COPY +/* Lightweight fpsimd context saving/restoration. + * Necessary for vectorized kernel memory movement + * implementation + */ +.macro fpsimd_save_light state + st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [\state] +.endm + +.macro fpsimd_restore_light state + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [\state] +.endm +#endif + .macro fpsimd_save state, tmpnr stp q0, q1, [\state, #16 * 0] stp q2, q3, [\state, #16 * 2] diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index d4b1d172a79b..ab84b194d7b3 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h @@ -16,4 +16,32 @@ void kernel_neon_begin(void); void kernel_neon_end(void); +#ifdef CONFIG_USE_VECTORIZED_COPY +bool kernel_fpsimd_begin(void); +void kernel_fpsimd_end(void); +/* Functions to use in non-preemptible context */ +void _kernel_fpsimd_save(struct fpsimd_state *state); +void _kernel_fpsimd_load(struct fpsimd_state *state); +#else +bool kernel_fpsimd_begin(void) +{ + return false; +} + +void kernel_fpsimd_end(void) +{ +} + +/* Functions to use in non-preemptible context */ +void _kernel_fpsimd_save(struct fpsimd_state *state) +{ + (void) state; +} + +void _kernel_fpsimd_load(struct fpsimd_state *state) +{ + (void) state; +} +#endif + #endif /* ! __ASM_NEON_H */ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 66186f3ab550..d6ca823f7f0f 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -137,6 +137,10 @@ struct cpu_context { unsigned long pc; }; +struct fpsimd_state { + __uint128_t v[4]; +}; + struct thread_struct { struct cpu_context cpu_context; /* cpu context */ @@ -174,6 +178,12 @@ struct thread_struct { KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8) +#ifdef CONFIG_USE_VECTORIZED_COPY + KABI_EXTEND( + struct fpsimd_state ustate; + struct fpsimd_state kstate; + ) +#endif }; static inline unsigned int thread_get_vl(struct thread_struct *thread, diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 390d9612546b..2e395ebcc856 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -89,6 +89,8 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_PATCH_PENDING 28 /* pending live patching update */ #define TIF_SME 29 /* SME in use */ #define TIF_SME_VL_INHERIT 30 /* Inherit SME vl_onexec across exec */ +#define TIF_KERNEL_FPSIMD 31 /* Use FPSIMD in kernel */ +#define TIF_PRIV_UACC_ENABLED 32 /* Whether priviliged uaccess was manually enabled */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) @@ -108,6 +110,8 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_32BIT_AARCH64 (1 << TIF_32BIT_AARCH64) #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) +#define _TIF_KERNEL_FPSIMD (1 << TIF_KERNEL_FPSIMD) +#define _TIF_PRIV_UACC_ENABLED (1 << TIF_PRIV_UACC_ENABLED) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 03c2db710f92..4e4eec098cbc 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -24,6 +24,10 @@ #include <asm/memory.h> #include <asm/extable.h> +#ifndef __GENKSYMS__ +#include <asm/neon.h> +#endif + #define HAVE_GET_KERNEL_NOFAULT /* @@ -174,7 +178,7 @@ static inline void __uaccess_enable_hw_pan(void) CONFIG_ARM64_PAN)); } -static inline void uaccess_disable_privileged(void) +static inline void __uaccess_disable_privileged(void) { if (uaccess_ttbr0_disable()) return; @@ -182,7 +186,22 @@ static inline void uaccess_disable_privileged(void) __uaccess_enable_hw_pan(); } -static inline void uaccess_enable_privileged(void) +static inline void uaccess_disable_privileged(void) +{ + preempt_disable(); + + if (!test_and_clear_thread_flag(TIF_PRIV_UACC_ENABLED)) { + WARN_ON(1); + preempt_enable(); + return; + } + + __uaccess_disable_privileged(); + + preempt_enable(); +} + +static inline void __uaccess_enable_privileged(void) { if (uaccess_ttbr0_enable()) return; @@ -190,6 +209,47 @@ static inline void uaccess_enable_privileged(void) __uaccess_disable_hw_pan(); } +static inline void uaccess_enable_privileged(void) +{ + preempt_disable(); + + if (test_and_set_thread_flag(TIF_PRIV_UACC_ENABLED)) { + WARN_ON(1); + preempt_enable(); + return; + } + + __uaccess_enable_privileged(); + + preempt_enable(); +} + +static inline void uaccess_priviliged_context_switch(struct task_struct *next) +{ + bool curr_enabled = !!test_thread_flag(TIF_PRIV_UACC_ENABLED); + bool next_enabled = !!test_ti_thread_flag(&next->thread_info, TIF_PRIV_UACC_ENABLED); + + if (curr_enabled == next_enabled) + return; + + if (curr_enabled) + __uaccess_disable_privileged(); + else + __uaccess_enable_privileged(); +} + +static inline void uaccess_priviliged_state_save(void) +{ + if (test_thread_flag(TIF_PRIV_UACC_ENABLED)) + __uaccess_disable_privileged(); +} + +static inline void uaccess_priviliged_state_restore(void) +{ + if (test_thread_flag(TIF_PRIV_UACC_ENABLED)) + __uaccess_enable_privileged(); +} + /* * Sanitise a uaccess pointer such that it becomes NULL if above the maximum * user address. In case the pointer is tagged (has the top byte set), untag @@ -386,7 +446,97 @@ do { \ goto err_label; \ } while(0) -extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); +#define USER_COPY_CHUNK_SIZE 4096 + +#ifdef CONFIG_USE_VECTORIZED_COPY + +extern int sysctl_copy_from_user_threshold; + +#define verify_fpsimd_copy(to, from, n, ret) \ +({ \ + unsigned long __verify_ret = 0; \ + __verify_ret = memcmp(to, from, ret ? n - ret : n); \ + if (__verify_ret) \ + pr_err("FPSIMD:%s inconsistent state\n", __func__); \ + if (ret) \ + pr_err("FPSIMD:%s failed to copy data, expected=%lu, copied=%lu\n", __func__, n, n - ret); \ + __verify_ret |= ret; \ + __verify_ret; \ +}) + +#define compare_fpsimd_copy(to, from, n, ret_fpsimd, ret) \ +({ \ + unsigned long __verify_ret = 0; \ + __verify_ret = memcmp(to, from, ret ? n - ret : n); \ + if (__verify_ret) \ + pr_err("FIXUP:%s inconsistent state\n", __func__); \ + if (ret) \ + pr_err("FIXUP:%s failed to copy data, expected=%lu, copied=%lu\n", __func__, n, n - ret); \ + __verify_ret |= ret; \ + if (ret_fpsimd != ret) { \ + pr_err("FIXUP:%s difference between FPSIMD %lu and regular %lu\n", __func__, n - ret_fpsimd, n - ret); \ + __verify_ret |= 1; \ + } else { \ + __verify_ret = 0; \ + } \ + __verify_ret; \ +}) + +extern unsigned long __must_check +__arch_copy_from_user(void *to, const void __user *from, unsigned long n); + +extern unsigned long __must_check +__arch_copy_from_user_fpsimd(void *to, const void __user *from, unsigned long n); + +static __always_inline unsigned long __must_check +raw_copy_from_user(void *to, const void __user *from, unsigned long n) +{ + unsigned long __acfu_ret; + + if (sysctl_copy_from_user_threshold == -1 || n < sysctl_copy_from_user_threshold) { + uaccess_ttbr0_enable(); + __acfu_ret = __arch_copy_from_user(to, + __uaccess_mask_ptr(from), n); + uaccess_ttbr0_disable(); + } else { + if (kernel_fpsimd_begin()) { + unsigned long __acfu_ret_fpsimd; + + uaccess_enable_privileged(); + __acfu_ret_fpsimd = __arch_copy_from_user_fpsimd((to), + __uaccess_mask_ptr(from), n); + uaccess_disable_privileged(); + + __acfu_ret = __acfu_ret_fpsimd; + kernel_fpsimd_end(); +#ifdef CONFIG_VECTORIZED_COPY_VALIDATE + if (verify_fpsimd_copy(to, __uaccess_mask_ptr(from), n, + __acfu_ret)) { + + uaccess_ttbr0_enable(); + __acfu_ret = __arch_copy_from_user((to), + __uaccess_mask_ptr(from), n); + uaccess_ttbr0_disable(); + + compare_fpsimd_copy(to, __uaccess_mask_ptr(from), n, + __acfu_ret_fpsimd, __acfu_ret); + } +#endif + } else { + uaccess_ttbr0_enable(); + __acfu_ret = __arch_copy_from_user((to), + __uaccess_mask_ptr(from), n); + uaccess_ttbr0_disable(); + } + } + + + return __acfu_ret; +} +#else +extern unsigned long __must_check +__arch_copy_from_user(void *to, const void __user *from, unsigned long n); + #define raw_copy_from_user(to, from, n) \ ({ \ unsigned long __acfu_ret; \ @@ -397,7 +547,66 @@ extern unsigned long __must_check __arch_copy_from_user(void *to, const void __u __acfu_ret; \ }) -extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); +#endif + +#ifdef CONFIG_USE_VECTORIZED_COPY + +extern int sysctl_copy_to_user_threshold; + +extern unsigned long __must_check +__arch_copy_to_user(void __user *to, const void *from, unsigned long n); + +extern unsigned long __must_check +__arch_copy_to_user_fpsimd(void __user *to, const void *from, unsigned long n); + +static __always_inline unsigned long __must_check +raw_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + unsigned long __actu_ret; + + + if (sysctl_copy_to_user_threshold == -1 || n < sysctl_copy_to_user_threshold) { + uaccess_ttbr0_enable(); + __actu_ret = __arch_copy_to_user(__uaccess_mask_ptr(to), + from, n); + uaccess_ttbr0_disable(); + } else { + if (kernel_fpsimd_begin()) { + unsigned long __actu_ret_fpsimd; + + uaccess_enable_privileged(); + __actu_ret_fpsimd = __arch_copy_to_user_fpsimd(__uaccess_mask_ptr(to), + from, n); + uaccess_disable_privileged(); + + kernel_fpsimd_end(); + __actu_ret = __actu_ret_fpsimd; +#ifdef CONFIG_VECTORIZED_COPY_VALIDATE + if (verify_fpsimd_copy(__uaccess_mask_ptr(to), from, n, + __actu_ret)) { + uaccess_ttbr0_enable(); + __actu_ret = __arch_copy_to_user(__uaccess_mask_ptr(to), + from, n); + uaccess_ttbr0_disable(); + + compare_fpsimd_copy(__uaccess_mask_ptr(to), from, n, + __actu_ret_fpsimd, __actu_ret); + } +#endif + } else { + uaccess_ttbr0_enable(); + __actu_ret = __arch_copy_to_user(__uaccess_mask_ptr(to), + from, n); + uaccess_ttbr0_disable(); + } + } + + return __actu_ret; +} +#else +extern unsigned long __must_check +__arch_copy_to_user(void __user *to, const void *from, unsigned long n); + #define raw_copy_to_user(to, from, n) \ ({ \ unsigned long __actu_ret; \ @@ -407,7 +616,62 @@ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const voi uaccess_ttbr0_disable(); \ __actu_ret; \ }) +#endif +#ifdef CONFIG_USE_VECTORIZED_COPY + +extern int sysctl_copy_in_user_threshold; + +extern unsigned long __must_check +__arch_copy_in_user(void __user *to, const void __user *from, unsigned long n); + +extern unsigned long __must_check +__arch_copy_in_user_fpsimd(void __user *to, const void __user *from, unsigned long n); + +static __always_inline unsigned long __must_check +raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) +{ + unsigned long __aciu_ret; + + if (sysctl_copy_in_user_threshold == -1 || n < sysctl_copy_in_user_threshold) { + uaccess_ttbr0_enable(); + __aciu_ret = __arch_copy_in_user(__uaccess_mask_ptr(to), + __uaccess_mask_ptr(from), n); + uaccess_ttbr0_disable(); + } else { + if (kernel_fpsimd_begin()) { + unsigned long __aciu_ret_fpsimd; + + uaccess_enable_privileged(); + __aciu_ret_fpsimd = __arch_copy_in_user_fpsimd(__uaccess_mask_ptr(to), + __uaccess_mask_ptr(from), n); + uaccess_disable_privileged(); + + kernel_fpsimd_end(); + __aciu_ret = __aciu_ret_fpsimd; +#ifdef CONFIG_VECTORIZED_COPY_VALIDATE + if (verify_fpsimd_copy(__uaccess_mask_ptr(to), __uaccess_mask_ptr(from), n, + __aciu_ret)) { + uaccess_ttbr0_enable(); + __aciu_ret = __arch_copy_in_user(__uaccess_mask_ptr(to), + __uaccess_mask_ptr(from), n); + uaccess_ttbr0_disable(); + + compare_fpsimd_copy(__uaccess_mask_ptr(to), __uaccess_mask_ptr(from), n, + __aciu_ret_fpsimd, __aciu_ret); + } +#endif + } else { + uaccess_ttbr0_enable(); + __aciu_ret = __arch_copy_in_user(__uaccess_mask_ptr(to), + __uaccess_mask_ptr(from), n); + uaccess_ttbr0_disable(); + } + } + + return __aciu_ret; +} +#else extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n); #define raw_copy_in_user(to, from, n) \ ({ \ @@ -419,6 +683,8 @@ extern unsigned long __must_check __arch_copy_in_user(void __user *to, const voi __aciu_ret; \ }) +#endif + #define INLINE_COPY_TO_USER #define INLINE_COPY_FROM_USER diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 8d12aaac7862..848ca6a351d7 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -11,6 +11,28 @@ #include <asm/assembler.h> #include <asm/fpsimdmacros.h> +#ifdef CONFIG_USE_VECTORIZED_COPY +/* + * Save the FP registers. + * + * x0 - pointer to struct fpsimd_state_light + */ +SYM_FUNC_START(fpsimd_save_state_light) + fpsimd_save_light x0 + ret +SYM_FUNC_END(fpsimd_save_state_light) + +/* + * Load the FP registers. + * + * x0 - pointer to struct fpsimd_state_light + */ +SYM_FUNC_START(fpsimd_load_state_light) + fpsimd_restore_light x0 + ret +SYM_FUNC_END(fpsimd_load_state_light) +#endif + /* * Save the FP registers. * diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index c2489a72b0b9..1a08c19a181f 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1492,6 +1492,11 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) current); } +#ifdef CONFIG_USE_VECTORIZED_COPY +static void kernel_fpsimd_rollback_changes(void); +static void kernel_fpsimd_restore_changes(struct task_struct *tsk); +#endif + void fpsimd_thread_switch(struct task_struct *next) { bool wrong_task, wrong_cpu; @@ -1500,10 +1505,11 @@ void fpsimd_thread_switch(struct task_struct *next) return; __get_cpu_fpsimd_context(); - +#ifdef CONFIG_USE_VECTORIZED_COPY + kernel_fpsimd_rollback_changes(); +#endif /* Save unsaved fpsimd state, if any: */ fpsimd_save(); - /* * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's * state. For kernel threads, FPSIMD registers are never loaded @@ -1516,6 +1522,9 @@ void fpsimd_thread_switch(struct task_struct *next) update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, wrong_task || wrong_cpu); +#ifdef CONFIG_USE_VECTORIZED_COPY + kernel_fpsimd_restore_changes(next); +#endif __put_cpu_fpsimd_context(); } @@ -1835,6 +1844,95 @@ void kernel_neon_end(void) } EXPORT_SYMBOL(kernel_neon_end); +#ifdef CONFIG_USE_VECTORIZED_COPY +bool kernel_fpsimd_begin(void) +{ + if (WARN_ON(!system_capabilities_finalized()) || + !system_supports_fpsimd() || + in_irq() || irqs_disabled() || in_nmi()) + return false; + + preempt_disable(); + if (test_and_set_thread_flag(TIF_KERNEL_FPSIMD)) { + preempt_enable(); + + WARN_ON(1); + return false; + } + + /* + * Leaving streaming mode enabled will cause issues for any kernel + * NEON and leaving streaming mode or ZA enabled may increase power + * consumption. + */ + if (system_supports_sme()) + sme_smstop(); + + fpsimd_save_state_light(¤t->thread.ustate); + preempt_enable(); + + return true; +} +EXPORT_SYMBOL(kernel_fpsimd_begin); + +void kernel_fpsimd_end(void) +{ + if (!system_supports_fpsimd()) + return; + + preempt_disable(); + if (test_and_clear_thread_flag(TIF_KERNEL_FPSIMD)) + fpsimd_load_state_light(¤t->thread.ustate); + + preempt_enable(); +} +EXPORT_SYMBOL(kernel_fpsimd_end); + +void _kernel_fpsimd_save(struct fpsimd_state *state) +{ + if (!system_supports_fpsimd()) + return; + + BUG_ON(preemptible()); + if (test_thread_flag(TIF_KERNEL_FPSIMD)) + fpsimd_save_state_light(state); +} + +void _kernel_fpsimd_load(struct fpsimd_state *state) +{ + if (!system_supports_fpsimd()) + return; + + BUG_ON(preemptible()); + if (test_thread_flag(TIF_KERNEL_FPSIMD)) + fpsimd_load_state_light(state); +} + +static void kernel_fpsimd_rollback_changes(void) +{ + if (!system_supports_fpsimd()) + return; + + BUG_ON(preemptible()); + if (test_thread_flag(TIF_KERNEL_FPSIMD)) { + fpsimd_save_state_light(¤t->thread.kstate); + fpsimd_load_state_light(¤t->thread.ustate); + } +} + +static void kernel_fpsimd_restore_changes(struct task_struct *tsk) +{ + if (!system_supports_fpsimd()) + return; + + BUG_ON(preemptible()); + if (test_ti_thread_flag(task_thread_info(tsk), TIF_KERNEL_FPSIMD)) { + fpsimd_save_state_light(&tsk->thread.ustate); + fpsimd_load_state_light(&tsk->thread.kstate); + } +} +#endif + #ifdef CONFIG_EFI static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 14300c9e06d5..338d40725a5d 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -572,7 +572,7 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next) { struct task_struct *last; - + uaccess_priviliged_context_switch(next); fpsimd_thread_switch(next); tls_thread_switch(next); hw_breakpoint_thread_switch(next); diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index dfc33ce09e72..94290069d97d 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -63,6 +63,24 @@ SYM_FUNC_START(__arch_copy_from_user) SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user) +#ifdef CONFIG_USE_VECTORIZED_COPY + .macro ldsve reg1, reg2, reg3, reg4, ptr + USER(9997f, ld1 {\reg1, \reg2, \reg3, \reg4}, [\ptr]) + .endm + + .macro stsve reg1, reg2, reg3, reg4, ptr + USER_MC(9998f, st1 {\reg1, \reg2, \reg3, \reg4}, [\ptr]) + .endm + +SYM_FUNC_START(__arch_copy_from_user_fpsimd) + add end, x0, x2 + mov srcin, x1 +#include "copy_template_fpsimd.S" + mov x0, #0 // Nothing to copy + ret +SYM_FUNC_END(__arch_copy_from_user_fpsimd) +EXPORT_SYMBOL(__arch_copy_from_user_fpsimd) +#endif .section .fixup,"ax" .align 2 9997: cmp dst, dstin diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index dbea3799c3ef..cbc09c377050 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -64,6 +64,25 @@ SYM_FUNC_START(__arch_copy_in_user) SYM_FUNC_END(__arch_copy_in_user) EXPORT_SYMBOL(__arch_copy_in_user) +#ifdef CONFIG_USE_VECTORIZED_COPY + .macro ldsve reg1, reg2, reg3, reg4, ptr + USER(9997f, ld1 {\reg1, \reg2, \reg3, \reg4}, [\ptr]) + .endm + + .macro stsve reg1, reg2, reg3, reg4, ptr + USER(9997f, st1 {\reg1, \reg2, \reg3, \reg4}, [\ptr]) + .endm + +SYM_FUNC_START(__arch_copy_in_user_fpsimd) + add end, x0, x2 + mov srcin, x1 +#include "copy_template_fpsimd.S" + mov x0, #0 + ret +SYM_FUNC_END(__arch_copy_in_user_fpsimd) +EXPORT_SYMBOL(__arch_copy_in_user_fpsimd) +#endif + .section .fixup,"ax" .align 2 9997: cmp dst, dstin diff --git a/arch/arm64/lib/copy_template_fpsimd.S b/arch/arm64/lib/copy_template_fpsimd.S new file mode 100644 index 000000000000..9b2e7ce1e4d2 --- /dev/null +++ b/arch/arm64/lib/copy_template_fpsimd.S @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + */ + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + +V_a .req v20 +V_b .req v21 +V_c .req v22 +V_d .req v23 + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15_fpsimd + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned_fpsimd + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned_fpsimd + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned_fpsimd: + cmp count, #64 + b.ge .Lcpy_over64_fpsimd + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63_fpsimd: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15_fpsimd + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +1: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +2: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +.Ltiny15_fpsimd: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc_fpsimd + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc_fpsimd + +.Lcpy_over64_fpsimd: + subs count, count, #128 + b.ge .Lcpy_body_large_fpsimd + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + ldp1 D_l, D_h, src, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63_fpsimd + b .Lexitfunc_fpsimd + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large_fpsimd: + /* pre-get 64 bytes data. */ + ldsve V_a.16b, V_b.16b, V_c.16b, V_d.16b, src + add src, src, #64 + +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stsve V_a.16b, V_b.16b, V_c.16b, V_d.16b, dst + ldsve V_a.16b, V_b.16b, V_c.16b, V_d.16b, src + add dst, dst, #64 + add src, src, #64 + + subs count, count, #64 + b.ge 1b + + stsve V_a.16b, V_b.16b, V_c.16b, V_d.16b, dst + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63_fpsimd +.Lexitfunc_fpsimd: diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 34154e7c8577..d0211fce4923 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -62,6 +62,25 @@ SYM_FUNC_START(__arch_copy_to_user) SYM_FUNC_END(__arch_copy_to_user) EXPORT_SYMBOL(__arch_copy_to_user) +#ifdef CONFIG_USE_VECTORIZED_COPY + .macro stsve reg1, reg2, reg3, reg4, ptr + USER(9997f, st1 {\reg1, \reg2, \reg3, \reg4}, [\ptr]) + .endm + + .macro ldsve reg1, reg2, reg3, reg4, ptr + USER_MC(9998f, ld1 {\reg1, \reg2, \reg3, \reg4}, [\ptr]) + .endm + +SYM_FUNC_START(__arch_copy_to_user_fpsimd) + add end, x0, x2 + mov srcin, x1 +#include "copy_template_fpsimd.S" + mov x0, #0 + ret +SYM_FUNC_END(__arch_copy_to_user_fpsimd) +EXPORT_SYMBOL(__arch_copy_to_user_fpsimd) +#endif + .section .fixup,"ax" .align 2 9997: cmp dst, dstin diff --git a/kernel/softirq.c b/kernel/softirq.c index 9fc69e6e2c11..e3f73422829d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -26,6 +26,10 @@ #include <linux/tick.h> #include <linux/irq.h> +#ifdef CONFIG_USE_VECTORIZED_COPY +#include <asm/fpsimd.h> +#endif + #define CREATE_TRACE_POINTS #include <trace/events/irq.h> @@ -262,6 +266,9 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) __u32 pending; int softirq_bit; +#ifdef CONFIG_USE_VECTORIZED_COPY + struct fpsimd_state state; +#endif /* * Mask out PF_MEMALLOC as the current task context is borrowed for the * softirq. A softirq handled, such as network RX, might set PF_MEMALLOC @@ -273,8 +280,11 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) account_irq_enter_time(current); __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); +#ifdef CONFIG_USE_VECTORIZED_COPY + _kernel_fpsimd_save(&state); + uaccess_priviliged_state_save(); +#endif in_hardirq = lockdep_softirq_start(); - restart: /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); @@ -322,6 +332,11 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) lockdep_softirq_end(in_hardirq); account_irq_exit_time(current); + +#ifdef CONFIG_USE_VECTORIZED_COPY + uaccess_priviliged_state_restore(); + _kernel_fpsimd_load(&state); +#endif __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); current_restore_flags(old_flags, PF_MEMALLOC); @@ -612,12 +627,21 @@ static void tasklet_action_common(struct softirq_action *a, { struct tasklet_struct *list; +#ifdef CONFIG_USE_VECTORIZED_COPY + struct fpsimd_state state; +#endif + local_irq_disable(); list = tl_head->head; tl_head->head = NULL; tl_head->tail = &tl_head->head; local_irq_enable(); +#ifdef CONFIG_USE_VECTORIZED_COPY + _kernel_fpsimd_save(&state); + uaccess_priviliged_state_save(); +#endif + while (list) { struct tasklet_struct *t = list; @@ -645,6 +669,11 @@ static void tasklet_action_common(struct softirq_action *a, __raise_softirq_irqoff(softirq_nr); local_irq_enable(); } + +#ifdef CONFIG_USE_VECTORIZED_COPY + uaccess_priviliged_state_restore(); + _kernel_fpsimd_load(&state); +#endif } static __latent_entropy void tasklet_action(struct softirq_action *a) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0b1c13a05332..9ec07294429b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -210,6 +210,17 @@ static int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ +#ifdef CONFIG_USE_VECTORIZED_COPY +int sysctl_copy_to_user_threshold = -1; +EXPORT_SYMBOL(sysctl_copy_to_user_threshold); + +int sysctl_copy_from_user_threshold = -1; +EXPORT_SYMBOL(sysctl_copy_from_user_threshold); + +int sysctl_copy_in_user_threshold = -1; +EXPORT_SYMBOL(sysctl_copy_in_user_threshold); +#endif + #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) static int bpf_stats_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -3385,6 +3396,30 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_ONE, }, #endif + +#ifdef CONFIG_USE_VECTORIZED_COPY + { + .procname = "copy_to_user_threshold", + .data = &sysctl_copy_to_user_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "copy_from_user_threshold", + .data = &sysctl_copy_from_user_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "copy_in_user_threshold", + .data = &sysctl_copy_in_user_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif { } }; -- 2.34.1