From: Yipeng Zou zouyipeng@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB6JLE
--------------------------------
The svc exception handling process in ARM64, which includes auxiliary functions for debug/trace and core functions like KPTI, has been identified as overly "lengthy".
This inefficiency is particularly noticeable in short syscalls such as lseek() and getpid(), where the syscall function itself comprises a small percentage of the total instructions executed.
To address this, we introduce the concept of xcall, a fast svc exception handling path that only considers necessary features such as security, context saving, and recovery.
This approach can be seen as a high-speed syscall processing mechanism bridging the gap between vdso and traditional syscalls.
We've implemented a per-task bitmap to enable or disable xcall for specific syscalls.
Users can enable a syscall with the following command:
echo $syscall_nr > /proc/$PID/xcall
To disable a syscall, use:
echo !$syscall_nr > /proc/$PID/xcall
The current status of enabled syscalls can be viewed by:
cat /proc/$PID/xcall
Finally, we've added a kernel boot parameter to control the xcall feature.
To enable xcall, include "xcall" in the kernel boot command.
By default, xcall is disabled.
This patch introduces basic framework and have not modified to syscall path only copy to xcall.
Signed-off-by: Yipeng Zou zouyipeng@huawei.com --- arch/Kconfig | 19 ++++++ arch/arm64/Kconfig | 1 + arch/arm64/kernel/asm-offsets.c | 3 + arch/arm64/kernel/cpufeature.c | 28 ++++++++ arch/arm64/kernel/entry.S | 60 +++++++++++++++++ arch/arm64/tools/cpucaps | 1 + fs/proc/base.c | 112 ++++++++++++++++++++++++++++++++ include/linux/sched.h | 4 ++ kernel/fork.c | 17 +++++ 9 files changed, 245 insertions(+)
diff --git a/arch/Kconfig b/arch/Kconfig index 98e8ee5a8a74..f7e7c7018602 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1530,4 +1530,23 @@ config FUNCTION_ALIGNMENT default 4 if FUNCTION_ALIGNMENT_4B default 0
+config ARCH_SUPPORTS_FAST_SYSCALL + bool + +config FAST_SYSCALL + bool "Fast Syscall support" + depends on ARCH_SUPPORTS_FAST_SYSCALL + default n + help + This enable support Fast syscall feature. + The svc exception handling process, which includes auxiliary + functions for debug/trace and core functions like + KPTI, has been identified as overly "lengthy". + This inefficiency is particularly noticeable in short syscalls + such as lseek() and getpid(), where the syscall function itself + comprises a small percentage of the total instructions executed. + To address this, we introduce the concept of fast syscall, a fast svc + exception handling path that only considers necessary features + such as security, context saving, and recovery. + endmenu diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 501ec560a939..2fd2a1712875 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -261,6 +261,7 @@ config ARM64 select TRACE_IRQFLAGS_SUPPORT select TRACE_IRQFLAGS_NMI_SUPPORT select HAVE_SOFTIRQ_ON_OWN_STACK + select ARCH_SUPPORTS_FAST_SYSCALL if !ARM64_MTE && !KASAN_HW_TAGS help ARM 64-bit (AArch64) Linux support.
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index e997ad275afb..f20918eb36bc 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -28,6 +28,9 @@
int main(void) { +#ifdef CONFIG_FAST_SYSCALL + DEFINE(TSK_XCALL, offsetof(struct task_struct, xcall_enable)); +#endif DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); BLANK(); DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a7b4ccd7983e..6f0ce5b830f8 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2375,6 +2375,26 @@ static void mpam_extra_caps(void) __enable_mpam_hcr(); }
+#ifdef CONFIG_FAST_SYSCALL +static bool is_xcall_support; +static int __init xcall_setup(char *str) +{ + is_xcall_support = true; + return 1; +} +__setup("xcall", xcall_setup); + +bool fast_syscall_enabled(void) +{ + return is_xcall_support; +} + +static bool has_xcall_support(const struct arm64_cpu_capabilities *entry, int __unused) +{ + return is_xcall_support; +} +#endif + static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, @@ -2891,6 +2911,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, TWED, IMP) }, +#endif +#ifdef CONFIG_FAST_SYSCALL + { + .desc = "Xcall Support", + .capability = ARM64_HAS_XCALL, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_xcall_support, + }, #endif {}, }; diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 7fcbee0f6c0e..ea6b55dc564a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -569,9 +569,69 @@ SYM_CODE_START_LOCAL(__bad_stack) SYM_CODE_END(__bad_stack) #endif /* CONFIG_VMAP_STACK */
+#ifdef CONFIG_FAST_SYSCALL + .macro check_esr_el1_ec_svc64 + /* Only support SVC64 for now */ + mrs x20, esr_el1 + lsr w20, w20, #ESR_ELx_EC_SHIFT + cmp x20, #ESR_ELx_EC_SVC64 + .endm + + .macro check_syscall_nr + cmp x8, __NR_syscalls + .endm + + .macro check_xcall_enable + /* x21 = task_struct->xcall_enable */ + ldr_this_cpu x20, __entry_task, x21 + ldr x21, [x20, #TSK_XCALL] + /* x20 = sc_no / 8 */ + lsr x20, x8, 3 + ldr x21, [x21, x20] + /* x8 = sc_no % 8 */ + and x8, x8, 7 + mov x20, 1 + lsl x20, x20, x8 + and x21, x21, x20 + cmp x21, 0 + .endm + + .macro check_xcall_pre_kernel_entry + stp x20, x21, [sp, #0] + /* is ESR_ELx_EC_SVC64 */ + check_esr_el1_ec_svc64 + bne .Lskip_xcall@ + /* x8 >= __NR_syscalls */ + check_syscall_nr + bhs .Lskip_xcall@ + str x8, [sp, #16] + /* is xcall enabled */ + check_xcall_enable + ldr x8, [sp, #16] + beq .Lskip_xcall@ + ldp x20, x21, [sp, #0] + /* do xcall */ + kernel_entry 0, 64 + mov x0, sp + bl el0t_64_sync_handler + b ret_to_user +.Lskip_xcall@: + ldp x20, x21, [sp, #0] + .endm +#endif
.macro entry_handler el:req, ht:req, regsize:req, label:req SYM_CODE_START_LOCAL(el\el\ht()_\regsize()_\label) +#ifdef CONFIG_FAST_SYSCALL + .if \el == 0 && \regsize == 64 && \label == sync + /* Only support el0 aarch64 sync exception */ + alternative_if_not ARM64_HAS_XCALL + b .Lret_to_kernel_entry@ + alternative_else_nop_endif + check_xcall_pre_kernel_entry + .Lret_to_kernel_entry@: + .endif +#endif kernel_entry \el, \regsize mov x0, sp bl el\el\ht()_\regsize()_\label()_handler diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 2d3df8c73158..1f662b0bc1f7 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -53,6 +53,7 @@ HAS_TLB_RANGE HAS_TWED HAS_VIRT_HOST_EXTN HAS_WFXT +HAS_XCALL HW_DBM KVM_HVHE KVM_PROTECTED_MODE diff --git a/fs/proc/base.c b/fs/proc/base.c index 63fa766e5feb..276588a25225 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3394,6 +3394,115 @@ static const struct file_operations proc_pid_sg_level_operations = { }; #endif
+#ifdef CONFIG_FAST_SYSCALL +bool fast_syscall_enabled(void); + +static int xcall_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + unsigned int rs, re; + + if (!fast_syscall_enabled()) + return -EACCES; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (!p->xcall_enable) + goto out; + + seq_printf(m, "Enabled Total[%d/%d]:", bitmap_weight(p->xcall_enable, __NR_syscalls), + __NR_syscalls); + + for (rs = 0, bitmap_next_set_region(p->xcall_enable, &rs, &re, __NR_syscalls); + rs < re; rs = re + 1, + bitmap_next_set_region(p->xcall_enable, &rs, &re, __NR_syscalls)) { + rs == (re - 1) ? seq_printf(m, "%d,", rs) : + seq_printf(m, "%d-%d,", rs, re - 1); + } + seq_puts(m, "\n"); +out: + put_task_struct(p); + + return 0; +} + +static int xcall_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, xcall_show, inode); +} + +static int xcall_enable_one(struct task_struct *p, unsigned int sc_no) +{ + bitmap_set(p->xcall_enable, sc_no, 1); + return 0; +} + +static int xcall_disable_one(struct task_struct *p, unsigned int sc_no) +{ + bitmap_clear(p->xcall_enable, sc_no, 1); + return 0; +} + +static ssize_t xcall_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; + unsigned int sc_no = __NR_syscalls; + int ret = 0; + int is_clear = 0; + + if (!fast_syscall_enabled()) + return -EACCES; + + memset(buffer, 0, sizeof(buffer)); + if (!count || copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p || !p->xcall_enable) + return -ESRCH; + + if (buffer[0] == '!') + is_clear = 1; + + if (kstrtouint(buffer + is_clear, 10, &sc_no)) { + ret = -EINVAL; + goto out; + } + + if (sc_no >= __NR_syscalls) { + ret = -EINVAL; + goto out; + } + + if (!is_clear && !test_bit(sc_no, p->xcall_enable)) + ret = xcall_enable_one(p, sc_no); + else if (is_clear && test_bit(sc_no, p->xcall_enable)) + ret = xcall_disable_one(p, sc_no); + else + ret = -EINVAL; + +out: + put_task_struct(p); + + return ret ? ret : count; +} + +static const struct file_operations proc_pid_xcall_operations = { + .open = xcall_open, + .read = seq_read, + .write = xcall_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Thread groups */ @@ -3420,6 +3529,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_QOS_SCHED_SMART_GRID REG("smart_grid_level", 0644, proc_pid_sg_level_operations), #endif +#ifdef CONFIG_FAST_SYSCALL + REG("xcall", 0644, proc_pid_xcall_operations), +#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index f40411aa7b70..f2f18b9ea002 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1597,7 +1597,11 @@ struct task_struct { */ randomized_struct_fields_end
+#if defined(CONFIG_FAST_SYSCALL) + KABI_USE(1, unsigned long *xcall_enable) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/kernel/fork.c b/kernel/fork.c index 27d605c64b45..a7cfc3106340 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -636,6 +636,10 @@ void free_task(struct task_struct *tsk) #ifdef CONFIG_QOS_SCHED_SMART_GRID if (smart_grid_enabled()) sched_grid_qos_free(tsk); +#endif +#ifdef CONFIG_FAST_SYSCALL + if (tsk->xcall_enable) + bitmap_free(tsk->xcall_enable); #endif free_task_struct(tsk); } @@ -1251,6 +1255,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->mm_cid_active = 0; tsk->migrate_from_cpu = -1; #endif + +#ifdef CONFIG_FAST_SYSCALL + tsk->xcall_enable = NULL; +#endif return tsk;
free_stack: @@ -2415,6 +2423,15 @@ __latent_entropy struct task_struct *copy_process(
rt_mutex_init_task(p);
+#ifdef CONFIG_FAST_SYSCALL + p->xcall_enable = bitmap_zalloc(__NR_syscalls, GFP_KERNEL); + if (!p->xcall_enable) + goto bad_fork_free; + + if (current->xcall_enable) + bitmap_copy(p->xcall_enable, current->xcall_enable, __NR_syscalls); +#endif + #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY if (dynamic_affinity_enabled()) { retval = sched_prefer_cpus_fork(p, current->prefer_cpus);