hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM10 CVE: NA
--------------------------------
During kernel copy_from_user processing, the kernel triggers a RAS exception when reading pages. In this solution, we identify this scenario in the kernel do_sea processing process, send SIGBUS signals to the process that triggers copy_from_user and isolate memory pages, preventing kernel panic.
At the same time, we use both cmdline(uce_kernel_recovery) and proc (/proc/sys/kernel/uce_kernel_recovery) to control this feature on/off.
Signed-off-by: Tong Tiangen tongtiangen@huawei.com --- v1->v2: 1. update commit msg. 2. change copy_from_user return value. 3. change copy_from_user proc control bit.
arch/arm64/Kconfig | 9 ++ arch/arm64/include/asm/exception.h | 12 +++ arch/arm64/lib/copy_from_user.S | 5 + arch/arm64/mm/fault.c | 154 +++++++++++++++++++++++++++++ include/linux/kernel.h | 4 + kernel/sysctl.c | 11 +++ 6 files changed, 195 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c0f6a275f798..5858cb9a3dac 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2155,6 +2155,15 @@ config ARCH_HIBERNATION_HEADER config ARCH_SUSPEND_POSSIBLE def_bool y
+config UCE_KERNEL_RECOVERY + bool "uce kernel recovery from special scenario" + def_bool n + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors + are consumed. In some cases, if the error address is in a user page there + is a chance to recover. Such as error occurs in COW and pagecache reading + scenario, we can isolate this page and killing process instead of die. + endmenu
menu "CPU Power Management" diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 0756191f44f6..bad92fe763fb 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -53,4 +53,16 @@ void do_cp15instr(unsigned int esr, struct pt_regs *regs); void do_el0_svc(struct pt_regs *regs); void do_el0_svc_compat(struct pt_regs *regs); void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr); + +#ifdef CONFIG_UCE_KERNEL_RECOVERY +struct uce_kernel_recovery_info { + int (*fn)(void); + const char *name; + unsigned long addr; + unsigned long size; +}; + +extern int copy_from_user_sea_fallback(void); +#endif + #endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 2cf999e41d30..b03ea0faa118 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -60,6 +60,11 @@ SYM_FUNC_START(__arch_copy_from_user) #include "copy_template.S" mov x0, #0 // Nothing to copy ret + + .global copy_from_user_sea_fallback +copy_from_user_sea_fallback: + sub x0, end, dst // bytes not copied + ret SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3fc5aceb72eb..7f76e277daa3 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -634,6 +634,127 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 1; /* "fault" */ }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +int kernel_access_sea_recovery; + +#define UCE_KER_REC_NUM ARRAY_SIZE(reco_info) +static struct uce_kernel_recovery_info reco_info[] = { + {NULL, NULL, 0, 0}, + {NULL, NULL, 0, 0}, + {copy_from_user_sea_fallback, "__arch_copy_from_user", (unsigned long)__arch_copy_from_user, 0}, +}; + +static int __init kernel_access_sea_recovery_init(void) +{ + unsigned long addr, size, offset; + unsigned int i; + + for (i = 0; i < UCE_KER_REC_NUM; i++) { + addr = reco_info[i].addr; + if (!kallsyms_lookup_size_offset(addr, &size, &offset)) { + pr_info("UCE: symbol %s lookup addr fail.\n", + reco_info[i].name); + size = 0; + } + + reco_info[i].size = size; + } + + return 1; +} +fs_initcall(kernel_access_sea_recovery_init); + +static int __init enable_kernel_access_sea_recovery(char *str) +{ + int max = (1 << UCE_KER_REC_NUM) - 1; + int val; + + if (kstrtoint(str, 0, &val)) + return -EINVAL; + + if (val < 0 || val > max) { + pr_info("UCE: invalid uce_kernel_recovery value %d", val); + return -EINVAL; + } + + kernel_access_sea_recovery = val; + + return 1; +} +__setup("uce_kernel_recovery=", enable_kernel_access_sea_recovery); + +/* + * what is kernel recovery? + * If the process's private data is accessed in the kernel mode to trigger + * special sea fault, it can controlled by killing the process and isolating + * the failure pages instead of die. + */ +static int is_in_kernel_recovery(unsigned int esr, struct pt_regs *regs) +{ + /* + * target insn: ldp-pre, ldp-post, ldp-offset, + * ldr-64bit-pre/pose, ldr-32bit-pre/post, ldrb-pre/post, ldrh-pre/post + */ + u32 target_insn[] = {0xa8c, 0xa9c, 0xa94, 0xf84, 0x784, 0x384, 0xb84}; + void *pc = (void *)instruction_pointer(regs); + struct uce_kernel_recovery_info *info; + bool insn_match = false; + u32 insn; + int i; + + pr_emerg("UCE: %s-%d, kernel recovery: 0x%x, esr: 0x%08x -- %s, %pS\n", + current->comm, current->pid, kernel_access_sea_recovery, esr, + esr_get_class_string(esr), pc); + + if (aarch64_insn_read((void *)pc, &insn)) { + pr_emerg("UCE: insn read fail.\n"); + return -EFAULT; + } + + /* + * We process special ESR: + * EC : 0b100101 Data Abort taken without a change in Exception level. + * DFSC : 0b010000 Synchronous External abort, not on translation table + * walk or hardware update of translation table. + * eg: 0x96000610 + */ + if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR || + (esr & ESR_ELx_FSC) != ESR_ELx_FSC_EXTABT) { + pr_emerg("UCE: esr not match.\n"); + return -EINVAL; + } + + insn = (insn >> 20) & 0xffc; + for (i = 0; i < ARRAY_SIZE(target_insn); i++) { + if (insn == target_insn[i]) { + insn_match = true; + break; + } + } + + if (!insn_match) { + pr_emerg("UCE: insn 0x%x is not match.\n", insn); + return -EINVAL; + } + + for (i = 0; i < UCE_KER_REC_NUM; i++) { + if (!((kernel_access_sea_recovery >> i) & 0x1)) + continue; + + info = &reco_info[i]; + if (info->fn && regs->pc >= info->addr && + regs->pc < (info->addr + info->size)) { + pr_emerg("UCE: total match %s success.\n", info->name); + return i; + } + } + + pr_emerg("UCE: symbol is not match or switch if off, kernel recovery %d.\n", + kernel_access_sea_recovery); + return -EINVAL; +} +#endif + static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; @@ -649,6 +770,39 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 0; }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY + if (!user_mode(regs)) { + int idx; + + if (!current->mm || !kernel_access_sea_recovery) { + pr_emerg("UCE: kernel recovery %d, %s-%d is %s-thread.\n", + kernel_access_sea_recovery, + current->comm, current->pid, + (current->mm) ? "user" : "kernel"); + die("Uncorrected hardware memory error in kernel-access\n", + regs, esr); + } + + idx = is_in_kernel_recovery(esr, regs); + if (idx >= 0 && idx < UCE_KER_REC_NUM) { + current->thread.fault_address = 0; + current->thread.fault_code = esr; + regs->pc = (unsigned long)reco_info[idx].fn; + + if (esr & ESR_ELx_FnV) + siaddr = NULL; + else + siaddr = (void __user *)addr; + + arm64_force_sig_fault(inf->sig, inf->code, siaddr, + "Uncorrected hardware memory use with kernel recovery in kernel-access\n"); + } else { + die("Uncorrected hardware memory error (not match idx or sence switch is off) in kernel-access\n", + regs, esr); + } + } +#endif + if (esr & ESR_ELx_FnV) siaddr = NULL; else diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 78a0907f0b04..b634fb1cce38 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -558,6 +558,10 @@ extern int sysctl_panic_on_stackoverflow;
extern bool crash_kexec_post_notifiers;
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern int kernel_access_sea_recovery; +#endif + /* * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It * holds a CPU number which is executing panic() currently. A value of diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 89ef0c1a1642..e38fff657683 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2711,6 +2711,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one_hundred, .extra2 = &one_thousand, }, +#endif +#if defined(CONFIG_UCE_KERNEL_RECOVERY) + { + .procname = "uce_kernel_recovery", + .data = &kernel_access_sea_recovery, + .maxlen = sizeof(kernel_access_sea_recovery), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &four, + .extra2 = &four, + }, #endif { } };