subject: arm64: ras: support sea recovery for copy_from_user
On 2022/2/22 21:09, Tong Tiangen wrote:
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM10 CVE: NA
During kernel copy_from_user processing, the kernel triggers a RAS exception when reading pages. In this solution, we identify this scenario in the kernel do_sea processing process, send SIGBUS signals to the process that triggers copy_from_user and isolate memory pages, preventing kernel panic.
At the same time, we use both cmdline(uce_kernel_recovery) and proc (/proc/sys/kernel/uce_kernel_recovery) to control this feature on/off.
Signed-off-by: Tong Tiangen tongtiangen@huawei.com
v1->v2:
- update commit msg.
- change copy_from_user return value.
- change copy_from_user proc control bit.
arch/arm64/Kconfig | 9 ++ arch/arm64/include/asm/exception.h | 12 +++ arch/arm64/lib/copy_from_user.S | 5 + arch/arm64/mm/fault.c | 154 +++++++++++++++++++++++++++++ include/linux/kernel.h | 4 + kernel/sysctl.c | 11 +++ 6 files changed, 195 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c0f6a275f798..5858cb9a3dac 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2155,6 +2155,15 @@ config ARCH_HIBERNATION_HEADER config ARCH_SUSPEND_POSSIBLE def_bool y
+config UCE_KERNEL_RECOVERY
- bool "uce kernel recovery from special scenario"
- def_bool n
- help
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. In some cases, if the error address is in a user page there
is a chance to recover. Such as error occurs in COW and pagecache reading
scenario, we can isolate this page and killing process instead of die.
endmenu
menu "CPU Power Management" diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 0756191f44f6..bad92fe763fb 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -53,4 +53,16 @@ void do_cp15instr(unsigned int esr, struct pt_regs *regs); void do_el0_svc(struct pt_regs *regs); void do_el0_svc_compat(struct pt_regs *regs); void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr);
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +struct uce_kernel_recovery_info {
- int (*fn)(void);
- const char *name;
- unsigned long addr;
- unsigned long size;
+};
+extern int copy_from_user_sea_fallback(void); +#endif
#endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 2cf999e41d30..b03ea0faa118 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -60,6 +60,11 @@ SYM_FUNC_START(__arch_copy_from_user) #include "copy_template.S" mov x0, #0 // Nothing to copy ret
- .global copy_from_user_sea_fallback
+copy_from_user_sea_fallback:
- sub x0, end, dst // bytes not copied
- ret
SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3fc5aceb72eb..7f76e277daa3 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -634,6 +634,127 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 1; /* "fault" */ }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +int kernel_access_sea_recovery;
+#define UCE_KER_REC_NUM ARRAY_SIZE(reco_info) +static struct uce_kernel_recovery_info reco_info[] = {
- {NULL, NULL, 0, 0},
- {NULL, NULL, 0, 0},
{NULL, NULL, 0, 0}, /* reserved */ {NULL, NULL, 0, 0}, /* reserved */
- {copy_from_user_sea_fallback, "__arch_copy_from_user", (unsigned long)__arch_copy_from_user, 0},
+};
+static int __init kernel_access_sea_recovery_init(void) +{
- unsigned long addr, size, offset;
- unsigned int i;
- for (i = 0; i < UCE_KER_REC_NUM; i++) {
addr = reco_info[i].addr;
if (!kallsyms_lookup_size_offset(addr, &size, &offset)) {
pr_info("UCE: symbol %s lookup addr fail.\n",
reco_info[i].name);
size = 0;
}
reco_info[i].size = size;
- }
- return 1;
+} +fs_initcall(kernel_access_sea_recovery_init);
+static int __init enable_kernel_access_sea_recovery(char *str) +{
- int max = (1 << UCE_KER_REC_NUM) - 1;
- int val;
- if (kstrtoint(str, 0, &val))
return -EINVAL;
- if (val < 0 || val > max) {
pr_info("UCE: invalid uce_kernel_recovery value %d", val);
return -EINVAL;
- }
- kernel_access_sea_recovery = val;
- return 1;
+} +__setup("uce_kernel_recovery=", enable_kernel_access_sea_recovery);
+/*
- what is kernel recovery?
- If the process's private data is accessed in the kernel mode to trigger
- special sea fault, it can controlled by killing the process and isolating
- the failure pages instead of die.
- */
+static int is_in_kernel_recovery(unsigned int esr, struct pt_regs *regs) +{
- /*
* target insn: ldp-pre, ldp-post, ldp-offset,
* ldr-64bit-pre/pose, ldr-32bit-pre/post, ldrb-pre/post, ldrh-pre/post
*/
- u32 target_insn[] = {0xa8c, 0xa9c, 0xa94, 0xf84, 0x784, 0x384, 0xb84};
- void *pc = (void *)instruction_pointer(regs);
- struct uce_kernel_recovery_info *info;
- bool insn_match = false;
- u32 insn;
- int i;
- pr_emerg("UCE: %s-%d, kernel recovery: 0x%x, esr: 0x%08x -- %s, %pS\n",
current->comm, current->pid, kernel_access_sea_recovery, esr,
esr_get_class_string(esr), pc);
- if (aarch64_insn_read((void *)pc, &insn)) {
pr_emerg("UCE: insn read fail.\n");
return -EFAULT;
- }
- /*
* We process special ESR:
* EC : 0b100101 Data Abort taken without a change in Exception level.
* DFSC : 0b010000 Synchronous External abort, not on translation table
* walk or hardware update of translation table.
* eg: 0x96000610
*/
- if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR ||
(esr & ESR_ELx_FSC) != ESR_ELx_FSC_EXTABT) {
pr_emerg("UCE: esr not match.\n");
return -EINVAL;
- }
- insn = (insn >> 20) & 0xffc;
- for (i = 0; i < ARRAY_SIZE(target_insn); i++) {
if (insn == target_insn[i]) {
insn_match = true;
break;
}
- }
- if (!insn_match) {
pr_emerg("UCE: insn 0x%x is not match.\n", insn);
return -EINVAL;
- }
- for (i = 0; i < UCE_KER_REC_NUM; i++) {
if (!((kernel_access_sea_recovery >> i) & 0x1))
continue;
info = &reco_info[i];
if (info->fn && regs->pc >= info->addr &&
regs->pc < (info->addr + info->size)) {
pr_emerg("UCE: total match %s success.\n", info->name);
return i;
}
- }
- pr_emerg("UCE: symbol is not match or switch if off, kernel recovery %d.\n",
kernel_access_sea_recovery);
- return -EINVAL;
+} +#endif
static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; @@ -649,6 +770,39 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 0; }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY
- if (!user_mode(regs)) {
int idx;
if (!current->mm || !kernel_access_sea_recovery) {
pr_emerg("UCE: kernel recovery %d, %s-%d is %s-thread.\n",
kernel_access_sea_recovery,
current->comm, current->pid,
(current->mm) ? "user" : "kernel");
die("Uncorrected hardware memory error in kernel-access\n",
regs, esr);
return 0;
}
idx = is_in_kernel_recovery(esr, regs);
if (idx >= 0 && idx < UCE_KER_REC_NUM) {
current->thread.fault_address = 0;
current->thread.fault_code = esr;
regs->pc = (unsigned long)reco_info[idx].fn;
if (esr & ESR_ELx_FnV)
siaddr = NULL;
else
siaddr = (void __user *)addr;
arm64_force_sig_fault(inf->sig, inf->code, siaddr,
"Uncorrected hardware memory use with kernel recovery in kernel-access\n");
return 0;
} else {
die("Uncorrected hardware memory error (not match idx or sence switch is off) in kernel-access\n",
regs, esr);
return 0;
}
- }
+#endif
- if (esr & ESR_ELx_FnV) siaddr = NULL; else
diff --git a/include/linux/kernel.h b/include/linux/kernel.h: index 78a0907f0b04..b634fb1cce38 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -558,6 +558,10 @@ extern int sysctl_panic_on_stackoverflow;
extern bool crash_kexec_post_notifiers;
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern int kernel_access_sea_recovery; +#endif
/*
- panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
- holds a CPU number which is executing panic() currently. A value of
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 89ef0c1a1642..e38fff657683 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2711,6 +2711,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one_hundred, .extra2 = &one_thousand, }, +#endif +#if defined(CONFIG_UCE_KERNEL_RECOVERY)
- {
.procname = "uce_kernel_recovery",
.data = &kernel_access_sea_recovery,
.maxlen = sizeof(kernel_access_sea_recovery),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &four,
.extra2 = &four,
- },
#endif { } };