Hi,
Thanks for your patches.
There are trinval typo, otherwice, looks good to me.
Reviewed-by: Xie XiuQi xiexiuqi@huawei.com
On 2021/8/11 11:37, Tong Tiangen wrote:
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I44R9D CVE: NA
During kernel COW processing, the kernel triggers a RAS exception when reading pages. In this solution, we identify this scenario in the kernel do_sea processing process, send SIGBUS signals to the process that triggers COW and isolate memory pages, preventing kernel panic.
At the same time, we use both cmdline(uce_kernel_recovery) and proc(/proc/sys/ kernel/uce_kernel_recovery) to control this feature on/off.
We use customized functions to distinguish scenarios that need kernel recovery, specifically copy_page_cow(). When hardware error occurs in this function, we killing the process that triggers the hardware error to replace the kernel die.
Signed-off-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Chen Huang chenhuang5@huawei.com
arch/arm64/Kconfig | 10 ++ arch/arm64/include/asm/exception.h | 11 +++ arch/arm64/include/asm/page.h | 8 ++ arch/arm64/lib/copy_page.S | 78 +++++++++++++++ arch/arm64/mm/copypage.c | 14 +++ arch/arm64/mm/fault.c | 153 +++++++++++++++++++++++++++++ include/linux/highmem.h | 17 ++++ include/linux/kernel.h | 4 + kernel/sysctl.c | 13 +++ mm/internal.h | 5 + mm/memory.c | 8 ++ 11 files changed, 321 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 99905ee64625..e968df599e31 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1755,6 +1755,16 @@ config ARCH_HIBERNATION_HEADER config ARCH_SUSPEND_POSSIBLE def_bool y
+config UCE_KERNEL_RECOVERY
- bool "uce kernel recovery from special scenario"
- def_bool y
- depends on ARM64_ERR_RECOV
- help
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. In some cases, if the error address is in a user page there
is a chance to recover. Such as error occurs in COW and pagecache reading
cenario, we can isolate this page and killing process instead of die.
s/cenario/scenario/
endmenu
menu "CPU Power Management" diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index bc30429d8e91..f938690c5ad6 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -41,4 +41,15 @@ static inline u32 disr_to_esr(u64 disr) return esr; }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +struct uce_kernel_recovery_info {
- int (*fn)(void);
- const char *name;
- unsigned long addr;
- unsigned long size;
+};
+extern int copy_page_cow_sea_fallback(void); +#endif
#endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 60d02c81a3a2..725bff5bc0ad 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -35,6 +35,14 @@ extern void clear_page(void *to); #define clear_user_page(addr,vaddr,pg) __cpu_clear_user_page(addr, vaddr) #define copy_user_page(to,from,vaddr,pg) __cpu_copy_user_page(to, from, vaddr)
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern int copy_page_cow(void *to, const void *from); +extern int __cpu_copy_user_page_cow(void *to, const void *from,
unsigned long user);
+#define copy_user_page_cow(to, from, vaddr, pg) \
- __cpu_copy_user_page_cow(to, from, vaddr)
+#endif
typedef struct page *pgtable_t;
#ifdef CONFIG_HAVE_ARCH_PFN_VALID diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 076c43715e64..649cf4eb96bf 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -87,3 +87,81 @@ alternative_else_nop_endif
ret ENDPROC(copy_page)
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +#The difference between copy_page_cow and copy_page: +# 1) copy_page_cow adds the recovery path of sea fault(copy_page_cow_sea_fallback). +# 2) copy_page_cow with return value: 0 - copy success 1 - copy fail. +/*
- COW copy a page from src to dest (both are page aligned)
- Parameters:
- x0 - dest
- x1 - src
- */
+ENTRY(copy_page_cow) +alternative_if ARM64_HAS_NO_HW_PREFETCH
- // Prefetch three cache lines ahead.
- prfm pldl1strm, [x1, #128]
- prfm pldl1strm, [x1, #256]
- prfm pldl1strm, [x1, #384]
+alternative_else_nop_endif
- ldp x2, x3, [x1]
- ldp x4, x5, [x1, #16]
- ldp x6, x7, [x1, #32]
- ldp x8, x9, [x1, #48]
- ldp x10, x11, [x1, #64]
- ldp x12, x13, [x1, #80]
- ldp x14, x15, [x1, #96]
- ldp x16, x17, [x1, #112]
- mov x18, #(PAGE_SIZE - 128)
- add x1, x1, #128
+1:
- subs x18, x18, #128
+alternative_if ARM64_HAS_NO_HW_PREFETCH
- prfm pldl1strm, [x1, #384]
+alternative_else_nop_endif
- stnp x2, x3, [x0]
- ldp x2, x3, [x1]
- stnp x4, x5, [x0, #16]
- ldp x4, x5, [x1, #16]
- stnp x6, x7, [x0, #32]
- ldp x6, x7, [x1, #32]
- stnp x8, x9, [x0, #48]
- ldp x8, x9, [x1, #48]
- stnp x10, x11, [x0, #64]
- ldp x10, x11, [x1, #64]
- stnp x12, x13, [x0, #80]
- ldp x12, x13, [x1, #80]
- stnp x14, x15, [x0, #96]
- ldp x14, x15, [x1, #96]
- stnp x16, x17, [x0, #112]
- ldp x16, x17, [x1, #112]
- add x0, x0, #128
- add x1, x1, #128
- b.gt 1b
- stnp x2, x3, [x0]
- stnp x4, x5, [x0, #16]
- stnp x6, x7, [x0, #32]
- stnp x8, x9, [x0, #48]
- stnp x10, x11, [x0, #64]
- stnp x12, x13, [x0, #80]
- stnp x14, x15, [x0, #96]
- stnp x16, x17, [x0, #112]
- mov x0, #0
- ret
- .global copy_page_cow_sea_fallback
+copy_page_cow_sea_fallback:
- mov x0, #1
- ret
+ENDPROC(copy_page_cow) +#endif diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 22e4cb4d6f53..506d166d1dec 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -30,6 +30,20 @@ void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr) } EXPORT_SYMBOL_GPL(__cpu_copy_user_page);
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +int __cpu_copy_user_page_cow(void *kto, const void *kfrom, unsigned long vaddr) +{
- int ret;
- struct page *page = virt_to_page(kto);
- ret = copy_page_cow(kto, kfrom);
- flush_dcache_page(page);
- return ret;
+} +EXPORT_SYMBOL_GPL(__cpu_copy_user_page_cow); +#endif
void __cpu_clear_user_page(void *kaddr, unsigned long vaddr) { clear_page(kaddr); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 6cd448d9835c..039802b047af 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -656,6 +656,126 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 1; /* "fault" */ }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +int kernel_access_sea_recovery;
+#define UCE_KER_REC_NUM ARRAY_SIZE(reco_info) +static struct uce_kernel_recovery_info reco_info[] = {
- {copy_page_cow_sea_fallback, "copy_page_cow", (unsigned long)copy_page_cow, 0},
+};
+static int __init kernel_access_sea_recovery_init(void) +{
- unsigned long addr, size, offset;
- unsigned int i;
- for (i = 0; i < UCE_KER_REC_NUM; i++) {
addr = reco_info[i].addr;
if (!kallsyms_lookup_size_offset(addr, &size, &offset)) {
pr_info("UCE: symbol %s lookup addr fail.\n",
reco_info[i].name);
size = 0;
}
reco_info[i].size = size;
- }
- return 1;
+} +fs_initcall(kernel_access_sea_recovery_init);
+static int __init enable_kernel_access_sea_recovery(char *str) +{
- int max = (1 << UCE_KER_REC_NUM) - 1;
- int val;
- if (kstrtoint(str, 0, &val))
return -EINVAL;
- if (val < 0 || val > max) {
pr_info("UCE: invalid uce_kernel_recovery value %d", val);
return -EINVAL;
- }
- kernel_access_sea_recovery = val;
- return 1;
+} +__setup("uce_kernel_recovery=", enable_kernel_access_sea_recovery);
+int is_cow_kernel_recovery_enable(void) +{
- return kernel_access_sea_recovery & 0x1;
+}
+/*
- what is kernel recovery?
- If the process's private data is accessed in the kernel mode to trigger
- special sea fault, it can controlled by killing the process and isolating
- the failure pages instead of die.
- */
+static int is_in_kernel_recovery(unsigned int esr, struct pt_regs *regs) +{
- /*
* target insn: ldp-pre, ldp-post, ldp-offset,
* ldr-64bit-pre/pose, ldr-32bit-pre/post, ldrb-pre/post, ldrh-pre/post
*/
- u32 target_insn[] = {0xa8c, 0xa9c, 0xa94, 0xf84, 0x784, 0x384, 0xb84};
- void *pc = (void *)instruction_pointer(regs);
- struct uce_kernel_recovery_info *info;
- bool insn_match = false;
- u32 insn;
- int i;
- pr_emerg("UCE: %s-%d, kernel recovery: 0x%x, esr: 0x%08x -- %s, %pS\n",
current->comm, current->pid, kernel_access_sea_recovery, esr,
esr_get_class_string(esr), pc);
- if (aarch64_insn_read((void *)pc, &insn)) {
pr_emerg("UCE: insn read fail.\n");
return -EFAULT;
- }
- /*
* We process special ESR:
* EC : 0b100101 Data Abort taken without a change in Exception level.
* DFSC : 0b010000 Synchronous External abort, not on translation table
* walk or hardware update of translation table.
* eg: 0x96000610
*/
- if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR ||
(esr & ESR_ELx_FSC) != ESR_ELx_FSC_EXTABT) {
pr_emerg("UCE: esr not match.\n");
return -EINVAL;
- }
- insn = (insn >> 20) & 0xffc;
- for (i = 0; i < ARRAY_SIZE(target_insn); i++) {
if (insn == target_insn[i]) {
insn_match = true;
break;
}
- }
- if (!insn_match) {
pr_emerg("UCE: insn 0x%x is not match.\n", insn);
return -EINVAL;
- }
- for (i = 0; i < UCE_KER_REC_NUM; i++) {
info = &reco_info[i];
if (info->fn && regs->pc >= info->addr &&
regs->pc < (info->addr + info->size)) {
pr_emerg("UCE: total match %s success.\n", info->name);
return i;
}
- }
- pr_emerg("UCE: symbol is not match.\n");
- return -EINVAL;
+} +#endif
static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) { struct siginfo info; @@ -693,8 +813,41 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
arm64_notify_die(inf->name, regs, &info, esr);
} else { +#ifdef CONFIG_UCE_KERNEL_RECOVERY
int idx;
if (!current->mm || !kernel_access_sea_recovery) {
pr_emerg("UCE: kernel recovery %d, %s-%d is %s-thread.\n",
kernel_access_sea_recovery,
current->comm, current->pid,
(current->mm) ? "user" : "kernel");
die("Uncorrected hardware memory error in kernel-access\n",
regs, esr);
}
idx = is_in_kernel_recovery(esr, regs);
if (idx >= 0 && idx < UCE_KER_REC_NUM) {
clear_siginfo(&info);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
info.si_addr = NULL;
current->thread.fault_address = regs->pc;
current->thread.fault_code = esr;
regs->pc = (unsigned long)reco_info[idx].fn;
arm64_force_sig_info(&info,
"Uncorrected hardware memory use with kernel recovery in kernel-access\n",
current);
} else {
die("Uncorrected hardware memory error (kernel recovery on but not match idx) in kernel-access\n",
regs, esr);
Do you mean "kernel recovery is enabled, but not match the idx" ? This path is the common case if in-kernel access uce. So we could just use the default message, or "Uncorrected hardware memory error without kernel recovery in kernel-access" ?
}
+#else die("Uncorrected hardware memory error in kernel-access\n", regs, esr); +#endif }
return 0; diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 0690679832d4..1fed918bb1e5 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -235,6 +235,23 @@ static inline void copy_user_highpage(struct page *to, struct page *from, kunmap_atomic(vfrom); }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +static inline int copy_user_highpage_cow(struct page *to, struct page *from,
- unsigned long vaddr, struct vm_area_struct *vma)
+{
- char *vfrom, *vto;
- int ret;
- vfrom = kmap_atomic(from);
- vto = kmap_atomic(to);
- ret = copy_user_page_cow(vto, vfrom, vaddr, to);
- kunmap_atomic(vto);
- kunmap_atomic(vfrom);
- return ret;
+} +#endif
#endif
#ifndef __HAVE_ARCH_COPY_HIGHPAGE diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f24fd248b3ac..122f77b8b0ea 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -531,6 +531,10 @@ extern int sysctl_panic_on_stackoverflow;
extern bool crash_kexec_post_notifiers;
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern int kernel_access_sea_recovery; +#endif
/*
- panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
- holds a CPU number which is executing panic() currently. A value of
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 116fe8a13236..53ddcd74b5c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -127,6 +127,7 @@ static int __maybe_unused neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static int __maybe_unused four = 4; static unsigned long zero_ul; static unsigned long one_ul = 1; @@ -1254,6 +1255,18 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#if defined(CONFIG_UCE_KERNEL_RECOVERY)
- {
.procname = "uce_kernel_recovery",
.data = &kernel_access_sea_recovery,
.maxlen = sizeof(kernel_access_sea_recovery),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &three,
- },
+#endif { } };
diff --git a/mm/internal.h b/mm/internal.h index bfa97c43a3fc..b6d3b3660782 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -588,4 +588,9 @@ static inline bool is_migrate_highatomic_page(struct page *page)
void setup_zone_pageset(struct zone *zone); extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern int is_cow_kernel_recovery_enable(void); +#endif
#endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index d9f4e7dd17a6..174b0baf197e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2212,7 +2212,15 @@ static inline bool cow_user_page(struct page *dst, struct page *src, debug_dma_assert_idle(src);
if (likely(src)) { +#ifdef CONFIG_UCE_KERNEL_RECOVERY
if (is_cow_kernel_recovery_enable()) {
if (copy_user_highpage_cow(dst, src, addr, vma))
return false;
} else
copy_user_highpage(dst, src, addr, vma);
+#else copy_user_highpage(dst, src, addr, vma); +#endif return true; }