hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I44R9D CVE: NA
---------------------------
During kernel COW processing, the kernel triggers a RAS exception when reading pages. In this solution, we identify this scenario in the kernel do_sea processing process, send SIGBUS signals to the process that triggers COW and isolate memory pages, preventing kernel panic.
At the same time, we use both cmdline(uce_kernel_recovery) and proc(/proc/sys/ kernel/uce_kernel_recovery) to control this feature on/off.
We use customized functions to distinguish scenarios that need kernel recovery, specifically copy_page_cow(). When hardware error occurs in this function, we killing the process that triggers the hardware error to replace the kernel die.
Signed-off-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Chen Huang chenhuang5@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com --- arch/arm64/Kconfig | 10 ++ arch/arm64/include/asm/exception.h | 11 +++ arch/arm64/include/asm/page.h | 8 ++ arch/arm64/lib/copy_page.S | 78 +++++++++++++++ arch/arm64/mm/copypage.c | 14 +++ arch/arm64/mm/fault.c | 153 +++++++++++++++++++++++++++++ include/linux/highmem.h | 17 ++++ include/linux/kernel.h | 4 + kernel/sysctl.c | 13 +++ mm/internal.h | 5 + mm/memory.c | 8 ++ 11 files changed, 321 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4486a6ee343a..9dfbd052bffc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1565,6 +1565,16 @@ config ARCH_HIBERNATION_HEADER config ARCH_SUSPEND_POSSIBLE def_bool y
+config UCE_KERNEL_RECOVERY + bool "uce kernel recovery from special scenario" + def_bool y + depends on ARM64_ERR_RECOV + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors + are consumed. In some cases, if the error address is in a user page there + is a chance to recover. Such as error occurs in COW and pagecache reading + scenario, we can isolate this page and killing process instead of die. + endmenu
menu "CPU Power Management" diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index bc30429d8e91..f938690c5ad6 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -41,4 +41,15 @@ static inline u32 disr_to_esr(u64 disr) return esr; }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +struct uce_kernel_recovery_info { + int (*fn)(void); + const char *name; + unsigned long addr; + unsigned long size; +}; + +extern int copy_page_cow_sea_fallback(void); +#endif + #endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 60d02c81a3a2..725bff5bc0ad 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -35,6 +35,14 @@ extern void clear_page(void *to); #define clear_user_page(addr,vaddr,pg) __cpu_clear_user_page(addr, vaddr) #define copy_user_page(to,from,vaddr,pg) __cpu_copy_user_page(to, from, vaddr)
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern int copy_page_cow(void *to, const void *from); +extern int __cpu_copy_user_page_cow(void *to, const void *from, + unsigned long user); +#define copy_user_page_cow(to, from, vaddr, pg) \ + __cpu_copy_user_page_cow(to, from, vaddr) +#endif + typedef struct page *pgtable_t;
#ifdef CONFIG_HAVE_ARCH_PFN_VALID diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 076c43715e64..649cf4eb96bf 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -87,3 +87,81 @@ alternative_else_nop_endif
ret ENDPROC(copy_page) + +#ifdef CONFIG_UCE_KERNEL_RECOVERY +#The difference between copy_page_cow and copy_page: +# 1) copy_page_cow adds the recovery path of sea fault(copy_page_cow_sea_fallback). +# 2) copy_page_cow with return value: 0 - copy success 1 - copy fail. +/* + * COW copy a page from src to dest (both are page aligned) + * + * Parameters: + * x0 - dest + * x1 - src + */ +ENTRY(copy_page_cow) +alternative_if ARM64_HAS_NO_HW_PREFETCH + // Prefetch three cache lines ahead. + prfm pldl1strm, [x1, #128] + prfm pldl1strm, [x1, #256] + prfm pldl1strm, [x1, #384] +alternative_else_nop_endif + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + ldp x6, x7, [x1, #32] + ldp x8, x9, [x1, #48] + ldp x10, x11, [x1, #64] + ldp x12, x13, [x1, #80] + ldp x14, x15, [x1, #96] + ldp x16, x17, [x1, #112] + + mov x18, #(PAGE_SIZE - 128) + add x1, x1, #128 +1: + subs x18, x18, #128 + +alternative_if ARM64_HAS_NO_HW_PREFETCH + prfm pldl1strm, [x1, #384] +alternative_else_nop_endif + + stnp x2, x3, [x0] + ldp x2, x3, [x1] + stnp x4, x5, [x0, #16] + ldp x4, x5, [x1, #16] + stnp x6, x7, [x0, #32] + ldp x6, x7, [x1, #32] + stnp x8, x9, [x0, #48] + ldp x8, x9, [x1, #48] + stnp x10, x11, [x0, #64] + ldp x10, x11, [x1, #64] + stnp x12, x13, [x0, #80] + ldp x12, x13, [x1, #80] + stnp x14, x15, [x0, #96] + ldp x14, x15, [x1, #96] + stnp x16, x17, [x0, #112] + ldp x16, x17, [x1, #112] + + add x0, x0, #128 + add x1, x1, #128 + + b.gt 1b + + stnp x2, x3, [x0] + stnp x4, x5, [x0, #16] + stnp x6, x7, [x0, #32] + stnp x8, x9, [x0, #48] + stnp x10, x11, [x0, #64] + stnp x12, x13, [x0, #80] + stnp x14, x15, [x0, #96] + stnp x16, x17, [x0, #112] + + mov x0, #0 + ret + + .global copy_page_cow_sea_fallback +copy_page_cow_sea_fallback: + mov x0, #1 + ret +ENDPROC(copy_page_cow) +#endif diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 22e4cb4d6f53..506d166d1dec 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -30,6 +30,20 @@ void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr) } EXPORT_SYMBOL_GPL(__cpu_copy_user_page);
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +int __cpu_copy_user_page_cow(void *kto, const void *kfrom, unsigned long vaddr) +{ + int ret; + + struct page *page = virt_to_page(kto); + ret = copy_page_cow(kto, kfrom); + flush_dcache_page(page); + + return ret; +} +EXPORT_SYMBOL_GPL(__cpu_copy_user_page_cow); +#endif + void __cpu_clear_user_page(void *kaddr, unsigned long vaddr) { clear_page(kaddr); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 6cd448d9835c..039802b047af 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -656,6 +656,126 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 1; /* "fault" */ }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +int kernel_access_sea_recovery; + +#define UCE_KER_REC_NUM ARRAY_SIZE(reco_info) +static struct uce_kernel_recovery_info reco_info[] = { + {copy_page_cow_sea_fallback, "copy_page_cow", (unsigned long)copy_page_cow, 0}, +}; + +static int __init kernel_access_sea_recovery_init(void) +{ + unsigned long addr, size, offset; + unsigned int i; + + for (i = 0; i < UCE_KER_REC_NUM; i++) { + addr = reco_info[i].addr; + if (!kallsyms_lookup_size_offset(addr, &size, &offset)) { + pr_info("UCE: symbol %s lookup addr fail.\n", + reco_info[i].name); + size = 0; + } + + reco_info[i].size = size; + } + + return 1; +} +fs_initcall(kernel_access_sea_recovery_init); + +static int __init enable_kernel_access_sea_recovery(char *str) +{ + int max = (1 << UCE_KER_REC_NUM) - 1; + int val; + + if (kstrtoint(str, 0, &val)) + return -EINVAL; + + if (val < 0 || val > max) { + pr_info("UCE: invalid uce_kernel_recovery value %d", val); + return -EINVAL; + } + + kernel_access_sea_recovery = val; + + return 1; +} +__setup("uce_kernel_recovery=", enable_kernel_access_sea_recovery); + +int is_cow_kernel_recovery_enable(void) +{ + return kernel_access_sea_recovery & 0x1; +} + +/* + * what is kernel recovery? + * If the process's private data is accessed in the kernel mode to trigger + * special sea fault, it can controlled by killing the process and isolating + * the failure pages instead of die. + */ +static int is_in_kernel_recovery(unsigned int esr, struct pt_regs *regs) +{ + /* + * target insn: ldp-pre, ldp-post, ldp-offset, + * ldr-64bit-pre/pose, ldr-32bit-pre/post, ldrb-pre/post, ldrh-pre/post + */ + u32 target_insn[] = {0xa8c, 0xa9c, 0xa94, 0xf84, 0x784, 0x384, 0xb84}; + void *pc = (void *)instruction_pointer(regs); + struct uce_kernel_recovery_info *info; + bool insn_match = false; + u32 insn; + int i; + + pr_emerg("UCE: %s-%d, kernel recovery: 0x%x, esr: 0x%08x -- %s, %pS\n", + current->comm, current->pid, kernel_access_sea_recovery, esr, + esr_get_class_string(esr), pc); + + if (aarch64_insn_read((void *)pc, &insn)) { + pr_emerg("UCE: insn read fail.\n"); + return -EFAULT; + } + + /* + * We process special ESR: + * EC : 0b100101 Data Abort taken without a change in Exception level. + * DFSC : 0b010000 Synchronous External abort, not on translation table + * walk or hardware update of translation table. + * eg: 0x96000610 + */ + if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR || + (esr & ESR_ELx_FSC) != ESR_ELx_FSC_EXTABT) { + pr_emerg("UCE: esr not match.\n"); + return -EINVAL; + } + + insn = (insn >> 20) & 0xffc; + for (i = 0; i < ARRAY_SIZE(target_insn); i++) { + if (insn == target_insn[i]) { + insn_match = true; + break; + } + } + + if (!insn_match) { + pr_emerg("UCE: insn 0x%x is not match.\n", insn); + return -EINVAL; + } + + for (i = 0; i < UCE_KER_REC_NUM; i++) { + info = &reco_info[i]; + if (info->fn && regs->pc >= info->addr && + regs->pc < (info->addr + info->size)) { + pr_emerg("UCE: total match %s success.\n", info->name); + return i; + } + } + + pr_emerg("UCE: symbol is not match.\n"); + return -EINVAL; +} +#endif + static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) { struct siginfo info; @@ -693,8 +813,41 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
arm64_notify_die(inf->name, regs, &info, esr); } else { +#ifdef CONFIG_UCE_KERNEL_RECOVERY + int idx; + + if (!current->mm || !kernel_access_sea_recovery) { + pr_emerg("UCE: kernel recovery %d, %s-%d is %s-thread.\n", + kernel_access_sea_recovery, + current->comm, current->pid, + (current->mm) ? "user" : "kernel"); + die("Uncorrected hardware memory error in kernel-access\n", + regs, esr); + } + + idx = is_in_kernel_recovery(esr, regs); + if (idx >= 0 && idx < UCE_KER_REC_NUM) { + clear_siginfo(&info); + info.si_signo = inf->sig; + info.si_errno = 0; + info.si_code = inf->code; + info.si_addr = NULL; + + current->thread.fault_address = regs->pc; + current->thread.fault_code = esr; + regs->pc = (unsigned long)reco_info[idx].fn; + arm64_force_sig_info(&info, + "Uncorrected hardware memory use with kernel recovery in kernel-access\n", + current); + } else { + die("Uncorrected hardware memory error (kernel recovery on but not match idx) in kernel-access\n", + regs, esr); + } + +#else die("Uncorrected hardware memory error in kernel-access\n", regs, esr); +#endif }
return 0; diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 0690679832d4..1fed918bb1e5 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -235,6 +235,23 @@ static inline void copy_user_highpage(struct page *to, struct page *from, kunmap_atomic(vfrom); }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +static inline int copy_user_highpage_cow(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + char *vfrom, *vto; + int ret; + + vfrom = kmap_atomic(from); + vto = kmap_atomic(to); + ret = copy_user_page_cow(vto, vfrom, vaddr, to); + kunmap_atomic(vto); + kunmap_atomic(vfrom); + + return ret; +} +#endif + #endif
#ifndef __HAVE_ARCH_COPY_HIGHPAGE diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 842c0a8cd0b8..06c738d394ab 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -538,6 +538,10 @@ extern int sysctl_panic_on_stackoverflow;
extern bool crash_kexec_post_notifiers;
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern int kernel_access_sea_recovery; +#endif + /* * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It * holds a CPU number which is executing panic() currently. A value of diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 91d4fe5b2770..1c4b3470fb5e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,6 +126,7 @@ static int __maybe_unused neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static int __maybe_unused four = 4; static unsigned long zero_ul; static unsigned long one_ul = 1; @@ -1244,6 +1245,18 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#if defined(CONFIG_UCE_KERNEL_RECOVERY) + { + .procname = "uce_kernel_recovery", + .data = &kernel_access_sea_recovery, + .maxlen = sizeof(kernel_access_sea_recovery), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &three, + }, + +#endif { } };
diff --git a/mm/internal.h b/mm/internal.h index bfa97c43a3fc..b6d3b3660782 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -588,4 +588,9 @@ static inline bool is_migrate_highatomic_page(struct page *page)
void setup_zone_pageset(struct zone *zone); extern struct page *alloc_new_node_page(struct page *page, unsigned long node); + +#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern int is_cow_kernel_recovery_enable(void); +#endif + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index 6acc5ae64dcd..508e7867378c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2126,7 +2126,15 @@ static inline bool cow_user_page(struct page *dst, struct page *src, debug_dma_assert_idle(src);
if (likely(src)) { +#ifdef CONFIG_UCE_KERNEL_RECOVERY + if (is_cow_kernel_recovery_enable()) { + if (copy_user_highpage_cow(dst, src, addr, vma)) + return false; + } else + copy_user_highpage(dst, src, addr, vma); +#else copy_user_highpage(dst, src, addr, vma); +#endif return true; }