v2->v1: fix typo.
Tong Tiangen (2): uce: cow scenario support kernel recovery uce: pagecache reading scenario support kernel recovery
arch/arm64/Kconfig | 10 + arch/arm64/include/asm/exception.h | 12 ++ arch/arm64/include/asm/page.h | 8 + arch/arm64/include/asm/thread_info.h | 8 + arch/arm64/include/asm/uaccess.h | 8 + arch/arm64/kernel/ras.c | 7 + arch/arm64/lib/copy_page.S | 78 ++++++++ arch/arm64/lib/copy_template_generic_read.S | 193 ++++++++++++++++++++ arch/arm64/lib/copy_to_user.S | 19 ++ arch/arm64/mm/copypage.c | 14 ++ arch/arm64/mm/fault.c | 160 ++++++++++++++++ include/linux/highmem.h | 17 ++ include/linux/kernel.h | 4 + include/linux/mm.h | 3 + include/linux/uio.h | 5 + kernel/sysctl.c | 13 ++ lib/iov_iter.c | 132 +++++++++++++ mm/filemap.c | 8 +- mm/internal.h | 6 + mm/memory-failure.c | 7 + mm/memory.c | 8 + 21 files changed, 719 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/lib/copy_template_generic_read.S
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I44R9D CVE: NA
---------------------------
During kernel COW processing, the kernel triggers a RAS exception when reading pages. In this solution, we identify this scenario in the kernel do_sea processing process, send SIGBUS signals to the process that triggers COW and isolate memory pages, preventing kernel panic.
At the same time, we use both cmdline(uce_kernel_recovery) and proc(/proc/sys/ kernel/uce_kernel_recovery) to control this feature on/off.
We use customized functions to distinguish scenarios that need kernel recovery, specifically copy_page_cow(). When hardware error occurs in this function, we killing the process that triggers the hardware error to replace the kernel die.
Signed-off-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Chen Huang chenhuang5@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com --- arch/arm64/Kconfig | 10 ++ arch/arm64/include/asm/exception.h | 11 +++ arch/arm64/include/asm/page.h | 8 ++ arch/arm64/lib/copy_page.S | 78 +++++++++++++++ arch/arm64/mm/copypage.c | 14 +++ arch/arm64/mm/fault.c | 153 +++++++++++++++++++++++++++++ include/linux/highmem.h | 17 ++++ include/linux/kernel.h | 4 + kernel/sysctl.c | 13 +++ mm/internal.h | 5 + mm/memory.c | 8 ++ 11 files changed, 321 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 99905ee64625..d57c836d8753 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1755,6 +1755,16 @@ config ARCH_HIBERNATION_HEADER config ARCH_SUSPEND_POSSIBLE def_bool y
+config UCE_KERNEL_RECOVERY + bool "uce kernel recovery from special scenario" + def_bool y + depends on ARM64_ERR_RECOV + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors + are consumed. In some cases, if the error address is in a user page there + is a chance to recover. Such as error occurs in COW and pagecache reading + scenario, we can isolate this page and killing process instead of die. + endmenu
menu "CPU Power Management" diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index bc30429d8e91..f938690c5ad6 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -41,4 +41,15 @@ static inline u32 disr_to_esr(u64 disr) return esr; }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +struct uce_kernel_recovery_info { + int (*fn)(void); + const char *name; + unsigned long addr; + unsigned long size; +}; + +extern int copy_page_cow_sea_fallback(void); +#endif + #endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 60d02c81a3a2..725bff5bc0ad 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -35,6 +35,14 @@ extern void clear_page(void *to); #define clear_user_page(addr,vaddr,pg) __cpu_clear_user_page(addr, vaddr) #define copy_user_page(to,from,vaddr,pg) __cpu_copy_user_page(to, from, vaddr)
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern int copy_page_cow(void *to, const void *from); +extern int __cpu_copy_user_page_cow(void *to, const void *from, + unsigned long user); +#define copy_user_page_cow(to, from, vaddr, pg) \ + __cpu_copy_user_page_cow(to, from, vaddr) +#endif + typedef struct page *pgtable_t;
#ifdef CONFIG_HAVE_ARCH_PFN_VALID diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 076c43715e64..649cf4eb96bf 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -87,3 +87,81 @@ alternative_else_nop_endif
ret ENDPROC(copy_page) + +#ifdef CONFIG_UCE_KERNEL_RECOVERY +#The difference between copy_page_cow and copy_page: +# 1) copy_page_cow adds the recovery path of sea fault(copy_page_cow_sea_fallback). +# 2) copy_page_cow with return value: 0 - copy success 1 - copy fail. +/* + * COW copy a page from src to dest (both are page aligned) + * + * Parameters: + * x0 - dest + * x1 - src + */ +ENTRY(copy_page_cow) +alternative_if ARM64_HAS_NO_HW_PREFETCH + // Prefetch three cache lines ahead. + prfm pldl1strm, [x1, #128] + prfm pldl1strm, [x1, #256] + prfm pldl1strm, [x1, #384] +alternative_else_nop_endif + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + ldp x6, x7, [x1, #32] + ldp x8, x9, [x1, #48] + ldp x10, x11, [x1, #64] + ldp x12, x13, [x1, #80] + ldp x14, x15, [x1, #96] + ldp x16, x17, [x1, #112] + + mov x18, #(PAGE_SIZE - 128) + add x1, x1, #128 +1: + subs x18, x18, #128 + +alternative_if ARM64_HAS_NO_HW_PREFETCH + prfm pldl1strm, [x1, #384] +alternative_else_nop_endif + + stnp x2, x3, [x0] + ldp x2, x3, [x1] + stnp x4, x5, [x0, #16] + ldp x4, x5, [x1, #16] + stnp x6, x7, [x0, #32] + ldp x6, x7, [x1, #32] + stnp x8, x9, [x0, #48] + ldp x8, x9, [x1, #48] + stnp x10, x11, [x0, #64] + ldp x10, x11, [x1, #64] + stnp x12, x13, [x0, #80] + ldp x12, x13, [x1, #80] + stnp x14, x15, [x0, #96] + ldp x14, x15, [x1, #96] + stnp x16, x17, [x0, #112] + ldp x16, x17, [x1, #112] + + add x0, x0, #128 + add x1, x1, #128 + + b.gt 1b + + stnp x2, x3, [x0] + stnp x4, x5, [x0, #16] + stnp x6, x7, [x0, #32] + stnp x8, x9, [x0, #48] + stnp x10, x11, [x0, #64] + stnp x12, x13, [x0, #80] + stnp x14, x15, [x0, #96] + stnp x16, x17, [x0, #112] + + mov x0, #0 + ret + + .global copy_page_cow_sea_fallback +copy_page_cow_sea_fallback: + mov x0, #1 + ret +ENDPROC(copy_page_cow) +#endif diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 22e4cb4d6f53..506d166d1dec 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -30,6 +30,20 @@ void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr) } EXPORT_SYMBOL_GPL(__cpu_copy_user_page);
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +int __cpu_copy_user_page_cow(void *kto, const void *kfrom, unsigned long vaddr) +{ + int ret; + + struct page *page = virt_to_page(kto); + ret = copy_page_cow(kto, kfrom); + flush_dcache_page(page); + + return ret; +} +EXPORT_SYMBOL_GPL(__cpu_copy_user_page_cow); +#endif + void __cpu_clear_user_page(void *kaddr, unsigned long vaddr) { clear_page(kaddr); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 6cd448d9835c..039802b047af 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -656,6 +656,126 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 1; /* "fault" */ }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +int kernel_access_sea_recovery; + +#define UCE_KER_REC_NUM ARRAY_SIZE(reco_info) +static struct uce_kernel_recovery_info reco_info[] = { + {copy_page_cow_sea_fallback, "copy_page_cow", (unsigned long)copy_page_cow, 0}, +}; + +static int __init kernel_access_sea_recovery_init(void) +{ + unsigned long addr, size, offset; + unsigned int i; + + for (i = 0; i < UCE_KER_REC_NUM; i++) { + addr = reco_info[i].addr; + if (!kallsyms_lookup_size_offset(addr, &size, &offset)) { + pr_info("UCE: symbol %s lookup addr fail.\n", + reco_info[i].name); + size = 0; + } + + reco_info[i].size = size; + } + + return 1; +} +fs_initcall(kernel_access_sea_recovery_init); + +static int __init enable_kernel_access_sea_recovery(char *str) +{ + int max = (1 << UCE_KER_REC_NUM) - 1; + int val; + + if (kstrtoint(str, 0, &val)) + return -EINVAL; + + if (val < 0 || val > max) { + pr_info("UCE: invalid uce_kernel_recovery value %d", val); + return -EINVAL; + } + + kernel_access_sea_recovery = val; + + return 1; +} +__setup("uce_kernel_recovery=", enable_kernel_access_sea_recovery); + +int is_cow_kernel_recovery_enable(void) +{ + return kernel_access_sea_recovery & 0x1; +} + +/* + * what is kernel recovery? + * If the process's private data is accessed in the kernel mode to trigger + * special sea fault, it can controlled by killing the process and isolating + * the failure pages instead of die. + */ +static int is_in_kernel_recovery(unsigned int esr, struct pt_regs *regs) +{ + /* + * target insn: ldp-pre, ldp-post, ldp-offset, + * ldr-64bit-pre/pose, ldr-32bit-pre/post, ldrb-pre/post, ldrh-pre/post + */ + u32 target_insn[] = {0xa8c, 0xa9c, 0xa94, 0xf84, 0x784, 0x384, 0xb84}; + void *pc = (void *)instruction_pointer(regs); + struct uce_kernel_recovery_info *info; + bool insn_match = false; + u32 insn; + int i; + + pr_emerg("UCE: %s-%d, kernel recovery: 0x%x, esr: 0x%08x -- %s, %pS\n", + current->comm, current->pid, kernel_access_sea_recovery, esr, + esr_get_class_string(esr), pc); + + if (aarch64_insn_read((void *)pc, &insn)) { + pr_emerg("UCE: insn read fail.\n"); + return -EFAULT; + } + + /* + * We process special ESR: + * EC : 0b100101 Data Abort taken without a change in Exception level. + * DFSC : 0b010000 Synchronous External abort, not on translation table + * walk or hardware update of translation table. + * eg: 0x96000610 + */ + if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR || + (esr & ESR_ELx_FSC) != ESR_ELx_FSC_EXTABT) { + pr_emerg("UCE: esr not match.\n"); + return -EINVAL; + } + + insn = (insn >> 20) & 0xffc; + for (i = 0; i < ARRAY_SIZE(target_insn); i++) { + if (insn == target_insn[i]) { + insn_match = true; + break; + } + } + + if (!insn_match) { + pr_emerg("UCE: insn 0x%x is not match.\n", insn); + return -EINVAL; + } + + for (i = 0; i < UCE_KER_REC_NUM; i++) { + info = &reco_info[i]; + if (info->fn && regs->pc >= info->addr && + regs->pc < (info->addr + info->size)) { + pr_emerg("UCE: total match %s success.\n", info->name); + return i; + } + } + + pr_emerg("UCE: symbol is not match.\n"); + return -EINVAL; +} +#endif + static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) { struct siginfo info; @@ -693,8 +813,41 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
arm64_notify_die(inf->name, regs, &info, esr); } else { +#ifdef CONFIG_UCE_KERNEL_RECOVERY + int idx; + + if (!current->mm || !kernel_access_sea_recovery) { + pr_emerg("UCE: kernel recovery %d, %s-%d is %s-thread.\n", + kernel_access_sea_recovery, + current->comm, current->pid, + (current->mm) ? "user" : "kernel"); + die("Uncorrected hardware memory error in kernel-access\n", + regs, esr); + } + + idx = is_in_kernel_recovery(esr, regs); + if (idx >= 0 && idx < UCE_KER_REC_NUM) { + clear_siginfo(&info); + info.si_signo = inf->sig; + info.si_errno = 0; + info.si_code = inf->code; + info.si_addr = NULL; + + current->thread.fault_address = regs->pc; + current->thread.fault_code = esr; + regs->pc = (unsigned long)reco_info[idx].fn; + arm64_force_sig_info(&info, + "Uncorrected hardware memory use with kernel recovery in kernel-access\n", + current); + } else { + die("Uncorrected hardware memory error (kernel recovery on but not match idx) in kernel-access\n", + regs, esr); + } + +#else die("Uncorrected hardware memory error in kernel-access\n", regs, esr); +#endif }
return 0; diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 0690679832d4..1fed918bb1e5 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -235,6 +235,23 @@ static inline void copy_user_highpage(struct page *to, struct page *from, kunmap_atomic(vfrom); }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +static inline int copy_user_highpage_cow(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + char *vfrom, *vto; + int ret; + + vfrom = kmap_atomic(from); + vto = kmap_atomic(to); + ret = copy_user_page_cow(vto, vfrom, vaddr, to); + kunmap_atomic(vto); + kunmap_atomic(vfrom); + + return ret; +} +#endif + #endif
#ifndef __HAVE_ARCH_COPY_HIGHPAGE diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f24fd248b3ac..122f77b8b0ea 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -531,6 +531,10 @@ extern int sysctl_panic_on_stackoverflow;
extern bool crash_kexec_post_notifiers;
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern int kernel_access_sea_recovery; +#endif + /* * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It * holds a CPU number which is executing panic() currently. A value of diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 116fe8a13236..53ddcd74b5c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -127,6 +127,7 @@ static int __maybe_unused neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static int __maybe_unused four = 4; static unsigned long zero_ul; static unsigned long one_ul = 1; @@ -1254,6 +1255,18 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#if defined(CONFIG_UCE_KERNEL_RECOVERY) + { + .procname = "uce_kernel_recovery", + .data = &kernel_access_sea_recovery, + .maxlen = sizeof(kernel_access_sea_recovery), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &three, + }, + +#endif { } };
diff --git a/mm/internal.h b/mm/internal.h index bfa97c43a3fc..b6d3b3660782 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -588,4 +588,9 @@ static inline bool is_migrate_highatomic_page(struct page *page)
void setup_zone_pageset(struct zone *zone); extern struct page *alloc_new_node_page(struct page *page, unsigned long node); + +#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern int is_cow_kernel_recovery_enable(void); +#endif + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index d9f4e7dd17a6..174b0baf197e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2212,7 +2212,15 @@ static inline bool cow_user_page(struct page *dst, struct page *src, debug_dma_assert_idle(src);
if (likely(src)) { +#ifdef CONFIG_UCE_KERNEL_RECOVERY + if (is_cow_kernel_recovery_enable()) { + if (copy_user_highpage_cow(dst, src, addr, vma)) + return false; + } else + copy_user_highpage(dst, src, addr, vma); +#else copy_user_highpage(dst, src, addr, vma); +#endif return true; }
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I44REB CVE: NA
---------------------------
Support pagecache reading triggered by user-mode file access scenario to kernel recovery.
At the same time, in this patch, we solved the problem of process D state caused by waiting for PG_writeback in memory_failure() (the background writeback thread timed out while writing back to the disk on the page with hardware error). In this case, our processing method is panic system to avoid the business interruption for a long time caused by the business process in D state.
Signed-off-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Chen Huang chenhuang5@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com --- arch/arm64/include/asm/exception.h | 1 + arch/arm64/include/asm/thread_info.h | 8 + arch/arm64/include/asm/uaccess.h | 8 + arch/arm64/kernel/ras.c | 7 + arch/arm64/lib/copy_template_generic_read.S | 193 ++++++++++++++++++++ arch/arm64/lib/copy_to_user.S | 19 ++ arch/arm64/mm/fault.c | 7 + include/linux/mm.h | 3 + include/linux/uio.h | 5 + lib/iov_iter.c | 132 +++++++++++++ mm/filemap.c | 8 +- mm/internal.h | 1 + mm/memory-failure.c | 7 + 13 files changed, 398 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/lib/copy_template_generic_read.S
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index f938690c5ad6..559d86ad9e5d 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -50,6 +50,7 @@ struct uce_kernel_recovery_info { };
extern int copy_page_cow_sea_fallback(void); +extern int copy_generic_read_sea_fallback(void); #endif
#endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 802a7d10ab82..b91fb7d6c18d 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -93,6 +93,10 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_SVE_VL_INHERIT 24 /* Inherit sve_vl_onexec across exec */ #define TIF_SSBD 25 /* Wants SSB mitigation */
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +#define TIF_UCE_KERNEL_RECOVERY 26 +#endif + #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -110,6 +114,10 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_SVE (1 << TIF_SVE) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +#define _TIF_UCE_KERNEL_RECOVERY (1 << TIF_UCE_KERNEL_RECOVERY) +#endif + #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_FSCHECK | _TIF_SEA_NOTIFY) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 35482c9076db..52d3c8c64849 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -407,6 +407,14 @@ extern unsigned long __must_check __arch_copy_from_user(void *to, const void __u __arch_copy_from_user((to), __uaccess_mask_ptr(from), (n)); \ })
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern unsigned long __must_check __arch_copy_to_user_generic_read(void __user *to, const void *from, unsigned long n); +#define raw_copy_to_user_generic_read(to, from, n) \ +({ \ + __arch_copy_to_user_generic_read(__uaccess_mask_ptr(to), (from), (n)); \ +}) +#endif + extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); #define raw_copy_to_user(to, from, n) \ ({ \ diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c index 181e609e1cd4..b57041f9e6e7 100644 --- a/arch/arm64/kernel/ras.c +++ b/arch/arm64/kernel/ras.c @@ -92,6 +92,13 @@ void sea_notify_process(void) if (!si) panic("Lost physical address for consumed uncorrectable error");
+#ifdef CONFIG_UCE_KERNEL_RECOVERY + if (test_thread_flag(TIF_UCE_KERNEL_RECOVERY)) { + flags |= MF_UCE_KERNEL_RECOVERY; + clear_thread_flag(TIF_UCE_KERNEL_RECOVERY); + } +#endif + clear_thread_flag(TIF_SEA_NOTIFY); do { pfn = si->paddr >> PAGE_SHIFT; diff --git a/arch/arm64/lib/copy_template_generic_read.S b/arch/arm64/lib/copy_template_generic_read.S new file mode 100644 index 000000000000..287193527260 --- /dev/null +++ b/arch/arm64/lib/copy_template_generic_read.S @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ + + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15_gr + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned_gr + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned_gr + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned_gr: + cmp count, #64 + b.ge .Lcpy_over64_gr + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63_gr: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15_gr + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +1: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +2: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +.Ltiny15_gr: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc_gr + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc_gr + +.Lcpy_over64_gr: + subs count, count, #128 + b.ge .Lcpy_body_large_gr + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + ldp1 D_l, D_h, src, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63_gr + b .Lexitfunc_gr + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large_gr: + /* pre-get 64 bytes data. */ + ldp1 A_l, A_h, src, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + ldp1 D_l, D_h, src, #16 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp1 A_l, A_h, dst, #16 + ldp1 A_l, A_h, src, #16 + stp1 B_l, B_h, dst, #16 + ldp1 B_l, B_h, src, #16 + stp1 C_l, C_h, dst, #16 + ldp1 C_l, C_h, src, #16 + stp1 D_l, D_h, dst, #16 + ldp1 D_l, D_h, src, #16 + subs count, count, #64 + b.ge 1b + stp1 A_l, A_h, dst, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63_gr +.Lexitfunc_gr: diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 6b99b939c50f..769133236f4a 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -71,6 +71,25 @@ ENTRY(__arch_copy_to_user) ret ENDPROC(__arch_copy_to_user)
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +ENTRY(__arch_copy_to_user_generic_read) + uaccess_enable_not_uao x3, x4, x5 + add end, x0, x2 +#include "copy_template_generic_read.S" + uaccess_disable_not_uao x3, x4 + + mov x0, #0 + ret + + .global copy_generic_read_sea_fallback +copy_generic_read_sea_fallback: + uaccess_disable_not_uao x3, x4 + + mov x0, #-1 + ret +ENDPROC(__arch_copy_to_user_generic_read) +#endif + .section .fixup,"ax" .align 2 9998: sub x0, end, dst // bytes not copied diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 039802b047af..08040fe73199 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -662,6 +662,7 @@ int kernel_access_sea_recovery; #define UCE_KER_REC_NUM ARRAY_SIZE(reco_info) static struct uce_kernel_recovery_info reco_info[] = { {copy_page_cow_sea_fallback, "copy_page_cow", (unsigned long)copy_page_cow, 0}, + {copy_generic_read_sea_fallback, "__arch_copy_to_user_generic_read", (unsigned long)__arch_copy_to_user_generic_read, 0}, };
static int __init kernel_access_sea_recovery_init(void) @@ -708,6 +709,11 @@ int is_cow_kernel_recovery_enable(void) return kernel_access_sea_recovery & 0x1; }
+int is_pagecache_reading_kernel_recovery_enable(void) +{ + return kernel_access_sea_recovery & 0x2; +} + /* * what is kernel recovery? * If the process's private data is accessed in the kernel mode to trigger @@ -827,6 +833,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
idx = is_in_kernel_recovery(esr, regs); if (idx >= 0 && idx < UCE_KER_REC_NUM) { + set_thread_flag(TIF_UCE_KERNEL_RECOVERY); clear_siginfo(&info); info.si_signo = inf->sig; info.si_errno = 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b5f9b8fb1a3..5bd84fa48a44 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2837,6 +2837,9 @@ enum mf_flags { MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, +#ifdef CONFIG_UCE_KERNEL_RECOVERY + MF_UCE_KERNEL_RECOVERY = 1 << 4 +#endif }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); diff --git a/include/linux/uio.h b/include/linux/uio.h index 422b1c01ee0d..13c48b13f21e 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -97,6 +97,11 @@ bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i);
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +size_t copy_page_to_iter_generic_read(struct page *page, size_t offset, + size_t bytes, struct iov_iter *i); +#endif + static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { diff --git a/lib/iov_iter.c b/lib/iov_iter.c index dd8385a20325..66c0a4afbc74 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -130,6 +130,17 @@ } \ }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +static int copyout_generic_read(void __user *to, const void *from, size_t n) +{ + if (access_ok(to, n)) { + kasan_check_read(from, n); + n = raw_copy_to_user_generic_read(to, from, n); + } + return n; +} +#endif + static int copyout(void __user *to, const void *from, size_t n) { if (access_ok(to, n)) { @@ -148,6 +159,108 @@ static int copyin(void *to, const void __user *from, size_t n) return n; }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +static size_t copy_page_to_iter_iovec_generic_read(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + might_fault(); + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { + kaddr = kmap_atomic(page); + from = kaddr + offset; + + /* first chunk, usually the only one */ + left = copyout_generic_read(buf, from, copy); + if (left == -1) { + kunmap_atomic(kaddr); + goto done; + } + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = copyout_generic_read(buf, from, copy); + if (left == -1) { + kunmap_atomic(kaddr); + goto done; + } + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = from - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + + kaddr = kmap(page); + from = kaddr + offset; + left = copyout_generic_read(buf, from, copy); + if (left == -1) { + kunmap(page); + goto done; + } + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = copyout_generic_read(buf, from, copy); + if (left == -1) { + kunmap(page); + goto done; + } + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + kunmap(page); + +done: + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} +#endif + static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { @@ -839,6 +952,25 @@ static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) return false; }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +size_t copy_page_to_iter_generic_read(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + if (unlikely(!page_copy_sane(page, offset, bytes))) + return 0; + if (i->type & (ITER_BVEC|ITER_KVEC)) { + void *kaddr = kmap_atomic(page); + size_t wanted = copy_to_iter(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; + } else if (likely(!(i->type & ITER_PIPE))) + return copy_page_to_iter_iovec_generic_read(page, offset, bytes, i); + else + return copy_page_to_iter_pipe(page, offset, bytes, i); +} +EXPORT_SYMBOL(copy_page_to_iter_generic_read); +#endif + size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { diff --git a/mm/filemap.c b/mm/filemap.c index 6a212192e1b7..5c33c40e8019 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2290,8 +2290,14 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb, * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ - +#ifdef CONFIG_UCE_KERNEL_RECOVERY + if (is_pagecache_reading_kernel_recovery_enable()) + ret = copy_page_to_iter_generic_read(page, offset, nr, iter); + else + ret = copy_page_to_iter(page, offset, nr, iter); +#else ret = copy_page_to_iter(page, offset, nr, iter); +#endif offset += ret; index += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; diff --git a/mm/internal.h b/mm/internal.h index b6d3b3660782..3bb7ca86e84e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -591,6 +591,7 @@ extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
#ifdef CONFIG_UCE_KERNEL_RECOVERY extern int is_cow_kernel_recovery_enable(void); +extern int is_pagecache_reading_kernel_recovery_enable(void); #endif
#endif /* __MM_INTERNAL_H */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 145262227bb0..1316e86e0a16 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1395,6 +1395,13 @@ int memory_failure(unsigned long pfn, int flags) if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p)) goto identify_page_state;
+#ifdef CONFIG_UCE_KERNEL_RECOVERY + if ((flags & MF_UCE_KERNEL_RECOVERY) && PageWriteback(p)) { + panic("UCE in memory failure while Page writeback, panic on page %lx, flags %x", + pfn, flags); + } +#endif + /* * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here.
Applied.
On 2021/8/12 11:34, Tong Tiangen wrote:
v2->v1: fix typo.
Tong Tiangen (2): uce: cow scenario support kernel recovery uce: pagecache reading scenario support kernel recovery
arch/arm64/Kconfig | 10 + arch/arm64/include/asm/exception.h | 12 ++ arch/arm64/include/asm/page.h | 8 + arch/arm64/include/asm/thread_info.h | 8 + arch/arm64/include/asm/uaccess.h | 8 + arch/arm64/kernel/ras.c | 7 + arch/arm64/lib/copy_page.S | 78 ++++++++ arch/arm64/lib/copy_template_generic_read.S | 193 ++++++++++++++++++++ arch/arm64/lib/copy_to_user.S | 19 ++ arch/arm64/mm/copypage.c | 14 ++ arch/arm64/mm/fault.c | 160 ++++++++++++++++ include/linux/highmem.h | 17 ++ include/linux/kernel.h | 4 + include/linux/mm.h | 3 + include/linux/uio.h | 5 + kernel/sysctl.c | 13 ++ lib/iov_iter.c | 132 +++++++++++++ mm/filemap.c | 8 +- mm/internal.h | 6 + mm/memory-failure.c | 7 + mm/memory.c | 8 + 21 files changed, 719 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/lib/copy_template_generic_read.S