hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I44REB CVE: NA
---------------------------
Support pagecache reading triggered by user-mode file access scenario to kernel recovery.
At the same time, in this patch, we solved the problem of process D state caused by waiting for PG_writeback in memory_failure() (the background writeback thread timed out while writing back to the disk on the page with hardware error). In this case, our processing method is panic system to avoid the business interruption for a long time caused by the business process in D state.
Signed-off-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Chen Huang chenhuang5@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com --- arch/arm64/include/asm/exception.h | 1 + arch/arm64/include/asm/thread_info.h | 8 + arch/arm64/include/asm/uaccess.h | 8 + arch/arm64/kernel/ras.c | 7 + arch/arm64/lib/copy_template_generic_read.S | 193 ++++++++++++++++++++ arch/arm64/lib/copy_to_user.S | 19 ++ arch/arm64/mm/fault.c | 7 + include/linux/mm.h | 3 + include/linux/uio.h | 5 + lib/iov_iter.c | 132 +++++++++++++ mm/filemap.c | 8 +- mm/internal.h | 1 + mm/memory-failure.c | 7 + 13 files changed, 398 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/lib/copy_template_generic_read.S
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index f938690c5ad6..559d86ad9e5d 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -50,6 +50,7 @@ struct uce_kernel_recovery_info { };
extern int copy_page_cow_sea_fallback(void); +extern int copy_generic_read_sea_fallback(void); #endif
#endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index d0e638f5e3ec..6bc5fe80fd46 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -94,6 +94,10 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_SSBD 25 /* Wants SSB mitigation */ #define TIF_32BIT_AARCH64 26 /* 32 bit process on AArch64(ILP32) */
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +#define TIF_UCE_KERNEL_RECOVERY 27 +#endif + #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -111,6 +115,10 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_32BIT_AARCH64 (1 << TIF_32BIT_AARCH64) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +#define _TIF_UCE_KERNEL_RECOVERY (1 << TIF_UCE_KERNEL_RECOVERY) +#endif + #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_FSCHECK | _TIF_SEA_NOTIFY) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index bde376077167..19a7d62593ed 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -408,6 +408,14 @@ extern unsigned long __must_check __arch_copy_from_user(void *to, const void __u __arch_copy_from_user((to), __uaccess_mask_ptr(from), (n)); \ })
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +extern unsigned long __must_check __arch_copy_to_user_generic_read(void __user *to, const void *from, unsigned long n); +#define raw_copy_to_user_generic_read(to, from, n) \ +({ \ + __arch_copy_to_user_generic_read(__uaccess_mask_ptr(to), (from), (n)); \ +}) +#endif + extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); #define raw_copy_to_user(to, from, n) \ ({ \ diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c index 181e609e1cd4..b57041f9e6e7 100644 --- a/arch/arm64/kernel/ras.c +++ b/arch/arm64/kernel/ras.c @@ -92,6 +92,13 @@ void sea_notify_process(void) if (!si) panic("Lost physical address for consumed uncorrectable error");
+#ifdef CONFIG_UCE_KERNEL_RECOVERY + if (test_thread_flag(TIF_UCE_KERNEL_RECOVERY)) { + flags |= MF_UCE_KERNEL_RECOVERY; + clear_thread_flag(TIF_UCE_KERNEL_RECOVERY); + } +#endif + clear_thread_flag(TIF_SEA_NOTIFY); do { pfn = si->paddr >> PAGE_SHIFT; diff --git a/arch/arm64/lib/copy_template_generic_read.S b/arch/arm64/lib/copy_template_generic_read.S new file mode 100644 index 000000000000..287193527260 --- /dev/null +++ b/arch/arm64/lib/copy_template_generic_read.S @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ + + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15_gr + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned_gr + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned_gr + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned_gr: + cmp count, #64 + b.ge .Lcpy_over64_gr + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63_gr: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15_gr + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +1: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +2: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +.Ltiny15_gr: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc_gr + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc_gr + +.Lcpy_over64_gr: + subs count, count, #128 + b.ge .Lcpy_body_large_gr + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + ldp1 D_l, D_h, src, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63_gr + b .Lexitfunc_gr + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large_gr: + /* pre-get 64 bytes data. */ + ldp1 A_l, A_h, src, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + ldp1 D_l, D_h, src, #16 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp1 A_l, A_h, dst, #16 + ldp1 A_l, A_h, src, #16 + stp1 B_l, B_h, dst, #16 + ldp1 B_l, B_h, src, #16 + stp1 C_l, C_h, dst, #16 + ldp1 C_l, C_h, src, #16 + stp1 D_l, D_h, dst, #16 + ldp1 D_l, D_h, src, #16 + subs count, count, #64 + b.ge 1b + stp1 A_l, A_h, dst, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63_gr +.Lexitfunc_gr: diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 6b99b939c50f..769133236f4a 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -71,6 +71,25 @@ ENTRY(__arch_copy_to_user) ret ENDPROC(__arch_copy_to_user)
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +ENTRY(__arch_copy_to_user_generic_read) + uaccess_enable_not_uao x3, x4, x5 + add end, x0, x2 +#include "copy_template_generic_read.S" + uaccess_disable_not_uao x3, x4 + + mov x0, #0 + ret + + .global copy_generic_read_sea_fallback +copy_generic_read_sea_fallback: + uaccess_disable_not_uao x3, x4 + + mov x0, #-1 + ret +ENDPROC(__arch_copy_to_user_generic_read) +#endif + .section .fixup,"ax" .align 2 9998: sub x0, end, dst // bytes not copied diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 039802b047af..08040fe73199 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -662,6 +662,7 @@ int kernel_access_sea_recovery; #define UCE_KER_REC_NUM ARRAY_SIZE(reco_info) static struct uce_kernel_recovery_info reco_info[] = { {copy_page_cow_sea_fallback, "copy_page_cow", (unsigned long)copy_page_cow, 0}, + {copy_generic_read_sea_fallback, "__arch_copy_to_user_generic_read", (unsigned long)__arch_copy_to_user_generic_read, 0}, };
static int __init kernel_access_sea_recovery_init(void) @@ -708,6 +709,11 @@ int is_cow_kernel_recovery_enable(void) return kernel_access_sea_recovery & 0x1; }
+int is_pagecache_reading_kernel_recovery_enable(void) +{ + return kernel_access_sea_recovery & 0x2; +} + /* * what is kernel recovery? * If the process's private data is accessed in the kernel mode to trigger @@ -827,6 +833,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
idx = is_in_kernel_recovery(esr, regs); if (idx >= 0 && idx < UCE_KER_REC_NUM) { + set_thread_flag(TIF_UCE_KERNEL_RECOVERY); clear_siginfo(&info); info.si_signo = inf->sig; info.si_errno = 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index d3d62cd3ee07..75d94ea5d1c2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2816,6 +2816,9 @@ enum mf_flags { MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, +#ifdef CONFIG_UCE_KERNEL_RECOVERY + MF_UCE_KERNEL_RECOVERY = 1 << 4 +#endif }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); diff --git a/include/linux/uio.h b/include/linux/uio.h index 4af82ff60264..fd3cd8a9efa4 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -97,6 +97,11 @@ bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i);
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +size_t copy_page_to_iter_generic_read(struct page *page, size_t offset, + size_t bytes, struct iov_iter *i); +#endif + static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { diff --git a/lib/iov_iter.c b/lib/iov_iter.c index da01bb6551d9..25668139fc1f 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -130,6 +130,17 @@ } \ }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +static int copyout_generic_read(void __user *to, const void *from, size_t n) +{ + if (access_ok(to, n)) { + kasan_check_read(from, n); + n = raw_copy_to_user_generic_read(to, from, n); + } + return n; +} +#endif + static int copyout(void __user *to, const void *from, size_t n) { if (access_ok(to, n)) { @@ -148,6 +159,108 @@ static int copyin(void *to, const void __user *from, size_t n) return n; }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +static size_t copy_page_to_iter_iovec_generic_read(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + might_fault(); + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { + kaddr = kmap_atomic(page); + from = kaddr + offset; + + /* first chunk, usually the only one */ + left = copyout_generic_read(buf, from, copy); + if (left == -1) { + kunmap_atomic(kaddr); + goto done; + } + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = copyout_generic_read(buf, from, copy); + if (left == -1) { + kunmap_atomic(kaddr); + goto done; + } + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = from - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + + kaddr = kmap(page); + from = kaddr + offset; + left = copyout_generic_read(buf, from, copy); + if (left == -1) { + kunmap(page); + goto done; + } + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = copyout_generic_read(buf, from, copy); + if (left == -1) { + kunmap(page); + goto done; + } + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + kunmap(page); + +done: + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} +#endif + static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { @@ -839,6 +952,25 @@ static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) return false; }
+#ifdef CONFIG_UCE_KERNEL_RECOVERY +size_t copy_page_to_iter_generic_read(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + if (unlikely(!page_copy_sane(page, offset, bytes))) + return 0; + if (i->type & (ITER_BVEC|ITER_KVEC)) { + void *kaddr = kmap_atomic(page); + size_t wanted = copy_to_iter(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; + } else if (likely(!(i->type & ITER_PIPE))) + return copy_page_to_iter_iovec_generic_read(page, offset, bytes, i); + else + return copy_page_to_iter_pipe(page, offset, bytes, i); +} +EXPORT_SYMBOL(copy_page_to_iter_generic_read); +#endif + size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { diff --git a/mm/filemap.c b/mm/filemap.c index dd02515475bc..e80189f91294 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2292,8 +2292,14 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb, * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ - +#ifdef CONFIG_UCE_KERNEL_RECOVERY + if (is_pagecache_reading_kernel_recovery_enable()) + ret = copy_page_to_iter_generic_read(page, offset, nr, iter); + else + ret = copy_page_to_iter(page, offset, nr, iter); +#else ret = copy_page_to_iter(page, offset, nr, iter); +#endif offset += ret; index += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; diff --git a/mm/internal.h b/mm/internal.h index b6d3b3660782..3bb7ca86e84e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -591,6 +591,7 @@ extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
#ifdef CONFIG_UCE_KERNEL_RECOVERY extern int is_cow_kernel_recovery_enable(void); +extern int is_pagecache_reading_kernel_recovery_enable(void); #endif
#endif /* __MM_INTERNAL_H */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 77f29dd9e5c9..52619a9bc3b0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1403,6 +1403,13 @@ int memory_failure(unsigned long pfn, int flags) if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p)) goto identify_page_state;
+#ifdef CONFIG_UCE_KERNEL_RECOVERY + if ((flags & MF_UCE_KERNEL_RECOVERY) && PageWriteback(p)) { + panic("UCE in memory failure while Page writeback, panic on page %lx, flags %x", + pfn, flags); + } +#endif + /* * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here.