From: Yang Shi shy828301@gmail.com
stable inclusion from stable-v5.15.86 commit a62b1bc603a1ded739e7cf543da29a3eb93cc534 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6AR36 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit dd0f230a0a80ff396c7ce587f16429f2a8131344 upstream.
Memory failure will report failure if the page still has extra pinned refcount other than from hwpoison after the handler is done. Actually the check is not necessary for all handlers, so move the check into specific handlers. This would make the following keeping shmem page in page cache patch easier.
There may be expected extra pin for some cases, for example, when the page is dirty and in swapcache.
Link: https://lkml.kernel.org/r/20211020210755.23964-5-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Signed-off-by: Naoya Horiguchi naoya.horiguchi@nec.com Suggested-by: Naoya Horiguchi naoya.horiguchi@nec.com Cc: Hugh Dickins hughd@google.com Cc: Kirill A. Shutemov kirill.shutemov@linux.intel.com Cc: Matthew Wilcox willy@infradead.org Cc: Oscar Salvador osalvador@suse.de Cc: Peter Xu peterx@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Cc: Naoya Horiguchi naoya.horiguchi@linux.dev Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Ze Zuo zuoze1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- mm/memory-failure.c | 93 +++++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 29 deletions(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9a816fdf812d..b653637d5a00 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -655,12 +655,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn, return ret; }
+struct page_state { + unsigned long mask; + unsigned long res; + enum mf_action_page_type type; + + /* Callback ->action() has to unlock the relevant page inside it. */ + int (*action)(struct page_state *ps, struct page *p); +}; + +/* + * Return true if page is still referenced by others, otherwise return + * false. + * + * The extra_pins is true when one extra refcount is expected. + */ +static bool has_extra_refcount(struct page_state *ps, struct page *p, + bool extra_pins) +{ + int count = page_count(p) - 1; + + if (extra_pins) + count -= 1; + + if (count > 0) { + pr_err("Memory failure: %#lx: %s still referenced by %d users\n", + page_to_pfn(p), action_page_types[ps->type], count); + return true; + } + + return false; +} + /* * Error hit kernel page. * Do nothing, try to be lucky and not touch this instead. For a few cases we * could be more sophisticated. */ -static int me_kernel(struct page *p, unsigned long pfn) +static int me_kernel(struct page_state *ps, struct page *p) { unlock_page(p); return MF_IGNORED; @@ -669,9 +701,9 @@ static int me_kernel(struct page *p, unsigned long pfn) /* * Page in unknown state. Do nothing. */ -static int me_unknown(struct page *p, unsigned long pfn) +static int me_unknown(struct page_state *ps, struct page *p) { - pr_err("Memory failure: %#lx: Unknown page state\n", pfn); + pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p)); unlock_page(p); return MF_FAILED; } @@ -679,7 +711,7 @@ static int me_unknown(struct page *p, unsigned long pfn) /* * Clean (or cleaned) page cache page. */ -static int me_pagecache_clean(struct page *p, unsigned long pfn) +static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; @@ -716,9 +748,13 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_mutex or not for this? Right now we don't. */ - ret = truncate_error_page(p, pfn, mapping); + ret = truncate_error_page(p, page_to_pfn(p), mapping); out: unlock_page(p); + + if (has_extra_refcount(ps, p, false)) + ret = MF_FAILED; + return ret; }
@@ -727,7 +763,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * Issues: when the error hit a hole page the error is not properly * propagated. */ -static int me_pagecache_dirty(struct page *p, unsigned long pfn) +static int me_pagecache_dirty(struct page_state *ps, struct page *p) { struct address_space *mapping = page_mapping(p);
@@ -771,7 +807,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) mapping_set_error(mapping, -EIO); }
- return me_pagecache_clean(p, pfn); + return me_pagecache_clean(ps, p); }
/* @@ -793,9 +829,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * Clean swap cache pages can be directly isolated. A later page fault will * bring in the known good data from disk. */ -static int me_swapcache_dirty(struct page *p, unsigned long pfn) +static int me_swapcache_dirty(struct page_state *ps, struct page *p) { int ret; + bool extra_pins = false;
ClearPageDirty(p); /* Trigger EIO in shmem: */ @@ -803,10 +840,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; unlock_page(p); + + if (ret == MF_DELAYED) + extra_pins = true; + + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + return ret; }
-static int me_swapcache_clean(struct page *p, unsigned long pfn) +static int me_swapcache_clean(struct page_state *ps, struct page *p) { int ret;
@@ -814,6 +858,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; unlock_page(p); + + if (has_extra_refcount(ps, p, false)) + ret = MF_FAILED; + return ret; }
@@ -823,7 +871,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) * - Error on hugepage is contained in hugepage unit (not in raw page unit.) * To narrow down kill region to one page, we need to break up pmd. */ -static int me_huge_page(struct page *p, unsigned long pfn) +static int me_huge_page(struct page_state *ps, struct page *p) { int res; struct page *hpage = compound_head(p); @@ -834,7 +882,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage); if (mapping) { - res = truncate_error_page(hpage, pfn, mapping); + res = truncate_error_page(hpage, page_to_pfn(p), mapping); unlock_page(hpage); } else { res = MF_FAILED; @@ -852,6 +900,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) } }
+ if (has_extra_refcount(ps, p, false)) + res = MF_FAILED; + return res; }
@@ -878,14 +929,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved)
-static struct page_state { - unsigned long mask; - unsigned long res; - enum mf_action_page_type type; - - /* Callback ->action() has to unlock the relevant page inside it. */ - int (*action)(struct page *p, unsigned long pfn); -} error_states[] = { +static struct page_state error_states[] = { { reserved, reserved, MF_MSG_KERNEL, me_kernel }, /* * free pages are specially detected outside this table: @@ -946,19 +990,10 @@ static int page_action(struct page_state *ps, struct page *p, unsigned long pfn) { int result; - int count;
/* page p should be unlocked after returning from ps->action(). */ - result = ps->action(p, pfn); + result = ps->action(ps, p);
- count = page_count(p) - 1; - if (ps->action == me_swapcache_dirty && result == MF_DELAYED) - count--; - if (count > 0) { - pr_err("Memory failure: %#lx: %s still referenced by %d users\n", - pfn, action_page_types[ps->type], count); - result = MF_FAILED; - } action_result(pfn, ps->type, result);
/* Could do more checks here if page looks ok */
From: James Houghton jthoughton@google.com
stable inclusion from stable-v5.15.86 commit 30571f28bb35c826219971c63bcf60d2517112ed category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6AR36 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit 8625147cafaa9ba74713d682f5185eb62cb2aedb ]
This change is very similar to the change that was made for shmem [1], and it solves the same problem but for HugeTLBFS instead.
Currently, when poison is found in a HugeTLB page, the page is removed from the page cache. That means that attempting to map or read that hugepage in the future will result in a new hugepage being allocated instead of notifying the user that the page was poisoned. As [1] states, this is effectively memory corruption.
The fix is to leave the page in the page cache. If the user attempts to use a poisoned HugeTLB page with a syscall, the syscall will fail with EIO, the same error code that shmem uses. For attempts to map the page, the thread will get a BUS_MCEERR_AR SIGBUS.
[1]: commit a76054266661 ("mm: shmem: don't truncate page if memory failure happens")
Link: https://lkml.kernel.org/r/20221018200125.848471-1-jthoughton@google.com Signed-off-by: James Houghton jthoughton@google.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Reviewed-by: Naoya Horiguchi naoya.horiguchi@nec.com Tested-by: Naoya Horiguchi naoya.horiguchi@nec.com Reviewed-by: Yang Shi shy828301@gmail.com Cc: Axel Rasmussen axelrasmussen@google.com Cc: James Houghton jthoughton@google.com Cc: Miaohe Lin linmiaohe@huawei.com Cc: Muchun Song songmuchun@bytedance.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Ze Zuo zuoze1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- fs/hugetlbfs/inode.c | 13 ++++++------- mm/hugetlb.c | 4 ++++ mm/memory-failure.c | 5 ++++- 3 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5fcd3586c81f..23c86e5b51a9 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -360,6 +360,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { unlock_page(page);
+ if (PageHWPoison(page)) { + put_page(page); + retval = -EIO; + break; + } + /* * We have the page, copy it to user space buffer. */ @@ -1021,13 +1027,6 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, static int hugetlbfs_error_remove_page(struct address_space *mapping, struct page *page) { - struct inode *inode = mapping->host; - pgoff_t index = page->index; - - remove_huge_page(page); - if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - return 0; }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c61c3d65eafc..4f61d3be0b11 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5775,6 +5775,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ptl = huge_pte_lockptr(h, dst_mm, dst_pte); spin_lock(ptl);
+ ret = -EIO; + if (PageHWPoison(page)) + goto out_release_unlock; + /* * Recheck the i_size after holding PT lock to make sure not * to leave any page mapped (as page_mapped()) beyond the end diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b653637d5a00..fff715a27253 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -876,6 +876,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) int res; struct page *hpage = compound_head(p); struct address_space *mapping; + bool extra_pins = false;
if (!PageHuge(hpage)) return MF_DELAYED; @@ -883,6 +884,8 @@ static int me_huge_page(struct page_state *ps, struct page *p) mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, page_to_pfn(p), mapping); + /* The page is kept in page cache. */ + extra_pins = true; unlock_page(hpage); } else { res = MF_FAILED; @@ -900,7 +903,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) } }
- if (has_extra_refcount(ps, p, false)) + if (has_extra_refcount(ps, p, extra_pins)) res = MF_FAILED;
return res;
From: Longlong Xia xialonglong1@huawei.com
maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6AWEO
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?...
--------------------------------
The softlockup still occurs in get_swap_pages() under memory pressure. 64 CPU cores, 64GB memory, and 28 zram devices, the disksize of each zram device is 50MB with same priority as si. Use the stress-ng tool to increase memory pressure, causing the system to oom frequently.
The plist_for_each_entry_safe() loops in get_swap_pages() could reach tens of thousands of times to find available space (extreme case: cond_resched() is not called in scan_swap_map_slots()). Let's add cond_resched() into get_swap_pages() when failed to find available space to avoid softlockup.
Link: https://lkml.kernel.org/r/20230128094757.1060525-1-xialonglong1@huawei.com Signed-off-by: Longlong Xia xialonglong1@huawei.com Reviewed-by: "Huang, Ying" ying.huang@intel.com Cc: Chen Wandun chenwandun@huawei.com Cc: Huang Ying ying.huang@intel.com Cc: Kefeng Wang wangkefeng.wang@huawei.com Cc: Nanyong Sun sunnanyong@huawei.com Cc: Hugh Dickins hughd@google.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Longlong Xia xialonglong1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- mm/swapfile.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/mm/swapfile.c b/mm/swapfile.c index ba97c8f635b0..1fc42be5efe8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1010,6 +1010,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); + cond_resched();
spin_lock(&swap_avail_lock); nextsi: