From: Dave Chinner <dchinner(a)redhat.com>
mainline inclusion
from mainline-v5.19-rc1
commit c230a4a85bcdbfc1a7415deec6caf04e8fca1301
category: bugfix
bugzilla: 187372, https://gitee.com/openeuler/kernel/issues/I5K0OM
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Ever since we added shadown format buffers to the log items, log
items need to handle the item being released with shadow buffers
attached. Due to the fact this requirement was added at the same
time we added new rmap/reflink intents, we missed the cleanup of
those items.
In theory, this means shadow buffers can be leaked in a very small
window when a shutdown is initiated. Testing with KASAN shows this
leak does not happen in practice - we haven't identified a single
leak in several years of shutdown testing since ~v4.8 kernels.
However, the intent whiteout cleanup mechanism results in every
cancelled intent in exactly the same state as this tiny race window
creates and so if intents down clean up shadow buffers on final
release we will leak the shadow buffer for just about every intent
we create.
Hence we start with this patch to close this condition off and
ensure that when whiteouts start to be used we don't leak lots of
memory.
Signed-off-by: Dave Chinner <dchinner(a)redhat.com>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Reviewed-by: Allison Henderson <allison.henderson(a)oracle.com>
Signed-off-by: Dave Chinner <david(a)fromorbit.com>
conflicts:
fs/xfs/xfs_bmap_item.c
fs/xfs/xfs_icreate_item.c
fs/xfs/xfs_refcount_item.c
fs/xfs/xfs_rmap_item.c
Signed-off-by: Li Nan <linan122(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/xfs/xfs_bmap_item.c | 2 ++
fs/xfs/xfs_icreate_item.c | 1 +
fs/xfs/xfs_refcount_item.c | 2 ++
fs/xfs/xfs_rmap_item.c | 2 ++
4 files changed, 7 insertions(+)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 44ec0f2d5253..e6de8081451f 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -40,6 +40,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_zone, buip);
}
@@ -199,6 +200,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_zone, budp);
}
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 9b3994b9c716..aa8c7c261d24 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0dee316283a9..9f4ff45c7a93 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -204,6 +205,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_zone, cudp);
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 20905953fe76..b5447ac7cb9b 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -227,6 +228,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_zone, rudp);
}
--
2.25.1
From: Dave Chinner <dchinner(a)redhat.com>
mainline inclusion
from mainline-v5.19-rc1
commit c230a4a85bcdbfc1a7415deec6caf04e8fca1301
category: bugfix
bugzilla: 187372, https://gitee.com/openeuler/kernel/issues/I5K0OM
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Ever since we added shadown format buffers to the log items, log
items need to handle the item being released with shadow buffers
attached. Due to the fact this requirement was added at the same
time we added new rmap/reflink intents, we missed the cleanup of
those items.
In theory, this means shadow buffers can be leaked in a very small
window when a shutdown is initiated. Testing with KASAN shows this
leak does not happen in practice - we haven't identified a single
leak in several years of shutdown testing since ~v4.8 kernels.
However, the intent whiteout cleanup mechanism results in every
cancelled intent in exactly the same state as this tiny race window
creates and so if intents down clean up shadow buffers on final
release we will leak the shadow buffer for just about every intent
we create.
Hence we start with this patch to close this condition off and
ensure that when whiteouts start to be used we don't leak lots of
memory.
Signed-off-by: Dave Chinner <dchinner(a)redhat.com>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Reviewed-by: Allison Henderson <allison.henderson(a)oracle.com>
Signed-off-by: Dave Chinner <david(a)fromorbit.com>
conflicts:
fs/xfs/xfs_bmap_item.c
fs/xfs/xfs_icreate_item.c
fs/xfs/xfs_refcount_item.c
fs/xfs/xfs_rmap_item.c
Signed-off-by: Li Nan <linan122(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/xfs/xfs_bmap_item.c | 2 ++
fs/xfs/xfs_icreate_item.c | 1 +
fs/xfs/xfs_refcount_item.c | 2 ++
fs/xfs/xfs_rmap_item.c | 2 ++
4 files changed, 7 insertions(+)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 44ec0f2d5253..e6de8081451f 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -40,6 +40,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_zone, buip);
}
@@ -199,6 +200,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_zone, budp);
}
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 9b3994b9c716..aa8c7c261d24 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0dee316283a9..9f4ff45c7a93 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -204,6 +205,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_zone, cudp);
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 20905953fe76..b5447ac7cb9b 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -227,6 +228,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_zone, rudp);
}
--
2.25.1
From: Yang Shi <shy828301(a)gmail.com>
stable inclusion
from stable-v5.15.86
commit a62b1bc603a1ded739e7cf543da29a3eb93cc534
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I6AR36
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
--------------------------------
commit dd0f230a0a80ff396c7ce587f16429f2a8131344 upstream.
Memory failure will report failure if the page still has extra pinned
refcount other than from hwpoison after the handler is done. Actually
the check is not necessary for all handlers, so move the check into
specific handlers. This would make the following keeping shmem page in
page cache patch easier.
There may be expected extra pin for some cases, for example, when the
page is dirty and in swapcache.
Link: https://lkml.kernel.org/r/20211020210755.23964-5-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301(a)gmail.com>
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Suggested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Naoya Horiguchi <naoya.horiguchi(a)linux.dev>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Ze Zuo <zuoze1(a)huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
mm/memory-failure.c | 93 +++++++++++++++++++++++++++++++--------------
1 file changed, 64 insertions(+), 29 deletions(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9a816fdf812d..b653637d5a00 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -655,12 +655,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
return ret;
}
+struct page_state {
+ unsigned long mask;
+ unsigned long res;
+ enum mf_action_page_type type;
+
+ /* Callback ->action() has to unlock the relevant page inside it. */
+ int (*action)(struct page_state *ps, struct page *p);
+};
+
+/*
+ * Return true if page is still referenced by others, otherwise return
+ * false.
+ *
+ * The extra_pins is true when one extra refcount is expected.
+ */
+static bool has_extra_refcount(struct page_state *ps, struct page *p,
+ bool extra_pins)
+{
+ int count = page_count(p) - 1;
+
+ if (extra_pins)
+ count -= 1;
+
+ if (count > 0) {
+ pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+ page_to_pfn(p), action_page_types[ps->type], count);
+ return true;
+ }
+
+ return false;
+}
+
/*
* Error hit kernel page.
* Do nothing, try to be lucky and not touch this instead. For a few cases we
* could be more sophisticated.
*/
-static int me_kernel(struct page *p, unsigned long pfn)
+static int me_kernel(struct page_state *ps, struct page *p)
{
unlock_page(p);
return MF_IGNORED;
@@ -669,9 +701,9 @@ static int me_kernel(struct page *p, unsigned long pfn)
/*
* Page in unknown state. Do nothing.
*/
-static int me_unknown(struct page *p, unsigned long pfn)
+static int me_unknown(struct page_state *ps, struct page *p)
{
- pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
unlock_page(p);
return MF_FAILED;
}
@@ -679,7 +711,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
/*
* Clean (or cleaned) page cache page.
*/
-static int me_pagecache_clean(struct page *p, unsigned long pfn)
+static int me_pagecache_clean(struct page_state *ps, struct page *p)
{
int ret;
struct address_space *mapping;
@@ -716,9 +748,13 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
- ret = truncate_error_page(p, pfn, mapping);
+ ret = truncate_error_page(p, page_to_pfn(p), mapping);
out:
unlock_page(p);
+
+ if (has_extra_refcount(ps, p, false))
+ ret = MF_FAILED;
+
return ret;
}
@@ -727,7 +763,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
-static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+static int me_pagecache_dirty(struct page_state *ps, struct page *p)
{
struct address_space *mapping = page_mapping(p);
@@ -771,7 +807,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
mapping_set_error(mapping, -EIO);
}
- return me_pagecache_clean(p, pfn);
+ return me_pagecache_clean(ps, p);
}
/*
@@ -793,9 +829,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* Clean swap cache pages can be directly isolated. A later page fault will
* bring in the known good data from disk.
*/
-static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+static int me_swapcache_dirty(struct page_state *ps, struct page *p)
{
int ret;
+ bool extra_pins = false;
ClearPageDirty(p);
/* Trigger EIO in shmem: */
@@ -803,10 +840,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
unlock_page(p);
+
+ if (ret == MF_DELAYED)
+ extra_pins = true;
+
+ if (has_extra_refcount(ps, p, extra_pins))
+ ret = MF_FAILED;
+
return ret;
}
-static int me_swapcache_clean(struct page *p, unsigned long pfn)
+static int me_swapcache_clean(struct page_state *ps, struct page *p)
{
int ret;
@@ -814,6 +858,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
unlock_page(p);
+
+ if (has_extra_refcount(ps, p, false))
+ ret = MF_FAILED;
+
return ret;
}
@@ -823,7 +871,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd.
*/
-static int me_huge_page(struct page *p, unsigned long pfn)
+static int me_huge_page(struct page_state *ps, struct page *p)
{
int res;
struct page *hpage = compound_head(p);
@@ -834,7 +882,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
- res = truncate_error_page(hpage, pfn, mapping);
+ res = truncate_error_page(hpage, page_to_pfn(p), mapping);
unlock_page(hpage);
} else {
res = MF_FAILED;
@@ -852,6 +900,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
}
}
+ if (has_extra_refcount(ps, p, false))
+ res = MF_FAILED;
+
return res;
}
@@ -878,14 +929,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
-static struct page_state {
- unsigned long mask;
- unsigned long res;
- enum mf_action_page_type type;
-
- /* Callback ->action() has to unlock the relevant page inside it. */
- int (*action)(struct page *p, unsigned long pfn);
-} error_states[] = {
+static struct page_state error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
/*
* free pages are specially detected outside this table:
@@ -946,19 +990,10 @@ static int page_action(struct page_state *ps, struct page *p,
unsigned long pfn)
{
int result;
- int count;
/* page p should be unlocked after returning from ps->action(). */
- result = ps->action(p, pfn);
+ result = ps->action(ps, p);
- count = page_count(p) - 1;
- if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
- count--;
- if (count > 0) {
- pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
- pfn, action_page_types[ps->type], count);
- result = MF_FAILED;
- }
action_result(pfn, ps->type, result);
/* Could do more checks here if page looks ok */
--
2.25.1