From: Zhihao Cheng <chengzhihao1(a)huawei.com>
maillist inclusion
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6C5HV
CVE: NA
Reference: https://lore.kernel.org/lkml/20230110015327.1181863-1-chengzhihao1@huawei.c…
--------------------------------
Following process will make data lost and could lead to a filesystem
corrupted problem:
1. jh(bh) is inserted into T1->t_checkpoint_list, bh is dirty, and
jh->b_transaction = NULL
2. T1 is added into journal->j_checkpoint_transactions.
3. Get bh prepare to write while doing checkpoing:
PA PB
do_get_write_access jbd2_log_do_checkpoint
spin_lock(&jh->b_state_lock)
if (buffer_dirty(bh))
clear_buffer_dirty(bh) // clear buffer dirty
set_buffer_jbddirty(bh)
transaction =
journal->j_checkpoint_transactions
jh = transaction->t_checkpoint_list
if (!buffer_dirty(bh))
__jbd2_journal_remove_checkpoint(jh)
// bh won't be flushed
jbd2_cleanup_journal_tail
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved)
4. Aborting journal/Power-cut before writing latest bh on journal area.
In this way we get a corrupted filesystem with bh's data lost.
Fix it by moving the clearing of buffer_dirty bit just before the call
to __jbd2_journal_file_buffer(), both bit clearing and jh->b_transaction
assignment are under journal->j_list_lock locked, so that
jbd2_log_do_checkpoint() will wait until jh's new transaction fininshed
even bh is currently not dirty. And journal_shrink_one_cp_list() won't
remove jh from checkpoint list if the buffer head is reused in
do_get_write_access().
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216898
Cc: <stable(a)kernel.org>
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: zhanchengbin <zhanchengbin1(a)huawei.com>
Suggested-by: Jan Kara <jack(a)suse.cz>
Conflicts:
fs/jbd2/transaction.c
[ 464170647b5648bb8("jbd2: Make state lock a spinlock") is not
applied. ]
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
fs/jbd2/transaction.c | 50 +++++++++++++++++++++++++------------------
1 file changed, 29 insertions(+), 21 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 3b31bd1f7b77..149190c2ac89 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -935,36 +935,28 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* ie. locked but not dirty) or tune2fs (which may actually have
* the buffer dirtied, ugh.) */
- if (buffer_dirty(bh)) {
+ if (buffer_dirty(bh) && jh->b_transaction) {
+ warn_dirty_buffer(bh);
/*
- * First question: is this buffer already part of the current
- * transaction or the existing committing transaction?
- */
- if (jh->b_transaction) {
- J_ASSERT_JH(jh,
- jh->b_transaction == transaction ||
- jh->b_transaction ==
- journal->j_committing_transaction);
- if (jh->b_next_transaction)
- J_ASSERT_JH(jh, jh->b_next_transaction ==
- transaction);
- warn_dirty_buffer(bh);
- }
- /*
- * In any case we need to clean the dirty flag and we must
- * do it under the buffer lock to be sure we don't race
- * with running write-out.
+ * We need to clean the dirty flag and we must do it under the
+ * buffer lock to be sure we don't race with running write-out.
*/
JBUFFER_TRACE(jh, "Journalling dirty buffer");
clear_buffer_dirty(bh);
+ /*
+ * The buffer is going to be added to BJ_Reserved list now and
+ * nothing guarantees jbd2_journal_dirty_metadata() will be
+ * ever called for it. So we need to set jbddirty bit here to
+ * make sure the buffer is dirtied and written out when the
+ * journaling machinery is done with it.
+ */
set_buffer_jbddirty(bh);
}
- unlock_buffer(bh);
-
error = -EROFS;
if (is_handle_aborted(handle)) {
jbd_unlock_bh_state(bh);
+ unlock_buffer(bh);
goto out;
}
error = 0;
@@ -974,8 +966,10 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* b_next_transaction points to it
*/
if (jh->b_transaction == transaction ||
- jh->b_next_transaction == transaction)
+ jh->b_next_transaction == transaction) {
+ unlock_buffer(bh);
goto done;
+ }
/*
* this is the first time this transaction is touching this buffer,
@@ -999,10 +993,24 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
*/
smp_wmb();
spin_lock(&journal->j_list_lock);
+ if (test_clear_buffer_dirty(bh)) {
+ /*
+ * Execute buffer dirty clearing and jh->b_transaction
+ * assignment under journal->j_list_lock locked to
+ * prevent bh being removed from checkpoint list if
+ * the buffer is in an intermediate state (not dirty
+ * and jh->b_transaction is NULL).
+ */
+ JBUFFER_TRACE(jh, "Journalling dirty buffer");
+ set_buffer_jbddirty(bh);
+ }
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
spin_unlock(&journal->j_list_lock);
+ unlock_buffer(bh);
goto done;
}
+ unlock_buffer(bh);
+
/*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
--
2.25.1
From: Dave Chinner <dchinner(a)redhat.com>
mainline inclusion
from mainline-v5.19-rc1
commit c230a4a85bcdbfc1a7415deec6caf04e8fca1301
category: bugfix
bugzilla: 187372, https://gitee.com/openeuler/kernel/issues/I5K0OM
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Ever since we added shadown format buffers to the log items, log
items need to handle the item being released with shadow buffers
attached. Due to the fact this requirement was added at the same
time we added new rmap/reflink intents, we missed the cleanup of
those items.
In theory, this means shadow buffers can be leaked in a very small
window when a shutdown is initiated. Testing with KASAN shows this
leak does not happen in practice - we haven't identified a single
leak in several years of shutdown testing since ~v4.8 kernels.
However, the intent whiteout cleanup mechanism results in every
cancelled intent in exactly the same state as this tiny race window
creates and so if intents down clean up shadow buffers on final
release we will leak the shadow buffer for just about every intent
we create.
Hence we start with this patch to close this condition off and
ensure that when whiteouts start to be used we don't leak lots of
memory.
Signed-off-by: Dave Chinner <dchinner(a)redhat.com>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Reviewed-by: Allison Henderson <allison.henderson(a)oracle.com>
Signed-off-by: Dave Chinner <david(a)fromorbit.com>
conflicts:
fs/xfs/xfs_bmap_item.c
fs/xfs/xfs_icreate_item.c
fs/xfs/xfs_refcount_item.c
fs/xfs/xfs_rmap_item.c
Signed-off-by: Li Nan <linan122(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/xfs/xfs_bmap_item.c | 2 ++
fs/xfs/xfs_icreate_item.c | 1 +
fs/xfs/xfs_refcount_item.c | 2 ++
fs/xfs/xfs_rmap_item.c | 2 ++
4 files changed, 7 insertions(+)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 44ec0f2d5253..e6de8081451f 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -40,6 +40,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_zone, buip);
}
@@ -199,6 +200,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_zone, budp);
}
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 9b3994b9c716..aa8c7c261d24 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0dee316283a9..9f4ff45c7a93 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -204,6 +205,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_zone, cudp);
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 20905953fe76..b5447ac7cb9b 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -227,6 +228,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_zone, rudp);
}
--
2.25.1
From: Dave Chinner <dchinner(a)redhat.com>
mainline inclusion
from mainline-v5.19-rc1
commit c230a4a85bcdbfc1a7415deec6caf04e8fca1301
category: bugfix
bugzilla: 187372, https://gitee.com/openeuler/kernel/issues/I5K0OM
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Ever since we added shadown format buffers to the log items, log
items need to handle the item being released with shadow buffers
attached. Due to the fact this requirement was added at the same
time we added new rmap/reflink intents, we missed the cleanup of
those items.
In theory, this means shadow buffers can be leaked in a very small
window when a shutdown is initiated. Testing with KASAN shows this
leak does not happen in practice - we haven't identified a single
leak in several years of shutdown testing since ~v4.8 kernels.
However, the intent whiteout cleanup mechanism results in every
cancelled intent in exactly the same state as this tiny race window
creates and so if intents down clean up shadow buffers on final
release we will leak the shadow buffer for just about every intent
we create.
Hence we start with this patch to close this condition off and
ensure that when whiteouts start to be used we don't leak lots of
memory.
Signed-off-by: Dave Chinner <dchinner(a)redhat.com>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Reviewed-by: Allison Henderson <allison.henderson(a)oracle.com>
Signed-off-by: Dave Chinner <david(a)fromorbit.com>
conflicts:
fs/xfs/xfs_bmap_item.c
fs/xfs/xfs_icreate_item.c
fs/xfs/xfs_refcount_item.c
fs/xfs/xfs_rmap_item.c
Signed-off-by: Li Nan <linan122(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/xfs/xfs_bmap_item.c | 2 ++
fs/xfs/xfs_icreate_item.c | 1 +
fs/xfs/xfs_refcount_item.c | 2 ++
fs/xfs/xfs_rmap_item.c | 2 ++
4 files changed, 7 insertions(+)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 44ec0f2d5253..e6de8081451f 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -40,6 +40,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_zone, buip);
}
@@ -199,6 +200,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_zone, budp);
}
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 9b3994b9c716..aa8c7c261d24 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0dee316283a9..9f4ff45c7a93 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -204,6 +205,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_zone, cudp);
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 20905953fe76..b5447ac7cb9b 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -227,6 +228,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_zone, rudp);
}
--
2.25.1