From: Zhang Yi yi.zhang@huawei.com
hulk inclusion category: bugfix bugzilla: 50788 CVE: NA ---------------------------
This reverts commit 30a8fc55ac1ba2cf3a8456788ce4ff1593853738. Prepare to backport 235d68069cbd ("jbd2: don't abort the journal when freeing buffers")
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/jbd2/transaction.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 4055929a043cf..ffa6d3530f4bd 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2085,6 +2085,7 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, { struct buffer_head *head; struct buffer_head *bh; + bool has_write_io_error = false; int ret = 0;
J_ASSERT(PageLocked(page)); @@ -2109,10 +2110,26 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, jbd_unlock_bh_state(bh); if (buffer_jbd(bh)) goto busy; + + /* + * If we free a metadata buffer which has been failed to + * write out, the jbd2 checkpoint procedure will not detect + * this failure and may lead to filesystem inconsistency + * after cleanup journal tail. + */ + if (buffer_write_io_error(bh)) { + pr_err("JBD2: Error while async write back metadata bh %llu.", + (unsigned long long)bh->b_blocknr); + has_write_io_error = true; + } } while ((bh = bh->b_this_page) != head);
ret = try_to_free_buffers(page); + busy: + if (has_write_io_error) + jbd2_journal_abort(journal, -EIO); + return ret; }
From: Zhang Yi yi.zhang@huawei.com
hulk inclusion category: bugfix bugzilla: 50788 CVE: NA ---------------------------
This reverts commit 0f9e8e1cd7ac08d79428655945c6cf195e7016a3. Prepare to backpart fcf37549ae19 ("jbd2: ensure abort the journal if detect IO error when writing original buffer back").
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/jbd2/checkpoint.c | 12 ------------ fs/jbd2/journal.c | 14 -------------- include/linux/jbd2.h | 11 ----------- 3 files changed, 37 deletions(-)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index b1af15ad36dcb..61de87fbf5ec3 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -562,7 +562,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; - struct buffer_head *bh = jh2bh(jh);
JBUFFER_TRACE(jh, "entry");
@@ -574,17 +573,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) journal = transaction->t_journal;
JBUFFER_TRACE(jh, "removing from transaction"); - - /* - * If we have failed to write the buffer out to disk, the filesystem - * may become inconsistent. We cannot abort the journal here since - * we hold j_list_lock and we have to careful about races with - * jbd2_journal_destroy(). So mark the writeback IO error in the - * journal here and we abort the journal later from a better context. - */ - if (buffer_write_io_error(bh)) - set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags); - __buffer_unlink(jh); jh->b_cp_transaction = NULL; jbd2_journal_put_journal_head(jh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 89fad4c3e13cb..f1bc1c82c4802 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1429,10 +1429,6 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
if (is_journal_aborted(journal)) return -EIO; - if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) { - jbd2_journal_abort(journal, -EIO); - return -EIO; - }
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", @@ -1788,16 +1784,6 @@ int jbd2_journal_destroy(journal_t *journal) J_ASSERT(journal->j_checkpoint_transactions == NULL); spin_unlock(&journal->j_list_lock);
- /* - * OK, all checkpoint transactions have been checked, now check the - * write out io error flag and abort the journal if some buffer failed - * to write back to the original location, otherwise the filesystem - * may become inconsistent. - */ - if (!is_journal_aborted(journal) && - test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) - jbd2_journal_abort(journal, -EIO); - if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { mutex_lock_io(&journal->j_checkpoint_mutex); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5c0446f22bee1..667fce234ac90 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -780,11 +780,6 @@ struct journal_s */ unsigned long j_flags;
- /** - * @j_atomic_flags: Atomic journaling state flags. - */ - unsigned long j_atomic_flags; - /** * @j_errno: * @@ -1281,12 +1276,6 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) * data write error in ordered * mode */
-/* - * Journal atomic flag definitions - */ -#define JBD2_CHECKPOINT_IO_ERROR 0x001 /* Detect io error while writing - * buffer back to disk */ - /* * Function declarations for the journaling transaction and buffer * management
From: Zhang Yi yi.zhang@huawei.com
hulk inclusion category: bugfix bugzilla: 50788 CVE: NA ---------------------------
This reverts commit 7348101fc5fb2e6218700c6349a4d92666f8815f. Prepare to backport 1866cba84243 ("jbd2: remove the out label in __jbd2_journal_remove_checkpoint()").
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/jbd2/checkpoint.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 61de87fbf5ec3..96bf33986d030 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -562,13 +562,13 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; + int ret = 0;
JBUFFER_TRACE(jh, "entry");
- transaction = jh->b_cp_transaction; - if (!transaction) { + if ((transaction = jh->b_cp_transaction) == NULL) { JBUFFER_TRACE(jh, "not on transaction"); - return 0; + goto out; } journal = transaction->t_journal;
@@ -577,9 +577,9 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) jh->b_cp_transaction = NULL; jbd2_journal_put_journal_head(jh);
- /* Is this transaction empty? */ - if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list) - return 0; + if (transaction->t_checkpoint_list != NULL || + transaction->t_checkpoint_io_list != NULL) + goto out;
/* * There is one special case to worry about: if we have just pulled the @@ -591,12 +591,10 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) * See the comment at the end of jbd2_journal_commit_transaction(). */ if (transaction->t_state != T_FINISHED) - return 0; + goto out;
- /* - * OK, that was the last buffer for the transaction, we can now - * safely remove this transaction from the log. - */ + /* OK, that was the last buffer for the transaction: we can now + safely remove this transaction from the log */ stats = &transaction->t_chp_stats; if (stats->cs_chp_time) stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, @@ -606,7 +604,9 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
__jbd2_journal_drop_transaction(journal, transaction); jbd2_journal_free_transaction(transaction); - return 1; + ret = 1; +out: + return ret; }
/*
From: Yang Shi shy828301@gmail.com
mainline inclusion from mainline-5.13-rc1 commit 41ca668a71e7b03743369a2c6d8b8edc1e943dc8 category: bugfix bugzilla: 50788 CVE: NA ---------------------------
Currently registered shrinker is indicated by non-NULL shrinker->nr_deferred. This approach is fine with nr_deferred at the shrinker level, but the following patches will move MEMCG_AWARE shrinkers' nr_deferred to memcg level, so their shrinker->nr_deferred would always be NULL. This would prevent the shrinkers from unregistering correctly.
Remove SHRINKER_REGISTERING since we could check if shrinker is registered successfully by the new flag.
Link: https://lkml.kernel.org/r/20210311190845.9708-9-shy828301@gmail.com Signed-off-by: Yang Shi shy828301@gmail.com Acked-by: Kirill Tkhai ktkhai@virtuozzo.com Acked-by: Vlastimil Babka vbabka@suse.cz Acked-by: Roman Gushchin guro@fb.com Reviewed-by: Shakeel Butt shakeelb@google.com Cc: Dave Chinner david@fromorbit.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org
Conflict: mm/vmscan.c
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/shrinker.h | 7 ++++--- mm/vmscan.c | 39 +++++++++++++++------------------------ 2 files changed, 19 insertions(+), 27 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 0f80123650e23..1eac79ce57d44 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -79,13 +79,14 @@ struct shrinker { #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
/* Flags */ -#define SHRINKER_NUMA_AWARE (1 << 0) -#define SHRINKER_MEMCG_AWARE (1 << 1) +#define SHRINKER_REGISTERED (1 << 0) +#define SHRINKER_NUMA_AWARE (1 << 1) +#define SHRINKER_MEMCG_AWARE (1 << 2) /* * It just makes sense when the shrinker is also MEMCG_AWARE for now, * non-MEMCG_AWARE shrinker should not have this flag set. */ -#define SHRINKER_NONSLAB (1 << 2) +#define SHRINKER_NONSLAB (1 << 3)
extern int prealloc_shrinker(struct shrinker *shrinker); extern void register_shrinker_prepared(struct shrinker *shrinker); diff --git a/mm/vmscan.c b/mm/vmscan.c index bedea8b4024a8..5de64cb7e3188 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -188,18 +188,6 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG -/* - * We allow subsystems to populate their shrinker-related - * LRU lists before register_shrinker_prepared() is called - * for the shrinker, since we don't want to impose - * restrictions on their internal registration order. - * In this case shrink_slab_memcg() may find corresponding - * bit is set in the shrinkers map. - * - * This value is used by the function to detect registering - * shrinkers and to skip do_shrink_slab() calls for them. - */ -#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
static DEFINE_IDR(shrinker_idr); static int shrinker_nr_max; @@ -210,7 +198,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
down_write(&shrinker_rwsem); /* This may call shrinker, so it must use down_read_trylock() */ - id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL); + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock;
@@ -235,9 +223,9 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
BUG_ON(id < 0);
- down_write(&shrinker_rwsem); + lockdep_assert_held(&shrinker_rwsem); + idr_remove(&shrinker_idr, id); - up_write(&shrinker_rwsem); }
static bool global_reclaim(struct scan_control *sc) @@ -412,8 +400,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return;
- if (shrinker->flags & SHRINKER_MEMCG_AWARE) + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + }
kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; @@ -423,10 +414,7 @@ void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); -#ifdef CONFIG_MEMCG - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - idr_replace(&shrinker_idr, shrinker, shrinker->id); -#endif + shrinker->flags |= SHRINKER_REGISTERED; up_write(&shrinker_rwsem); }
@@ -446,13 +434,16 @@ EXPORT_SYMBOL(register_shrinker); */ void unregister_shrinker(struct shrinker *shrinker) { - if (!shrinker->nr_deferred) + if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); + down_write(&shrinker_rwsem); list_del(&shrinker->list); + shrinker->flags &= ~SHRINKER_REGISTERED; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -609,7 +600,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct shrinker *shrinker;
shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { if (!shrinker) clear_bit(i, map->map); continue;
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-5.14-rc1 commit 1866cba842437f3e7a5a8ee5b558744d9ae844d0 category: bugfix bugzilla: 50788 CVE: NA ---------------------------
The 'out' lable just return the 'ret' value and seems not required, so remove this label and switch to return appropriate value immediately. This patch also do some minor cleanup, no logical change.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20210610112440.3438139-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/jbd2/checkpoint.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 96bf33986d030..61de87fbf5ec3 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -562,13 +562,13 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; - int ret = 0;
JBUFFER_TRACE(jh, "entry");
- if ((transaction = jh->b_cp_transaction) == NULL) { + transaction = jh->b_cp_transaction; + if (!transaction) { JBUFFER_TRACE(jh, "not on transaction"); - goto out; + return 0; } journal = transaction->t_journal;
@@ -577,9 +577,9 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) jh->b_cp_transaction = NULL; jbd2_journal_put_journal_head(jh);
- if (transaction->t_checkpoint_list != NULL || - transaction->t_checkpoint_io_list != NULL) - goto out; + /* Is this transaction empty? */ + if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list) + return 0;
/* * There is one special case to worry about: if we have just pulled the @@ -591,10 +591,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) * See the comment at the end of jbd2_journal_commit_transaction(). */ if (transaction->t_state != T_FINISHED) - goto out; + return 0;
- /* OK, that was the last buffer for the transaction: we can now - safely remove this transaction from the log */ + /* + * OK, that was the last buffer for the transaction, we can now + * safely remove this transaction from the log. + */ stats = &transaction->t_chp_stats; if (stats->cs_chp_time) stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, @@ -604,9 +606,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
__jbd2_journal_drop_transaction(journal, transaction); jbd2_journal_free_transaction(transaction); - ret = 1; -out: - return ret; + return 1; }
/*
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-5.14-rc1 commit fcf37549ae19e904bc6a5eadf5c25eca36100c5e category: bugfix bugzilla: 50788 CVE: NA ---------------------------
Although we merged c044f3d8360 ("jbd2: abort journal if free a async write error metadata buffer"), there is a race between jbd2_journal_try_to_free_buffers() and jbd2_journal_destroy(), so the jbd2_log_do_checkpoint() may still fail to detect the buffer write io error flag which may lead to filesystem inconsistency.
jbd2_journal_try_to_free_buffers() ext4_put_super() jbd2_journal_destroy() __jbd2_journal_remove_checkpoint() detect buffer write error jbd2_log_do_checkpoint() jbd2_cleanup_journal_tail() <--- lead to inconsistency jbd2_journal_abort()
Fix this issue by introducing a new atomic flag which only have one JBD2_CHECKPOINT_IO_ERROR bit now, and set it in __jbd2_journal_remove_checkpoint() when freeing a checkpoint buffer which has write_io_error flag. Then jbd2_journal_destroy() will detect this mark and abort the journal to prevent updating log tail.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20210610112440.3438139-3-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/jbd2/checkpoint.c | 12 ++++++++++++ fs/jbd2/journal.c | 14 ++++++++++++++ include/linux/jbd2.h | 11 +++++++++++ 3 files changed, 37 insertions(+)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 61de87fbf5ec3..80c5fcfa94f6d 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -562,6 +562,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; + struct buffer_head *bh = jh2bh(jh);
JBUFFER_TRACE(jh, "entry");
@@ -573,6 +574,17 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) journal = transaction->t_journal;
JBUFFER_TRACE(jh, "removing from transaction"); + + /* + * If we have failed to write the buffer out to disk, the filesystem + * may become inconsistent. We cannot abort the journal here since + * we hold j_list_lock and we have to be careful about races with + * jbd2_journal_destroy(). So mark the writeback IO error in the + * journal here and we abort the journal later from a better context. + */ + if (buffer_write_io_error(bh)) + set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags); + __buffer_unlink(jh); jh->b_cp_transaction = NULL; jbd2_journal_put_journal_head(jh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f1bc1c82c4802..89fad4c3e13cb 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1429,6 +1429,10 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
if (is_journal_aborted(journal)) return -EIO; + if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) { + jbd2_journal_abort(journal, -EIO); + return -EIO; + }
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", @@ -1784,6 +1788,16 @@ int jbd2_journal_destroy(journal_t *journal) J_ASSERT(journal->j_checkpoint_transactions == NULL); spin_unlock(&journal->j_list_lock);
+ /* + * OK, all checkpoint transactions have been checked, now check the + * write out io error flag and abort the journal if some buffer failed + * to write back to the original location, otherwise the filesystem + * may become inconsistent. + */ + if (!is_journal_aborted(journal) && + test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) + jbd2_journal_abort(journal, -EIO); + if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { mutex_lock_io(&journal->j_checkpoint_mutex); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 667fce234ac90..5c0446f22bee1 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -780,6 +780,11 @@ struct journal_s */ unsigned long j_flags;
+ /** + * @j_atomic_flags: Atomic journaling state flags. + */ + unsigned long j_atomic_flags; + /** * @j_errno: * @@ -1276,6 +1281,12 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) * data write error in ordered * mode */
+/* + * Journal atomic flag definitions + */ +#define JBD2_CHECKPOINT_IO_ERROR 0x001 /* Detect io error while writing + * buffer back to disk */ + /* * Function declarations for the journaling transaction and buffer * management
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-5.14-rc1 commit 235d68069cbd158cb00835d434e9e9accf9a6dd4 category: bugfix bugzilla: 50788 CVE: NA ---------------------------
Now that we can be sure the journal is aborted once a buffer has failed to be written back to disk, we can remove the journal abort logic in jbd2_journal_try_to_free_buffers() which was introduced in commit c044f3d8360d ("jbd2: abort journal if free a async write error metadata buffer"), because it may cost and propably is not safe.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20210610112440.3438139-4-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/jbd2/transaction.c | 17 ----------------- 1 file changed, 17 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ffa6d3530f4bd..4055929a043cf 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2085,7 +2085,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, { struct buffer_head *head; struct buffer_head *bh; - bool has_write_io_error = false; int ret = 0;
J_ASSERT(PageLocked(page)); @@ -2110,26 +2109,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, jbd_unlock_bh_state(bh); if (buffer_jbd(bh)) goto busy; - - /* - * If we free a metadata buffer which has been failed to - * write out, the jbd2 checkpoint procedure will not detect - * this failure and may lead to filesystem inconsistency - * after cleanup journal tail. - */ - if (buffer_write_io_error(bh)) { - pr_err("JBD2: Error while async write back metadata bh %llu.", - (unsigned long long)bh->b_blocknr); - has_write_io_error = true; - } } while ((bh = bh->b_this_page) != head);
ret = try_to_free_buffers(page); - busy: - if (has_write_io_error) - jbd2_journal_abort(journal, -EIO); - return ret; }
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-5.14-rc1 commit 214eb5a4d8a2032fb9f0711d1b202eb88ee02920 category: bugfix bugzilla: 50788 CVE: NA ---------------------------
Now that __jbd2_journal_remove_checkpoint() can detect buffer io error and mark journal checkpoint error, then we abort the journal later before updating log tail to ensure the filesystem works consistently. So we could remove other redundant buffer io error checkes.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20210610112440.3438139-5-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/jbd2/checkpoint.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 80c5fcfa94f6d..6198e17d8b62c 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -91,8 +91,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh) int ret = 0; struct buffer_head *bh = jh2bh(jh);
- if (jh->b_transaction == NULL && !buffer_locked(bh) && - !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + if (!jh->b_transaction && !buffer_locked(bh) && !buffer_dirty(bh)) { JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; } @@ -226,7 +225,6 @@ int jbd2_log_do_checkpoint(journal_t *journal) * OK, we need to start writing disk blocks. Take one transaction * and write it. */ - result = 0; spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) goto out; @@ -293,8 +291,6 @@ int jbd2_log_do_checkpoint(journal_t *journal) goto restart; } if (!buffer_dirty(bh)) { - if (unlikely(buffer_write_io_error(bh)) && !result) - result = -EIO; BUFFER_TRACE(bh, "remove from checkpoint"); if (__jbd2_journal_remove_checkpoint(jh)) /* The transaction was released; we're done */ @@ -354,8 +350,6 @@ int jbd2_log_do_checkpoint(journal_t *journal) spin_lock(&journal->j_list_lock); goto restart2; } - if (unlikely(buffer_write_io_error(bh)) && !result) - result = -EIO;
/* * Now in whatever state the buffer currently is, we @@ -367,10 +361,7 @@ int jbd2_log_do_checkpoint(journal_t *journal) } out: spin_unlock(&journal->j_list_lock); - if (result < 0) - jbd2_journal_abort(journal, result); - else - result = jbd2_cleanup_journal_tail(journal); + result = jbd2_cleanup_journal_tail(journal);
return (result < 0) ? result : 0; }
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-5.14-rc1 commit 4ba3fcdde7e36af93610ceb3cc38365b14539865 category: bugfix bugzilla: 50788 CVE: NA ---------------------------
Current metadata buffer release logic in bdev_try_to_free_page() have a lot of use-after-free issues when umount filesystem concurrently, and it is difficult to fix directly because ext4 is the only user of s_op->bdev_try_to_free_page callback and we may have to add more special refcount or lock that is only used by ext4 into the common vfs layer, which is unacceptable.
One better solution is remove the bdev_try_to_free_page callback, but the real problem is we cannot easily release journal_head on the checkpointed buffer, so try_to_free_buffers() cannot release buffers and page under memory pressure, which is more likely to trigger out-of-memory. So we cannot remove the callback directly before we find another way to release journal_head.
This patch introduce a shrinker to free journal_head on the checkpointed transaction. After the journal_head got freed, try_to_free_buffers() could free buffer properly.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Suggested-by: Jan Kara jack@suse.cz Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20210610112440.3438139-6-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu
Conflict: fs/jbd2/journal.c
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/super.c | 8 ++ fs/jbd2/checkpoint.c | 147 ++++++++++++++++++++++++++++++++++++ fs/jbd2/journal.c | 87 +++++++++++++++++++++ include/linux/jbd2.h | 26 +++++++ include/trace/events/jbd2.h | 101 +++++++++++++++++++++++++ 5 files changed, 369 insertions(+)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 53490fc88446a..8ee9ffdd5417e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1161,6 +1161,7 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) { + jbd2_journal_unregister_shrinker(sbi->s_journal); aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -4853,6 +4854,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_ea_block_cache = NULL; } if (sbi->s_journal) { + jbd2_journal_unregister_shrinker(sbi->s_journal); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } @@ -5171,6 +5173,12 @@ static int ext4_load_journal(struct super_block *sb, ext4_commit_super(sb); }
+ err = jbd2_journal_register_shrinker(journal); + if (err) { + EXT4_SB(sb)->s_journal = NULL; + goto err_out; + } + return 0;
err_out: diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 6198e17d8b62c..24cf40fd34d14 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -79,6 +79,18 @@ static inline void __buffer_relink_io(struct journal_head *jh) transaction->t_checkpoint_io_list = jh; }
+/* + * Check a checkpoint buffer could be release or not. + * + * Requires j_list_lock + */ +static inline bool __cp_buffer_busy(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); +} + /* * Try to release a checkpointed buffer from its transaction. * Returns 1 if we released it and 2 if we also released the @@ -456,6 +468,137 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) return 0; }
+/* + * journal_shrink_one_cp_list + * + * Find 'nr_to_scan' written-back checkpoint buffers in the given list + * and try to release them. If the whole transaction is released, set + * the 'released' parameter. Return the number of released checkpointed + * buffers. + * + * Called with j_list_lock held. + */ +static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, + unsigned long *nr_to_scan, + bool *released) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + unsigned long nr_freed = 0; + int ret; + + if (!jh || *nr_to_scan == 0) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + + (*nr_to_scan)--; + if (__cp_buffer_busy(jh)) + continue; + + nr_freed++; + ret = __jbd2_journal_remove_checkpoint(jh); + if (ret) { + *released = true; + break; + } + + if (need_resched()) + break; + } while (jh != last_jh && *nr_to_scan); + + return nr_freed; +} + +/* + * jbd2_journal_shrink_checkpoint_list + * + * Find 'nr_to_scan' written-back checkpoint buffers in the journal + * and try to release them. Return the number of released checkpointed + * buffers. + * + * Called with j_list_lock held. + */ +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, + unsigned long *nr_to_scan) +{ + transaction_t *transaction, *last_transaction, *next_transaction; + bool released; + tid_t first_tid = 0, last_tid = 0, next_tid = 0; + tid_t tid = 0; + unsigned long nr_freed = 0; + unsigned long nr_scanned = *nr_to_scan; + +again: + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) { + spin_unlock(&journal->j_list_lock); + goto out; + } + + /* + * Get next shrink transaction, resume previous scan or start + * over again. If some others do checkpoint and drop transaction + * from the checkpoint list, we ignore saved j_shrink_transaction + * and start over unconditionally. + */ + if (journal->j_shrink_transaction) + transaction = journal->j_shrink_transaction; + else + transaction = journal->j_checkpoint_transactions; + + if (!first_tid) + first_tid = transaction->t_tid; + last_transaction = journal->j_checkpoint_transactions->t_cpprev; + next_transaction = transaction; + last_tid = last_transaction->t_tid; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + tid = transaction->t_tid; + released = false; + + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list, + nr_to_scan, &released); + if (*nr_to_scan == 0) + break; + if (need_resched() || spin_needbreak(&journal->j_list_lock)) + break; + if (released) + continue; + + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list, + nr_to_scan, &released); + if (*nr_to_scan == 0) + break; + if (need_resched() || spin_needbreak(&journal->j_list_lock)) + break; + } while (transaction != last_transaction); + + if (transaction != last_transaction) { + journal->j_shrink_transaction = next_transaction; + next_tid = next_transaction->t_tid; + } else { + journal->j_shrink_transaction = NULL; + next_tid = 0; + } + + spin_unlock(&journal->j_list_lock); + cond_resched(); + + if (*nr_to_scan && next_tid) + goto again; +out: + nr_scanned -= *nr_to_scan; + trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, + nr_freed, nr_scanned, next_tid); + + return nr_freed; +} + /* * journal_clean_checkpoint_list * @@ -578,6 +721,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
__buffer_unlink(jh); jh->b_cp_transaction = NULL; + percpu_counter_dec(&journal->j_jh_shrink_count); jbd2_journal_put_journal_head(jh);
/* Is this transaction empty? */ @@ -640,6 +784,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, jh->b_cpnext->b_cpprev = jh; } transaction->t_checkpoint_list = jh; + percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count); }
/* @@ -655,6 +800,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) { assert_spin_locked(&journal->j_list_lock); + + journal->j_shrink_transaction = NULL; if (transaction->t_cpnext) { transaction->t_cpnext->t_cpprev = transaction->t_cpprev; transaction->t_cpprev->t_cpnext = transaction->t_cpnext; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 89fad4c3e13cb..9dce0e9af5ac4 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1743,6 +1743,91 @@ int jbd2_journal_load(journal_t *journal) return -EIO; }
+/** + * jbd2_journal_shrink_scan() + * + * Scan the checkpointed buffer on the checkpoint list and release the + * journal_head. + */ +static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long nr_to_scan = sc->nr_to_scan; + unsigned long nr_shrunk; + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); + trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); + + nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); + + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); + trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); + + return nr_shrunk; +} + +/** + * jbd2_journal_shrink_count() + * + * Count the number of checkpoint buffers on the checkpoint list. + */ +static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); + trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); + + return count; +} + +/** + * jbd2_journal_register_shrinker() + * @journal: Journal to act on. + * + * Init a percpu counter to record the checkpointed buffers on the checkpoint + * list and register a shrinker to release their journal_head. + */ +int jbd2_journal_register_shrinker(journal_t *journal) +{ + int err; + + journal->j_shrink_transaction = NULL; + + err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL); + if (err) + return err; + + journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; + journal->j_shrinker.count_objects = jbd2_journal_shrink_count; + journal->j_shrinker.seeks = DEFAULT_SEEKS; + journal->j_shrinker.batch = journal->j_max_transaction_buffers; + + err = register_shrinker(&journal->j_shrinker); + if (err) { + percpu_counter_destroy(&journal->j_jh_shrink_count); + return err; + } + + return 0; +} + +/** + * jbd2_journal_unregister_shrinker() + * @journal: Journal to act on. + * + * Unregister the checkpointed buffer shrinker and destroy the percpu counter. + */ +void jbd2_journal_unregister_shrinker(journal_t *journal) +{ + percpu_counter_destroy(&journal->j_jh_shrink_count); + unregister_shrinker(&journal->j_shrinker); +} + /** * void jbd2_journal_destroy() - Release a journal_t structure. * @journal: Journal to act on. @@ -1815,6 +1900,8 @@ int jbd2_journal_destroy(journal_t *journal) brelse(journal->j_sb_buffer); }
+ jbd2_journal_unregister_shrinker(journal); + if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); iput(journal->j_inode); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5c0446f22bee1..151d3b58512a1 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -901,6 +901,29 @@ struct journal_s */ struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH];
+ /** + * @j_shrinker: + * + * Journal head shrinker, reclaim buffer's journal head which + * has been written back. + */ + struct shrinker j_shrinker; + + /** + * @j_jh_shrink_count: + * + * Number of journal buffers on the checkpoint list. [j_list_lock] + */ + struct percpu_counter j_jh_shrink_count; + + /** + * @j_shrink_transaction: + * + * Record next transaction will shrink on the checkpoint list. + * [j_list_lock] + */ + transaction_t *j_shrink_transaction; + /** * @j_head: * @@ -1323,6 +1346,7 @@ extern void jbd2_journal_commit_transaction(journal_t *);
/* Checkpoint list management */ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); int __jbd2_journal_remove_checkpoint(struct journal_head *); void jbd2_journal_destroy_checkpoint(journal_t *journal); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); @@ -1434,6 +1458,8 @@ extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); +extern int jbd2_journal_register_shrinker(journal_t *journal); +extern void jbd2_journal_unregister_shrinker(journal_t *journal); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index d16a32867f3a6..a4dfe005983d3 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -394,6 +394,107 @@ TRACE_EVENT(jbd2_lock_buffer_stall, __entry->stall_ms) );
+DECLARE_EVENT_CLASS(jbd2_journal_shrink, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, + unsigned long count), + + TP_ARGS(journal, nr_to_scan, count), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, nr_to_scan) + __field(unsigned long, count) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->nr_to_scan = nr_to_scan; + __entry->count = count; + ), + + TP_printk("dev %d,%d nr_to_scan %lu count %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_to_scan, __entry->count) +); + +DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), + + TP_ARGS(journal, nr_to_scan, count) +); + +DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), + + TP_ARGS(journal, nr_to_scan, count) +); + +TRACE_EVENT(jbd2_shrink_scan_exit, + + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, + unsigned long nr_shrunk, unsigned long count), + + TP_ARGS(journal, nr_to_scan, nr_shrunk, count), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, nr_to_scan) + __field(unsigned long, nr_shrunk) + __field(unsigned long, count) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->nr_to_scan = nr_to_scan; + __entry->nr_shrunk = nr_shrunk; + __entry->count = count; + ), + + TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_to_scan, __entry->nr_shrunk, + __entry->count) +); + +TRACE_EVENT(jbd2_shrink_checkpoint_list, + + TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid, + unsigned long nr_freed, unsigned long nr_scanned, + tid_t next_tid), + + TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, + nr_scanned, next_tid), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(tid_t, first_tid) + __field(tid_t, tid) + __field(tid_t, last_tid) + __field(unsigned long, nr_freed) + __field(unsigned long, nr_scanned) + __field(tid_t, next_tid) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->first_tid = first_tid; + __entry->tid = tid; + __entry->last_tid = last_tid; + __entry->nr_freed = nr_freed; + __entry->nr_scanned = nr_scanned; + __entry->next_tid = next_tid; + ), + + TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu " + "scanned %lu next transaction %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->first_tid, __entry->tid, __entry->last_tid, + __entry->nr_freed, __entry->nr_scanned, __entry->next_tid) +); + #endif /* _TRACE_JBD2_H */
/* This part must be outside protection */
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-5.14-rc1 commit dbf2bab7935b65689f3b39178cf87374f0334ead category: bugfix bugzilla: 50788 CVE: NA ---------------------------
Now that __try_to_free_cp_buf() remove checkpointed buffer or transaction when the buffer is not 'busy', which is only called by journal_clean_one_cp_list(). This patch simplify this function by remove __try_to_free_cp_buf() and invoke __cp_buffer_busy() directly.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20210610112440.3438139-7-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/jbd2/checkpoint.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 24cf40fd34d14..d0b9200caab7e 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -91,25 +91,6 @@ static inline bool __cp_buffer_busy(struct journal_head *jh) return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); }
-/* - * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it and 2 if we also released the - * whole transaction. - * - * Requires j_list_lock - */ -static int __try_to_free_cp_buf(struct journal_head *jh) -{ - int ret = 0; - struct buffer_head *bh = jh2bh(jh); - - if (!jh->b_transaction && !buffer_locked(bh) && !buffer_dirty(bh)) { - JBUFFER_TRACE(jh, "remove from checkpoint list"); - ret = __jbd2_journal_remove_checkpoint(jh) + 1; - } - return ret; -} - /* * __jbd2_log_wait_for_space: wait until there is space in the journal. * @@ -438,7 +419,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) { struct journal_head *last_jh; struct journal_head *next_jh = jh; - int ret;
if (!jh) return 0; @@ -447,13 +427,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) do { jh = next_jh; next_jh = jh->b_cpnext; - if (!destroy) - ret = __try_to_free_cp_buf(jh); - else - ret = __jbd2_journal_remove_checkpoint(jh) + 1; - if (!ret) + + if (!destroy && __cp_buffer_busy(jh)) return 0; - if (ret == 2) + + if (__jbd2_journal_remove_checkpoint(jh)) return 1; /* * This function only frees up some memory
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-5.14-rc1 commit 3b672e3aedffc9f092e7e7eae0050a97a8ca508e category: bugfix bugzilla: 50788 CVE: NA ---------------------------
After we introduce a jbd2 shrinker to release checkpointed buffer's journal head, we could free buffer without bdev_try_to_free_page() under memory pressure. So this patch remove the whole bdev_try_to_free_page() callback directly. It also remove many use-after-free issues relate to it together.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20210610112440.3438139-8-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu
Conflict: fs/ext4/super.c
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/super.c | 21 --------------------- 1 file changed, 21 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8ee9ffdd5417e..6f06280e682e9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1416,26 +1416,6 @@ static int ext4_nfs_commit_metadata(struct inode *inode) return ext4_write_inode(inode, &wbc); }
-/* - * Try to release metadata pages (indirect blocks, directories) which are - * mapped via the block device. Since these pages could have journal heads - * which would prevent try_to_free_buffers() from freeing them, we must use - * jbd2 layer's try_to_free_buffers() function to release them. - */ -static int bdev_try_to_free_page(struct super_block *sb, struct page *page, - gfp_t wait) -{ - journal_t *journal = EXT4_SB(sb)->s_journal; - - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) - return 0; - if (journal) - return jbd2_journal_try_to_free_buffers(journal, page, - wait & ~__GFP_DIRECT_RECLAIM); - return try_to_free_buffers(page); -} - #ifdef CONFIG_EXT4_FS_ENCRYPTION static int ext4_get_context(struct inode *inode, void *ctx, size_t len) { @@ -1613,7 +1593,6 @@ static const struct super_operations ext4_sops = { .quota_write = ext4_quota_write, .get_dquots = ext4_get_dquots, #endif - .bdev_try_to_free_page = bdev_try_to_free_page, };
static const struct export_operations ext4_export_ops = {
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-5.14-rc1 commit acc6100d3ffa24bdd2add8ea85fb66811bcce5d4 category: bugfix bugzilla: 50788 CVE: NA ---------------------------
After remove the unique user of sop->bdev_try_to_free_page() callback, we could remove the callback and the corresponding blkdev_releasepage() at all.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20210610112440.3438139-9-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/block_dev.c | 15 --------------- include/linux/fs.h | 1 - 2 files changed, 16 deletions(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c index 06f73a1a1f66b..77dbfc832bb9f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -2082,20 +2082,6 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) } EXPORT_SYMBOL_GPL(blkdev_read_iter);
-/* - * Try to release a page associated with block device when the system - * is under memory pressure. - */ -static int blkdev_releasepage(struct page *page, gfp_t wait) -{ - struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; - - if (super && super->s_op->bdev_try_to_free_page) - return super->s_op->bdev_try_to_free_page(super, page, wait); - - return try_to_free_buffers(page); -} - static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -2109,7 +2095,6 @@ static const struct address_space_operations def_blk_aops = { .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .writepages = blkdev_writepages, - .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, .is_dirty_writeback = buffer_check_dirty_writeback, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 394da46d143c2..1a8f8dd19eef6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1927,7 +1927,6 @@ struct super_operations { ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); struct dquot **(*get_dquots)(struct inode *); #endif - int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); long (*nr_cached_objects)(struct super_block *, struct shrink_control *); long (*free_cached_objects)(struct super_block *,
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-5.14-rc1 commit 16aa4c9a1fbe763c147a964cdc1f5be8ed98ed13 category: bugfix bugzilla: 50788 CVE: NA ---------------------------
Export jbd2_journal_[un]register_shrinker() to fix this error when ext4 is built as a module:
ERROR: modpost: "jbd2_journal_unregister_shrinker" undefined! ERROR: modpost: "jbd2_journal_register_shrinker" undefined!
Fixes: 4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers") Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20210630083638.140218-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/jbd2/journal.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9dce0e9af5ac4..e96c379f54162 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1815,6 +1815,7 @@ int jbd2_journal_register_shrinker(journal_t *journal)
return 0; } +EXPORT_SYMBOL(jbd2_journal_register_shrinker);
/** * jbd2_journal_unregister_shrinker() @@ -1827,6 +1828,7 @@ void jbd2_journal_unregister_shrinker(journal_t *journal) percpu_counter_destroy(&journal->j_jh_shrink_count); unregister_shrinker(&journal->j_shrinker); } +EXPORT_SYMBOL(jbd2_journal_unregister_shrinker);
/** * void jbd2_journal_destroy() - Release a journal_t structure.
From: Theodore Ts'o tytso@mit.edu
mainline inclusion from mainline-5.14-rc1 commit 0705e8d1e2207ceeb83dc6e1751b6b82718b353a category: bugfix bugzilla: 50788 CVE: NA ---------------------------
The function jbd2_journal_unregister_shrinker() was getting called twice when the file system was getting unmounted. On Power and ARM platforms this was causing kernel crash when unmounting the file system, when a percpu_counter was destroyed twice.
Fix this by removing jbd2_journal_[un]register_shrinker() functions, and inlining the shrinker setup and teardown into journal_init_common() and jbd2_journal_destroy(). This means that ext4 and ocfs2 now no longer need to know about registering and unregistering jbd2's shrinker.
Also, while we're at it, rename the percpu counter from j_jh_shrink_count to j_checkpoint_jh_count, since this makes it clearer what this counter is intended to track.
Link: https://lore.kernel.org/r/20210705145025.3363130-1-tytso@mit.edu Fixes: 4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers") Reported-by: Jon Hunter jonathanh@nvidia.com Reported-by: Sachin Sant sachinp@linux.vnet.ibm.com Tested-by: Sachin Sant sachinp@linux.vnet.ibm.com Tested-by: Jon Hunter jonathanh@nvidia.com Reviewed-by: Jan Kara jack@suse.cz Signed-off-by: Theodore Ts'o tytso@mit.edu
Conflict: fs/jbd2/journal.c
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/super.c | 8 --- fs/jbd2/checkpoint.c | 4 +- fs/jbd2/journal.c | 149 +++++++++++++++++-------------------------- include/linux/jbd2.h | 6 +- 4 files changed, 64 insertions(+), 103 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6f06280e682e9..d793e597c0623 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1161,7 +1161,6 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) { - jbd2_journal_unregister_shrinker(sbi->s_journal); aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -4833,7 +4832,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_ea_block_cache = NULL; } if (sbi->s_journal) { - jbd2_journal_unregister_shrinker(sbi->s_journal); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } @@ -5152,12 +5150,6 @@ static int ext4_load_journal(struct super_block *sb, ext4_commit_super(sb); }
- err = jbd2_journal_register_shrinker(journal); - if (err) { - EXT4_SB(sb)->s_journal = NULL; - goto err_out; - } - return 0;
err_out: diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index d0b9200caab7e..035a1fe46e065 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -699,7 +699,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
__buffer_unlink(jh); jh->b_cp_transaction = NULL; - percpu_counter_dec(&journal->j_jh_shrink_count); + percpu_counter_dec(&journal->j_checkpoint_jh_count); jbd2_journal_put_journal_head(jh);
/* Is this transaction empty? */ @@ -762,7 +762,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, jh->b_cpnext->b_cpprev = jh; } transaction->t_checkpoint_list = jh; - percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count); + percpu_counter_inc(&transaction->t_journal->j_checkpoint_jh_count); }
/* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e96c379f54162..7984a918e635e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1114,6 +1114,48 @@ static int jbd2_min_tag_size(void) return sizeof(journal_block_tag_t) - 4; }
+/** + * jbd2_journal_shrink_scan() + * + * Scan the checkpointed buffer on the checkpoint list and release the + * journal_head. + */ +static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long nr_to_scan = sc->nr_to_scan; + unsigned long nr_shrunk; + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); + + nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); + + return nr_shrunk; +} + +/** + * jbd2_journal_shrink_count() + * + * Count the number of checkpoint buffers on the checkpoint list. + */ +static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); + + return count; +} + /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -1190,9 +1232,23 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data;
+ journal->j_shrink_transaction = NULL; + journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; + journal->j_shrinker.count_objects = jbd2_journal_shrink_count; + journal->j_shrinker.seeks = DEFAULT_SEEKS; + journal->j_shrinker.batch = journal->j_max_transaction_buffers; + + if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL)) + goto err_cleanup; + + if (register_shrinker(&journal->j_shrinker)) { + percpu_counter_destroy(&journal->j_checkpoint_jh_count); + goto err_cleanup; + } return journal;
err_cleanup: + brelse(journal->j_sb_buffer); kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); kfree(journal); @@ -1743,93 +1799,6 @@ int jbd2_journal_load(journal_t *journal) return -EIO; }
-/** - * jbd2_journal_shrink_scan() - * - * Scan the checkpointed buffer on the checkpoint list and release the - * journal_head. - */ -static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - journal_t *journal = container_of(shrink, journal_t, j_shrinker); - unsigned long nr_to_scan = sc->nr_to_scan; - unsigned long nr_shrunk; - unsigned long count; - - count = percpu_counter_read_positive(&journal->j_jh_shrink_count); - trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); - - nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); - - count = percpu_counter_read_positive(&journal->j_jh_shrink_count); - trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); - - return nr_shrunk; -} - -/** - * jbd2_journal_shrink_count() - * - * Count the number of checkpoint buffers on the checkpoint list. - */ -static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - journal_t *journal = container_of(shrink, journal_t, j_shrinker); - unsigned long count; - - count = percpu_counter_read_positive(&journal->j_jh_shrink_count); - trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); - - return count; -} - -/** - * jbd2_journal_register_shrinker() - * @journal: Journal to act on. - * - * Init a percpu counter to record the checkpointed buffers on the checkpoint - * list and register a shrinker to release their journal_head. - */ -int jbd2_journal_register_shrinker(journal_t *journal) -{ - int err; - - journal->j_shrink_transaction = NULL; - - err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL); - if (err) - return err; - - journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; - journal->j_shrinker.count_objects = jbd2_journal_shrink_count; - journal->j_shrinker.seeks = DEFAULT_SEEKS; - journal->j_shrinker.batch = journal->j_max_transaction_buffers; - - err = register_shrinker(&journal->j_shrinker); - if (err) { - percpu_counter_destroy(&journal->j_jh_shrink_count); - return err; - } - - return 0; -} -EXPORT_SYMBOL(jbd2_journal_register_shrinker); - -/** - * jbd2_journal_unregister_shrinker() - * @journal: Journal to act on. - * - * Unregister the checkpointed buffer shrinker and destroy the percpu counter. - */ -void jbd2_journal_unregister_shrinker(journal_t *journal) -{ - percpu_counter_destroy(&journal->j_jh_shrink_count); - unregister_shrinker(&journal->j_shrinker); -} -EXPORT_SYMBOL(jbd2_journal_unregister_shrinker); - /** * void jbd2_journal_destroy() - Release a journal_t structure. * @journal: Journal to act on. @@ -1902,8 +1871,10 @@ int jbd2_journal_destroy(journal_t *journal) brelse(journal->j_sb_buffer); }
- jbd2_journal_unregister_shrinker(journal); - + if (journal->j_shrinker.flags & SHRINKER_REGISTERED) { + percpu_counter_destroy(&journal->j_checkpoint_jh_count); + unregister_shrinker(&journal->j_shrinker); + } if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); iput(journal->j_inode); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 151d3b58512a1..5ecab235ed6a6 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -910,11 +910,11 @@ struct journal_s struct shrinker j_shrinker;
/** - * @j_jh_shrink_count: + * @j_checkpoint_jh_count: * * Number of journal buffers on the checkpoint list. [j_list_lock] */ - struct percpu_counter j_jh_shrink_count; + struct percpu_counter j_checkpoint_jh_count;
/** * @j_shrink_transaction: @@ -1458,8 +1458,6 @@ extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); -extern int jbd2_journal_register_shrinker(journal_t *journal); -extern void jbd2_journal_unregister_shrinker(journal_t *journal); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal);