From: Jan Kara jack@suse.cz
mainline inclusion from mainline-v6.4-rc2 commit 00d873c17e29cc32d90ca852b82685f1673acaa5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9SYGK
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Ext4 has a filesystem wide lock protecting ext4_writepages() calls to avoid races with switching of journalled data flag or inode format. This lock can however cause a deadlock like:
CPU0 CPU1
ext4_writepages() percpu_down_read(sbi->s_writepages_rwsem); ext4_change_inode_journal_flag() percpu_down_write(sbi->s_writepages_rwsem); - blocks, all readers block from now on ext4_do_writepages() ext4_init_io_end() kmem_cache_zalloc(io_end_cachep, GFP_KERNEL) fs_reclaim frees dentry... dentry_unlink_inode() iput() - last ref => iput_final() - inode dirty => write_inode_now()... ext4_writepages() tries to acquire sbi->s_writepages_rwsem and blocks forever
Make sure we cannot recurse into filesystem reclaim from writeback code to avoid the deadlock.
Reported-by: syzbot+6898da502aef574c5f8a@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000004c66b405fa108e27@google.com Fixes: c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages") CC: stable@vger.kernel.org Signed-off-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20230504124723.20205-1-jack@suse.cz Signed-off-by: Theodore Ts'o tytso@mit.edu
Conflicts: fs/ext4/ext4.h fs/ext4/super.c fs/ext4/inode.c [Because we have merged in b66c23ec7fcf ("ext4: fix race between writepages and remount") and we don't have ext4_do_writepages() yet and i_mmap_sem hasn't switched to invalidate_lock yet.] Signed-off-by: Baokun Li libaokun1@huawei.com --- fs/ext4/ext4.h | 25 +++++++++++++++++++++++++ fs/ext4/inode.c | 18 ++++++++++-------- fs/ext4/migrate.c | 11 ++++++----- fs/ext4/super.c | 12 +++++++----- 4 files changed, 48 insertions(+), 18 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b701f7ac7f66..a9530484147d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -37,6 +37,7 @@ #include <linux/falloc.h> #include <linux/percpu-rwsem.h> #include <linux/fiemap.h> +#include <linux/sched/mm.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif @@ -1701,6 +1702,30 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) return container_of(inode, struct ext4_inode_info, vfs_inode); }
+static inline int ext4_writepages_down_read(struct super_block *sb) +{ + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); +} + +static inline int ext4_writepages_down_write(struct super_block *sb) +{ + percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); +} + static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 412af3c5fde4..0f8d23aa2838 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2709,13 +2709,14 @@ static int ext4_writepages(struct address_space *mapping, struct blk_plug plug; bool give_up_on_write = false; unsigned long retry_warn_ddl = 0; + int alloc_ctx;
#define RETRY_WARN_TIMEOUT (30 * HZ)
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO;
- percpu_down_read(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc);
/* @@ -2936,7 +2937,7 @@ static int ext4_writepages(struct address_space *mapping, out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_writepages_rwsem); + ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; }
@@ -2947,17 +2948,18 @@ static int ext4_dax_writepages(struct address_space *mapping, long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + int alloc_ctx;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO;
- percpu_down_read(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc);
ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_writepages_rwsem); + ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; }
@@ -6107,7 +6109,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) journal_t *journal; handle_t *handle; int err; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int alloc_ctx;
/* * We have to be very careful here: changing a data block's @@ -6145,7 +6147,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) } }
- percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal);
/* @@ -6162,7 +6164,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = jbd2_journal_flush(journal); if (err < 0) { jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); @@ -6170,7 +6172,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx);
if (val) up_write(&EXT4_I(inode)->i_mmap_sem); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b0ea646454ac..50e449ec62bf 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -409,7 +409,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
int ext4_ext_migrate(struct inode *inode) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; int retval = 0, i; __le32 *i_data; @@ -419,6 +418,7 @@ int ext4_ext_migrate(struct inode *inode) unsigned long max_entries; __u32 goal, tmp_csum_seed; uid_t owner[2]; + int alloc_ctx;
/* * If the filesystem does not support extents, or the inode @@ -435,7 +435,7 @@ int ext4_ext_migrate(struct inode *inode) */ return retval;
- percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb);
/* * Worst case we can touch the allocation bitmaps and a block @@ -587,7 +587,7 @@ int ext4_ext_migrate(struct inode *inode) unlock_new_inode(tmp_inode); iput(tmp_inode); out_unlock: - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return retval; }
@@ -606,6 +606,7 @@ int ext4_ind_migrate(struct inode *inode) ext4_fsblk_t blk; handle_t *handle; int ret, ret2 = 0; + int alloc_ctx;
if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) @@ -622,7 +623,7 @@ int ext4_ind_migrate(struct inode *inode) if (test_opt(inode->i_sb, DELALLOC)) ext4_alloc_da_blocks(inode);
- percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb);
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { @@ -666,6 +667,6 @@ int ext4_ind_migrate(struct inode *inode) ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); out_unlock: - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return ret; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 90f886184fc6..bfa592bfd298 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5961,6 +5961,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_group_t g; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err = 0; + int alloc_ctx; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; @@ -6014,13 +6015,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * here s_writepages_rwsem to avoid race between writepages ops and * remount. */ - percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(sb); if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(sb, alloc_ctx); goto restore_opts; } - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(sb, alloc_ctx);
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { @@ -6247,7 +6248,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); - percpu_down_write(&sbi->s_writepages_rwsem); + + alloc_ctx = ext4_writepages_down_write(sb); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; @@ -6256,7 +6258,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(sb, alloc_ctx);
if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb);
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/8925 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/W...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/8925 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/W...