From: Jan Kara jack@suse.cz
mainline inclusion from mainline-v6.4-rc2 commit 00d873c17e29cc32d90ca852b82685f1673acaa5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9SYGK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Ext4 has a filesystem wide lock protecting ext4_writepages() calls to avoid races with switching of journalled data flag or inode format. This lock can however cause a deadlock like:
CPU0 CPU1
ext4_writepages() percpu_down_read(sbi->s_writepages_rwsem); ext4_change_inode_journal_flag() percpu_down_write(sbi->s_writepages_rwsem); - blocks, all readers block from now on ext4_do_writepages() ext4_init_io_end() kmem_cache_zalloc(io_end_cachep, GFP_KERNEL) fs_reclaim frees dentry... dentry_unlink_inode() iput() - last ref => iput_final() - inode dirty => write_inode_now()... ext4_writepages() tries to acquire sbi->s_writepages_rwsem and blocks forever
Make sure we cannot recurse into filesystem reclaim from writeback code to avoid the deadlock.
Reported-by: syzbot+6898da502aef574c5f8a@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000004c66b405fa108e27@google.com Fixes: c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages") CC: stable@vger.kernel.org Signed-off-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20230504124723.20205-1-jack@suse.cz Signed-off-by: Theodore Ts'o tytso@mit.edu
Conflicts: fs/ext4/ext4.h fs/ext4/super.c fs/ext4/inode.c fs/ext4/migrate.c [Because we have merged in 41d10c74ff04 ("ext4: fix race between writepages and remount") and we don't have ext4_do_writepages() yet and i_mmap_sem hasn't switched to invalidate_lock yet.] Signed-off-by: Baokun Li libaokun1@huawei.com --- V1->V2: Correct fs/ext4/migrate.c
fs/ext4/ext4.h | 25 +++++++++++++++++++++++++ fs/ext4/inode.c | 18 ++++++++++-------- fs/ext4/migrate.c | 11 ++++++----- fs/ext4/super.c | 11 ++++++----- 4 files changed, 47 insertions(+), 18 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6df919b154b4..1794f5ce1b46 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -37,6 +37,7 @@ #include <crypto/hash.h> #include <linux/falloc.h> #include <linux/percpu-rwsem.h> +#include <linux/sched/mm.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif @@ -1571,6 +1572,30 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) return container_of(inode, struct ext4_inode_info, vfs_inode); }
+static inline int ext4_writepages_down_read(struct super_block *sb) +{ + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); +} + +static inline int ext4_writepages_down_write(struct super_block *sb) +{ + percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); +} + static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 35033bb43149..7110fda36c14 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2744,13 +2744,14 @@ static int ext4_writepages(struct address_space *mapping, struct blk_plug plug; bool give_up_on_write = false; unsigned long retry_warn_ddl = 0; + int alloc_ctx;
#define RETRY_WARN_TIMEOUT (30 * HZ)
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO;
- percpu_down_read(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc);
/* @@ -2981,7 +2982,7 @@ static int ext4_writepages(struct address_space *mapping, out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_writepages_rwsem); + ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; }
@@ -2992,17 +2993,18 @@ static int ext4_dax_writepages(struct address_space *mapping, long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + int alloc_ctx;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO;
- percpu_down_read(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc);
ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_writepages_rwsem); + ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; }
@@ -6245,7 +6247,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) journal_t *journal; handle_t *handle; int err; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int alloc_ctx;
/* * We have to be very careful here: changing a data block's @@ -6285,7 +6287,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) } }
- percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal);
/* @@ -6302,7 +6304,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = jbd2_journal_flush(journal); if (err < 0) { jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); ext4_inode_resume_unlocked_dio(inode); return err; } @@ -6311,7 +6313,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx);
if (val) up_write(&EXT4_I(inode)->i_mmap_sem); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index ed9e7816efbb..443a09087dac 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -407,7 +407,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
int ext4_ext_migrate(struct inode *inode) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; int retval = 0, i; __le32 *i_data; @@ -417,6 +416,7 @@ int ext4_ext_migrate(struct inode *inode) unsigned long max_entries; __u32 goal, tmp_csum_seed; uid_t owner[2]; + int alloc_ctx;
/* * If the filesystem does not support extents, or the inode @@ -432,7 +432,7 @@ int ext4_ext_migrate(struct inode *inode) */ return retval;
- percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb);
/* * Worst case we can touch the allocation bitmaps and a block @@ -584,7 +584,7 @@ int ext4_ext_migrate(struct inode *inode) unlock_new_inode(tmp_inode); iput(tmp_inode); out_unlock: - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return retval; }
@@ -603,6 +603,7 @@ int ext4_ind_migrate(struct inode *inode) ext4_fsblk_t blk; handle_t *handle; int ret; + int alloc_ctx;
if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) @@ -619,7 +620,7 @@ int ext4_ind_migrate(struct inode *inode) if (test_opt(inode->i_sb, DELALLOC)) ext4_alloc_da_blocks(inode);
- percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb);
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { @@ -661,6 +662,6 @@ int ext4_ind_migrate(struct inode *inode) ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); out_unlock: - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return ret; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1cfec3ac7c9e..8761fe5037a6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5559,6 +5559,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_group_t g; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err = 0; + int alloc_ctx; #ifdef CONFIG_QUOTA int i, j; char *to_free[EXT4_MAXQUOTAS]; @@ -5611,13 +5612,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * s_writepages_rwsem to avoid race between writepages ops and * remount. */ - percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(sb); if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(sb, alloc_ctx); goto restore_opts; } - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(sb, alloc_ctx);
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { @@ -5850,7 +5851,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sb_any_quota_suspended(sb)) dquot_resume(sb, -1);
- percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(sb); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; @@ -5859,7 +5860,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(sb, alloc_ctx);
if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) ext4_release_system_zone(sb);