This patch set fix CVE-2024-50225.
Naohiro Aota (1): btrfs: fix error propagation of split bios
Qu Wenruo (1): btrfs: merge btrfs_orig_bbio_end_io() into btrfs_bio_end_io()
fs/btrfs/bio.c | 62 ++++++++++++++++++-------------------------------- fs/btrfs/bio.h | 3 +++ 2 files changed, 25 insertions(+), 40 deletions(-)
From: Qu Wenruo wqu@suse.com
mainline inclusion from mainline-v6.10-rc2 commit 9ca0e58cb752b09816f56f7a3147a39773d5e831 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IB37AG CVE: CVE-2024-50225
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
There are only two differences between the two functions:
- btrfs_orig_bbio_end_io() does extra error propagation This is mostly to allow tolerance for write errors.
- btrfs_orig_bbio_end_io() does extra pending_ios check This check can handle both the original bio, or the cloned one. (All accounting happens in the original one).
This makes btrfs_orig_bbio_end_io() a much safer call. In fact we already had a double freeing error due to usage of btrfs_bio_end_io() in the error path of btrfs_submit_chunk().
So just move the whole content of btrfs_orig_bbio_end_io() into btrfs_bio_end_io().
For normal paths this brings no change, because they are already calling btrfs_orig_bbio_end_io() in the first place.
For error paths (not only inside bio.c but also external callers), this change will introduce extra checks, especially for external callers, as they will error out without submitting the btrfs bio.
But considering it's already in the error path, such slower but much safer checks are still an overall win.
Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Conflicts: fs/btrfs/bio.c [conflict due to not merge 02c372e1f016 ("btrfs: add support for inserting raid stripe extents")] Signed-off-by: Long Li leo.lilong@huawei.com --- fs/btrfs/bio.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-)
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 650972895652..597a512dfa04 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -121,12 +121,6 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) } }
-void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - __btrfs_bio_end_io(bbio); -} - static void btrfs_orig_write_end_io(struct bio *bio);
static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, @@ -148,8 +142,9 @@ static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, } }
-static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { + bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private;
@@ -180,7 +175,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) static void btrfs_repair_done(struct btrfs_failed_bio *fbio) { if (atomic_dec_and_test(&fbio->repair_count)) { - btrfs_orig_bbio_end_io(fbio->bbio); + btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status); mempool_free(fbio, &btrfs_failed_bio_pool); } } @@ -321,7 +316,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de if (fbio) btrfs_repair_done(fbio); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); }
static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) @@ -355,7 +350,7 @@ static void btrfs_end_bio_work(struct work_struct *work) if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); }
static void btrfs_simple_end_io(struct bio *bio) @@ -375,7 +370,7 @@ static void btrfs_simple_end_io(struct bio *bio) } else { if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) btrfs_record_physical_zoned(bbio); - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } }
@@ -389,7 +384,7 @@ static void btrfs_raid56_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status);
btrfs_put_bioc(bioc); } @@ -416,7 +411,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK;
- btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); }
@@ -577,7 +572,7 @@ static void run_one_async_done(struct btrfs_work *work)
/* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_orig_bbio_end_io(async->bbio); + btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); return; }
@@ -729,11 +724,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); ASSERT(remaining);
- remaining->bio.bi_status = ret; - btrfs_orig_bbio_end_io(remaining); + btrfs_bio_end_io(remaining, ret); } - bbio->bio.bi_status = ret; - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, ret); /* Do not submit another chunk */ return true; }
From: Naohiro Aota naohiro.aota@wdc.com
mainline inclusion from mainline-v6.10-rc2 commit d48e1dea3931de64c26717adc2b89743c7ab6594 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IB37AG CVE: CVE-2024-50225
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The purpose of btrfs_bbio_propagate_error() shall be propagating an error of split bio to its original btrfs_bio, and tell the error to the upper layer. However, it's not working well on some cases.
* Case 1. Immediate (or quick) end_bio with an error
When btrfs sends btrfs_bio to mirrored devices, btrfs calls btrfs_bio_end_io() when all the mirroring bios are completed. If that btrfs_bio was split, it is from btrfs_clone_bioset and its end_io function is btrfs_orig_write_end_io. For this case, btrfs_bbio_propagate_error() accesses the orig_bbio's bio context to increase the error count.
That works well in most cases. However, if the end_io is called enough fast, orig_bbio's (remaining part after split) bio context may not be properly set at that time. Since the bio context is set when the orig_bbio (the last btrfs_bio) is sent to devices, that might be too late for earlier split btrfs_bio's completion. That will result in NULL pointer dereference.
That bug is easily reproducible by running btrfs/146 on zoned devices [1] and it shows the following trace.
[1] You need raid-stripe-tree feature as it create "-d raid0 -m raid1" FS.
BUG: kernel NULL pointer dereference, address: 0000000000000020 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 UID: 0 PID: 13 Comm: kworker/u32:1 Not tainted 6.11.0-rc7-BTRFS-ZNS+ #474 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Workqueue: writeback wb_workfn (flush-btrfs-5) RIP: 0010:btrfs_bio_end_io+0xae/0xc0 [btrfs] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0 RSP: 0018:ffffc9000006f248 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888005a7f080 RCX: ffffc9000006f1dc RDX: 0000000000000000 RSI: 000000000000000a RDI: ffff888005a7f080 RBP: ffff888011dfc540 R08: 0000000000000000 R09: 0000000000000001 R10: ffffffff82e508e0 R11: 0000000000000005 R12: ffff88800ddfbe58 R13: ffff888005a7f080 R14: ffff888005a7f158 R15: ffff888005a7f158 FS: 0000000000000000(0000) GS:ffff88803ea80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000020 CR3: 0000000002e22006 CR4: 0000000000370ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> ? __die_body.cold+0x19/0x26 ? page_fault_oops+0x13e/0x2b0 ? _printk+0x58/0x73 ? do_user_addr_fault+0x5f/0x750 ? exc_page_fault+0x76/0x240 ? asm_exc_page_fault+0x22/0x30 ? btrfs_bio_end_io+0xae/0xc0 [btrfs] ? btrfs_log_dev_io_error+0x7f/0x90 [btrfs] btrfs_orig_write_end_io+0x51/0x90 [btrfs] dm_submit_bio+0x5c2/0xa50 [dm_mod] ? find_held_lock+0x2b/0x80 ? blk_try_enter_queue+0x90/0x1e0 __submit_bio+0xe0/0x130 ? ktime_get+0x10a/0x160 ? lockdep_hardirqs_on+0x74/0x100 submit_bio_noacct_nocheck+0x199/0x410 btrfs_submit_bio+0x7d/0x150 [btrfs] btrfs_submit_chunk+0x1a1/0x6d0 [btrfs] ? lockdep_hardirqs_on+0x74/0x100 ? __folio_start_writeback+0x10/0x2c0 btrfs_submit_bbio+0x1c/0x40 [btrfs] submit_one_bio+0x44/0x60 [btrfs] submit_extent_folio+0x13f/0x330 [btrfs] ? btrfs_set_range_writeback+0xa3/0xd0 [btrfs] extent_writepage_io+0x18b/0x360 [btrfs] extent_write_locked_range+0x17c/0x340 [btrfs] ? __pfx_end_bbio_data_write+0x10/0x10 [btrfs] run_delalloc_cow+0x71/0xd0 [btrfs] btrfs_run_delalloc_range+0x176/0x500 [btrfs] ? find_lock_delalloc_range+0x119/0x260 [btrfs] writepage_delalloc+0x2ab/0x480 [btrfs] extent_write_cache_pages+0x236/0x7d0 [btrfs] btrfs_writepages+0x72/0x130 [btrfs] do_writepages+0xd4/0x240 ? find_held_lock+0x2b/0x80 ? wbc_attach_and_unlock_inode+0x12c/0x290 ? wbc_attach_and_unlock_inode+0x12c/0x290 __writeback_single_inode+0x5c/0x4c0 ? do_raw_spin_unlock+0x49/0xb0 writeback_sb_inodes+0x22c/0x560 __writeback_inodes_wb+0x4c/0xe0 wb_writeback+0x1d6/0x3f0 wb_workfn+0x334/0x520 process_one_work+0x1ee/0x570 ? lock_is_held_type+0xc6/0x130 worker_thread+0x1d1/0x3b0 ? __pfx_worker_thread+0x10/0x10 kthread+0xee/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x30/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 </TASK> Modules linked in: dm_mod btrfs blake2b_generic xor raid6_pq rapl CR2: 0000000000000020
* Case 2. Earlier completion of orig_bbio for mirrored btrfs_bios
btrfs_bbio_propagate_error() assumes the end_io function for orig_bbio is called last among split bios. In that case, btrfs_orig_write_end_io() sets the bio->bi_status to BLK_STS_IOERR by seeing the bioc->error [2]. Otherwise, the increased orig_bio's bioc->error is not checked by anyone and return BLK_STS_OK to the upper layer.
[2] Actually, this is not true. Because we only increases orig_bioc->errors by max_errors, the condition "atomic_read(&bioc->error) > bioc->max_errors" is still not met if only one split btrfs_bio fails.
* Case 3. Later completion of orig_bbio for un-mirrored btrfs_bios
In contrast to the above case, btrfs_bbio_propagate_error() is not working well if un-mirrored orig_bbio is completed last. It sets orig_bbio->bio.bi_status to the btrfs_bio's error. But, that is easily over-written by orig_bbio's completion status. If the status is BLK_STS_OK, the upper layer would not know the failure.
* Solution
Considering the above cases, we can only save the error status in the orig_bbio (remaining part after split) itself as it is always available. Also, the saved error status should be propagated when all the split btrfs_bios are finished (i.e, bbio->pending_ios == 0).
This commit introduces "status" to btrfs_bbio and saves the first error of split bios to original btrfs_bio's "status" variable. When all the split bios are finished, the saved status is loaded into original btrfs_bio's status.
With this commit, btrfs/146 on zoned devices does not hit the NULL pointer dereference anymore.
Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Qu Wenruo wqu@suse.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Johannes Thumshirn johannes.thumshirn@wdc.com Signed-off-by: Naohiro Aota naohiro.aota@wdc.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/btrfs/bio.c | 37 +++++++++++++------------------------ fs/btrfs/bio.h | 3 +++ 2 files changed, 16 insertions(+), 24 deletions(-)
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 597a512dfa04..5fbede35b100 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -50,6 +50,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, bbio->end_io = end_io; bbio->private = private; atomic_set(&bbio->pending_ios, 1); + WRITE_ONCE(bbio->status, BLK_STS_OK); }
/* @@ -121,41 +122,29 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) } }
-static void btrfs_orig_write_end_io(struct bio *bio); - -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, - struct btrfs_bio *orig_bbio) -{ - /* - * For writes we tolerate nr_mirrors - 1 write failures, so we can't - * just blindly propagate a write failure here. Instead increment the - * error count in the original I/O context so that it is guaranteed to - * be larger than the error tolerance. - */ - if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { - struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; - struct btrfs_io_context *orig_bioc = orig_stripe->bioc; - - atomic_add(orig_bioc->max_errors, &orig_bioc->error); - } else { - orig_bbio->bio.bi_status = bbio->bio.bi_status; - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private;
- if (bbio->bio.bi_status) - btrfs_bbio_propagate_error(bbio, orig_bbio); btrfs_cleanup_bio(bbio); bbio = orig_bbio; }
- if (atomic_dec_and_test(&bbio->pending_ios)) + /* + * At this point, bbio always points to the original btrfs_bio. Save + * the first error in it. + */ + if (status != BLK_STS_OK) + cmpxchg(&bbio->status, BLK_STS_OK, status); + + if (atomic_dec_and_test(&bbio->pending_ios)) { + /* Load split bio's error which might be set above. */ + if (status == BLK_STS_OK) + bbio->bio.bi_status = READ_ONCE(bbio->status); __btrfs_bio_end_io(bbio); + } }
static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index ca79decee060..264dda60a1cb 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -77,6 +77,9 @@ struct btrfs_bio { /* File system that this I/O operates on. */ struct btrfs_fs_info *fs_info;
+ /* Save the first error status of split bio. */ + blk_status_t status; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last.
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/13307 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/D...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/13307 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/D...