From: Ye Bin yebin10@huawei.com
mainline inclusion from mainline-v5.16-rc3 commit 1d0254e6b47e73222fd3d6ae95cccbaafe5b3ecf category: bugfix bugzilla: 185746 CVE: NA
-----------------------------------------------
I got issue as follows: [ 567.094140] __io_remove_buffers: [1]start ctx=0xffff8881067bf000 bgid=65533 buf=0xffff8881fefe1680 [ 594.360799] watchdog: BUG: soft lockup - CPU#2 stuck for 26s! [kworker/u32:5:108] [ 594.364987] Modules linked in: [ 594.365405] irq event stamp: 604180238 [ 594.365906] hardirqs last enabled at (604180237): [<ffffffff93fec9bd>] _raw_spin_unlock_irqrestore+0x2d/0x50 [ 594.367181] hardirqs last disabled at (604180238): [<ffffffff93fbbadb>] sysvec_apic_timer_interrupt+0xb/0xc0 [ 594.368420] softirqs last enabled at (569080666): [<ffffffff94200654>] __do_softirq+0x654/0xa9e [ 594.369551] softirqs last disabled at (569080575): [<ffffffff913e1d6a>] irq_exit_rcu+0x1ca/0x250 [ 594.370692] CPU: 2 PID: 108 Comm: kworker/u32:5 Tainted: G L 5.15.0-next-20211112+ #88 [ 594.371891] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 [ 594.373604] Workqueue: events_unbound io_ring_exit_work [ 594.374303] RIP: 0010:_raw_spin_unlock_irqrestore+0x33/0x50 [ 594.375037] Code: 48 83 c7 18 53 48 89 f3 48 8b 74 24 10 e8 55 f5 55 fd 48 89 ef e8 ed a7 56 fd 80 e7 02 74 06 e8 43 13 7b fd fb bf 01 00 00 00 <e8> f8 78 474 [ 594.377433] RSP: 0018:ffff888101587a70 EFLAGS: 00000202 [ 594.378120] RAX: 0000000024030f0d RBX: 0000000000000246 RCX: 1ffffffff2f09106 [ 594.379053] RDX: 0000000000000000 RSI: ffffffff9449f0e0 RDI: 0000000000000001 [ 594.379991] RBP: ffffffff9586cdc0 R08: 0000000000000001 R09: fffffbfff2effcab [ 594.380923] R10: ffffffff977fe557 R11: fffffbfff2effcaa R12: ffff8881b8f3def0 [ 594.381858] R13: 0000000000000246 R14: ffff888153a8b070 R15: 0000000000000000 [ 594.382787] FS: 0000000000000000(0000) GS:ffff888399c00000(0000) knlGS:0000000000000000 [ 594.383851] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 594.384602] CR2: 00007fcbe71d2000 CR3: 00000000b4216000 CR4: 00000000000006e0 [ 594.385540] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 594.386474] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 594.387403] Call Trace: [ 594.387738] <TASK> [ 594.388042] find_and_remove_object+0x118/0x160 [ 594.389321] delete_object_full+0xc/0x20 [ 594.389852] kfree+0x193/0x470 [ 594.390275] __io_remove_buffers.part.0+0xed/0x147 [ 594.390931] io_ring_ctx_free+0x342/0x6a2 [ 594.392159] io_ring_exit_work+0x41e/0x486 [ 594.396419] process_one_work+0x906/0x15a0 [ 594.399185] worker_thread+0x8b/0xd80 [ 594.400259] kthread+0x3bf/0x4a0 [ 594.401847] ret_from_fork+0x22/0x30 [ 594.402343] </TASK>
Message from syslogd@localhost at Nov 13 09:09:54 ... kernel:watchdog: BUG: soft lockup - CPU#2 stuck for 26s! [kworker/u32:5:108] [ 596.793660] __io_remove_buffers: [2099199]start ctx=0xffff8881067bf000 bgid=65533 buf=0xffff8881fefe1680
We can reproduce this issue by follow syzkaller log: r0 = syz_io_uring_setup(0x401, &(0x7f0000000300), &(0x7f0000003000/0x2000)=nil, &(0x7f0000ff8000/0x4000)=nil, &(0x7f0000000280)=<r1=>0x0, &(0x7f0000000380)=<r2=>0x0) sendmsg$ETHTOOL_MSG_FEATURES_SET(0xffffffffffffffff, &(0x7f0000003080)={0x0, 0x0, &(0x7f0000003040)={&(0x7f0000000040)=ANY=[], 0x18}}, 0x0) syz_io_uring_submit(r1, r2, &(0x7f0000000240)=@IORING_OP_PROVIDE_BUFFERS={0x1f, 0x5, 0x0, 0x401, 0x1, 0x0, 0x100, 0x0, 0x1, {0xfffd}}, 0x0) io_uring_enter(r0, 0x3a2d, 0x0, 0x0, 0x0, 0x0)
The reason above issue is 'buf->list' has 2,100,000 nodes, occupied cpu lead to soft lockup. To solve this issue, we need add schedule point when do while loop in '__io_remove_buffers'. After add schedule point we do regression, get follow data. [ 240.141864] __io_remove_buffers: [1]start ctx=0xffff888170603000 bgid=65533 buf=0xffff8881116fcb00 [ 268.408260] __io_remove_buffers: [1]start ctx=0xffff8881b92d2000 bgid=65533 buf=0xffff888130c83180 [ 275.899234] __io_remove_buffers: [2099199]start ctx=0xffff888170603000 bgid=65533 buf=0xffff8881116fcb00 [ 296.741404] __io_remove_buffers: [1]start ctx=0xffff8881b659c000 bgid=65533 buf=0xffff8881010fe380 [ 305.090059] __io_remove_buffers: [2099199]start ctx=0xffff8881b92d2000 bgid=65533 buf=0xffff888130c83180 [ 325.415746] __io_remove_buffers: [1]start ctx=0xffff8881b92d1000 bgid=65533 buf=0xffff8881a17d8f00 [ 333.160318] __io_remove_buffers: [2099199]start ctx=0xffff8881b659c000 bgid=65533 buf=0xffff8881010fe380 ...
Fixes:8bab4c09f24e("io_uring: allow conditional reschedule for intensive iterators") Signed-off-by: Ye Bin yebin10@huawei.com Link: https://lore.kernel.org/r/20211122024737.2198530-1-yebin10@huawei.com Signed-off-by: Jens Axboe axboe@kernel.dk
conflicts: fs/io_uring.c
Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c index d07388600bbed..f3b5f9d670df3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3462,6 +3462,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, kfree(nxt); if (++i == nbufs) return i; + cond_resched(); } i++; kfree(buf);
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-v5.11-rc1 commit ac0648a56c1ff66c1cbf735075ad33a26cbc50de category: bugfix bugzilla: 185834 CVE: NA
-----------------------------------------------
io_file_data_ref_zero() can be invoked from soft-irq from the RCU core, hence we need to ensure that the file_data lock is bottom half safe. Use the _bh() variants when grabbing this lock.
Reported-by: syzbot+1f4ba1e5520762c523c6@syzkaller.appspotmail.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index f3b5f9d670df3..f933d4f0edb4c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6664,9 +6664,9 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) if (!data) return -ENXIO;
- spin_lock(&data->lock); + spin_lock_bh(&data->lock); ref_node = data->node; - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock); if (ref_node) percpu_ref_kill(&ref_node->refs);
@@ -6951,7 +6951,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref) data = ref_node->file_data; ctx = data->ctx;
- spin_lock(&data->lock); + spin_lock_bh(&data->lock); ref_node->done = true;
while (!list_empty(&data->ref_list)) { @@ -6963,7 +6963,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref) list_del(&ref_node->node); first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); } - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock);
if (percpu_ref_is_dying(&data->refs)) delay = 0; @@ -7086,9 +7086,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, }
file_data->node = ref_node; - spin_lock(&file_data->lock); + spin_lock_bh(&file_data->lock); list_add_tail(&ref_node->node, &file_data->ref_list); - spin_unlock(&file_data->lock); + spin_unlock_bh(&file_data->lock); percpu_ref_get(&file_data->refs); return ret; out_fput: @@ -7245,10 +7245,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (needs_switch) { percpu_ref_kill(&data->node->refs); - spin_lock(&data->lock); + spin_lock_bh(&data->lock); list_add_tail(&ref_node->node, &data->ref_list); data->node = ref_node; - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock); percpu_ref_get(&ctx->file_data->refs); } else destroy_fixed_file_ref_node(ref_node);