From: Baokun Li libaokun1@huawei.com
hulk inclusion category: bugfix bugzilla: 187222, https://gitee.com/openeuler/kernel/issues/I5H3KE CVE: NA
--------------------------------
Hulk Robot reported a BUG:
================================================================== kernel BUG at fs/ext4/extents_status.c:762! invalid opcode: 0000 [#1] SMP KASAN PTI [...] Call Trace: ext4_cache_extents+0x238/0x2f0 ext4_find_extent+0x785/0xa40 ext4_fiemap+0x36d/0xe90 do_vfs_ioctl+0x6af/0x1200 [...] ==================================================================
Above issue may happen as follows: ------------------------------------- cpu1 cpu2 _____________________|_____________________ do_vfs_ioctl ext4_ioctl ext4_ioctl_setflags ext4_ind_migrate do_vfs_ioctl ioctl_fiemap ext4_fiemap ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ext4_fill_fiemap_extents down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_check_inode ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS) memset(ei->i_data, 0, sizeof(ei->i_data)) up_write(&EXT4_I(inode)->i_data_sem); down_read(&EXT4_I(inode)->i_data_sem); ext4_find_extent ext4_cache_extents ext4_es_cache_extent BUG_ON(end < lblk)
We can easily reproduce this problem with the syzkaller testcase: ``` 02:37:07 executing program 3: r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x26e1, 0x0) ioctl$FS_IOC_FSSETXATTR(r0, 0x40086602, &(0x7f0000000080)={0x17e}) mkdirat(0xffffffffffffff9c, &(0x7f00000000c0)='./file1\x00', 0x1ff) r1 = openat(0xffffffffffffff9c, &(0x7f0000000100)='./file1\x00', 0x0, 0x0) ioctl$FS_IOC_FIEMAP(r1, 0xc020660b, &(0x7f0000000180)={0x0, 0x1, 0x0, 0xef3, 0x6, []}) (async, rerun: 32) ioctl$FS_IOC_FSSETXATTR(r1, 0x40086602, &(0x7f0000000140)={0x17e}) (rerun: 32) ```
To solve this issue, we use __generic_block_fiemap() instead of generic_block_fiemap() and add inode_lock_shared to avoid race condition.
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- fs/ext4/extents.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d08e6d4afba6..8e5ed3e315cd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5267,13 +5267,18 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return error; }
+ inode_lock_shared(inode); /* fallback to generic here if not in extents fmt */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return generic_block_fiemap(inode, fieinfo, start, len, + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + error = __generic_block_fiemap(inode, fieinfo, start, len, ext4_get_block); + goto out_unlock; + }
- if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) - return -EBADR; + if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) { + error = -EBADR; + goto out_unlock; + }
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { error = ext4_xattr_fiemap(inode, fieinfo); @@ -5294,6 +5299,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, error = ext4_fill_fiemap_extents(inode, start_blk, len_blks, fieinfo); } +out_unlock: + inode_unlock_shared(inode); return error; }
From: Yang Yang yang.yang@vivo.com
mainline inclusion from mainline-v5.10-rc1 commit 47ce030b7ac5a5259a9a5919f230b52497afc31a category: bugfix bugzilla: 186769, https://gitee.com/openeuler/kernel/issues/I5FYJY CVE: NA
--------------------------------
blk_exit_queue will free elevator_data, while blk_mq_run_work_fn will access it. Move cancel of hctx->run_work to the front of blk_exit_queue to avoid use-after-free.
Fixes: 1b97871b501f ("blk-mq: move cancel of hctx->run_work into blk_mq_hw_sysfs_release") Signed-off-by: Yang Yang yang.yang@vivo.com Reviewed-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/blk-mq-sysfs.c | 2 -- block/blk-sysfs.c | 9 ++++++++- 2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 014c7d84ff99..4ca4f0b1b619 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -35,8 +35,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
- cancel_delayed_work_sync(&hctx->run_work); - if (hctx->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(hctx->srcu); blk_free_flush_queue(hctx->fq); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 90fab2253367..8ee343b6b0ce 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -837,9 +837,16 @@ static void __blk_release_queue(struct work_struct *work)
blk_free_queue_stats(q->stats);
- if (q->mq_ops) + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + int i; + cancel_delayed_work_sync(&q->requeue_work);
+ queue_for_each_hw_ctx(q, hctx, i) + cancel_delayed_work_sync(&hctx->run_work); + } + blk_exit_rl(q, &q->root_rl);
if (q->queue_tags)
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.16-rc2 commit 2a19b28f7929866e1cec92a3619f4de9f2d20005 category: bugfix bugzilla: 186769, https://gitee.com/openeuler/kernel/issues/I5FYJY CVE: NA
--------------------------------
For avoiding to slow down queue destroy, we don't call blk_mq_quiesce_queue() in blk_cleanup_queue(), instead of delaying to cancel dispatch work in blk_release_queue().
However, this way has caused kernel oops[1], reported by Changhui. The log shows that scsi_device can be freed before running blk_release_queue(), which is expected too since scsi_device is released after the scsi disk is closed and the scsi_device is removed.
Fixes the issue by canceling blk-mq dispatch work in both blk_cleanup_queue() and disk_release():
1) when disk_release() is run, the disk has been closed, and any sync dispatch activities have been done, so canceling dispatch work is enough to quiesce filesystem I/O dispatch activity.
2) in blk_cleanup_queue(), we only focus on passthrough request, and passthrough request is always explicitly allocated & freed by its caller, so once queue is frozen, all sync dispatch activity for passthrough request has been done, then it is enough to just cancel dispatch work for avoiding any dispatch activity.
[1] kernel panic log [12622.769416] BUG: kernel NULL pointer dereference, address: 0000000000000300 [12622.777186] #PF: supervisor read access in kernel mode [12622.782918] #PF: error_code(0x0000) - not-present page [12622.788649] PGD 0 P4D 0 [12622.791474] Oops: 0000 [#1] PREEMPT SMP PTI [12622.796138] CPU: 10 PID: 744 Comm: kworker/10:1H Kdump: loaded Not tainted 5.15.0+ #1 [12622.804877] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 1.5.4 10/002/2015 [12622.813321] Workqueue: kblockd blk_mq_run_work_fn [12622.818572] RIP: 0010:sbitmap_get+0x75/0x190 [12622.823336] Code: 85 80 00 00 00 41 8b 57 08 85 d2 0f 84 b1 00 00 00 45 31 e4 48 63 cd 48 8d 1c 49 48 c1 e3 06 49 03 5f 10 4c 8d 6b 40 83 f0 01 <48> 8b 33 44 89 f2 4c 89 ef 0f b6 c8 e8 fa f3 ff ff 83 f8 ff 75 58 [12622.844290] RSP: 0018:ffffb00a446dbd40 EFLAGS: 00010202 [12622.850120] RAX: 0000000000000001 RBX: 0000000000000300 RCX: 0000000000000004 [12622.858082] RDX: 0000000000000006 RSI: 0000000000000082 RDI: ffffa0b7a2dfe030 [12622.866042] RBP: 0000000000000004 R08: 0000000000000001 R09: ffffa0b742721334 [12622.874003] R10: 0000000000000008 R11: 0000000000000008 R12: 0000000000000000 [12622.881964] R13: 0000000000000340 R14: 0000000000000000 R15: ffffa0b7a2dfe030 [12622.889926] FS: 0000000000000000(0000) GS:ffffa0baafb40000(0000) knlGS:0000000000000000 [12622.898956] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [12622.905367] CR2: 0000000000000300 CR3: 0000000641210001 CR4: 00000000001706e0 [12622.913328] Call Trace: [12622.916055] <TASK> [12622.918394] scsi_mq_get_budget+0x1a/0x110 [12622.922969] __blk_mq_do_dispatch_sched+0x1d4/0x320 [12622.928404] ? pick_next_task_fair+0x39/0x390 [12622.933268] __blk_mq_sched_dispatch_requests+0xf4/0x140 [12622.939194] blk_mq_sched_dispatch_requests+0x30/0x60 [12622.944829] __blk_mq_run_hw_queue+0x30/0xa0 [12622.949593] process_one_work+0x1e8/0x3c0 [12622.954059] worker_thread+0x50/0x3b0 [12622.958144] ? rescuer_thread+0x370/0x370 [12622.962616] kthread+0x158/0x180 [12622.966218] ? set_kthread_struct+0x40/0x40 [12622.970884] ret_from_fork+0x22/0x30 [12622.974875] </TASK> [12622.977309] Modules linked in: scsi_debug rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs sunrpc dm_multipath intel_rapl_msr intel_rapl_common dell_wmi_descriptor sb_edac rfkill video x86_pkg_temp_thermal intel_powerclamp dcdbas coretemp kvm_intel kvm mgag200 irqbypass i2c_algo_bit rapl drm_kms_helper ipmi_ssif intel_cstate intel_uncore syscopyarea sysfillrect sysimgblt fb_sys_fops pcspkr cec mei_me lpc_ich mei ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter drm fuse xfs libcrc32c sr_mod cdrom sd_mod t10_pi sg ixgbe ahci libahci crct10dif_pclmul crc32_pclmul crc32c_intel libata megaraid_sas ghash_clmulni_intel tg3 wdat_wdt mdio dca wmi dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_debug]
Reported-by: ChanghuiZhong czhong@redhat.com Cc: Christoph Hellwig hch@lst.de Cc: "Martin K. Petersen" martin.petersen@oracle.com Cc: Bart Van Assche bvanassche@acm.org Cc: linux-scsi@vger.kernel.org Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20211116014343.610501-1-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: ./block/blk-mq.c ./block/blk-mq.h ./block/blk-sysfs.c ./block/genhd.c ./block/blk-core.c Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/blk-core.c | 4 +++- block/blk-mq.c | 13 +++++++++++++ block/blk-mq.h | 2 ++ block/blk-sysfs.c | 10 ---------- block/genhd.c | 3 +++ 5 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index bced58322fcc..e98827d25ef8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -843,8 +843,10 @@ void blk_cleanup_queue(struct request_queue *q)
blk_exit_queue(q);
- if (q->mq_ops) + if (q->mq_ops) { + blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); + } percpu_ref_exit(&q->q_usage_counter);
spin_lock_irq(lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index 9604d2b39745..1bf6713fb2cb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3502,6 +3502,19 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) return __blk_mq_poll(hctx, rq); }
+void blk_mq_cancel_work_sync(struct request_queue *q) +{ + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + int i; + + cancel_delayed_work_sync(&q->requeue_work); + + queue_for_each_hw_ctx(q, hctx, i) + cancel_delayed_work_sync(&hctx->run_work); + } +} + static int __init blk_mq_init(void) { cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, diff --git a/block/blk-mq.h b/block/blk-mq.h index b3540d62c541..c6ec9aa12fb2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -98,6 +98,8 @@ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
+void blk_mq_cancel_work_sync(struct request_queue *q); + void blk_mq_release(struct request_queue *q);
/** diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8ee343b6b0ce..6beca7743c11 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -837,16 +837,6 @@ static void __blk_release_queue(struct work_struct *work)
blk_free_queue_stats(q->stats);
- if (q->mq_ops) { - struct blk_mq_hw_ctx *hctx; - int i; - - cancel_delayed_work_sync(&q->requeue_work); - - queue_for_each_hw_ctx(q, hctx, i) - cancel_delayed_work_sync(&hctx->run_work); - } - blk_exit_rl(q, &q->root_rl);
if (q->queue_tags) diff --git a/block/genhd.c b/block/genhd.c index 124f8d94584c..3c23299976b1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1274,6 +1274,9 @@ static void disk_release(struct device *dev) struct gendisk *disk = dev_to_disk(dev);
blk_free_devt(dev->devt); + + blk_mq_cancel_work_sync(disk->queue); + disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL);
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 186769, https://gitee.com/openeuler/kernel/issues/I5FYJY CVE: NA
--------------------------------
Our test report a following crash:
BUG: kernel NULL pointer dereference, address: 0000000000000018 PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 6 PID: 265 Comm: kworker/6:1H Kdump: loaded Tainted: G O 5.10.0-60.17.0.h43.eulerosv2r11.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-20220320_160524-szxrtosci10000 04/01/2014 Workqueue: kblockd blk_mq_run_work_fn RIP: 0010:blk_mq_delay_run_hw_queues+0xb6/0xe0 RSP: 0018:ffffacc6803d3d88 EFLAGS: 00010246 RAX: 0000000000000006 RBX: ffff99e2c3d25008 RCX: 00000000ffffffff RDX: 0000000000000000 RSI: 0000000000000003 RDI: ffff99e2c911ae18 RBP: ffffacc6803d3dd8 R08: 0000000000000000 R09: ffff99e2c0901f6c R10: 0000000000000018 R11: 0000000000000018 R12: ffff99e2c911ae18 R13: 0000000000000000 R14: 0000000000000003 R15: ffff99e2c911ae18 FS: 0000000000000000(0000) GS:ffff99e6bbf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000018 CR3: 000000007460a006 CR4: 00000000003706e0 Call Trace: __blk_mq_do_dispatch_sched+0x2a7/0x2c0 ? newidle_balance+0x23e/0x2f0 __blk_mq_sched_dispatch_requests+0x13f/0x190 blk_mq_sched_dispatch_requests+0x30/0x60 __blk_mq_run_hw_queue+0x47/0xd0 process_one_work+0x1b0/0x350 worker_thread+0x49/0x300 ? rescuer_thread+0x3a0/0x3a0 kthread+0xfe/0x140 ? kthread_park+0x90/0x90 ret_from_fork+0x22/0x30
After digging from vmcore, I found that the queue is cleaned up(blk_cleanup_queue() is done) and tag set is freed(blk_mq_free_tag_set() is done).
There are two problems here:
1) blk_mq_delay_run_hw_queues() will only be called from __blk_mq_do_dispatch_sched() if e->type->ops.has_work() return true. This seems impossible because blk_cleanup_queue() is done, and there should be no io. However, bfq_has_work() can return true even if no io is queued. This is because bfq_has_work() is using busy queues, and bfq_queue can stay busy after dispatching all the requests.
2) 'hctx->run_work' still exists after blk_cleanup_queue(). blk_mq_cancel_work_sync() is called from blk_cleanup_queue() to cancel all the 'run_work'. However, there is no guarantee that new 'run_work' won't be queued after that(and before blk_mq_exit_queue() is done).
The first problem is not the root cause, this patch just fix the second problem by grabbing 'q_usage_counter' before queuing 'hctx->run_work'.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/blk-mq.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/block/blk-mq.c b/block/blk-mq.c index 1bf6713fb2cb..34d4fdb4e717 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1476,8 +1476,16 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, put_cpu(); }
+ /* + * No need to queue work if there is no io, and this can avoid race + * with blk_cleanup_queue(). + */ + if (!percpu_ref_tryget(&hctx->queue->q_usage_counter)) + return; + kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs)); + percpu_ref_put(&hctx->queue->q_usage_counter); }
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
From: Yu Kuai yukuai3@huawei.com
mainline inclusion from mainline-v5.19-rc1 commit ddc25c86b466d2359b57bc7798f167baa1735a44 category: bugfix bugzilla: 186769, https://gitee.com/openeuler/kernel/issues/I5FYJY CVE: NA
--------------------------------
bfq_has_work() is using busy_queues currently, which is not accurate because bfq_queue is busy doesn't represent that it has requests. Since bfqd aready has a counter 'queued' to record how many requests are in bfq, use it instead of busy_queues.
Noted that bfq_has_work() can be called with 'bfqd->lock' held, thus the lock can't be held in bfq_has_work() to protect 'bfqd->queued'.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20220513023507.2625717-3-yukuai3@huawei.com Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/bfq-iosched.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 8f4275d1b11a..2fbec936b7c1 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1685,7 +1685,11 @@ static void bfq_add_request(struct request *rq)
bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; + /* + * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it + * may be read without holding the lock in bfq_has_work(). + */ + WRITE_ONCE(bfqd->queued, bfqd->queued + 1);
elv_rb_add(&bfqq->sort_list, rq);
@@ -1803,7 +1807,11 @@ static void bfq_remove_request(struct request_queue *q, if (rq->queuelist.prev != &rq->queuelist) list_del_init(&rq->queuelist); bfqq->queued[sync]--; - bfqd->queued--; + /* + * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it + * may be read without holding the lock in bfq_has_work(). + */ + WRITE_ONCE(bfqd->queued, bfqd->queued - 1); elv_rb_del(&bfqq->sort_list, rq);
elv_rqhash_del(q, rq); @@ -3989,11 +3997,11 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
/* - * Avoiding lock: a race on bfqd->busy_queues should cause at + * Avoiding lock: a race on bfqd->queued should cause at * most a call to dispatch for nothing */ return !list_empty_careful(&bfqd->dispatch) || - bfqd->busy_queues > 0; + READ_ONCE(bfqd->queued); }
static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 186769, https://gitee.com/openeuler/kernel/issues/I5FYJY CVE: NA
--------------------------------
Our test report a crash:
run fstests generic/349 at 2022-05-20 20:55:10 sd 3:0:0:0: Power-on or device reset occurred BUG: kernel NULL pointer dereference, address: 0000000000000030 Call Trace: disk_release+0x42/0x170 device_release+0x92/0x120 kobject_put+0x183/0x350 put_disk+0x23/0x30 sg_device_destroy+0x77/0xd0 sg_remove_device+0x1b8/0x220 device_del+0x19b/0x610 ? kfree_const+0x3e/0x50 ? kobject_put+0x1d1/0x350 device_unregister+0x36/0xa0 __scsi_remove_device+0x1ba/0x240 scsi_forget_host+0x95/0xd0 scsi_remove_host+0xba/0x1f0 sdebug_driver_remove+0x30/0x110 [scsi_debug] device_release_driver_internal+0x1ab/0x340 device_release_driver+0x16/0x20 bus_remove_device+0x167/0x220 device_del+0x23e/0x610 device_unregister+0x36/0xa0 sdebug_do_remove_host+0x159/0x190 [scsi_debug] scsi_debug_exit+0x2d/0x120 [scsi_debug] __se_sys_delete_module+0x34c/0x420 ? exit_to_user_mode_prepare+0x93/0x210 __x64_sys_delete_module+0x1a/0x30 do_syscall_64+0x4d/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Such crash happened since commit 2a19b28f7929 ("blk-mq: cancel blk-mq dispatch work in both blk_cleanup_queue and disk_release()") was backported from mainline.
commit 61a35cfc2633 ("block: hold a request_queue reference for the lifetime of struct gendisk") is not backported, thus we can't ensure request_queue still exist in disk_release(), and that's why blk_mq_cancel_work_sync() will triggered the problem in disk_release(). However, in order to backport it, there are too many relied patches and kabi will be broken.
Since we didn't backport related patches to tear down file system I/O in del_gendisk, which fix issues introduced by refactor patches to move bdi from request_queue to the disk, there is no need to call blk_mq_cancel_work_sync() from disk_release(). This patch just remove blk_mq_cancel_work_sync() from disk_release() to fix the above crash.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/genhd.c | 3 --- 1 file changed, 3 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c index 3c23299976b1..124f8d94584c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1274,9 +1274,6 @@ static void disk_release(struct device *dev) struct gendisk *disk = dev_to_disk(dev);
blk_free_devt(dev->devt); - - blk_mq_cancel_work_sync(disk->queue); - disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL);
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 186731, https://gitee.com/src-openeuler/kernel/issues/I5HWTF CVE: NA
--------------------------------
If new configuration is submitted while a bio is throttled, then new waiting time is recaculated regardless that the bio might aready wait for some time:
tg_conf_updated throtl_start_new_slice tg_update_disptime throtl_schedule_next_dispatch
Then io hung can be triggered by always submmiting new configuration before the throttled bio is dispatched.
Fix the problem by respecting the time that throttled bio aready waited. In order to do that, instead of start new slice in tg_conf_updated(), just update 'bytes_disp' and 'io_disp' based on the new configuration.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/blk-throttle.c | 80 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 13 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 95d28cf8ff0f..1c5ac48a9de8 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1394,7 +1394,57 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; }
-static void tg_conf_updated(struct throtl_grp *tg, bool global) +static u64 throtl_update_bytes_disp(u64 dispatched, u64 new_limit, + u64 old_limit) +{ + if (new_limit == old_limit) + return dispatched; + + if (!dispatched) + return 0; + + /* + * In the case that multiply will overflow, just return 0. It will only + * let bios to be dispatched earlier. + */ + if (div64_u64(U64_MAX, dispatched) < new_limit) + return 0; + + dispatched *= new_limit; + return div64_u64(dispatched, old_limit); +} + +static u32 throtl_update_io_disp(u32 dispatched, u32 new_limit, u32 old_limit) +{ + if (new_limit == old_limit) + return dispatched; + + if (!dispatched) + return 0; + /* + * In the case that multiply will overflow, just return 0. It will only + * let bios to be dispatched earlier. + */ + if (UINT_MAX / dispatched < new_limit) + return 0; + + dispatched *= new_limit; + return dispatched / old_limit; +} + +static void throtl_update_slice(struct throtl_grp *tg, u64 *old_limits) +{ + tg->bytes_disp[READ] = throtl_update_bytes_disp(tg->bytes_disp[READ], + tg_bps_limit(tg, READ), old_limits[0]); + tg->bytes_disp[WRITE] = throtl_update_bytes_disp(tg->bytes_disp[WRITE], + tg_bps_limit(tg, WRITE), old_limits[1]); + tg->io_disp[READ] = throtl_update_io_disp(tg->io_disp[READ], + tg_iops_limit(tg, READ), (u32)old_limits[2]); + tg->io_disp[WRITE] = throtl_update_io_disp(tg->io_disp[WRITE], + tg_iops_limit(tg, WRITE), (u32)old_limits[3]); +} + +static void tg_conf_updated(struct throtl_grp *tg, u64 *old_limits, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; @@ -1433,16 +1483,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) parent_tg->latency_target); }
- /* - * We're already holding queue_lock and know @tg is valid. Let's - * apply the new config directly. - * - * Restart the slices for both READ and WRITES. It might happen - * that a group's limit are dropped suddenly and we don't want to - * account recently dispatched IO with new low rate. - */ - throtl_start_new_slice(tg, 0); - throtl_start_new_slice(tg, 1); + throtl_update_slice(tg, old_limits);
if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -1475,6 +1516,14 @@ static inline int throtl_restart_syscall_when_busy(int errno) return ret; }
+static void tg_get_limits(struct throtl_grp *tg, u64 *limits) +{ + limits[0] = tg_bps_limit(tg, READ); + limits[1] = tg_bps_limit(tg, WRITE); + limits[2] = tg_iops_limit(tg, READ); + limits[3] = tg_iops_limit(tg, WRITE); +} + static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { @@ -1483,6 +1532,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, struct throtl_grp *tg; int ret; u64 v; + u64 old_limits[4];
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); if (ret) @@ -1499,12 +1549,14 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, v = U64_MAX;
tg = blkg_to_tg(ctx.blkg); + tg_get_limits(tg, old_limits); + if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; else *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
- tg_conf_updated(tg, false); + tg_conf_updated(tg, old_limits, false); ret = 0; out_finish: blkg_conf_finish(&ctx); @@ -1650,6 +1702,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; + u64 old_limits[4]; unsigned long idle_time; unsigned long latency_time; int ret; @@ -1668,6 +1721,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, v[1] = tg->bps_conf[WRITE][index]; v[2] = tg->iops_conf[READ][index]; v[3] = tg->iops_conf[WRITE][index]; + tg_get_limits(tg, old_limits);
idle_time = tg->idletime_threshold_conf; latency_time = tg->latency_target_conf; @@ -1754,7 +1808,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->td->limit_index = LIMIT_LOW; } else tg->td->limit_index = LIMIT_MAX; - tg_conf_updated(tg, index == LIMIT_LOW && + tg_conf_updated(tg, old_limits, index == LIMIT_LOW && tg->td->limit_valid[LIMIT_LOW]); ret = 0; out_finish:
From: Yu Kuai yukuai3@huawei.com
mainline inclusion from mainline-v5.19-rc1 commit 2895f1831e911ca87d4efdf43e35eb72a0c7e66e category: bugfix bugzilla: 187081, https://gitee.com/src-openeuler/kernel/issues/I5H341 CVE: NA
--------------------------------
Otherwise io will hung because request will only be completed if the cmd has the flag 'NBD_CMD_INFLIGHT'.
Fixes: 07175cb1baf4 ("nbd: make sure request completion won't concurrent") Signed-off-by: Yu Kuai yukuai3@huawei.com Link: https://lore.kernel.org/r/20220521073749.3146892-4-yukuai3@huawei.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflict: fake timeout is not supported yet, clear_bit() in nbd_handle_reply() directly. Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- drivers/block/nbd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index c0a4a8c123b0..6834ba0e7e2c 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -375,13 +375,14 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, if (!mutex_trylock(&cmd->lock)) return BLK_EH_RESET_TIMER;
- if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { mutex_unlock(&cmd->lock); return BLK_EH_DONE; }
if (!refcount_inc_not_zero(&nbd->config_refs)) { cmd->status = BLK_STS_TIMEOUT; + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); goto done; } @@ -422,6 +423,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, } set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags); cmd->status = BLK_STS_IOERR; + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); sock_shutdown(nbd); nbd_config_put(nbd); @@ -693,7 +695,7 @@ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, cmd = blk_mq_rq_to_pdu(req);
mutex_lock(&cmd->lock); - if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { + if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)", tag, cmd->status, cmd->flags); ret = -ENOENT; @@ -761,6 +763,8 @@ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, } } out: + if (!ret) + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); return ret ? ERR_PTR(ret) : cmd; }