From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 34280, https://gitee.com/openeuler/kernel/issues/I4AKY4 CVE: NA
-----------------------------------------------
This reverts commit a139227441d1a01bfce70e591dd6b17e126561fa.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/block/nbd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a82a5dc845c6f..6a72c07ce3cba 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -763,8 +763,7 @@ static void recv_work(struct work_struct *work) kfree(args); }
-static void nbd_clear_req(struct blk_mq_hw_ctx *hctx, - struct request *req, void *data, bool reserved) +static void nbd_clear_req(struct request *req, void *data, bool reserved) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
@@ -778,7 +777,7 @@ static void nbd_clear_req(struct blk_mq_hw_ctx *hctx, static void nbd_clear_que(struct nbd_device *nbd) { blk_mq_quiesce_queue(nbd->disk->queue); - blk_mq_queue_tag_inflight_iter(nbd->disk->queue, nbd_clear_req, NULL); + blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); blk_mq_unquiesce_queue(nbd->disk->queue); dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); }
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 34280, https://gitee.com/openeuler/kernel/issues/I4AKY4 CVE: NA
-----------------------------------------------
This reverts commit cb2e0ca05823681c8565565bb309a782f40a691c.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq-debugfs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index e098b79cab47e..f0865b6ea1e19 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -427,8 +427,7 @@ struct show_busy_params { * Note: the state of a request may change while this function is in progress, * e.g. due to a concurrent blk_mq_finish_request() call. */ -static void hctx_show_busy_rq(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *data, bool reserved) +static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) { const struct show_busy_params *params = data;
@@ -443,7 +442,7 @@ static int hctx_busy_show(void *data, struct seq_file *m) struct blk_mq_hw_ctx *hctx = data; struct show_busy_params params = { .m = m, .hctx = hctx };
- blk_mq_queue_tag_inflight_iter(hctx->queue, hctx_show_busy_rq, + blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, ¶ms);
return 0;
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 34280, https://gitee.com/openeuler/kernel/issues/I4AKY4 CVE: NA
-----------------------------------------------
This reverts commit 15c87cc96c186fc0927d2e0dac0887643293c73c.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq-tag.c | 77 ++++++++---------------------------------- block/blk-mq.c | 6 ++-- include/linux/blk-mq.h | 3 +- 3 files changed, 19 insertions(+), 67 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ce7f7188625eb..c8997e5c30f8a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -217,51 +217,37 @@ struct bt_iter_data { busy_iter_fn *fn; void *data; bool reserved; - bool inflight; };
static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; + struct blk_mq_tags *tags = hctx->tags; bool reserved = iter_data->reserved; - struct blk_mq_tags *tags; struct request *rq;
- tags = hctx->sched_tags ? hctx->sched_tags : hctx->tags; - if (!reserved) bitnr += tags->nr_reserved_tags; + rq = tags->rqs[bitnr];
/* - * Because tags->rqs[] will not been cleaned when free driver tag - * and there is a window between get driver tag and write tags->rqs[], - * so we may see stale rq in tags->rqs[] which may have been freed. - * Using static_rqs[] is safer. - */ - rq = tags->static_rqs[bitnr]; - - /* - * There is a small window between get tag and blk_mq_rq_ctx_init, - * so rq->q and rq->mq_hctx maybe different. + * We can hit rq == NULL here, because the tagging functions + * test and set the bit before assining ->rqs[]. */ - if (rq && rq->q == hctx->queue && - (!iter_data->inflight || - blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)) + if (rq && rq->q == hctx->queue) iter_data->fn(hctx, rq, iter_data->data, reserved); return true; }
-static void bt_for_each(struct blk_mq_hw_ctx *hctx, - struct sbitmap_queue *bt, busy_iter_fn *fn, - void *data, bool reserved, bool inflight) +static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, + busy_iter_fn *fn, void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, - .inflight = inflight, };
sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); @@ -359,23 +345,22 @@ void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) } EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
-static void __blk_mq_queue_tag_busy_iter(struct request_queue *q, - busy_iter_fn *fn, void *priv, bool inflight) +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv) { struct blk_mq_hw_ctx *hctx; int i;
/* - * Get a reference of the queue unless it has been zero. We use this - * to avoid the race with the code that would modify the hctxs after - * freeze and drain the queue, including updating nr_hw_queues, io - * scheduler switching and queue clean up. + * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and + * queue_hw_ctx after freeze the queue, so we use q_usage_counter + * to avoid race with it. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return;
queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_tags *tags; + struct blk_mq_tags *tags = hctx->tags;
/* * If not software queues are currently mapped to this @@ -384,45 +369,13 @@ static void __blk_mq_queue_tag_busy_iter(struct request_queue *q, if (!blk_mq_hw_queue_mapped(hctx)) continue;
- tags = hctx->sched_tags ? hctx->sched_tags : hctx->tags; - if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, - fn, priv, true, inflight); - bt_for_each(hctx, &tags->bitmap_tags, - fn, priv, false, inflight); - /* - * flush_rq represents the rq with REQ_PREFLUSH and REQ_FUA - * (if FUA is not supported by device) to be issued to - * device. So we need to consider it when iterate inflight - * rqs, but needn't to count it when iterate busy tags. - */ - if (inflight && - blk_mq_rq_state(hctx->fq->flush_rq) == MQ_RQ_IN_FLIGHT) - fn(hctx, hctx->fq->flush_rq, priv, false); + bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } blk_queue_exit(q); }
-/* - * Iterate all the busy tags including pending and in-flight ones. - */ -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, - void *priv) -{ - __blk_mq_queue_tag_busy_iter(q, fn, priv, false); -} - -/* - * Iterate all the inflight tags. - */ -void blk_mq_queue_tag_inflight_iter(struct request_queue *q, - busy_iter_fn *fn, void *priv) -{ - __blk_mq_queue_tag_busy_iter(q, fn, priv, true); -} -EXPORT_SYMBOL(blk_mq_queue_tag_inflight_iter); - static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, bool round_robin, int node) { diff --git a/block/blk-mq.c b/block/blk-mq.c index b37721774895b..b37d0f6156f9d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -112,7 +112,7 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, struct mq_inflight mi = { .part = part, .inflight = inflight, };
inflight[0] = inflight[1] = 0; - blk_mq_queue_tag_inflight_iter(q, blk_mq_check_inflight, &mi); + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); }
static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, @@ -131,7 +131,7 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, struct mq_inflight mi = { .part = part, .inflight = inflight, };
inflight[0] = inflight[1] = 0; - blk_mq_queue_tag_inflight_iter(q, blk_mq_check_inflight_rw, &mi); + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); }
void blk_freeze_queue_start(struct request_queue *q) @@ -917,7 +917,7 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return;
- blk_mq_queue_tag_inflight_iter(q, blk_mq_check_expired, &next); + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
if (next != 0) { mod_timer(&q->timeout, next); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f81bbcdbce0f8..3695a43eb556a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -325,8 +325,7 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -void blk_mq_queue_tag_inflight_iter(struct request_queue *q, busy_iter_fn *fn, - void *priv); + int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.13 commit 2e315dc07df009c3e29d6926871f62a30cfae394 category: bugfix bugzilla: 34280, https://gitee.com/openeuler/kernel/issues/I4AKY4 CVE: NA
-----------------------------------------------
Grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter(), and this way will prevent the request from being re-used when ->fn is running. The approach is same as what we do during handling timeout.
Fix request use-after-free(UAF) related with completion race or queue releasing:
- If one rq is referred before rq->q is frozen, then queue won't be frozen before the request is released during iteration.
- If one rq is referred after rq->q is frozen, refcount_inc_not_zero() will return false, and we won't iterate over this request.
However, still one request UAF not covered: refcount_inc_not_zero() may read one freed request, and it will be handled in next patch.
Tested-by: John Garry john.garry@huawei.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Bart Van Assche bvanassche@acm.org Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20210511152236.763464-3-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: - commit ea4f995ee8b8 ("blk-mq: cache request hardware queue mapping") is not backported, rq->mq_hctx doesn't exist, use blk_mq_map_queue() instead. - commit 602380d28e28 ("blk-mq: add blk_mq_all_tag_iter") is not backported, bt_tags_iter_data->flags doesn't exist, use 'reserved'. - commit 22f614bc0f37("blk-mq: fix blk_mq_all_tag_iter") is not backported, we won't use static_rqs in bt_tags_iter(). Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq-tag.c | 25 ++++++++++++++++++++++--- block/blk-mq.c | 13 +++++++++---- block/blk-mq.h | 1 + 3 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c8997e5c30f8a..dcf65c9b767ba 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -219,6 +219,16 @@ struct bt_iter_data { bool reserved; };
+static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, + unsigned int bitnr) +{ + struct request *rq = tags->rqs[bitnr]; + + if (!rq || !refcount_inc_not_zero(&rq->ref)) + return NULL; + return rq; +} + static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; @@ -235,8 +245,13 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * We can hit rq == NULL here, because the tagging functions * test and set the bit before assining ->rqs[]. */ - if (rq && rq->q == hctx->queue) + rq = blk_mq_find_and_get_req(tags, bitnr); + if (!rq) + return true; + + if (rq->q == hctx->queue) iter_data->fn(hctx, rq, iter_data->data, reserved); + blk_mq_put_rq_ref(rq, hctx); return true; }
@@ -274,9 +289,13 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * We can hit rq == NULL here, because the tagging functions * test and set the bit before assining ->rqs[]. */ - rq = tags->rqs[bitnr]; - if (rq && blk_mq_request_started(rq)) + rq = blk_mq_find_and_get_req(tags, bitnr); + if (!rq) + return true; + + if (blk_mq_request_started(rq)) iter_data->fn(rq, iter_data->data, reserved); + blk_mq_put_rq_ref(rq, blk_mq_map_queue(rq->q, rq->mq_ctx->cpu));
return true; } diff --git a/block/blk-mq.c b/block/blk-mq.c index b37d0f6156f9d..303c71e81146e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -854,6 +854,14 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; }
+void blk_mq_put_rq_ref(struct request *rq, struct blk_mq_hw_ctx *hctx) +{ + if (is_flush_rq(rq, hctx)) + rq->end_io(rq, 0); + else if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); +} + static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { @@ -887,10 +895,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, if (blk_mq_req_expired(rq, next)) blk_mq_rq_timed_out(rq, reserved);
- if (is_flush_rq(rq, hctx)) - rq->end_io(rq, 0); - else if (refcount_dec_and_test(&rq->ref)) - __blk_mq_free_request(rq); + blk_mq_put_rq_ref(rq, hctx); }
static void blk_mq_timeout_work(struct work_struct *work) diff --git a/block/blk-mq.h b/block/blk-mq.h index debc646e1bed8..1fc3debe69f94 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -45,6 +45,7 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_get_driver_tag(struct request *rq); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); +void blk_mq_put_rq_ref(struct request *rq, struct blk_mq_hw_ctx *hctx);
/* * Internal helpers for allocating/freeing the request map
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.13 commit bd63141d585bef14f4caf111f6d0e27fe2300ec6 category: bugfix bugzilla: 34280, https://gitee.com/openeuler/kernel/issues/I4AKY4 CVE: NA
-----------------------------------------------
refcount_inc_not_zero() in bt_tags_iter() still may read one freed request.
Fix the issue by the following approach:
1) hold a per-tags spinlock when reading ->rqs[tag] and calling refcount_inc_not_zero in bt_tags_iter()
2) clearing stale request referred via ->rqs[tag] before freeing request pool, the per-tags spinlock is held for clearing stale ->rq[tag]
So after we cleared stale requests, bt_tags_iter() won't observe freed request any more, also the clearing will wait for pending request reference.
The idea of clearing ->rqs[] is borrowed from John Garry's previous patch and one recent David's patch.
Tested-by: John Garry john.garry@huawei.com Reviewed-by: David Jeffery djeffery@redhat.com Reviewed-by: Bart Van Assche bvanassche@acm.org Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20210511152236.763464-4-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq-tag.c | 9 +++++++-- block/blk-mq-tag.h | 6 ++++++ block/blk-mq.c | 46 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 54 insertions(+), 7 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index dcf65c9b767ba..04986f7d87406 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -222,10 +222,14 @@ struct bt_iter_data { static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { - struct request *rq = tags->rqs[bitnr]; + struct request *rq; + unsigned long flags;
+ spin_lock_irqsave(&tags->lock, flags); + rq = tags->rqs[bitnr]; if (!rq || !refcount_inc_not_zero(&rq->ref)) - return NULL; + rq = NULL; + spin_unlock_irqrestore(&tags->lock, flags); return rq; }
@@ -439,6 +443,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; + spin_lock_init(&tags->lock);
return blk_mq_init_bitmap_tags(tags, node, alloc_policy); } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 61deab0b5a5a5..1a6cfb608f8aa 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -19,6 +19,12 @@ struct blk_mq_tags { struct request **rqs; struct request **static_rqs; struct list_head page_list; + + /* + * used to clear request reference in rqs[] before freeing one + * request pool + */ + spinlock_t lock; };
diff --git a/block/blk-mq.c b/block/blk-mq.c index 303c71e81146e..a0067147dd72d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2024,6 +2024,45 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return cookie; }
+static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +/* called before freeing request pool in @tags */ +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx) +{ + struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; + struct page *page; + unsigned long flags; + + list_for_each_entry(page, &tags->page_list, lru) { + unsigned long start = (unsigned long)page_address(page); + unsigned long end = start + order_to_size(page->private); + int i; + + for (i = 0; i < set->queue_depth; i++) { + struct request *rq = drv_tags->rqs[i]; + unsigned long rq_addr = (unsigned long)rq; + + if (rq_addr >= start && rq_addr < end) { + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + cmpxchg(&drv_tags->rqs[i], rq, NULL); + } + } + } + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&drv_tags->lock, flags); + spin_unlock_irqrestore(&drv_tags->lock, flags); +} + void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { @@ -2042,6 +2081,8 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } }
+ blk_mq_clear_rq_mapping(set, tags, hctx_idx); + while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); @@ -2101,11 +2142,6 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, return tags; }
-static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; -} - static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, int node) {
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.13 commit 364b61818f65045479e42e76ed8dd6f051778280 category: bugfix bugzilla: 34280, https://gitee.com/openeuler/kernel/issues/I4AKY4 CVE: NA
-----------------------------------------------
Before we free request queue, clearing flush request reference in tags->rqs[], so that potential UAF can be avoided.
Based on one patch written by David Jeffery.
Tested-by: John Garry john.garry@huawei.com Reviewed-by: Bart Van Assche bvanassche@acm.org Reviewed-by: David Jeffery djeffery@redhat.com Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20210511152236.763464-5-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index a0067147dd72d..d223e771eb2e0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2272,16 +2272,49 @@ static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) &hctx->cpuhp_dead); }
+/* + * Before freeing hw queue, clearing the flush request reference in + * tags->rqs[] for avoiding potential UAF. + */ +static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, + unsigned int queue_depth, struct request *flush_rq) +{ + int i; + unsigned long flags; + + /* The hw queue may not be mapped yet */ + if (!tags) + return; + + WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); + + for (i = 0; i < queue_depth; i++) + cmpxchg(&tags->rqs[i], flush_rq, NULL); + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&tags->lock, flags); + spin_unlock_irqrestore(&tags->lock, flags); +} + /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + struct request *flush_rq = hctx->fq->flush_rq; + if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx);
+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], + set->queue_depth, flush_rq); if (set->ops->exit_request) - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); + set->ops->exit_request(set, flush_rq, hctx_idx);
if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx);
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.14-rc7 commit c797b40ccc340b8a66f7a7842aecc90bf749f087 category: bugfix bugzilla: 34280, https://gitee.com/openeuler/kernel/issues/I4AKY4 CVE: NA
-----------------------------------------------
Inside blk_mq_queue_tag_busy_iter() we already grabbed request's refcount before calling ->fn(), so needn't to grab it one more time in blk_mq_check_expired().
Meantime remove extra request expire check in blk_mq_check_expired().
Cc: Keith Busch kbusch@kernel.org Signed-off-by: Ming Lei ming.lei@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: John Garry john.garry@huawei.com Link: https://lore.kernel.org/r/20210811155202.629575-1-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index d223e771eb2e0..ed1d9bd917c23 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -868,34 +868,14 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, unsigned long *next = priv;
/* - * Just do a quick check if it is expired before locking the request in - * so we're not unnecessarilly synchronizing across CPUs. - */ - if (!blk_mq_req_expired(rq, next)) - return; - - /* - * We have reason to believe the request may be expired. Take a - * reference on the request to lock this request lifetime into its - * currently allocated context to prevent it from being reallocated in - * the event the completion by-passes this timeout handler. - * - * If the reference was already released, then the driver beat the - * timeout handler to posting a natural completion. - */ - if (!refcount_inc_not_zero(&rq->ref)) - return; - - /* - * The request is now locked and cannot be reallocated underneath the - * timeout handler's processing. Re-verify this exact request is truly - * expired; if it is not expired, then the request was completed and - * reallocated as a new request. + * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot + * be reallocated underneath the timeout handler's processing, then + * the expire check is reliable. If the request is not expired, then + * it was completed and reallocated as a new request after returning + * from blk_mq_check_expired(). */ if (blk_mq_req_expired(rq, next)) blk_mq_rq_timed_out(rq, reserved); - - blk_mq_put_rq_ref(rq, hctx); }
static void blk_mq_timeout_work(struct work_struct *work)
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 34280, https://gitee.com/openeuler/kernel/issues/I4AKY4 CVE: NA
-----------------------------------------------
Prepare to fix kernel panic during iterating over flush request, no functional changes.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-core.c | 7 ++++++- block/blk.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 7760bebeabcd7..47b04b45df7f5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -183,7 +183,7 @@ void blk_queue_congestion_threshold(struct request_queue *q) q->nr_congestion_off = nr; }
-void blk_rq_init(struct request_queue *q, struct request *rq) +void __blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq));
@@ -198,6 +198,11 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->internal_tag = -1; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; +} + +void blk_rq_init(struct request_queue *q, struct request *rq) +{ + __blk_rq_init(q, rq); refcount_set(&rq->ref, 1); } EXPORT_SYMBOL(blk_rq_init); diff --git a/block/blk.h b/block/blk.h index 8bb450ef6659b..62464a303b3b0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -262,6 +262,7 @@ int elevator_switch_mq(struct request_queue *q, void elevator_exit(struct request_queue *, struct elevator_queue *); int elv_register_queue(struct request_queue *q); void elv_unregister_queue(struct request_queue *q); +void __blk_rq_init(struct request_queue *q, struct request *rq);
struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.14-rc7 commit c2da19ed50554ce52ecbad3655c98371fe58599f category: bugfix bugzilla: 34280, https://gitee.com/openeuler/kernel/issues/I4AKY4 CVE: NA
-----------------------------------------------
For fixing use-after-free during iterating over requests, we grabbed request's refcount before calling ->fn in commit 2e315dc07df0 ("blk-mq: grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter"). Turns out this way may cause kernel panic when iterating over one flush request:
1) old flush request's tag is just released, and this tag is reused by one new request, but ->rqs[] isn't updated yet
2) the flush request can be re-used for submitting one new flush command, so blk_rq_init() is called at the same time
3) meantime blk_mq_queue_tag_busy_iter() is called, and old flush request is retrieved from ->rqs[tag]; when blk_mq_put_rq_ref() is called, flush_rq->end_io may not be updated yet, so NULL pointer dereference is triggered in blk_mq_put_rq_ref().
Fix the issue by calling refcount_set(&flush_rq->ref, 1) after flush_rq->end_io is set. So far the only other caller of blk_rq_init() is scsi_ioctl_reset() in which the request doesn't enter block IO stack and the request reference count isn't used, so the change is safe.
Fixes: 2e315dc07df0 ("blk-mq: grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter") Reported-by: "Blank-Burian, Markus, Dr." blankburian@uni-muenster.de Tested-by: "Blank-Burian, Markus, Dr." blankburian@uni-muenster.de Signed-off-by: Ming Lei ming.lei@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: John Garry john.garry@huawei.com Link: https://lore.kernel.org/r/20210811142624.618598-1-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk
conflicts: use __blk_rq_init() instead of blk_rq_init(). Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-flush.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c index c357e5c16d89c..c87a0ffba9c74 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -340,7 +340,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, */ fq->flush_pending_idx ^= 1;
- blk_rq_init(q, flush_rq); + __blk_rq_init(q, flush_rq);
/* * In case of none scheduler, borrow tag from the first request @@ -371,6 +371,15 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io;
+ /* + * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one + * implied in refcount_inc_not_zero() called from + * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref + * and READ flush_rq->end_io + */ + smp_wmb(); + refcount_set(&flush_rq->ref, 1); + return blk_flush_queue_rq(flush_rq, false); }
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.14-rc7 commit a9ed27a764156929efe714033edb3e9023c5f321 category: bugfix bugzilla: 34280, https://gitee.com/openeuler/kernel/issues/I4AKY4 CVE: NA
-----------------------------------------------
is_flush_rq() is called from bt_iter()/bt_tags_iter(), and runs the following check:
hctx->fq->flush_rq == req
but the passed hctx from bt_iter()/bt_tags_iter() may be NULL because:
1) memory re-order in blk_mq_rq_ctx_init():
rq->mq_hctx = data->hctx; ... refcount_set(&rq->ref, 1);
OR
2) tag re-use and ->rqs[] isn't updated with new request.
Fix the issue by re-writing is_flush_rq() as:
return rq->end_io == flush_end_io;
which turns out simpler to follow and immune to data race since we have ordered WRITE rq->end_io and refcount_set(&rq->ref, 1).
Fixes: 2e315dc07df0 ("blk-mq: grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter") Cc: "Blank-Burian, Markus, Dr." blankburian@uni-muenster.de Cc: Yufen Yu yuyufen@huawei.com Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20210818010925.607383-1-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk
conflicts: - rq->mq_hctx do not exist, however the problem still exist because we are using rq->mq_ctx to get hctx. - the second parameter 'hctx' of blk_mq_put_rq_ref() can be removed. Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-flush.c | 5 +++++ block/blk-mq-tag.c | 4 ++-- block/blk-mq.c | 4 ++-- block/blk-mq.h | 2 +- block/blk.h | 6 +----- 5 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c index c87a0ffba9c74..022a2bbb012d7 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -295,6 +295,11 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) spin_unlock_irqrestore(&fq->mq_flush_lock, flags); }
+bool is_flush_rq(struct request *rq) +{ + return rq->end_io == flush_end_io; +} + /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 04986f7d87406..5c740716c5e5a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -255,7 +255,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
if (rq->q == hctx->queue) iter_data->fn(hctx, rq, iter_data->data, reserved); - blk_mq_put_rq_ref(rq, hctx); + blk_mq_put_rq_ref(rq); return true; }
@@ -299,7 +299,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
if (blk_mq_request_started(rq)) iter_data->fn(rq, iter_data->data, reserved); - blk_mq_put_rq_ref(rq, blk_mq_map_queue(rq->q, rq->mq_ctx->cpu)); + blk_mq_put_rq_ref(rq);
return true; } diff --git a/block/blk-mq.c b/block/blk-mq.c index ed1d9bd917c23..6da37cbb975c9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -854,9 +854,9 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; }
-void blk_mq_put_rq_ref(struct request *rq, struct blk_mq_hw_ctx *hctx) +void blk_mq_put_rq_ref(struct request *rq) { - if (is_flush_rq(rq, hctx)) + if (is_flush_rq(rq)) rq->end_io(rq, 0); else if (refcount_dec_and_test(&rq->ref)) __blk_mq_free_request(rq); diff --git a/block/blk-mq.h b/block/blk-mq.h index 1fc3debe69f94..bbb0c1d8849b4 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -45,7 +45,7 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_get_driver_tag(struct request *rq); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); -void blk_mq_put_rq_ref(struct request *rq, struct blk_mq_hw_ctx *hctx); +void blk_mq_put_rq_ref(struct request *rq);
/* * Internal helpers for allocating/freeing the request map diff --git a/block/blk.h b/block/blk.h index 62464a303b3b0..c2fa239ca78f2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -140,11 +140,7 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); }
-static inline bool -is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) -{ - return hctx->fq->flush_rq == req; -} +bool is_flush_rq(struct request *req);
struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, int node, int cmd_size, gfp_t flags);
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 34280, https://gitee.com/openeuler/kernel/issues/I4AKY4 CVE: NA
-----------------------------------------------
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq-tag.c | 16 +++++++++------- block/blk-mq-tag.h | 10 ++++++++++ block/blk-mq.c | 8 ++++---- 3 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 5c740716c5e5a..bee92ab06a5e3 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -225,11 +225,11 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, struct request *rq; unsigned long flags;
- spin_lock_irqsave(&tags->lock, flags); + blk_mq_tags_lock_irqsave(tags, flags); rq = tags->rqs[bitnr]; if (!rq || !refcount_inc_not_zero(&rq->ref)) rq = NULL; - spin_unlock_irqrestore(&tags->lock, flags); + blk_mq_tags_unlock_irqrestore(tags, flags); return rq; }
@@ -422,7 +422,7 @@ static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, free_bitmap_tags: sbitmap_queue_free(&tags->bitmap_tags); free_tags: - kfree(tags); + kfree(blk_mq_tags_to_wrapper(tags)); return NULL; }
@@ -431,19 +431,21 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, int node, int alloc_policy) { struct blk_mq_tags *tags; + struct blk_mq_tags_wrapper *tags_wrapper;
if (total_tags > BLK_MQ_TAG_MAX) { pr_err("blk-mq: tag depth too large\n"); return NULL; }
- tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); - if (!tags) + tags_wrapper = kzalloc_node(sizeof(*tags_wrapper), GFP_KERNEL, node); + if (!tags_wrapper) return NULL;
+ tags = &tags_wrapper->tags; tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; - spin_lock_init(&tags->lock); + spin_lock_init(&tags_wrapper->lock);
return blk_mq_init_bitmap_tags(tags, node, alloc_policy); } @@ -452,7 +454,7 @@ void blk_mq_free_tags(struct blk_mq_tags *tags) { sbitmap_queue_free(&tags->bitmap_tags); sbitmap_queue_free(&tags->breserved_tags); - kfree(tags); + kfree(blk_mq_tags_to_wrapper(tags)); }
int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 1a6cfb608f8aa..bc7a4c58c76f7 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -19,6 +19,10 @@ struct blk_mq_tags { struct request **rqs; struct request **static_rqs; struct list_head page_list; +}; + +struct blk_mq_tags_wrapper { + struct blk_mq_tags tags;
/* * used to clear request reference in rqs[] before freeing one @@ -27,6 +31,12 @@ struct blk_mq_tags { spinlock_t lock; };
+#define blk_mq_tags_to_wrapper(t) \ + container_of(t, struct blk_mq_tags_wrapper, tags) +#define blk_mq_tags_lock_irqsave(tags, flags) \ + spin_lock_irqsave(&blk_mq_tags_to_wrapper(tags)->lock, flags) +#define blk_mq_tags_unlock_irqrestore(tags, flags) \ + spin_unlock_irqrestore(&blk_mq_tags_to_wrapper(tags)->lock, flags)
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); extern void blk_mq_free_tags(struct blk_mq_tags *tags); diff --git a/block/blk-mq.c b/block/blk-mq.c index 6da37cbb975c9..7106c94ea58fe 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2039,8 +2039,8 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released. */ - spin_lock_irqsave(&drv_tags->lock, flags); - spin_unlock_irqrestore(&drv_tags->lock, flags); + blk_mq_tags_lock_irqsave(drv_tags, flags); + blk_mq_tags_unlock_irqrestore(drv_tags, flags); }
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, @@ -2277,8 +2277,8 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released. */ - spin_lock_irqsave(&tags->lock, flags); - spin_unlock_irqrestore(&tags->lock, flags); + blk_mq_tags_lock_irqsave(tags, flags); + blk_mq_tags_unlock_irqrestore(tags, flags); }
/* hctx->ctxs will be freed in queue's release handler */