Bart Van Assche (1): blk-mq: remove blk_mq_put_ctx()
Jens Axboe (4): block: add blk_time_get_ns() and blk_time_get() helpers block: cache current nsec time in struct blk_plug block: update cached timestamp post schedule/preemption block: limit block time caching to in_task() context
Yu Kuai (25): blk-mq: export blk-mq-debugfs apis block: fix that blk_time_get_ns() doesn't update time after schedule block: fix kabi broken in struct blk_plug block: support to recored bio allocation time block: support to recored bio allocation task block: support to recored bio allocation time in request block: fix kabi broken for struct request block: support to recored bio allocation task in request block: block: fix kabi broken for struct blk_mq_alloc_data blk-mq-debugfs: factor out a new helper to show allocated request block: support to record when request is completed block-io-hierarchy: core hierarchy stats implementation block-io-hierarchy: core hierarchy iodump implementation blk-io-hierarchy: support to recored the number of slow IO blk-io-hierarchy: support new bio based stage blk-throtl blk-io-hierarchy: support new bio based stage blk-wbt blk-io-hierarchy: support new bio based stage gettag blk-io-hierarchy: support new rq based stage plug blk-io-hierarchy: support new rq based stage mq-deadline blk-io-hierarchy: support new rq based stage bfq blk-io-hierarchy: support new rq based stage kyber blk-io-hierarchy: support new rq based stage hctx blk-io-hierarchy: support new rq based stage requeue blk-io-hierarchy: support new rq based stage rq_driver blk-io-hierarchy: support new stage for bio lifetime
arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + block/Kconfig | 8 + block/Makefile | 1 + block/bfq-cgroup.c | 15 +- block/bfq-iosched.c | 28 +- block/bio.c | 17 + block/blk-core.c | 37 +- block/blk-flush.c | 19 +- block/blk-io-hierarchy/Kconfig | 145 +++++ block/blk-io-hierarchy/Makefile | 8 + block/blk-io-hierarchy/debugfs.c | 230 ++++++++ block/blk-io-hierarchy/iodump.c | 754 +++++++++++++++++++++++++ block/blk-io-hierarchy/iodump.h | 103 ++++ block/blk-io-hierarchy/stats.c | 445 +++++++++++++++ block/blk-io-hierarchy/stats.h | 372 ++++++++++++ block/blk-merge.c | 1 + block/blk-mq-debugfs.c | 21 +- block/blk-mq-debugfs.h | 10 + block/blk-mq-sched.c | 7 +- block/blk-mq-tag.c | 13 +- block/blk-mq.c | 76 ++- block/blk-mq.h | 56 +- block/blk-sysfs.c | 16 + block/blk-throttle.c | 29 +- block/blk-wbt.c | 12 +- block/blk.h | 84 ++- block/kyber-iosched.c | 10 +- block/mq-deadline.c | 15 +- include/linux/blk_types.h | 55 +- include/linux/blkdev.h | 16 + include/linux/sched.h | 2 + kernel/sched/core.c | 6 +- 33 files changed, 2532 insertions(+), 81 deletions(-) create mode 100644 block/blk-io-hierarchy/Kconfig create mode 100644 block/blk-io-hierarchy/Makefile create mode 100644 block/blk-io-hierarchy/debugfs.c create mode 100644 block/blk-io-hierarchy/iodump.c create mode 100644 block/blk-io-hierarchy/iodump.h create mode 100644 block/blk-io-hierarchy/stats.c create mode 100644 block/blk-io-hierarchy/stats.h
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
These apis will be used later to create new debugfs files. There are no functional changes.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-mq-debugfs.c | 4 ++-- block/blk-mq-debugfs.h | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f0865b6ea1e1..1c8113f8acb1 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -811,8 +811,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {}, };
-static bool debugfs_create_files(struct dentry *parent, void *data, - const struct blk_mq_debugfs_attr *attr) +bool debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr) { if (IS_ERR_OR_NULL(parent)) return false; diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index a9160be12be0..b900fd7c90e1 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -31,6 +31,15 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q); int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); + +bool debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr); + +static inline bool blk_mq_debugfs_enabled(struct request_queue *q) +{ + return !IS_ERR_OR_NULL(q->debugfs_dir); +} + #else static inline int blk_mq_debugfs_register(struct request_queue *q) {
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-v6.9-rc1 commit 08420cf70cfb32eed2a0abfeb5c54c5651bd0c99 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Convert any user of ktime_get_ns() to use blk_time_get_ns(), and ktime_get() to blk_time_get(), so we have a unified API for querying the current time in nanoseconds or as ktime.
No functional changes intended, this patch just wraps ktime_get_ns() and ktime_get() with a block helper.
Reviewed-by: Johannes Thumshirn johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/bfq-cgroup.c block/bfq-iosched.c block/blk-cgroup.c block/blk-flush.c block/blk-iocost.c block/blk-iolatency.c block/blk-core.c block/blk-mq-debugfs.h block/blk-mq.c block/blk-throttle.c block/blk-wbt.c block/blk.h block/kyber-iosched.c
[Lots of conficts in context, reimplement the patch for current context] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/bfq-cgroup.c | 15 ++++++++------- block/bfq-iosched.c | 17 +++++++++-------- block/blk-core.c | 8 ++++---- block/blk-mq.c | 6 +++--- block/blk-throttle.c | 6 +++--- block/blk.h | 4 ++++ block/kyber-iosched.c | 2 +- 7 files changed, 32 insertions(+), 26 deletions(-)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b663cd8b9e46..25a407e5142d 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -22,6 +22,7 @@ #include <linux/sbitmap.h> #include <linux/delay.h>
+#include "blk.h" #include "bfq-iosched.h"
#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) @@ -60,7 +61,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) if (!bfqg_stats_waiting(stats)) return;
- now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_group_wait_time) blkg_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); @@ -77,7 +78,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, return; if (bfqg == curr_bfqg) return; - stats->start_group_wait_time = ktime_get_ns(); + stats->start_group_wait_time = blk_time_get_ns(); bfqg_stats_mark_waiting(stats); }
@@ -89,7 +90,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) if (!bfqg_stats_empty(stats)) return;
- now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_empty_time) blkg_stat_add(&stats->empty_time, now - stats->start_empty_time); @@ -116,7 +117,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) if (bfqg_stats_empty(stats)) return;
- stats->start_empty_time = ktime_get_ns(); + stats->start_empty_time = blk_time_get_ns(); bfqg_stats_mark_empty(stats); }
@@ -125,7 +126,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) struct bfqg_stats *stats = &bfqg->stats;
if (bfqg_stats_idling(stats)) { - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns();
if (now > stats->start_idle_time) blkg_stat_add(&stats->idle_time, @@ -138,7 +139,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats;
- stats->start_idle_time = ktime_get_ns(); + stats->start_idle_time = blk_time_get_ns(); bfqg_stats_mark_idling(stats); }
@@ -175,7 +176,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, unsigned int op) { struct bfqg_stats *stats = &bfqg->stats; - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns();
if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, op, diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 473d9e31ff87..cf15937eed5e 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -844,7 +844,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
rq = rq_entry_fifo(bfqq->fifo.next);
- if (rq == last || ktime_get_ns() < rq->fifo_time) + if (rq == last || blk_time_get_ns() < rq->fifo_time) return NULL;
bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); @@ -1566,7 +1566,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * bfq_bfqq_update_budg_for_activation for * details on the usage of the next variable. */ - arrived_in_time = ktime_get_ns() <= + arrived_in_time = blk_time_get_ns() <= bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3;
@@ -2605,7 +2605,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) { if (rq != NULL) { /* new rq dispatch now, reset accordingly */ - bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); + bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns(); bfqd->peak_rate_samples = 1; bfqd->sequential_samples = 0; bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = @@ -2762,7 +2762,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) */ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns();
if (bfqd->peak_rate_samples == 0) { /* first dispatch */ bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", @@ -4410,7 +4410,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_clear_bfqq_sync(bfqq);
/* set end request to minus infinity from now */ - bfqq->ttime.last_end_request = ktime_get_ns() + 1; + bfqq->ttime.last_end_request = blk_time_get_ns() + 1;
bfq_mark_bfqq_IO_bound(bfqq);
@@ -4528,7 +4528,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_ttime *ttime = &bfqq->ttime; - u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + u64 elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request;
elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
@@ -4697,7 +4697,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bfq_add_request(rq); idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
- rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + rq->fifo_time = blk_time_get_ns() + + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo);
bfq_rq_enqueued(bfqd, bfqq, rq); @@ -4853,7 +4854,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_weights_tree_remove(bfqd, bfqq); }
- now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns();
bfqq->ttime.last_end_request = now_ns;
diff --git a/block/blk-core.c b/block/blk-core.c index acf5585b0557..847fd7585952 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -454,7 +454,7 @@ void __blk_rq_init(struct request_queue *q, struct request *rq) RB_CLEAR_NODE(&rq->rb_node); rq->tag = -1; rq->internal_tag = -1; - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); rq->part = NULL; }
@@ -2952,7 +2952,7 @@ blk_status_t __blk_insert_cloned_request(struct request_queue *q, u64 now = 0;
if (blk_mq_need_time_stamp(rq)) - now = ktime_get_ns(); + now = blk_time_get_ns();
blk_account_io_done(rq, now); } @@ -3304,7 +3304,7 @@ void blk_start_request(struct request *req) blk_dequeue_request(req);
if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { - req->io_start_time_ns = ktime_get_ns(); + req->io_start_time_ns = blk_time_get_ns(); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW req->throtl_size = blk_rq_sectors(req); #endif @@ -3509,7 +3509,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); void blk_finish_request(struct request *req, blk_status_t error) { struct request_queue *q = req->q; - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns();
lockdep_assert_held(req->q->queue_lock); WARN_ON_ONCE(q->mq_ops); diff --git a/block/blk-mq.c b/block/blk-mq.c index aa4b3c608249..76dd32ee6172 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -366,7 +366,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); rq->io_start_time_ns = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -576,7 +576,7 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns();
if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); @@ -724,7 +724,7 @@ void blk_mq_start_request(struct request *rq) trace_block_rq_issue(q, rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { - rq->io_start_time_ns = ktime_get_ns(); + rq->io_start_time_ns = blk_time_get_ns(); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW rq->throtl_size = blk_rq_sectors(rq); #endif diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 598191286557..0795935574d3 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1910,7 +1910,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); ret = tg->latency_target == DFL_LATENCY_TARGET || tg->idletime_threshold == DFL_IDLE_THRESHOLD || - (ktime_get_ns() >> 10) - tg->last_finish_time > time || + (blk_time_get_ns() >> 10) - tg->last_finish_time > time || tg->avg_idletime > tg->idletime_threshold || (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); @@ -2140,7 +2140,7 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
static void blk_throtl_update_idletime(struct throtl_grp *tg) { - unsigned long now = ktime_get_ns() >> 10; + unsigned long now = blk_time_get_ns() >> 10; unsigned long last_finish_time = tg->last_finish_time;
if (now <= last_finish_time || last_finish_time == 0 || @@ -2403,7 +2403,7 @@ void blk_throtl_bio_endio(struct bio *bio) return; tg = blkg_to_tg(blkg);
- finish_time_ns = ktime_get_ns(); + finish_time_ns = blk_time_get_ns(); tg->last_finish_time = finish_time_ns >> 10;
start_time = bio_issue_time(&bio->bi_issue) >> 10; diff --git a/block/blk.h b/block/blk.h index 965e9c507654..7dba4947ef02 100644 --- a/block/blk.h +++ b/block/blk.h @@ -479,4 +479,8 @@ static inline void blk_free_queue_dispatch_async(struct request_queue *q) } #endif
+static inline u64 blk_time_get_ns(void) +{ + return ktime_get_ns(); +} #endif /* BLK_INTERNAL_H */ diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 833e9eaae640..f370d3e3f6e0 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -584,7 +584,7 @@ static void kyber_completed_request(struct request *rq) if (blk_stat_is_active(kqd->cb)) return;
- now = ktime_get_ns(); + now = blk_time_get_ns(); if (now < rq->io_start_time_ns) return;
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-v6.9-rc1 commit da4c8c3d0975f031ef82d39927102e39fa6ddfac category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Querying the current time is the most costly thing we do in the block layer per IO, and depending on kernel config settings, we may do it many times per IO.
None of the callers actually need nsec granularity. Take advantage of that by caching the current time in the plug, with the assumption here being that any time checking will be temporally close enough that the slight loss of precision doesn't matter.
If the block plug gets flushed, eg on preempt or schedule out, then we invalidate the cached clock.
On a basic peak IOPS test case with iostats enabled, this changes the performance from:
IOPS=108.41M, BW=52.93GiB/s, IOS/call=31/31 IOPS=108.43M, BW=52.94GiB/s, IOS/call=32/32 IOPS=108.29M, BW=52.88GiB/s, IOS/call=31/32 IOPS=108.35M, BW=52.91GiB/s, IOS/call=32/32 IOPS=108.42M, BW=52.94GiB/s, IOS/call=31/31 IOPS=108.40M, BW=52.93GiB/s, IOS/call=32/32 IOPS=108.31M, BW=52.89GiB/s, IOS/call=32/31
to
IOPS=118.79M, BW=58.00GiB/s, IOS/call=31/32 IOPS=118.62M, BW=57.92GiB/s, IOS/call=31/31 IOPS=118.80M, BW=58.01GiB/s, IOS/call=32/31 IOPS=118.78M, BW=58.00GiB/s, IOS/call=32/32 IOPS=118.69M, BW=57.95GiB/s, IOS/call=32/31 IOPS=118.62M, BW=57.92GiB/s, IOS/call=32/31 IOPS=118.63M, BW=57.92GiB/s, IOS/call=31/32
which is more than a 9% improvement in performance. Looking at perf diff, we can see a huge reduction in time overhead:
10.55% -9.88% [kernel.vmlinux] [k] read_tsc 1.31% -1.22% [kernel.vmlinux] [k] ktime_get
Note that since this relies on blk_plug for the caching, it's only applicable to the issue side. But this is where most of the time calls happen anyway. On the completion side, cached time stamping is done with struct io_comp patch, as long as the driver supports it.
It's also worth noting that the above testing doesn't enable any of the higher cost CPU items on the block layer side, like wbt, cgroups, iocost, etc, which all would add additional time querying and hence overhead. IOW, results would likely look even better in comparison with those enabled, as distros would do.
Reviewed-by: Johannes Thumshirn johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/blk-core.c block/blk.h include/linux/blkdev.h [the definition of struct blk_plug is different from mainline.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 1 + block/blk.h | 14 +++++++++++++- include/linux/blkdev.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 847fd7585952..d63d82a628ac 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3919,6 +3919,7 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); + plug->cur_ktime = 0; /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier diff --git a/block/blk.h b/block/blk.h index 7dba4947ef02..0c8de338b391 100644 --- a/block/blk.h +++ b/block/blk.h @@ -481,6 +481,18 @@ static inline void blk_free_queue_dispatch_async(struct request_queue *q)
static inline u64 blk_time_get_ns(void) { - return ktime_get_ns(); + struct blk_plug *plug = current->plug; + + if (!plug) + return ktime_get_ns(); + + /* + * 0 could very well be a valid time, but rather than flag "this is + * a valid timestamp" separately, just accept that we'll do an extra + * ktime_get_ns() if we just happen to get 0 as the current time. + */ + if (!plug->cur_ktime) + plug->cur_ktime = ktime_get_ns(); + return plug->cur_ktime; } #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c848f4205729..961bc66a0dd1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1372,6 +1372,7 @@ struct blk_plug { struct list_head list; /* requests */ struct list_head mq_list; /* blk-mq requests */ struct list_head cb_list; /* md requires an unplug callback */ + u64 cur_ktime; }; #define BLK_MAX_REQUEST_COUNT 16 #define BLK_PLUG_FLUSH_SIZE (128 * 1024)
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-v6.9-rc1 commit 06b23f92af87a84d70881b2ecaa72e00f7838264 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Mark the task as having a cached timestamp when set assign it, so we can efficiently check if it needs updating post being scheduled back in. This covers both the actual schedule out case, which would've flushed the plug, and the preemption case which doesn't touch the plugged requests (for many reasons, one of them being then we'd need to have preemption disabled around plug state manipulation).
Reviewed-by: Johannes Thumshirn johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/blk.h block/blk-core.c include/linux/blkdev.h include/linux/sched.h [Lots of conficts in context, reimplement the patch for current context] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 2 ++ block/blk.h | 4 +++- include/linux/blkdev.h | 16 ++++++++++++++++ include/linux/sched.h | 1 + kernel/sched/core.c | 6 ++++-- 5 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index d63d82a628ac..c8a033facf4d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -4061,6 +4061,8 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (q) queue_unplugged(q, depth, from_schedule); + + current->flags &= ~PF_BLOCK_TS; }
void blk_finish_plug(struct blk_plug *plug) diff --git a/block/blk.h b/block/blk.h index 0c8de338b391..c92748b2aebd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -491,8 +491,10 @@ static inline u64 blk_time_get_ns(void) * a valid timestamp" separately, just accept that we'll do an extra * ktime_get_ns() if we just happen to get 0 as the current time. */ - if (!plug->cur_ktime) + if (!plug->cur_ktime) { plug->cur_ktime = ktime_get_ns(); + current->flags |= PF_BLOCK_TS; + } return plug->cur_ktime; } #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 961bc66a0dd1..8fe611439db5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1436,6 +1436,18 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, return bqt->tag_index[tag]; }
+/* + * tsk == current here + */ +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (plug) + plug->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; +} + extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); @@ -2151,6 +2163,10 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) return false; }
+static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ +} + static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, sector_t *error_sector) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 26255b76ca52..14eba475138a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1495,6 +1495,7 @@ extern struct pid *cad_pid; #define PF_UCE_KERNEL_RECOVERY 0x02000000 /* Task in uce kernel recovery state */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ +#define PF_BLOCK_TS 0x10000000 /* plug has ts that needs updating */ #define PF_IO_WORKER 0x20000000 /* Task is an IO worker */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe9f91f39e2f..e37428598155 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3581,10 +3581,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { + if (tsk->flags & PF_BLOCK_TS) + blk_plug_invalidate_ts(tsk); if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); - else + else if (tsk->flags & PF_IO_WORKER) io_wq_worker_running(tsk); } }
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-v6.9-rc1 commit b874d4aae58b92144ec2c8fa5dc0a27c98388fcc category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We should not have any callers of this from non-task context, but Jakub ran [1] into one from blk-iocost. Rather than risk running into others, or future ones, just limit blk_time_get_ns() to when it is called from a task. Any other usage is invalid.
[1] https://lore.kernel.org/lkml/CAHk-=wiOaBLqarS2uFhM1YdwOvCX4CZaWkeyNDY1zONpbY...
Fixes: da4c8c3d0975 ("block: cache current nsec time in struct blk_plug") Reported-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/blk.h b/block/blk.h index c92748b2aebd..646940e1bd57 100644 --- a/block/blk.h +++ b/block/blk.h @@ -483,7 +483,7 @@ static inline u64 blk_time_get_ns(void) { struct blk_plug *plug = current->plug;
- if (!plug) + if (!plug || !in_task()) return ktime_get_ns();
/*
mainline inclusion from mainline-v6.9-rc4 commit 3ec4848913d695245716ea45ca4872d9dff097a5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
While monitoring the throttle time of IO from iocost, it's found that such time is always zero after the io_schedule() from ioc_rqos_throttle, for example, with the following debug patch:
+ printk("%s-%d: %s enter %llu\n", current->comm, current->pid, __func__, blk_time_get_ns()); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); if (wait.committed) break; io_schedule(); } + printk("%s-%d: %s exit %llu\n", current->comm, current->pid, __func__, blk_time_get_ns());
It can be observerd that blk_time_get_ns() always return the same time:
[ 1068.096579] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.272587] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.274389] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.472690] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.474485] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.672656] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.674451] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.872655] fio-1268: ioc_rqos_throttle exit 1067901962288
And I think the root cause is that 'PF_BLOCK_TS' is always cleared by blk_flush_plug() before scheduel(), hence blk_plug_invalidate_ts() will never be called:
blk_time_get_ns plug->cur_ktime = ktime_get_ns(); current->flags |= PF_BLOCK_TS;
io_schedule: io_schedule_prepare blk_flush_plug __blk_flush_plug /* the flag is cleared, while time is not */ current->flags &= ~PF_BLOCK_TS; schedule sched_update_worker /* the flag is not set, hence plug->cur_ktime is not cleared */ if (tsk->flags & PF_BLOCK_TS) blk_plug_invalidate_ts()
blk_time_get_ns /* got the time stashed before schedule */ return plug->cur_ktime;
Fix the problem by clearing cached time in __blk_flush_plug().
Fixes: 06b23f92af87 ("block: update cached timestamp post schedule/preemption") Signed-off-by: Yu Kuai yukuai3@huawei.com Link: https://lore.kernel.org/r/20240411032349.3051233-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/blk-core.c [__blk_flush_plug() doesn't exist, modify blk_flush_plug_list() instead.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/block/blk-core.c b/block/blk-core.c index c8a033facf4d..47b37c2fa073 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -4062,6 +4062,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) if (q) queue_unplugged(q, depth, from_schedule);
+ plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; }
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
blk_plug is always used with current, hence add the new filed into task_struct to fix kabi broken.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 4 ++-- block/blk.h | 11 ++++++----- include/linux/blkdev.h | 3 +-- include/linux/sched.h | 1 + 4 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 47b37c2fa073..caf3a897739e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3919,11 +3919,11 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); - plug->cur_ktime = 0; /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier */ + tsk->_resvd->cur_ktime = 0; tsk->plug = plug; } EXPORT_SYMBOL(blk_start_plug); @@ -4062,7 +4062,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) if (q) queue_unplugged(q, depth, from_schedule);
- plug->cur_ktime = 0; + current->_resvd->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; }
diff --git a/block/blk.h b/block/blk.h index 646940e1bd57..3ded5dd1227d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -481,7 +481,8 @@ static inline void blk_free_queue_dispatch_async(struct request_queue *q)
static inline u64 blk_time_get_ns(void) { - struct blk_plug *plug = current->plug; + struct task_struct *tsk = current; + struct blk_plug *plug = tsk->plug;
if (!plug || !in_task()) return ktime_get_ns(); @@ -491,10 +492,10 @@ static inline u64 blk_time_get_ns(void) * a valid timestamp" separately, just accept that we'll do an extra * ktime_get_ns() if we just happen to get 0 as the current time. */ - if (!plug->cur_ktime) { - plug->cur_ktime = ktime_get_ns(); - current->flags |= PF_BLOCK_TS; + if (!tsk->_resvd->cur_ktime) { + tsk->_resvd->cur_ktime = ktime_get_ns(); + tsk->flags |= PF_BLOCK_TS; } - return plug->cur_ktime; + return tsk->_resvd->cur_ktime; } #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8fe611439db5..241f59eb5b64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1372,7 +1372,6 @@ struct blk_plug { struct list_head list; /* requests */ struct list_head mq_list; /* blk-mq requests */ struct list_head cb_list; /* md requires an unplug callback */ - u64 cur_ktime; }; #define BLK_MAX_REQUEST_COUNT 16 #define BLK_PLUG_FLUSH_SIZE (128 * 1024) @@ -1444,7 +1443,7 @@ static inline void blk_plug_invalidate_ts(struct task_struct *tsk) struct blk_plug *plug = tsk->plug;
if (plug) - plug->cur_ktime = 0; + current->_resvd->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; }
diff --git a/include/linux/sched.h b/include/linux/sched.h index 14eba475138a..d2eceea955b0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -635,6 +635,7 @@ struct task_struct_resvd { #ifdef CONFIG_QOS_SCHED_SMART_GRID struct sched_grid_qos *grid_qos; #endif + u64 cur_ktime; };
struct task_struct {
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
A new config BLK_BIO_ALLOC_TIME is added to control the behaviour, and the time will be used later for blk-io-hierarchy. Also fix the problem that split BIO is not tracked if CONFIG_EULEROS_BLK_IO_TIME is enabeld.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/Kconfig | 3 +++ block/bio.c | 4 ++++ include/linux/blk_types.h | 4 ++++ 3 files changed, 11 insertions(+)
diff --git a/block/Kconfig b/block/Kconfig index da71e56f8682..de3574766ef5 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -26,6 +26,9 @@ menuconfig BLOCK
if BLOCK
+config BLK_BIO_ALLOC_TIME + bool + config LBDAF bool "Support for large (2TB+) block devices and files" depends on !64BIT diff --git a/block/bio.c b/block/bio.c index 06193e854577..200d17093a8e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -285,6 +285,10 @@ void bio_init(struct bio *bio, struct bio_vec *table,
bio->bi_io_vec = table; bio->bi_max_vecs = max_vecs; + +#ifdef CONFIG_BLK_BIO_ALLOC_TIME + bio->bi_alloc_time_ns = blk_time_get_ns(); +#endif } EXPORT_SYMBOL(bio_init);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8075b9955bb3..efafee166996 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -203,7 +203,11 @@ struct bio {
struct bio_set *bi_pool;
+#if defined(CONFIG_BLK_BIO_ALLOC_TIME) && !defined(__GENKSYMS__) + u64 bi_alloc_time_ns; +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
A new config BLK_BIO_ALLOC_TASK is added to control the behaviour, and the task_struct will be used later for blk-io-hierarchy to dump thread info to user.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/Kconfig | 3 +++ block/bio.c | 10 ++++++++++ include/linux/blk_types.h | 4 ++++ 3 files changed, 17 insertions(+)
diff --git a/block/Kconfig b/block/Kconfig index de3574766ef5..9b512a000af7 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -29,6 +29,9 @@ if BLOCK config BLK_BIO_ALLOC_TIME bool
+config BLK_BIO_ALLOC_TASK + bool + config LBDAF bool "Support for large (2TB+) block devices and files" depends on !64BIT diff --git a/block/bio.c b/block/bio.c index 200d17093a8e..b32dc89bb704 100644 --- a/block/bio.c +++ b/block/bio.c @@ -245,6 +245,12 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, void bio_uninit(struct bio *bio) { bio_disassociate_task(bio); +#ifdef CONFIG_BLK_BIO_ALLOC_TASK + if (bio->pid) { + put_pid(bio->pid); + bio->pid = NULL; + } +#endif } EXPORT_SYMBOL(bio_uninit);
@@ -289,6 +295,10 @@ void bio_init(struct bio *bio, struct bio_vec *table, #ifdef CONFIG_BLK_BIO_ALLOC_TIME bio->bi_alloc_time_ns = blk_time_get_ns(); #endif + +#ifdef CONFIG_BLK_BIO_ALLOC_TASK + bio->pid = find_get_pid(current->pid); +#endif } EXPORT_SYMBOL(bio_init);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index efafee166996..416cf84a0624 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -208,7 +208,11 @@ struct bio { #else KABI_RESERVE(1) #endif +#if defined(CONFIG_BLK_BIO_ALLOC_TASK) && !defined(__GENKSYMS__) + struct pid *pid; +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3)
/*
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
The time will be used later for dumping request in blk-io-hierarchy.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 4 ++++ block/blk-flush.c | 1 + block/blk-merge.c | 1 + block/blk-mq.c | 1 + block/blk.h | 45 ++++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 3 +++ 6 files changed, 55 insertions(+)
diff --git a/block/blk-core.c b/block/blk-core.c index caf3a897739e..5db196c0ef87 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2106,6 +2106,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; + blk_rq_update_bi_alloc_time(req, bio, NULL);
blk_account_io_start(req, false); return true; @@ -2129,6 +2130,7 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; + blk_rq_update_bi_alloc_time(req, bio, NULL);
blk_account_io_start(req, false); return true; @@ -2149,6 +2151,7 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; req->nr_phys_segments = segments + 1; + blk_rq_update_bi_alloc_time(req, bio, NULL);
blk_account_io_start(req, false); return true; @@ -3727,6 +3730,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; + blk_rq_update_bi_alloc_time(rq, bio, NULL);
if (bio->bi_disk) rq->rq_disk = bio->bi_disk; diff --git a/block/blk-flush.c b/block/blk-flush.c index c1bfcde165af..5dda142819b2 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -377,6 +377,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; + blk_rq_init_bi_alloc_time(flush_rq, first_rq);
/* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one diff --git a/block/blk-merge.c b/block/blk-merge.c index d2fabe1fdf32..9f9d803e064b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -784,6 +784,7 @@ static struct request *attempt_merge(struct request_queue *q, req->biotail = next->biotail;
req->__data_len += blk_rq_bytes(next); + blk_rq_update_bi_alloc_time(req, NULL, next);
if (!blk_discard_mergable(req)) elv_merge_requests(q, req, next); diff --git a/block/blk-mq.c b/block/blk-mq.c index 76dd32ee6172..f96f4bb8be92 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -367,6 +367,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->rq_disk = NULL; rq->part = NULL; rq->start_time_ns = blk_time_get_ns(); + blk_rq_init_bi_alloc_time(rq, NULL); rq->io_start_time_ns = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) diff --git a/block/blk.h b/block/blk.h index 3ded5dd1227d..e1278f790ac7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -147,6 +147,51 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); }
+#ifdef CONFIG_BLK_BIO_ALLOC_TIME +static inline u64 blk_time_get_ns(void); +static inline void blk_rq_init_bi_alloc_time(struct request *rq, + struct request *first_rq) +{ + rq->bi_alloc_time_ns = first_rq ? first_rq->bi_alloc_time_ns : + blk_time_get_ns(); +} + +/* + * Used in following cases to updated request bi_alloc_time_ns: + * + * 1) Allocate a new @rq for @bio; + * 2) @bio is merged to @rq, in this case @merged_rq should be NULL; + * 3) @merged_rq is merged to @rq, in this case @bio should be NULL; + */ +static inline void blk_rq_update_bi_alloc_time(struct request *rq, + struct bio *bio, + struct request *merged_rq) +{ + if (bio) { + if (rq->bi_alloc_time_ns > bio->bi_alloc_time_ns) + rq->bi_alloc_time_ns = bio->bi_alloc_time_ns; + return; + } + + if (WARN_ON_ONCE(!merged_rq)) + return; + + if (rq->bi_alloc_time_ns > merged_rq->bi_alloc_time_ns) + rq->bi_alloc_time_ns = merged_rq->bi_alloc_time_ns; +} +#else /* CONFIG_BLK_BIO_ALLOC_TIME */ +static inline void blk_rq_init_bi_alloc_time(struct request *rq, + struct request *first_rq) +{ +} + +static inline void blk_rq_update_bi_alloc_time(struct request *rq, + struct bio *bio, + struct request *merged_rq) +{ +} +#endif /* CONFIG_BLK_BIO_ALLOC_TIME */ + bool is_flush_rq(struct request *req);
struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 241f59eb5b64..c487c32b1bf4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -225,6 +225,9 @@ struct request { u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; +#ifdef CONFIG_BLK_BIO_ALLOC_TIME + u64 bi_alloc_time_ns; +#endif
#ifdef CONFIG_BLK_WBT unsigned short wbt_flags;
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Add a new struct request_wrapper to fix kabi broken by adding new fields in struct request:
Before: |request|cmd| Kabi broken: |request with new fields|cmd| After: |request_wrapper with new fields|request|cmd|
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-flush.c | 10 ++++++---- block/blk-mq.c | 5 +++-- block/blk-mq.h | 11 +++++++++++ block/blk.h | 25 +++++++++++++++++++------ include/linux/blkdev.h | 3 --- 5 files changed, 39 insertions(+), 15 deletions(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c index 5dda142819b2..a09c11678184 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -602,7 +602,8 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, int node, int cmd_size, gfp_t flags) { struct blk_flush_queue *fq; - int rq_sz = sizeof(struct request); + struct request_wrapper *wrapper; + int rq_sz = sizeof(struct request) + sizeof(struct request_wrapper);
fq = kzalloc_node(sizeof(*fq), flags, node); if (!fq) @@ -612,10 +613,11 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, spin_lock_init(&fq->mq_flush_lock);
rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); - fq->flush_rq = kzalloc_node(rq_sz, flags, node); - if (!fq->flush_rq) + wrapper = kzalloc_node(rq_sz, flags, node); + if (!wrapper) goto fail_rq;
+ fq->flush_rq = (struct request *)(wrapper + 1); INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[1]); INIT_LIST_HEAD(&fq->flush_data_in_flight); @@ -634,6 +636,6 @@ void blk_free_flush_queue(struct blk_flush_queue *fq) if (!fq) return;
- kfree(fq->flush_rq); + kfree(request_to_wrapper(fq->flush_rq)); kfree(fq); } diff --git a/block/blk-mq.c b/block/blk-mq.c index f96f4bb8be92..fdd440c3f31b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2241,7 +2241,8 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ - rq_size = round_up(sizeof(struct request) + set->cmd_size, + rq_size = round_up(sizeof(struct request) + + sizeof(struct request_wrapper) + set->cmd_size, cache_line_size()); left = rq_size * depth;
@@ -2282,7 +2283,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { - struct request *rq = p; + struct request *rq = p + sizeof(struct request_wrapper);
tags->static_rqs[i] = rq; if (blk_mq_init_request(set, rq, hctx_idx, node)) { diff --git a/block/blk-mq.h b/block/blk-mq.h index c6ec9aa12fb2..380362e37504 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -36,6 +36,17 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp;
+struct request_wrapper { +#ifdef CONFIG_BLK_BIO_ALLOC_TIME + u64 bi_alloc_time_ns; +#endif +} ____cacheline_aligned_in_smp; + +static inline struct request_wrapper *request_to_wrapper(void *rq) +{ + return rq - sizeof(struct request_wrapper); +} + void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); diff --git a/block/blk.h b/block/blk.h index e1278f790ac7..b7af0eff95b7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -152,8 +152,12 @@ static inline u64 blk_time_get_ns(void); static inline void blk_rq_init_bi_alloc_time(struct request *rq, struct request *first_rq) { - rq->bi_alloc_time_ns = first_rq ? first_rq->bi_alloc_time_ns : - blk_time_get_ns(); + if (!rq->q->mq_ops) + return; + + request_to_wrapper(rq)->bi_alloc_time_ns = + first_rq ? request_to_wrapper(first_rq)->bi_alloc_time_ns : + blk_time_get_ns(); }
/* @@ -167,17 +171,26 @@ static inline void blk_rq_update_bi_alloc_time(struct request *rq, struct bio *bio, struct request *merged_rq) { + struct request_wrapper *rq_wrapper; + struct request_wrapper *merged_rq_wrapper; + + if (!rq->q->mq_ops) + return; + + rq_wrapper = request_to_wrapper(rq); if (bio) { - if (rq->bi_alloc_time_ns > bio->bi_alloc_time_ns) - rq->bi_alloc_time_ns = bio->bi_alloc_time_ns; + if (rq_wrapper->bi_alloc_time_ns > bio->bi_alloc_time_ns) + rq_wrapper->bi_alloc_time_ns = bio->bi_alloc_time_ns; return; }
if (WARN_ON_ONCE(!merged_rq)) return;
- if (rq->bi_alloc_time_ns > merged_rq->bi_alloc_time_ns) - rq->bi_alloc_time_ns = merged_rq->bi_alloc_time_ns; + merged_rq_wrapper = request_to_wrapper(merged_rq); + if (rq_wrapper->bi_alloc_time_ns > merged_rq_wrapper->bi_alloc_time_ns) + rq_wrapper->bi_alloc_time_ns = + merged_rq_wrapper->bi_alloc_time_ns; } #else /* CONFIG_BLK_BIO_ALLOC_TIME */ static inline void blk_rq_init_bi_alloc_time(struct request *rq, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c487c32b1bf4..241f59eb5b64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -225,9 +225,6 @@ struct request { u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; -#ifdef CONFIG_BLK_BIO_ALLOC_TIME - u64 bi_alloc_time_ns; -#endif
#ifdef CONFIG_BLK_WBT unsigned short wbt_flags;
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
The task will be used later for dumping request in blk-io-hierarchy.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-flush.c | 2 ++ block/blk-mq.c | 8 +++++++- block/blk-mq.h | 30 ++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c index a09c11678184..2bc03d6f7d2a 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -245,6 +245,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) * avoiding use-after-free. */ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); + blk_mq_put_alloc_task(flush_rq); if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; fq->rq_status = BLK_STS_OK; @@ -378,6 +379,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; blk_rq_init_bi_alloc_time(flush_rq, first_rq); + blk_mq_get_alloc_task(flush_rq, first_rq->bio);
/* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one diff --git a/block/blk-mq.c b/block/blk-mq.c index fdd440c3f31b..be503078aadf 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -368,6 +368,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->part = NULL; rq->start_time_ns = blk_time_get_ns(); blk_rq_init_bi_alloc_time(rq, NULL); + blk_mq_get_alloc_task(rq, data->bio); + rq->io_start_time_ns = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -533,6 +535,7 @@ static void __blk_mq_free_request(struct request *rq) struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); const int sched_tag = rq->internal_tag;
+ blk_mq_put_alloc_task(rq); if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -1976,7 +1979,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { .flags = 0 }; + struct blk_mq_alloc_data data = { + .flags = 0, + .bio = bio + }; struct request *rq; unsigned int request_count = 0; struct blk_plug *plug; diff --git a/block/blk-mq.h b/block/blk-mq.h index 380362e37504..91ddf85d0a9b 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -40,6 +40,9 @@ struct request_wrapper { #ifdef CONFIG_BLK_BIO_ALLOC_TIME u64 bi_alloc_time_ns; #endif +#ifdef CONFIG_BLK_BIO_ALLOC_TASK + struct pid *pid; +#endif } ____cacheline_aligned_in_smp;
static inline struct request_wrapper *request_to_wrapper(void *rq) @@ -153,6 +156,8 @@ struct blk_mq_alloc_data { /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; + + struct bio *bio; };
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) @@ -245,4 +250,29 @@ static inline void blk_mq_free_requests(struct list_head *list) } }
+#ifdef CONFIG_BLK_BIO_ALLOC_TASK +static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio) +{ + request_to_wrapper(rq)->pid = bio ? get_pid(bio->pid) : + find_get_pid(current->pid); +} + +static inline void blk_mq_put_alloc_task(struct request *rq) +{ + struct request_wrapper *rq_wrapper = request_to_wrapper(rq); + + if (rq_wrapper->pid) { + put_pid(rq_wrapper->pid); + rq_wrapper->pid = NULL; + } +} +#else +static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio) +{ +} +static inline void blk_mq_put_alloc_task(struct request *rq) +{ +} +#endif + #endif
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
blk_mq_alloc_data is only used internally and always used as stack variable. Fix kabi broken by extending new filed at tail.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/block/blk-mq.h b/block/blk-mq.h index 91ddf85d0a9b..d924e4bd72b4 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -157,7 +157,9 @@ struct blk_mq_alloc_data { struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx;
+#ifndef __GENKSYMS__ struct bio *bio; +#endif };
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Currently, the helper __blk_mq_debugfs_rq_show() will call driver API 'mq_ops->show_rq', however, not all drivers implement this API, and in order to have a unified API to dump request for all drivers, factor out a helper from __blk_mq_debugfs_rq_show().
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-mq-debugfs.c | 15 +++++++++++++-- block/blk-mq-debugfs.h | 1 + 2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 1c8113f8acb1..e50d51a8839c 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -355,9 +355,13 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) return blk_mq_rq_state_name_array[rq_state]; }
-int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +/* + * This helper will dump general information for @rq into @m, started with '{' + * and doesn't end with '}', caller must include a closing curly brace '}' at + * the end after adding the custom string. + */ +void debugfs_rq_show(struct seq_file *m, struct request *rq) { - const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
seq_printf(m, "%p {.op=", rq); @@ -374,6 +378,13 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); +} + +int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +{ + const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; + + debugfs_rq_show(m, rq); if (mq_ops->show_rq) mq_ops->show_rq(m, rq); seq_puts(m, "}\n"); diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index b900fd7c90e1..70549712b0a2 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -15,6 +15,7 @@ struct blk_mq_debugfs_attr { const struct seq_operations *seq_ops; };
+void debugfs_rq_show(struct seq_file *m, struct request *rq); int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
The time will be reused for completing bio, to avoid getting ns time again.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-mq.c | 6 +++++- block/blk-mq.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index be503078aadf..539586b05646 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -371,6 +371,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, blk_mq_get_alloc_task(rq, data->bio);
rq->io_start_time_ns = 0; + request_to_wrapper(rq)->io_end_time_ns = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -580,7 +581,10 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { - u64 now = blk_time_get_ns(); + u64 now = request_to_wrapper(rq)->io_end_time_ns; + + if (!now) + now = blk_time_get_ns();
if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); diff --git a/block/blk-mq.h b/block/blk-mq.h index d924e4bd72b4..6d860f6ddc30 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -37,6 +37,7 @@ struct blk_mq_ctx { } ____cacheline_aligned_in_smp;
struct request_wrapper { + u64 io_end_time_ns; #ifdef CONFIG_BLK_BIO_ALLOC_TIME u64 bi_alloc_time_ns; #endif
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Include main structures definition and provede helpers for different IO stages to record IO stats.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + block/Kconfig | 2 + block/Makefile | 1 + block/bio.c | 2 + block/blk-flush.c | 2 + block/blk-io-hierarchy/Kconfig | 12 + block/blk-io-hierarchy/Makefile | 7 + block/blk-io-hierarchy/debugfs.c | 194 +++++++++++++ block/blk-io-hierarchy/stats.c | 388 +++++++++++++++++++++++++ block/blk-io-hierarchy/stats.h | 309 ++++++++++++++++++++ block/blk-mq.c | 2 + block/blk-mq.h | 5 + block/blk.h | 7 +- include/linux/blk_types.h | 15 +- 15 files changed, 945 insertions(+), 3 deletions(-) create mode 100644 block/blk-io-hierarchy/Kconfig create mode 100644 block/blk-io-hierarchy/Makefile create mode 100644 block/blk-io-hierarchy/debugfs.c create mode 100644 block/blk-io-hierarchy/stats.c create mode 100644 block/blk-io-hierarchy/stats.h
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 4de42999f905..71e12eb64467 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -822,6 +822,7 @@ CONFIG_BLK_DEBUG_FS=y CONFIG_BLK_DEBUG_FS_ZONED=y # CONFIG_BLK_SED_OPAL is not set # CONFIG_BLK_BIO_DISPATCH_ASYNC is not set +# CONFIG_BLK_IO_HIERARCHY_STATS is not set
# # Partition Types diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 928b4379af4d..7993f0f3e7a4 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -891,6 +891,7 @@ CONFIG_BLK_WBT_MQ=y CONFIG_BLK_DEBUG_FS=y # CONFIG_BLK_SED_OPAL is not set # CONFIG_BLK_BIO_DISPATCH_ASYNC is not set +# CONFIG_BLK_IO_HIERARCHY_STATS is not set
# # Partition Types diff --git a/block/Kconfig b/block/Kconfig index 9b512a000af7..8804f21df151 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -219,6 +219,8 @@ config BLK_BIO_DISPATCH_ASYNC feature will require special care in the driver to work. If unsure, say N here.
+source "block/blk-io-hierarchy/Kconfig" + menu "Partition Types"
source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 572b33f32c07..bb711b0c307a 100644 --- a/block/Makefile +++ b/block/Makefile @@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o +obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk-io-hierarchy/ diff --git a/block/bio.c b/block/bio.c index b32dc89bb704..c3aeae529dfd 100644 --- a/block/bio.c +++ b/block/bio.c @@ -33,6 +33,7 @@ #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h"
/* * Test patch to inline a certain number of bi_io_vec's inside the bio @@ -251,6 +252,7 @@ void bio_uninit(struct bio *bio) bio->pid = NULL; } #endif + bio_free_hierarchy_data(bio); } EXPORT_SYMBOL(bio_uninit);
diff --git a/block/blk-flush.c b/block/blk-flush.c index 2bc03d6f7d2a..6f08e1d87f47 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -75,6 +75,7 @@ #include "blk-mq.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h"
/* PREFLUSH/FUA sequences */ enum { @@ -380,6 +381,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->end_io = flush_end_io; blk_rq_init_bi_alloc_time(flush_rq, first_rq); blk_mq_get_alloc_task(flush_rq, first_rq->bio); + blk_rq_hierarchy_stats_init(flush_rq);
/* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig new file mode 100644 index 000000000000..2b2b725ba224 --- /dev/null +++ b/block/blk-io-hierarchy/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menuconfig BLK_IO_HIERARCHY_STATS + bool "Enable hierarchy io stats" + default n + depends on BLK_DEBUG_FS=y + help + Enabling this lets the block layer to record additional information + in different io stages. Such information can be helpful to debug + performance and problems like io hang. + + If unsure, say N. diff --git a/block/blk-io-hierarchy/Makefile b/block/blk-io-hierarchy/Makefile new file mode 100644 index 000000000000..1fb663c75521 --- /dev/null +++ b/block/blk-io-hierarchy/Makefile @@ -0,0 +1,7 @@ +# +# Make file for blk_io_hierarchy_stats +# + +obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk_io_hierarchy_stats.o + +blk_io_hierarchy_stats-y := stats.o debugfs.o diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c new file mode 100644 index 000000000000..e4c8751371f1 --- /dev/null +++ b/block/blk-io-hierarchy/debugfs.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/debugfs.h> +#include <linux/blkdev.h> + +#include "../blk-mq-debugfs.h" +#include "stats.h" + +static const char *stage_name[NR_STAGE_GROUPS] = { +}; + +const char *hierarchy_stage_name(enum stage_group stage) +{ + return stage_name[stage]; +} + +static int __hierarchy_stats_show(struct hierarchy_stats_data *hstats_data, + struct seq_file *m, enum stage_group stage) +{ + u64 dispatched[NR_NEW_STAT_GROUPS] = {0}; + u64 completed[NR_NEW_STAT_GROUPS] = {0}; + u64 latency[NR_NEW_STAT_GROUPS] = {0}; + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct hierarchy_stats *stat = + per_cpu_ptr(hstats_data->hstats, cpu); + + for (i = 0; i < NR_NEW_STAT_GROUPS; ++i) { + dispatched[i] += stat->dispatched[i]; + completed[i] += stat->completed[i]; + latency[i] += stage_is_rq(stage) ? + stat->jiffies[i] : stat->nsecs[i]; + } + } + + if (stage_is_rq(stage)) + for (i = 0; i < NR_NEW_STAT_GROUPS; ++i) + latency[i] = + jiffies_to_msecs(latency[i]) * NSEC_PER_MSEC; + + seq_printf(m, "%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", + dispatched[STAT_READ], completed[STAT_READ], + latency[STAT_READ], dispatched[STAT_WRITE], + completed[STAT_WRITE], latency[STAT_WRITE], + dispatched[STAT_DISCARD], completed[STAT_DISCARD], + latency[STAT_DISCARD], dispatched[STAT_FLUSH], + completed[STAT_FLUSH], latency[STAT_FLUSH]); + + seq_putc(m, '\n'); + return 0; +} + +static void *hierarchy_stats_start(struct seq_file *m, loff_t *pos) +{ + enum stage_group stage = *pos; + + if (stage < 0 || stage >= NR_STAGE_GROUPS) + return NULL; + + return pos; +} + +static void *hierarchy_stats_next(struct seq_file *m, void *v, loff_t *pos) +{ + enum stage_group stage = ++(*pos); + + if (stage >= 0 && stage < NR_STAGE_GROUPS) + return pos; + + return NULL; +} + +static void hierarchy_stats_stop(struct seq_file *m, void *v) +{ +} + +static int hierarchy_stats_show(struct seq_file *m, void *v) +{ + enum stage_group stage = (*(loff_t *)v); + struct blk_io_hierarchy_stats *stats = m->private; + struct hierarchy_stats_data *hstats_data = + get_hstats_data(stats, stage); + + if (!hstats_data) + return 0; + + seq_printf(m, "%s ", hierarchy_stage_name(stage)); + __hierarchy_stats_show(hstats_data, m, stage); + put_hstats_data(stats, hstats_data); + return 0; +} + +static const struct seq_operations hierarchy_stats_ops = { + .start = hierarchy_stats_start, + .next = hierarchy_stats_next, + .stop = hierarchy_stats_stop, + .show = hierarchy_stats_show, +}; + +static int hierarchy_stats_show_single(void *v, struct seq_file *m) +{ + struct hierarchy_stage *hstage = v; + + return __hierarchy_stats_show(hstage->hstats_data, m, hstage->stage); +} + +static const struct blk_mq_debugfs_attr hierarchy_debugfs_attrs[] = { + {"stats", 0400, hierarchy_stats_show_single}, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_stats_attr[] = { + {"stats", 0400, .seq_ops = &hierarchy_stats_ops}, + {}, +}; + +static void hierarchy_register_stage(struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = stats->hstage[stage]; + struct dentry *dir; + + if (!stage_name[stage] || hstage->debugfs_dir) + return; + + dir = debugfs_create_dir(stage_name[stage], stats->debugfs_dir); + if (IS_ERR(dir)) + return; + + hstage->debugfs_dir = dir; + debugfs_create_files(dir, hstage, hierarchy_debugfs_attrs); +} + +static void hierarchy_unregister_stage(struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = stats->hstage[stage]; + + if (!stage_name[stage] || !hstage->debugfs_dir) + return; + + debugfs_remove_recursive(hstage->debugfs_dir); + hstage->debugfs_dir = NULL; +} + +void blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + + if (!blk_mq_hierarchy_registered(q, stage) || + !blk_mq_debugfs_enabled(q)) + return; + + hierarchy_register_stage(stats, stage); +} + +void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + + if (!blk_mq_hierarchy_registered(q, stage) || + !blk_mq_debugfs_enabled(q)) + return; + + hierarchy_unregister_stage(stats, stage); +} + +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + + if (!blk_mq_debugfs_enabled(q)) + return; + + debugfs_create_files(stats->debugfs_dir, stats, hierarchy_stats_attr); +} diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c new file mode 100644 index 000000000000..b173ac5e2410 --- /dev/null +++ b/block/blk-io-hierarchy/stats.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/debugfs.h> + +#include "stats.h" +#include "../blk.h" +#include "../blk-mq-debugfs.h" + +#define io_hierarchy_add(statsp, field, group, nr) \ + this_cpu_add((statsp)->hstats->field[group], nr) +#define io_hierarchy_inc(statsp, field, group) \ + io_hierarchy_add(statsp, field, group, 1) + +#define PRE_ALLOC_BIO_CNT 8 + +static mempool_t *hdata_pool; + +void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + enum stage_group stage; + + stats = queue_to_wrapper(q)->io_hierarchy_stats; + if (!stats || !blk_mq_debugfs_enabled(q)) + return; + + stats->debugfs_dir = debugfs_create_dir("blk_io_hierarchy", + q->debugfs_dir); + blk_mq_debugfs_create_default_hierarchy_attr(q); + + for (stage = 0; stage < NR_STAGE_GROUPS; ++stage) + blk_mq_debugfs_register_hierarchy(q, stage); +} + +static void bio_alloc_hierarchy_data(struct bio *bio) +{ + if (!bio->hdata) { + struct bio_hierarchy_data *hdata = + mempool_alloc(hdata_pool, GFP_NOIO); + + bio->hdata = hdata; + } +} + +void bio_free_hierarchy_data(struct bio *bio) +{ + if (!bio->hdata) + return; + + mempool_free(bio->hdata, hdata_pool); + bio->hdata = NULL; +} + +void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + enum stage_group stage; + + stats = queue_to_wrapper(q)->io_hierarchy_stats; + if (!stats || !blk_mq_debugfs_enabled(q)) + return; + + for (stage = 0; stage < NR_STAGE_GROUPS; ++stage) + blk_mq_debugfs_unregister_hierarchy(q, stage); + + debugfs_remove_recursive(stats->debugfs_dir); + stats->debugfs_dir = NULL; +} + +int blk_io_hierarchy_stats_alloc(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + + if (!q->mq_ops) + return 0; + + stats = kzalloc(sizeof(struct blk_io_hierarchy_stats), GFP_KERNEL); + if (!stats) + return -ENOMEM; + + spin_lock_init(&stats->hstage_lock); + stats->q = q; + queue_to_wrapper(q)->io_hierarchy_stats = stats; + + return 0; +} + +void blk_io_hierarchy_stats_free(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + + if (!stats) + return; + + queue_to_wrapper(q)->io_hierarchy_stats = NULL; + kfree(stats); +} + +bool blk_mq_hierarchy_registered(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + + if (!stats) + return false; + + return stats->hstage[stage] != NULL; +} +EXPORT_SYMBOL_GPL(blk_mq_hierarchy_registered); + +static struct hierarchy_stats_data *alloc_hstats_data(void) +{ + struct hierarchy_stats_data *hstats_data; + + hstats_data = kmalloc(sizeof(*hstats_data), GFP_KERNEL); + if (!hstats_data) + return NULL; + + hstats_data->hstats = alloc_percpu(struct hierarchy_stats); + if (!hstats_data->hstats) { + kfree(hstats_data); + return NULL; + } + + hstats_data->ref = 1; + return hstats_data; +} + +struct hierarchy_stats_data *get_hstats_data( + struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage; + struct hierarchy_stats_data *hstats_data = NULL; + + spin_lock(&stats->hstage_lock); + hstage = stats->hstage[stage]; + if (hstage) { + hstats_data = hstage->hstats_data; + if (hstats_data) + hstats_data->ref++; + } + spin_unlock(&stats->hstage_lock); + + return hstats_data; +} + +static void __put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data) +{ + if (--hstats_data->ref == 0) { + free_percpu(hstats_data->hstats); + kfree(hstats_data); + } +} + +void put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data) +{ + spin_lock(&stats->hstage_lock); + __put_hstats_data(stats, hstats_data); + spin_unlock(&stats->hstage_lock); +} + +void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + struct hierarchy_stage *hstage; + + if (!stats || !hierarchy_stage_name(stage)) + return; + + if (blk_mq_hierarchy_registered(q, stage)) { + pr_warn("blk-io-hierarchy: disk %s is registering stage %s again.", + kobject_name(q->kobj.parent), + hierarchy_stage_name(stage)); + return; + } + + /* + * Alloc memory before freeze queue, prevent deadlock if new IO is + * issued by memory reclaim. + */ + hstage = kmalloc(sizeof(*hstage), GFP_KERNEL); + if (!hstage) + return; + + hstage->hstats_data = alloc_hstats_data(); + if (!hstage->hstats_data) { + kfree(hstage); + return; + } + + hstage->stage = stage; + hstage->unbalanced_warned = false; + hstage->debugfs_dir = NULL; + + blk_mq_freeze_queue(q); + + WRITE_ONCE(stats->hstage[stage], hstage); + blk_mq_debugfs_register_hierarchy(q, stage); + + blk_mq_unfreeze_queue(q); +} +EXPORT_SYMBOL_GPL(blk_mq_register_hierarchy); + +void blk_mq_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + struct hierarchy_stage *hstage; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + blk_mq_debugfs_unregister_hierarchy(q, stage); + + spin_lock(&stats->hstage_lock); + hstage = stats->hstage[stage]; + stats->hstage[stage] = NULL; + __put_hstats_data(stats, hstage->hstats_data); + spin_unlock(&stats->hstage_lock); + + kfree(hstage); +} +EXPORT_SYMBOL_GPL(blk_mq_unregister_hierarchy); + +static enum stat_group bio_hierarchy_op(struct bio *bio) +{ + if (op_is_discard(bio->bi_opf)) + return STAT_DISCARD; + + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return STAT_FLUSH; + + if (op_is_write(bio->bi_opf)) + return STAT_WRITE; + + return STAT_READ; +} + + +void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) +{ + struct request_queue *q = bio->bi_disk->queue; + struct hierarchy_stage *hstage; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + hstage = queue_to_wrapper(q)->io_hierarchy_stats->hstage[stage]; + bio_alloc_hierarchy_data(bio); + io_hierarchy_inc(hstage->hstats_data, dispatched, + bio_hierarchy_op(bio)); + bio->hdata->time = blk_time_get_ns(); +} + +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time) +{ + struct request_queue *q = bio->bi_disk->queue; + struct hierarchy_stage *hstage; + u64 duration; + enum stat_group op; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + op = bio_hierarchy_op(bio); + duration = time - bio->hdata->time; + hstage = queue_to_wrapper(q)->io_hierarchy_stats->hstage[stage]; + + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, nsecs, op, duration); +} + +static enum stat_group rq_hierarchy_op(struct request *rq) +{ + if (op_is_discard(rq->cmd_flags)) + return STAT_DISCARD; + + if (is_flush_rq(rq)) + return STAT_FLUSH; + + if (op_is_write(rq->cmd_flags)) + return STAT_WRITE; + + return STAT_READ; +} + +static void rq_hierarchy_warn_unbalanced(struct request *rq, + struct hierarchy_stage *hstage, + enum stage_group old_stage, + enum stage_group new_stage) +{ + if (hstage->unbalanced_warned) + return; + + pr_warn("blk-io-hierarchy: disk %s stage %d(%s) -> %d(%s) unbalanced accounting.", + kobject_name(rq->q->kobj.parent), + old_stage, hierarchy_stage_name(old_stage), + new_stage, hierarchy_stage_name(new_stage)); + hstage->unbalanced_warned = true; +} + +void blk_rq_hierarchy_stats_complete(struct request *rq) +{ + struct hierarchy_stage *hstage; + enum stage_group stage; + + stage = request_to_wrapper(rq)->stage; + if (stage == NR_RQ_STAGE_GROUPS) + return; + + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]; + rq_hierarchy_warn_unbalanced(rq, hstage, stage, NR_RQ_STAGE_GROUPS); + __rq_hierarchy_end_io_acct(rq, hstage); +} + +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + struct request_wrapper *rq_wrapper = request_to_wrapper(rq); + + blk_rq_hierarchy_stats_complete(rq); + io_hierarchy_inc(hstage->hstats_data, dispatched, rq_hierarchy_op(rq)); + WRITE_ONCE(rq_wrapper->hierarchy_time, jiffies); + + /* + * Paired with barrier in hierarchy_show_rq_fn(), make sure + * hierarchy_time is set before stage. + */ + smp_store_release(&rq_wrapper->stage, hstage->stage); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_start_io_acct); + +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + enum stat_group op; + unsigned long duration; + struct request_wrapper *rq_wrapper; + + rq_wrapper = request_to_wrapper(rq); + if (rq_wrapper->stage != hstage->stage) { + rq_hierarchy_warn_unbalanced(rq, hstage, rq_wrapper->stage, + hstage->stage); + return; + } + + op = rq_hierarchy_op(rq); + duration = jiffies - rq_wrapper->hierarchy_time; + + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, jiffies, op, duration); + WRITE_ONCE(rq_wrapper->stage, NR_RQ_STAGE_GROUPS); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_end_io_acct); + +static int __init hierarchy_stats_init(void) +{ + hdata_pool = mempool_create_kmalloc_pool(PRE_ALLOC_BIO_CNT, + sizeof(struct bio_hierarchy_data)); + if (!hdata_pool) + panic("Failed to create hdata_pool\n"); + + return 0; +} +module_init(hierarchy_stats_init); diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h new file mode 100644 index 000000000000..5f2f0ce2e34c --- /dev/null +++ b/block/blk-io-hierarchy/stats.h @@ -0,0 +1,309 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef BLK_IO_HIERARCHY_STATS_H +#define BLK_IO_HIERARCHY_STATS_H + +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + +#include <linux/blkdev.h> +#include <linux/blk_types.h> +#include "../blk.h" + +struct bio_hierarchy_data { + u64 time; +}; + +struct hierarchy_stats { + union { + /* for bio based stages. */ + u64 nsecs[NR_NEW_STAT_GROUPS]; + /* for request based stages. */ + unsigned long jiffies[NR_NEW_STAT_GROUPS]; + }; + unsigned long dispatched[NR_NEW_STAT_GROUPS]; + unsigned long completed[NR_NEW_STAT_GROUPS]; +}; + +struct hierarchy_stats_data { + int ref; + struct hierarchy_stats __percpu *hstats; +}; + +struct hierarchy_stage { + enum stage_group stage; + bool unbalanced_warned; + struct dentry *debugfs_dir; + struct hierarchy_stats_data *hstats_data; +}; + +struct blk_io_hierarchy_stats { + struct request_queue *q; + struct dentry *debugfs_dir; + spinlock_t hstage_lock; + struct hierarchy_stage *hstage[NR_STAGE_GROUPS]; +}; + +static inline bool stage_is_bio(enum stage_group stage) +{ + return stage >= 0 && stage < NR_BIO_STAGE_GROUPS; +} + +static inline bool stage_is_rq(enum stage_group stage) +{ + return stage >= NR_BIO_STAGE_GROUPS && stage < NR_RQ_STAGE_GROUPS; +} + +const char *hierarchy_stage_name(enum stage_group stage); +int blk_io_hierarchy_stats_alloc(struct request_queue *q); +void blk_io_hierarchy_stats_free(struct request_queue *q); + +/* APIs for stage registration */ +bool blk_mq_hierarchy_registered(struct request_queue *q, + enum stage_group stage); +void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage); +void blk_mq_unregister_hierarchy(struct request_queue *q, + enum stage_group stage); + +/* APIs for disk level debugfs */ +void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q); +void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q); +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q); + +/* APIs for stage level debugfs */ +void blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage); +void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage); +struct hierarchy_stats_data *get_hstats_data( + struct blk_io_hierarchy_stats *stats, + enum stage_group stage); +void put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data); + +/* APIs for bio based stage io accounting */ +void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage); +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time); +void bio_free_hierarchy_data(struct bio *bio); + +static inline void bio_hierarchy_end_io_acct(struct bio *bio, + enum stage_group stage) +{ + __bio_hierarchy_end_io_acct(bio, stage, blk_time_get_ns()); +} + +static inline void bio_list_hierarchy_end_io_acct(struct bio_list *list, + enum stage_group stage) +{ + u64 time = blk_time_get_ns(); + struct bio *bio; + + bio_list_for_each(bio, list) + __bio_hierarchy_end_io_acct(bio, stage, time); +} + +/* APIs for request based stage io accounting */ +void blk_rq_hierarchy_stats_complete(struct request *rq); +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage); +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage); + +static inline void rq_hierarchy_start_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_start_io_acct(rq, + queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_hierarchy_end_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_end_io_acct(rq, + queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_list_hierarchy_start_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_start_io_acct(rq, hstage); +} + +static inline void rq_list_hierarchy_end_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = queue_to_wrapper(rq->q)->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_end_io_acct(rq, hstage); +} + +static inline void blk_rq_hierarchy_stats_init(struct request *rq) +{ + request_to_wrapper(rq)->stage = NR_RQ_STAGE_GROUPS; + request_to_wrapper(rq)->flush_done = false; +} + +static inline void blk_rq_hierarchy_set_flush_done(struct request *rq) +{ + request_to_wrapper(rq)->flush_done = true; +} + +static inline bool blk_rq_hierarchy_is_flush_done(struct request *rq) +{ + return request_to_wrapper(rq)->flush_done; +} + +#else /* CONFIG_BLK_IO_HIERARCHY_STATS */ + +static inline int +blk_io_hierarchy_stats_alloc(struct request_queue *q) +{ + return 0; +} + +static inline void +blk_io_hierarchy_stats_free(struct request_queue *q) +{ +} + +static inline bool +blk_mq_hierarchy_registered(struct request_queue *q, enum stage_group stage) +{ + return false; +} + +static inline void +blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +blk_mq_unregister_hierarchy(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q) +{ +} + +static inline void +blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q) +{ +} + +static inline void +blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage) +{ +} + +static inline void +blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ +} + +static inline void +bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) +{ +} + +static inline void +bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage) +{ +} + +static inline void +bio_list_hierarchy_end_io_acct(struct bio_list *list, enum stage_group stage) +{ +} + +static inline void +bio_free_hierarchy_data(struct bio *bio) +{ +} + +static inline void +blk_rq_hierarchy_set_flush_done(struct request *rq) +{ +} + +static inline bool +blk_rq_hierarchy_is_flush_done(struct request *rq) +{ + return false; +} + +static inline void +blk_rq_hierarchy_stats_complete(struct request *rq) +{ +} + +static inline void +rq_hierarchy_start_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_hierarchy_end_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_start_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_end_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +static inline void +blk_rq_hierarchy_stats_init(struct request *rq) +{ +} + +#endif /* CONFIG_BLK_IO_HIERARCHY_STATS */ + +#endif /* BLK_IO_HIERARCHY_STATS_H */ diff --git a/block/blk-mq.c b/block/blk-mq.c index 539586b05646..955e80f4d0dc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -36,6 +36,7 @@ #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h"
static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); @@ -369,6 +370,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->start_time_ns = blk_time_get_ns(); blk_rq_init_bi_alloc_time(rq, NULL); blk_mq_get_alloc_task(rq, data->bio); + blk_rq_hierarchy_stats_init(rq);
rq->io_start_time_ns = 0; request_to_wrapper(rq)->io_end_time_ns = 0; diff --git a/block/blk-mq.h b/block/blk-mq.h index 6d860f6ddc30..b2a9efb43209 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -44,6 +44,11 @@ struct request_wrapper { #ifdef CONFIG_BLK_BIO_ALLOC_TASK struct pid *pid; #endif +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + bool flush_done; + enum stage_group stage; + unsigned long hierarchy_time; +#endif } ____cacheline_aligned_in_smp;
static inline struct request_wrapper *request_to_wrapper(void *rq) diff --git a/block/blk.h b/block/blk.h index b7af0eff95b7..9d8a59762843 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,10 +55,13 @@ struct request_queue_wrapper { int __percpu *last_dispatch_cpu; #endif struct mutex sysfs_dir_lock; +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + struct blk_io_hierarchy_stats *io_hierarchy_stats; +#endif };
-#define queue_to_wrapper(q) \ - container_of(q, struct request_queue_wrapper, q) +#define queue_to_wrapper(__q) \ + container_of((__q), struct request_queue_wrapper, q)
extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *request_cachep; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 416cf84a0624..0e1334c4a43e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -213,7 +213,11 @@ struct bio { #else KABI_RESERVE(2) #endif +#if defined(CONFIG_BLK_IO_HIERARCHY_STATS) && !defined(__GENKSYMS__) + struct bio_hierarchy_data *hdata; +#else KABI_RESERVE(3) +#endif
/* * We can inline a number of vecs at the end of the bio, to avoid @@ -376,7 +380,16 @@ enum stat_group { STAT_WRITE, STAT_DISCARD,
- NR_STAT_GROUPS + NR_STAT_GROUPS, + STAT_FLUSH = NR_STAT_GROUPS, + NR_NEW_STAT_GROUPS, +}; + +enum stage_group { + STAGE_BIO_RESERVE, + NR_BIO_STAGE_GROUPS, + NR_RQ_STAGE_GROUPS, + NR_STAGE_GROUPS, };
#define bio_op(bio) \
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Include main structures definition and provede helpers for different IO stages to dump inflight IO.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 17 ++ block/blk-io-hierarchy/Makefile | 1 + block/blk-io-hierarchy/debugfs.c | 2 + block/blk-io-hierarchy/iodump.c | 489 +++++++++++++++++++++++++++++++ block/blk-io-hierarchy/iodump.h | 66 +++++ block/blk-io-hierarchy/stats.c | 10 + block/blk-io-hierarchy/stats.h | 8 + block/blk-mq-debugfs.c | 2 + 8 files changed, 595 insertions(+) create mode 100644 block/blk-io-hierarchy/iodump.c create mode 100644 block/blk-io-hierarchy/iodump.h
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 2b2b725ba224..f35f174835ef 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -10,3 +10,20 @@ menuconfig BLK_IO_HIERARCHY_STATS performance and problems like io hang.
If unsure, say N. + +if BLK_IO_HIERARCHY_STATS + +config HIERARCHY_IO_DUMP + bool "Support to dump io that is throttled" + default n + select BLK_BIO_ALLOC_TIME + select BLK_BIO_ALLOC_TASK + depends on BLK_DEV_IO_TRACE + help + Enable this will create new debugfs entries to show user the detailed + information of IO that are submitted and not done yet, and user can + filter the result by IO stage or IO latency. + + If unsure, say N. + +endif diff --git a/block/blk-io-hierarchy/Makefile b/block/blk-io-hierarchy/Makefile index 1fb663c75521..9b989d379e58 100644 --- a/block/blk-io-hierarchy/Makefile +++ b/block/blk-io-hierarchy/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk_io_hierarchy_stats.o
blk_io_hierarchy_stats-y := stats.o debugfs.o +obj-$(CONFIG_HIERARCHY_IO_DUMP) += iodump.o diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index e4c8751371f1..99f1c753dece 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -16,6 +16,7 @@
#include "../blk-mq-debugfs.h" #include "stats.h" +#include "iodump.h"
static const char *stage_name[NR_STAGE_GROUPS] = { }; @@ -142,6 +143,7 @@ static void hierarchy_register_stage(struct blk_io_hierarchy_stats *stats,
hstage->debugfs_dir = dir; debugfs_create_files(dir, hstage, hierarchy_debugfs_attrs); + io_hierarchy_register_iodump(hstage); }
static void hierarchy_unregister_stage(struct blk_io_hierarchy_stats *stats, diff --git a/block/blk-io-hierarchy/iodump.c b/block/blk-io-hierarchy/iodump.c new file mode 100644 index 000000000000..df3621342c2a --- /dev/null +++ b/block/blk-io-hierarchy/iodump.c @@ -0,0 +1,489 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/seq_file.h> +#include <linux/blktrace_api.h> +#include <linux/blk-cgroup.h> +#include <linux/sched/task.h> + +#include "iodump.h" +#include "../blk.h" +#include "../blk-mq-debugfs.h" + +#define RWB_LEN 6 +#define PATH_LEN 64 +#define ms_to_ns(time) (time * NSEC_PER_MSEC) +#define DEFAULT_THRESHOLD 1000 + +static DEFINE_MUTEX(dump_mutex); + +struct bio_dump_data { + u64 stat_time; + struct list_head head; + spinlock_t lock; +}; + +struct rq_dump_data { + struct request_queue *q; + enum stage_group stage; + unsigned int tag; + unsigned int total_tags; + bool has_elevator; + bool enter_queue; +}; + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + hstage->threshold = DEFAULT_THRESHOLD; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = + kmalloc(sizeof(*bio_ddata), GFP_KERNEL); + + if (!bio_ddata) + return -ENOMEM; + + INIT_LIST_HEAD(&bio_ddata->head); + spin_lock_init(&bio_ddata->lock); + hstage->dump_data = bio_ddata; + return 0; + } + + if (stage_is_rq(hstage->stage)) { + struct rq_dump_data *rq_ddata = + kzalloc(sizeof(*rq_ddata), GFP_KERNEL); + + if (!rq_ddata) + return -ENOMEM; + + rq_ddata->q = q; + rq_ddata->stage = hstage->stage; + hstage->dump_data = rq_ddata; + return 0; + } + + return -EINVAL; +} + +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = + queue_to_wrapper(q)->io_hierarchy_stats->hstage[stage]; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = hstage->dump_data; + + WARN(!list_empty(&bio_ddata->head), + "blk-io-hierarchy: disk %s stage %s unregistered whih throttled IO.\n", + kobject_name(q->kobj.parent), hierarchy_stage_name(stage)); + } + + kfree(hstage->dump_data); + hstage->dump_data = NULL; +} + +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_hierarchy_data *data = bio->hdata; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_add_tail(&data->hierarchy_list, &bio_ddata->head); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_hierarchy_data *data = bio->hdata; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_del_init(&data->hierarchy_list); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata) +{ + hdata->bio = bio; + INIT_LIST_HEAD(&hdata->hierarchy_list); +} + +static void *bio_hierarchy_list_start(struct seq_file *m, loff_t *pos) + __acquires(&bio_ddata->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irq(&bio_ddata->lock); + bio_ddata->stat_time = blk_time_get_ns(); + + return seq_list_start(&bio_ddata->head, *pos); +} + +static void *bio_hierarchy_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + return seq_list_next(v, &bio_ddata->head, pos); +} + +static void bio_hierarchy_list_stop(struct seq_file *m, void *v) + __releases(&hstage->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_unlock_irq(&bio_ddata->lock); +} + +static void __hierarchy_show_bio(struct seq_file *m, + struct bio_hierarchy_data *data, + enum stage_group stage, u64 duration) +{ + char rwbs[RWB_LEN]; + char path[PATH_LEN] = {0}; + struct bio *bio = data->bio; + struct task_struct *task = get_pid_task(bio->pid, PIDTYPE_PID); + + blk_fill_rwbs(rwbs, bio->bi_opf, bio->bi_iter.bi_size); + cgroup_path(bio->bi_css->cgroup, path, PATH_LEN); + + seq_printf(m, "%s-%d %s stage %s bio %s %lu + %u cgroup %s started %llu ns ago\n", + task ? task->comm : "null", task ? task->pid : 0, + bio->bi_disk->disk_name, hierarchy_stage_name(stage), + rwbs, bio->bi_iter.bi_sector, bio_sectors(bio), path, + duration); + + if (task) + put_task_struct(task); +} + +static u64 get_duration(u64 a, u64 b) +{ + return a > b ? a - b : 0; +} + +static void hierarchy_show_bio(struct seq_file *m, + struct bio_hierarchy_data *data) +{ + u64 duration; + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + duration = get_duration(bio_ddata->stat_time, data->time); + if (hstage->threshold > ns_to_ms(duration)) + return; + + __hierarchy_show_bio(m, data, hstage->stage, duration); +} + +static int bio_hierarchy_list_show(struct seq_file *m, void *v) +{ + struct bio_hierarchy_data *data = + list_entry(v, struct bio_hierarchy_data, hierarchy_list); + + hierarchy_show_bio(m, data); + return 0; +} + +static const struct seq_operations hierarchy_bio_dump_ops = { + .start = bio_hierarchy_list_start, + .next = bio_hierarchy_list_next, + .stop = bio_hierarchy_list_stop, + .show = bio_hierarchy_list_show, +}; + +static int threshold_show(void *data, struct seq_file *m) +{ + struct hierarchy_stage *hstage = data; + + seq_printf(m, "%lu\n", hstage->threshold); + return 0; +} + +/* + * max size needed by different bases to express U64 + * HEX: "0xFFFFFFFFFFFFFFFF" --> 18 + * DEC: "18446744073709551615" --> 20 + * OCT: "01777777777777777777777" --> 23 + * pick the max one to define NUMBER_BUF_LEN + */ +#define MAX_BUF_LEN 24 +static ssize_t threshold_store(void *data, const char __user *buf, size_t count, + loff_t *ppos) +{ + int err; + unsigned long val; + char b[MAX_BUF_LEN + 1]; + struct hierarchy_stage *hstage = data; + + if (count > MAX_BUF_LEN) + return -EINVAL; + + if (copy_from_user(b, buf, count)) + return -EFAULT; + + b[count] = 0; + err = kstrtoul(b, 0, &val); + if (!err) + hstage->threshold = val; + + return err ? err : count; +} + +static void rq_hierarchy_init_dump_data(struct rq_dump_data *rq_ddata) +{ + struct request_queue *q = rq_ddata->q; + + rq_ddata->has_elevator = !!q->elevator; + + if (rq_ddata->has_elevator) + rq_ddata->total_tags = q->nr_hw_queues * q->nr_requests; + else + rq_ddata->total_tags = q->nr_hw_queues * + q->tag_set->queue_depth; +} + +static bool __rq_hierarchy_start(struct rq_dump_data *rq_ddata, + unsigned int tag) +{ + /* + * Grab .q_usage_counter so request pool won't go away, then no + * request use-after-free is possible during iteration. If queue is + * frozen, there won't be any inflight requests. + */ + if (!percpu_ref_tryget(&rq_ddata->q->q_usage_counter)) { + rq_ddata->enter_queue = false; + return false; + } + + rq_ddata->enter_queue = true; + rq_hierarchy_init_dump_data(rq_ddata); + rq_ddata->tag = tag; + + return tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static bool __rq_hierarchy_next(struct rq_dump_data *rq_ddata) +{ + rq_ddata->tag++; + + return rq_ddata->tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static void __rq_hierarchy_stop(struct rq_dump_data *rq_ddata) +{ + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +static void *rq_hierarchy_start(struct seq_file *m, loff_t *pos) + __acquires(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + + if (__rq_hierarchy_start(rq_ddata, *pos)) + return rq_ddata; + + return NULL; +} + +static void *rq_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct rq_dump_data *rq_ddata = v; + + if (__rq_hierarchy_next(rq_ddata)) { + *pos = rq_ddata->tag; + return rq_ddata; + } + + (*pos)++; + return NULL; +} + +static void rq_hierarchy_stop(struct seq_file *m, void *v) + __releases(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + __rq_hierarchy_stop(rq_ddata); + mutex_unlock(&dump_mutex); +} + +static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata) +{ + struct request *rq; + struct request_wrapper *rq_wrapper; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq_ddata->q; + unsigned int nr_tag = rq_ddata->tag; + unsigned int hctx_id; + + if (nr_tag >= rq_ddata->total_tags) { + hctx_id = nr_tag - rq_ddata->total_tags; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = q->queue_hw_ctx[hctx_id]; + rq = hctx->fq->flush_rq; + } else if (rq_ddata->has_elevator) { + hctx_id = nr_tag / q->nr_requests; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = q->queue_hw_ctx[hctx_id]; + rq = hctx->sched_tags->static_rqs[nr_tag % q->nr_requests]; + } else { + hctx_id = nr_tag / q->tag_set->queue_depth; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = q->queue_hw_ctx[hctx_id]; + if (!hctx->tags) + return NULL; + + rq = hctx->tags->static_rqs[nr_tag % q->tag_set->queue_depth]; + } + + rq_wrapper = request_to_wrapper(rq); + /* + * fast path to avoid refcount cas operations for the request that + * is from other shared request_queue or other stages. + */ + if (rq->q != q || READ_ONCE(rq_wrapper->stage) != rq_ddata->stage) + return NULL; + + if (!refcount_inc_not_zero(&rq->ref)) + return NULL; + + /* Check again after request is pinned, in case request is resued. */ + if (rq->q != q) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + /* + * Barrier is paired with the smp_store_release() in + * rq_hierarchy_start_io_acct(), so that if stage is read, uninitialized + * hierarchy_time won't be read. + */ + if (smp_load_acquire(&rq_wrapper->stage) != rq_ddata->stage) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + return rq; +} + +static void hierarchy_show_rq(struct seq_file *m, struct request *rq, + u64 duration) +{ + struct request_wrapper *rq_wrapper = request_to_wrapper(rq); + struct task_struct *task = get_pid_task(rq_wrapper->pid, PIDTYPE_PID); + const char *name = hierarchy_stage_name(rq_wrapper->stage); + + seq_printf(m, "%s-%d %s stage %s ", task ? task->comm : "null", + task ? task->pid : 0, + rq->rq_disk ? rq->rq_disk->disk_name : "?", + name ? name : "?"); + debugfs_rq_show(m, rq); + seq_printf(m, " started %llu ns ago}\n", duration); + + if (task) + put_task_struct(task); +} + +static int rq_hierarchy_show(struct seq_file *m, void *v) +{ + u64 duration; + unsigned long htime; + struct hierarchy_stage *hstage = m->private; + struct request_wrapper *rq_wrapper; + struct request *rq = hierarchy_find_and_get_rq(v); + + if (!rq) + return 0; + + rq_wrapper = request_to_wrapper(rq); + htime = READ_ONCE(rq_wrapper->hierarchy_time); + htime = time_after(jiffies, htime) ? jiffies - htime : 0; + duration = jiffies_to_msecs(htime); + if (hstage->threshold <= duration) + hierarchy_show_rq(m, rq, ms_to_ns(duration)); + + blk_mq_put_rq_ref(rq); + return 0; +} + +static const struct seq_operations hierarchy_rq_dump_ops = { + .start = rq_hierarchy_start, + .next = rq_hierarchy_next, + .stop = rq_hierarchy_stop, + .show = rq_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr hierarchy_threshold_attr[] = { + { + "threshold", + 0600, + threshold_show, + threshold_store, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_bio_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_bio_dump_ops, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_rq_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_rq_dump_ops, + }, + {}, +}; + +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ + const struct blk_mq_debugfs_attr *attr; + + if (stage_is_bio(hstage->stage)) + attr = hierarchy_bio_dump_attr; + else if (stage_is_rq(hstage->stage)) + attr = hierarchy_rq_dump_attr; + else + attr = NULL; + + debugfs_create_files(hstage->debugfs_dir, hstage, + hierarchy_threshold_attr); + if (attr) + debugfs_create_files(hstage->debugfs_dir, hstage, attr); +} diff --git a/block/blk-io-hierarchy/iodump.h b/block/blk-io-hierarchy/iodump.h new file mode 100644 index 000000000000..547666e418a0 --- /dev/null +++ b/block/blk-io-hierarchy/iodump.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef BLK_IO_HIERARCHY_IODUMP_H +#define BLK_IO_HIERARCHY_IODUMP_H + +#ifdef CONFIG_HIERARCHY_IO_DUMP + +#include "stats.h" + +#define ns_to_ms(time) div_u64(time, NSEC_PER_MSEC) + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage); +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage); +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio); +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio); +void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata); +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage); + +#else +static inline int +blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + return 0; +} + +static inline void +blk_io_hierarchy_iodump_exit(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata) +{ +} + +static inline void +io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ +} + +#endif +#endif diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c index b173ac5e2410..5bc7476d63d9 100644 --- a/block/blk-io-hierarchy/stats.c +++ b/block/blk-io-hierarchy/stats.c @@ -15,6 +15,7 @@ #include <linux/debugfs.h>
#include "stats.h" +#include "iodump.h" #include "../blk.h" #include "../blk-mq-debugfs.h"
@@ -50,6 +51,7 @@ static void bio_alloc_hierarchy_data(struct bio *bio) struct bio_hierarchy_data *hdata = mempool_alloc(hdata_pool, GFP_NOIO);
+ bio_hierarchy_data_init(bio, hdata); bio->hdata = hdata; } } @@ -209,6 +211,11 @@ void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) hstage->stage = stage; hstage->unbalanced_warned = false; hstage->debugfs_dir = NULL; + if (blk_io_hierarchy_iodump_init(q, hstage) < 0) { + put_hstats_data(stats, hstage->hstats_data); + kfree(hstage); + return; + }
blk_mq_freeze_queue(q);
@@ -230,6 +237,7 @@ void blk_mq_unregister_hierarchy(struct request_queue *q, return;
blk_mq_debugfs_unregister_hierarchy(q, stage); + blk_io_hierarchy_iodump_exit(q, stage);
spin_lock(&stats->hstage_lock); hstage = stats->hstage[stage]; @@ -269,6 +277,7 @@ void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) io_hierarchy_inc(hstage->hstats_data, dispatched, bio_hierarchy_op(bio)); bio->hdata->time = blk_time_get_ns(); + hierarchy_add_bio(hstage, bio); }
void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, @@ -286,6 +295,7 @@ void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, duration = time - bio->hdata->time; hstage = queue_to_wrapper(q)->io_hierarchy_stats->hstage[stage];
+ hierarchy_remove_bio(hstage, bio); io_hierarchy_inc(hstage->hstats_data, completed, op); io_hierarchy_add(hstage->hstats_data, nsecs, op, duration); } diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h index 5f2f0ce2e34c..b295e9e59a64 100644 --- a/block/blk-io-hierarchy/stats.h +++ b/block/blk-io-hierarchy/stats.h @@ -22,6 +22,10 @@
struct bio_hierarchy_data { u64 time; +#ifdef CONFIG_HIERARCHY_IO_DUMP + struct bio *bio; + struct list_head hierarchy_list; +#endif };
struct hierarchy_stats { @@ -45,6 +49,10 @@ struct hierarchy_stage { bool unbalanced_warned; struct dentry *debugfs_dir; struct hierarchy_stats_data *hstats_data; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long threshold; + void *dump_data; +#endif };
struct blk_io_hierarchy_stats { diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index e50d51a8839c..5ee91901d9cc 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -23,6 +23,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" +#include "blk-io-hierarchy/stats.h"
static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) { @@ -872,6 +873,7 @@ int blk_mq_debugfs_register(struct request_queue *q) goto err; }
+ blk_mq_debugfs_register_hierarchy_stats(q); return 0;
err:
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
The default threshold is 1s, and then number of IO exceed threshold will be recorded in stats as well.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/debugfs.c | 1 + block/blk-io-hierarchy/iodump.c | 26 ++++++++++++++++++++++ block/blk-io-hierarchy/iodump.h | 37 ++++++++++++++++++++++++++++++++ block/blk-io-hierarchy/stats.c | 2 ++ block/blk-io-hierarchy/stats.h | 3 +++ 5 files changed, 69 insertions(+)
diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 99f1c753dece..1d35bf4e19c5 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -60,6 +60,7 @@ static int __hierarchy_stats_show(struct hierarchy_stats_data *hstats_data, latency[STAT_DISCARD], dispatched[STAT_FLUSH], completed[STAT_FLUSH], latency[STAT_FLUSH]);
+ hierarchy_show_slow_io(hstats_data, m); seq_putc(m, '\n'); return 0; } diff --git a/block/blk-io-hierarchy/iodump.c b/block/blk-io-hierarchy/iodump.c index df3621342c2a..7e8029dfdd96 100644 --- a/block/blk-io-hierarchy/iodump.c +++ b/block/blk-io-hierarchy/iodump.c @@ -487,3 +487,29 @@ void io_hierarchy_register_iodump(struct hierarchy_stage *hstage) if (attr) debugfs_create_files(hstage->debugfs_dir, hstage, attr); } + +void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + if (hstage->threshold <= duration) + this_cpu_inc(hstage->hstats_data->hstats->slow[op]); +} + +void hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data, + struct seq_file *m) +{ + u64 slow[NR_NEW_STAT_GROUPS] = {0}; + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct hierarchy_stats *stat = + per_cpu_ptr(hstats_data->hstats, cpu); + + for (i = 0; i < NR_STAT_GROUPS; ++i) + slow[i] += stat->slow[i]; + } + + seq_printf(m, " %llu %llu %llu %llu", slow[STAT_READ], slow[STAT_WRITE], + slow[STAT_DISCARD], slow[STAT_FLUSH]); +} diff --git a/block/blk-io-hierarchy/iodump.h b/block/blk-io-hierarchy/iodump.h index 547666e418a0..f8ef0d8669f6 100644 --- a/block/blk-io-hierarchy/iodump.h +++ b/block/blk-io-hierarchy/iodump.h @@ -29,6 +29,25 @@ void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio); void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata); void io_hierarchy_register_iodump(struct hierarchy_stage *hstage);
+void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration); +void hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data, + struct seq_file *m); + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ + hierarchy_account_slow_io(hstage, op, ns_to_ms(duration)); +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + hierarchy_account_slow_io(hstage, op, jiffies_to_msecs(duration)); +} + #else static inline int blk_io_hierarchy_iodump_init(struct request_queue *q, @@ -62,5 +81,23 @@ io_hierarchy_register_iodump(struct hierarchy_stage *hstage) { }
+static inline void +hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ +} + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ +} + #endif #endif diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c index 5bc7476d63d9..430c5fd6a3e3 100644 --- a/block/blk-io-hierarchy/stats.c +++ b/block/blk-io-hierarchy/stats.c @@ -298,6 +298,7 @@ void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, hierarchy_remove_bio(hstage, bio); io_hierarchy_inc(hstage->hstats_data, completed, op); io_hierarchy_add(hstage->hstats_data, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); }
static enum stat_group rq_hierarchy_op(struct request *rq) @@ -382,6 +383,7 @@ void __rq_hierarchy_end_io_acct(struct request *rq,
io_hierarchy_inc(hstage->hstats_data, completed, op); io_hierarchy_add(hstage->hstats_data, jiffies, op, duration); + hierarchy_account_slow_io_jiffies(hstage, op, duration); WRITE_ONCE(rq_wrapper->stage, NR_RQ_STAGE_GROUPS); } EXPORT_SYMBOL_GPL(__rq_hierarchy_end_io_acct); diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h index b295e9e59a64..d77cf2b58ade 100644 --- a/block/blk-io-hierarchy/stats.h +++ b/block/blk-io-hierarchy/stats.h @@ -37,6 +37,9 @@ struct hierarchy_stats { }; unsigned long dispatched[NR_NEW_STAT_GROUPS]; unsigned long completed[NR_NEW_STAT_GROUPS]; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long slow[NR_NEW_STAT_GROUPS]; +#endif };
struct hierarchy_stats_data {
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
If blk-throtl is enabled, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- throtl | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in blk-throtl
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 11 +++++++++++ block/blk-io-hierarchy/debugfs.c | 3 +++ block/blk-mq.c | 11 +++++++++-- block/blk-throttle.c | 23 +++++++++++++++++++++++ include/linux/blk_types.h | 3 +++ 5 files changed, 49 insertions(+), 2 deletions(-)
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index f35f174835ef..2c15b5a7a006 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -26,4 +26,15 @@ config HIERARCHY_IO_DUMP
If unsure, say N.
+config HIERARCHY_THROTTLE + bool "Enable hierarchy stats layer blk-throttle" + default n + depends on BLK_DEV_THROTTLING=y + help + Enabling this lets blk hierarchy stats to record additional information + for blk-throttle. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 1d35bf4e19c5..5b61646553ae 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -19,6 +19,9 @@ #include "iodump.h"
static const char *stage_name[NR_STAGE_GROUPS] = { +#ifdef CONFIG_HIERARCHY_THROTTLE + [STAGE_THROTTLE] = "throtl", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/block/blk-mq.c b/block/blk-mq.c index 955e80f4d0dc..8f23109c797c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2772,6 +2772,8 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx, *next; int i;
+ blk_io_hierarchy_stats_free(q); + queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
@@ -2909,14 +2911,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops;
+ if (blk_io_hierarchy_stats_alloc(q)) + goto err_exit; + q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, blk_mq_poll_stats_bkt, BLK_MQ_POLL_STATS_BKTS, q); if (!q->poll_cb) - goto err_exit; + goto err_hierarchy_exit;
if (blk_mq_alloc_ctxs(q)) - goto err_exit; + goto err_hierarchy_exit;
/* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); @@ -2986,6 +2991,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_hw_queues = 0; err_sys_init: blk_mq_sysfs_deinit(q); +err_hierarchy_exit: + blk_io_hierarchy_stats_free(q); err_exit: q->mq_ops = NULL; return ERR_PTR(-ENOMEM); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0795935574d3..a1867a2f4f18 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -14,6 +14,7 @@ #include <linux/sched/signal.h> #include <linux/delay.h> #include "blk.h" +#include "blk-io-hierarchy/stats.h"
/* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; @@ -1350,6 +1351,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(q->queue_lock);
+ bio_list_hierarchy_end_io_acct(&bio_list_on_stack, STAGE_THROTTLE); + if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while((bio = bio_list_pop(&bio_list_on_stack))) @@ -2333,6 +2336,20 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
tg->last_low_overflow_time[rw] = jiffies;
+ /* + * This is slow path now, bio_hierarchy_start_io_acct() might spend + * some time to allocate memory. However, it's safe because 'tg' is + * pinned by this bio, and io charge should still be accurate because + * slice is already started from tg_may_dispatch(). + */ + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); + + bio_hierarchy_start_io_acct(bio, STAGE_THROTTLE); + + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; @@ -2505,6 +2522,8 @@ void blk_throtl_drain(struct request_queue *q) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(q->queue_lock);
+ bio_list_hierarchy_end_io_acct(&bio_list_on_stack, STAGE_THROTTLE); + if (!bio_list_empty(&bio_list_on_stack)) while ((bio = bio_list_pop(&bio_list_on_stack))) generic_make_request(bio); @@ -2561,6 +2580,8 @@ void blk_throtl_exit(struct request_queue *q) del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); + blk_mq_unregister_hierarchy(q, STAGE_THROTTLE); + free_percpu(q->td->latency_buckets[READ]); free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); @@ -2593,6 +2614,8 @@ void blk_throtl_register_queue(struct request_queue *q) td->track_bio_latency = !queue_is_rq_based(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); + + blk_mq_register_hierarchy(q, STAGE_THROTTLE); }
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0e1334c4a43e..ea5564ac6f11 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -386,6 +386,9 @@ enum stat_group { };
enum stage_group { +#ifdef CONFIG_BLK_DEV_THROTTLING + STAGE_THROTTLE, +#endif STAGE_BIO_RESERVE, NR_BIO_STAGE_GROUPS, NR_RQ_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
If blk-wbt is enabled, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- wbt | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in blk-wbt
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 11 +++++++++++ block/blk-io-hierarchy/debugfs.c | 3 +++ block/blk-wbt.c | 12 ++++++++++-- include/linux/blk_types.h | 3 +++ 4 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 2c15b5a7a006..ad1b7abc7610 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -37,4 +37,15 @@ config HIERARCHY_THROTTLE
If unsure, say N.
+config HIERARCHY_WBT + bool "Enable hierarchy stats layer blk-wbt" + default n + depends on BLK_WBT + help + Enabling this lets blk hierarchy stats to record additional information + for blk-wbt. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 5b61646553ae..327ed5c88edc 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -22,6 +22,9 @@ static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_THROTTLE [STAGE_THROTTLE] = "throtl", #endif +#ifdef CONFIG_HIERARCHY_WBT + [STAGE_WBT] = "wbt", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 94b5eff0cd3a..407974ee768c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -29,6 +29,9 @@
#include "blk-wbt.h" #include "blk-rq-qos.h" +#ifndef __GENKSYMS__ +#include "blk-io-hierarchy/stats.h" +#endif
#define CREATE_TRACE_POINTS #include <trace/events/wbt.h> @@ -533,11 +536,12 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, * the timer to kick off queuing again. */ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, - unsigned long rw, spinlock_t *lock) + struct bio *bio, spinlock_t *lock) __releases(lock) __acquires(lock) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); + unsigned long rw = bio->bi_opf; struct wbt_wait_data data = { .wq = { .func = wbt_wake_function, @@ -554,6 +558,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) return;
+ bio_hierarchy_start_io_acct(bio, STAGE_WBT); has_sleeper = !__prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); do { @@ -588,6 +593,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, } while (1);
finish_wait(&rqw->wait, &data.wq); + bio_hierarchy_end_io_acct(bio, STAGE_WBT); }
static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) @@ -652,7 +658,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) return; }
- __wbt_wait(rwb, flags, bio->bi_opf, lock); + __wbt_wait(rwb, flags, bio, lock);
if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); @@ -770,6 +776,7 @@ static void wbt_exit(struct rq_qos *rqos) struct rq_wb *rwb = RQWB(rqos); struct request_queue *q = rqos->q;
+ blk_mq_unregister_hierarchy(q, STAGE_WBT); blk_stat_remove_callback(q, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); @@ -848,6 +855,7 @@ int wbt_init(struct request_queue *q)
blk_mq_unfreeze_queue(q); wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + blk_mq_register_hierarchy(q, STAGE_WBT);
return 0; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index ea5564ac6f11..7ffe59d6d64e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -388,6 +388,9 @@ enum stat_group { enum stage_group { #ifdef CONFIG_BLK_DEV_THROTTLING STAGE_THROTTLE, +#endif +#ifdef CONFIG_BLK_WBT + STAGE_WBT, #endif STAGE_BIO_RESERVE, NR_BIO_STAGE_GROUPS,
From: Bart Van Assche bvanassche@acm.org
mainline inclusion from mainline-v5.3-rc1 commit c05f42206f4de12b6807270fc669b45472f1bdb7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
No code that occurs between blk_mq_get_ctx() and blk_mq_put_ctx() depends on preemption being disabled for its correctness. Since removing the CPU preemption calls does not measurably affect performance, simplify the blk-mq code by removing the blk_mq_put_ctx() function and also by not disabling preemption in blk_mq_get_ctx().
Cc: Hannes Reinecke hare@suse.com Cc: Omar Sandoval osandov@fb.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Ming Lei ming.lei@redhat.com Signed-off-by: Bart Van Assche bvanassche@acm.org Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/blk-mq-sched.c block/blk-mq-tag.c block/blk-mq.c block/blk-mq.h block/kyber-iosched.c [Context conflicts] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-mq-sched.c | 5 +---- block/blk-mq-tag.c | 8 -------- block/blk-mq.c | 16 +++------------- block/blk-mq.h | 7 +------ block/kyber-iosched.c | 1 - 5 files changed, 5 insertions(+), 32 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 0fb33abac3f6..52b119cc6616 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -389,10 +389,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); bool ret = false;
- if (e && e->type->ops.mq.bio_merge) { - blk_mq_put_ctx(ctx); + if (e && e->type->ops.mq.bio_merge) return e->type->ops.mq.bio_merge(hctx, bio); - }
if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && !list_empty_careful(&ctx->rq_list)) { @@ -402,7 +400,6 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) spin_unlock(&ctx->lock); }
- blk_mq_put_ctx(ctx); return ret; }
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index bee92ab06a5e..65464f0fe0fa 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -113,7 +113,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct sbq_wait_state *ws; DEFINE_WAIT(wait); unsigned int tag_offset; - bool drop_ctx; int tag;
if (data->flags & BLK_MQ_REQ_RESERVED) { @@ -136,7 +135,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) return BLK_MQ_TAG_FAIL;
ws = bt_wait_ptr(bt, data->hctx); - drop_ctx = data->ctx == NULL; do { struct sbitmap_queue *bt_prev;
@@ -162,9 +160,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break;
- if (data->ctx) - blk_mq_put_ctx(data->ctx); - bt_prev = bt; io_schedule();
@@ -189,9 +184,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ws = bt_wait_ptr(bt, data->hctx); } while (1);
- if (drop_ctx && data->ctx) - blk_mq_put_ctx(data->ctx); - finish_wait(&ws->wait, &wait);
found_tag: diff --git a/block/blk-mq.c b/block/blk-mq.c index 8f23109c797c..b4cd46e75a1e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -406,13 +406,13 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct elevator_queue *e = q->elevator; struct request *rq; unsigned int tag; - bool put_ctx_on_error = false; + bool clear_ctx_on_error = false;
blk_queue_enter_live(q); data->q = q; if (likely(!data->ctx)) { data->ctx = blk_mq_get_ctx(q); - put_ctx_on_error = true; + clear_ctx_on_error = true; } if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->ctx->cpu); @@ -436,10 +436,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
tag = blk_mq_get_tag(data); if (tag == BLK_MQ_TAG_FAIL) { - if (put_ctx_on_error) { - blk_mq_put_ctx(data->ctx); + if (clear_ctx_on_error) data->ctx = NULL; - } blk_queue_exit(q); return NULL; } @@ -476,8 +474,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (!rq) return ERR_PTR(-EWOULDBLOCK);
- blk_mq_put_ctx(alloc_data.ctx); - rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; @@ -2032,7 +2028,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
plug = current->plug; if (unlikely(is_flush_fua)) { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio);
/* bypass scheduler for flush rq */ @@ -2041,7 +2036,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else if (plug && q->nr_hw_queues == 1) { struct request *last = NULL;
- blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio);
/* @@ -2081,8 +2075,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) list_del_init(&same_queue_rq->queuelist); list_add_tail(&rq->queuelist, &plug->mq_list);
- blk_mq_put_ctx(data.ctx); - if (same_queue_rq) { data.hctx = blk_mq_map_queue(q, same_queue_rq->mq_ctx->cpu); @@ -2091,11 +2083,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_sched_insert_request(rq, false, true, true); } diff --git a/block/blk-mq.h b/block/blk-mq.h index b2a9efb43209..2d5de4c71692 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -145,12 +145,7 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { - return __blk_mq_get_ctx(q, get_cpu()); -} - -static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) -{ - put_cpu(); + return __blk_mq_get_ctx(q, raw_smp_processor_id()); }
struct blk_mq_alloc_data { diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index f370d3e3f6e0..6dd2d3ac7528 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -517,7 +517,6 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx_q, struct bio *bio) spin_lock(&kcq->lock); merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); spin_unlock(&kcq->lock); - blk_mq_put_ctx(ctx);
return merged; }
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- gettag | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in gettag.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 7 +++++++ block/blk-io-hierarchy/Kconfig | 10 ++++++++++ block/blk-io-hierarchy/debugfs.c | 3 +++ block/blk-mq-tag.c | 5 +++++ block/blk-sysfs.c | 11 +++++++++++ include/linux/blk_types.h | 2 +- 6 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 5db196c0ef87..bd232d9106ad 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -43,6 +43,7 @@ #include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h"
#ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; @@ -1001,6 +1002,11 @@ void blk_exit_queue(struct request_queue *q) bdi_put(q->backing_dev_info); }
+static void blk_mq_unregister_default_hierarchy(struct request_queue *q) +{ + blk_mq_unregister_hierarchy(q, STAGE_GETTAG); +} + /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -1088,6 +1094,7 @@ void blk_cleanup_queue(struct request_queue *q) blk_exit_queue(q);
if (q->mq_ops) { + blk_mq_unregister_default_hierarchy(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); } diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index ad1b7abc7610..2b9ccc2060b6 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -48,4 +48,14 @@ config HIERARCHY_WBT
If unsure, say N.
+config HIERARCHY_GETTAG + bool "Enable hierarchy stats layer gettag" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for gettag. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 327ed5c88edc..79062cd90c08 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -25,6 +25,9 @@ static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_WBT [STAGE_WBT] = "wbt", #endif +#ifdef CONFIG_HIERARCHY_GETTAG + [STAGE_GETTAG] = "gettag", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 65464f0fe0fa..f7b21d7f136e 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-io-hierarchy/stats.h"
bool blk_mq_has_free_tags(struct blk_mq_tags *tags) { @@ -134,6 +135,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (data->flags & BLK_MQ_REQ_NOWAIT) return BLK_MQ_TAG_FAIL;
+ if (data->bio) + bio_hierarchy_start_io_acct(data->bio, STAGE_GETTAG); ws = bt_wait_ptr(bt, data->hctx); do { struct sbitmap_queue *bt_prev; @@ -185,6 +188,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) } while (1);
finish_wait(&ws->wait, &wait); + if (data->bio) + bio_hierarchy_end_io_acct(data->bio, STAGE_GETTAG);
found_tag: return tag + tag_offset; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1c4d795bbdc4..1a8872409ab8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -17,6 +17,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h"
struct queue_sysfs_entry { struct attribute attr; @@ -924,6 +925,14 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, };
+static void blk_mq_register_default_hierarchy(struct request_queue *q) +{ + if (!q->mq_ops) + return; + + blk_mq_register_hierarchy(q, STAGE_GETTAG); +} + /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -973,6 +982,8 @@ int blk_register_queue(struct gendisk *disk) has_elevator = true; }
+ blk_mq_register_default_hierarchy(q); + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 7ffe59d6d64e..3e9068552ab0 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -392,7 +392,7 @@ enum stage_group { #ifdef CONFIG_BLK_WBT STAGE_WBT, #endif - STAGE_BIO_RESERVE, + STAGE_GETTAG, NR_BIO_STAGE_GROUPS, NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- plug | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in plug.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 1 + block/blk-io-hierarchy/Kconfig | 10 ++++++++++ block/blk-io-hierarchy/debugfs.c | 3 +++ block/blk-mq.c | 6 ++++++ block/blk-sysfs.c | 1 + include/linux/blk_types.h | 1 + 6 files changed, 22 insertions(+)
diff --git a/block/blk-core.c b/block/blk-core.c index bd232d9106ad..ae7785661722 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1005,6 +1005,7 @@ void blk_exit_queue(struct request_queue *q) static void blk_mq_unregister_default_hierarchy(struct request_queue *q) { blk_mq_unregister_hierarchy(q, STAGE_GETTAG); + blk_mq_unregister_hierarchy(q, STAGE_PLUG); }
/** diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 2b9ccc2060b6..9bdb124dd65c 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -58,4 +58,14 @@ config HIERARCHY_GETTAG
If unsure, say N.
+config HIERARCHY_PLUG + bool "Enable hierarchy stats layer plug" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for plug. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 79062cd90c08..f77e93092ae5 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -28,6 +28,9 @@ static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_GETTAG [STAGE_GETTAG] = "gettag", #endif +#ifdef CONFIG_HIERARCHY_PLUG + [STAGE_PLUG] = "plug", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/block/blk-mq.c b/block/blk-mq.c index b4cd46e75a1e..0926601c6a60 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1798,6 +1798,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) if (rq->mq_ctx != this_ctx) { if (this_ctx) { trace_block_unplug(this_q, depth, !from_schedule); + rq_list_hierarchy_end_io_acct(&ctx_list, + STAGE_PLUG); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); @@ -1818,6 +1820,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (this_ctx) { trace_block_unplug(this_q, depth, !from_schedule); + rq_list_hierarchy_end_io_acct(&ctx_list, STAGE_PLUG); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); } @@ -2058,6 +2061,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_plug(q); }
+ rq_hierarchy_start_io_acct(rq, STAGE_PLUG); list_add_tail(&rq->queuelist, &plug->mq_list); } else if (plug && !blk_queue_nomerges(q)) { blk_mq_bio_to_request(rq, bio); @@ -2073,11 +2077,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) same_queue_rq = NULL; if (same_queue_rq) list_del_init(&same_queue_rq->queuelist); + rq_hierarchy_start_io_acct(rq, STAGE_PLUG); list_add_tail(&rq->queuelist, &plug->mq_list);
if (same_queue_rq) { data.hctx = blk_mq_map_queue(q, same_queue_rq->mq_ctx->cpu); + rq_hierarchy_end_io_acct(same_queue_rq, STAGE_PLUG); blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1a8872409ab8..b1665e507417 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -931,6 +931,7 @@ static void blk_mq_register_default_hierarchy(struct request_queue *q) return;
blk_mq_register_hierarchy(q, STAGE_GETTAG); + blk_mq_register_hierarchy(q, STAGE_PLUG); }
/** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3e9068552ab0..3251b706ae01 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -394,6 +394,7 @@ enum stage_group { #endif STAGE_GETTAG, NR_BIO_STAGE_GROUPS, + STAGE_PLUG = NR_BIO_STAGE_GROUPS, NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS, };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- deadline | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in mq-deadline.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 11 +++++++++++ block/blk-io-hierarchy/debugfs.c | 3 +++ block/mq-deadline.c | 15 ++++++++++++++- include/linux/blk_types.h | 3 +++ 4 files changed, 31 insertions(+), 1 deletion(-)
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 9bdb124dd65c..d48fe3d9c673 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -68,4 +68,15 @@ config HIERARCHY_PLUG
If unsure, say N.
+config HIERARCHY_DEADLINE + bool "Enable hierarchy stats layer mq-deadline" + default n + depends on MQ_IOSCHED_DEADLINE + help + Enabling this lets blk hierarchy stats to record additional information + for mq-deadline. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index f77e93092ae5..6362c335c2ff 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -31,6 +31,9 @@ static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_PLUG [STAGE_PLUG] = "plug", #endif +#ifdef CONFIG_HIERARCHY_DEADLINE + [STAGE_DEADLINE] = "deadline", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 7ad820050675..aa51abb3eaa4 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -22,6 +22,7 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h"
/* * See Documentation/block/deadline-iosched.txt @@ -61,6 +62,8 @@ struct deadline_data { spinlock_t lock; spinlock_t zone_lock; struct list_head dispatch; + + struct request_queue *q; };
static inline struct rb_root * @@ -386,6 +389,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock);
+ if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_DEADLINE); return rq; }
@@ -396,6 +401,7 @@ static void dd_exit_queue(struct elevator_queue *e) BUG_ON(!list_empty(&dd->fifo_list[READ])); BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+ blk_mq_unregister_hierarchy(dd->q, STAGE_DEADLINE); kfree(dd); }
@@ -427,11 +433,13 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e) dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; + dd->q = q; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); INIT_LIST_HEAD(&dd->dispatch);
q->elevator = eq; + blk_mq_register_hierarchy(q, STAGE_DEADLINE); return 0; }
@@ -469,8 +477,10 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) ret = blk_mq_sched_try_merge(q, bio, &free); spin_unlock(&dd->lock);
- if (free) + if (free) { + rq_hierarchy_end_io_acct(free, STAGE_DEADLINE); blk_mq_free_request(free); + }
return ret; } @@ -493,6 +503,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_req_zone_write_unlock(rq);
if (blk_mq_sched_try_insert_merge(q, rq, &free)) { + rq_list_hierarchy_end_io_acct(&free, STAGE_DEADLINE); blk_mq_free_requests(&free); return; } @@ -527,6 +538,8 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data;
+ rq_list_hierarchy_start_io_acct(list, STAGE_DEADLINE); + spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3251b706ae01..24de7817b35f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -395,6 +395,9 @@ enum stage_group { STAGE_GETTAG, NR_BIO_STAGE_GROUPS, STAGE_PLUG = NR_BIO_STAGE_GROUPS, +#if IS_ENABLED(CONFIG_MQ_IOSCHED_DEADLINE) + STAGE_DEADLINE, +#endif NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS, };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- bfq | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in bfq.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/bfq-iosched.c | 11 ++++++++++- block/blk-io-hierarchy/Kconfig | 11 +++++++++++ block/blk-io-hierarchy/debugfs.c | 3 +++ include/linux/blk_types.h | 3 +++ 4 files changed, 27 insertions(+), 1 deletion(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index cf15937eed5e..59acc50945d2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -140,6 +140,7 @@ #include "blk-mq-sched.h" #include "bfq-iosched.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h"
#define BFQ_BFQQ_FNS(name) \ void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ @@ -1882,8 +1883,10 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) ret = blk_mq_sched_try_merge(q, bio, &free);
spin_unlock_irq(&bfqd->lock); - if (free) + if (free) { + rq_hierarchy_end_io_acct(free, STAGE_BFQ); blk_mq_free_request(free); + }
return ret; } @@ -4168,6 +4171,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) idle_timer_disabled ? in_serv_queue : NULL, idle_timer_disabled);
+ if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_BFQ); return rq; }
@@ -4751,6 +4756,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + rq_list_hierarchy_end_io_acct(&free, STAGE_BFQ); blk_mq_free_requests(&free); return; } @@ -4798,6 +4804,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool at_head) { + rq_list_hierarchy_start_io_acct(list, STAGE_BFQ); while (!list_empty(list)) { struct request *rq;
@@ -5395,6 +5402,7 @@ static void bfq_exit_queue(struct elevator_queue *e) struct bfq_queue *bfqq, *n; struct request_queue *q = bfqd->queue;
+ blk_mq_unregister_hierarchy(q, STAGE_BFQ); hrtimer_cancel(&bfqd->idle_slice_timer);
spin_lock_irq(&bfqd->lock); @@ -5561,6 +5569,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
wbt_disable_default(q); + blk_mq_register_hierarchy(q, STAGE_BFQ); return 0;
out_free: diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index d48fe3d9c673..57c23cde0150 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -79,4 +79,15 @@ config HIERARCHY_DEADLINE
If unsure, say N.
+config HIERARCHY_BFQ + bool "Enable hierarchy stats layer bfq" + default n + depends on IOSCHED_BFQ + help + Enabling this lets blk hierarchy stats to record additional information + for bfq. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 6362c335c2ff..691ac117d20d 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -34,6 +34,9 @@ static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_DEADLINE [STAGE_DEADLINE] = "deadline", #endif +#ifdef CONFIG_HIERARCHY_BFQ + [STAGE_BFQ] = "bfq", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 24de7817b35f..40812dba1337 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -397,6 +397,9 @@ enum stage_group { STAGE_PLUG = NR_BIO_STAGE_GROUPS, #if IS_ENABLED(CONFIG_MQ_IOSCHED_DEADLINE) STAGE_DEADLINE, +#endif +#if IS_ENABLED(CONFIG_IOSCHED_BFQ) + STAGE_BFQ, #endif NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- kyber | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in kyber.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 11 +++++++++++ block/blk-io-hierarchy/debugfs.c | 3 +++ block/kyber-iosched.c | 7 +++++++ include/linux/blk_types.h | 3 +++ 4 files changed, 24 insertions(+)
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 57c23cde0150..bc4f7ae14572 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -90,4 +90,15 @@ config HIERARCHY_BFQ
If unsure, say N.
+config HIERARCHY_KYBER + bool "Enable hierarchy stats layer kyber" + default n + depends on MQ_IOSCHED_KYBER + help + Enabling this lets blk hierarchy stats to record additional information + for kyber. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 691ac117d20d..f174acd9406e 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -37,6 +37,9 @@ static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_BFQ [STAGE_BFQ] = "bfq", #endif +#ifdef CONFIG_HIERARCHY_KYBER + [STAGE_KYBER] = "kyber", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 6dd2d3ac7528..fe0cb5ab76af 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -30,6 +30,7 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-stat.h" +#include "blk-io-hierarchy/stats.h"
/* Scheduling domains. */ enum { @@ -365,6 +366,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
blk_stat_add_callback(q, kqd->cb);
+ blk_mq_register_hierarchy(q, STAGE_KYBER); return 0; }
@@ -374,6 +376,7 @@ static void kyber_exit_sched(struct elevator_queue *e) struct request_queue *q = kqd->q; int i;
+ blk_mq_unregister_hierarchy(q, STAGE_KYBER); blk_stat_remove_callback(q, kqd->cb);
for (i = 0; i < KYBER_NUM_DOMAINS; i++) @@ -532,6 +535,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, struct kyber_hctx_data *khd = hctx->sched_data; struct request *rq, *next;
+ rq_list_hierarchy_start_io_acct(rq_list, STAGE_KYBER); list_for_each_entry_safe(rq, next, rq_list, queuelist) { unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; @@ -771,6 +775,9 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = NULL; out: spin_unlock(&khd->lock); + + if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_KYBER); return rq; }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 40812dba1337..b49b6ec475a9 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -400,6 +400,9 @@ enum stage_group { #endif #if IS_ENABLED(CONFIG_IOSCHED_BFQ) STAGE_BFQ, +#endif +#if IS_ENABLED(CONFIG_MQ_IOSCHED_KYBER) + STAGE_KYBER, #endif NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- hctx | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in hctx.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 1 + block/blk-flush.c | 2 ++ block/blk-io-hierarchy/Kconfig | 10 ++++++++++ block/blk-io-hierarchy/debugfs.c | 3 +++ block/blk-mq-sched.c | 2 ++ block/blk-mq.c | 3 +++ block/blk-sysfs.c | 1 + include/linux/blk_types.h | 1 + 8 files changed, 23 insertions(+)
diff --git a/block/blk-core.c b/block/blk-core.c index ae7785661722..c9f75d0dcdbe 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1006,6 +1006,7 @@ static void blk_mq_unregister_default_hierarchy(struct request_queue *q) { blk_mq_unregister_hierarchy(q, STAGE_GETTAG); blk_mq_unregister_hierarchy(q, STAGE_PLUG); + blk_mq_unregister_hierarchy(q, STAGE_HCTX); }
/** diff --git a/block/blk-flush.c b/block/blk-flush.c index 6f08e1d87f47..f3944533307c 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -188,6 +188,7 @@ static bool blk_flush_complete_seq(struct request *rq, if (list_empty(pending)) fq->flush_pending_since = jiffies; list_move_tail(&rq->flush.list, pending); + rq_hierarchy_start_io_acct(rq, STAGE_HCTX); break;
case REQ_FSEQ_DATA: @@ -276,6 +277,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) unsigned int seq = blk_flush_cur_seq(rq);
BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); + rq_hierarchy_end_io_acct(rq, STAGE_HCTX); queued |= blk_flush_complete_seq(rq, fq, seq, error); }
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index bc4f7ae14572..fb597c58e2f4 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -101,4 +101,14 @@ config HIERARCHY_KYBER
If unsure, say N.
+config HIERARCHY_HCTX + bool "Enable hierarchy stats layer hctx" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for hctx. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index f174acd9406e..312756b25e31 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -40,6 +40,9 @@ static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_KYBER [STAGE_KYBER] = "kyber", #endif +#ifdef CONFIG_HIERARCHY_HCTX + [STAGE_HCTX] = "hctx", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 52b119cc6616..1c8befbe7b69 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -15,6 +15,7 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h"
void blk_mq_sched_free_hctx_data(struct request_queue *q, void (*exit)(struct blk_mq_hw_ctx *)) @@ -250,6 +251,7 @@ int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); + rq_list_hierarchy_end_io_acct(&rq_list, STAGE_HCTX); if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { if (has_sched_dispatch) ret = blk_mq_do_dispatch_sched(hctx); diff --git a/block/blk-mq.c b/block/blk-mq.c index 0926601c6a60..cb801a98a8f2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1323,6 +1323,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!list_empty(list)) { bool needs_restart;
+ rq_list_hierarchy_start_io_acct(list, STAGE_HCTX); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1732,6 +1733,7 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head, struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+ rq_hierarchy_start_io_acct(rq, STAGE_HCTX); spin_lock(&hctx->lock); if (at_head) list_add(&rq->queuelist, &hctx->dispatch); @@ -2334,6 +2336,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) if (list_empty(&tmp)) return 0;
+ rq_list_hierarchy_start_io_acct(&tmp, STAGE_HCTX); spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b1665e507417..f309b99b11dc 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -932,6 +932,7 @@ static void blk_mq_register_default_hierarchy(struct request_queue *q)
blk_mq_register_hierarchy(q, STAGE_GETTAG); blk_mq_register_hierarchy(q, STAGE_PLUG); + blk_mq_register_hierarchy(q, STAGE_HCTX); }
/** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b49b6ec475a9..4e20306b9ef7 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -404,6 +404,7 @@ enum stage_group { #if IS_ENABLED(CONFIG_MQ_IOSCHED_KYBER) STAGE_KYBER, #endif + STAGE_HCTX, NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS, };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- requeue | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in requeue.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 1 + block/blk-io-hierarchy/Kconfig | 10 ++++++++++ block/blk-io-hierarchy/debugfs.c | 3 +++ block/blk-mq.c | 2 ++ block/blk-sysfs.c | 1 + include/linux/blk_types.h | 1 + 6 files changed, 18 insertions(+)
diff --git a/block/blk-core.c b/block/blk-core.c index c9f75d0dcdbe..f19b35f90800 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1007,6 +1007,7 @@ static void blk_mq_unregister_default_hierarchy(struct request_queue *q) blk_mq_unregister_hierarchy(q, STAGE_GETTAG); blk_mq_unregister_hierarchy(q, STAGE_PLUG); blk_mq_unregister_hierarchy(q, STAGE_HCTX); + blk_mq_unregister_hierarchy(q, STAGE_REQUEUE); }
/** diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index fb597c58e2f4..1d8a781167da 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -111,4 +111,14 @@ config HIERARCHY_HCTX
If unsure, say N.
+config HIERARCHY_REQUEUE + bool "Enable hierarchy stats layer requeue" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for requeue. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 312756b25e31..00e225860842 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -43,6 +43,9 @@ static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_HCTX [STAGE_HCTX] = "hctx", #endif +#ifdef CONFIG_HIERARCHY_REQUEUE + [STAGE_REQUEUE] = "requeue", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/block/blk-mq.c b/block/blk-mq.c index cb801a98a8f2..9b30d7bb27a2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -793,6 +793,7 @@ static void blk_mq_requeue_work(struct work_struct *work) spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); spin_unlock_irq(&q->requeue_lock); + rq_list_hierarchy_end_io_acct(&rq_list, STAGE_REQUEUE);
list_for_each_entry_safe(rq, next, &rq_list, queuelist) { if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) @@ -832,6 +833,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, */ BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
+ rq_hierarchy_start_io_acct(rq, STAGE_REQUEUE); spin_lock_irqsave(&q->requeue_lock, flags); if (at_head) { rq->rq_flags |= RQF_SOFTBARRIER; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f309b99b11dc..1ffe7c2fa4b6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -933,6 +933,7 @@ static void blk_mq_register_default_hierarchy(struct request_queue *q) blk_mq_register_hierarchy(q, STAGE_GETTAG); blk_mq_register_hierarchy(q, STAGE_PLUG); blk_mq_register_hierarchy(q, STAGE_HCTX); + blk_mq_register_hierarchy(q, STAGE_REQUEUE); }
/** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4e20306b9ef7..0f664c7104b0 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -405,6 +405,7 @@ enum stage_group { STAGE_KYBER, #endif STAGE_HCTX, + STAGE_REQUEUE, NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS, };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- rq_driver | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in rq_driver.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 1 + block/blk-flush.c | 2 ++ block/blk-io-hierarchy/Kconfig | 10 ++++++++++ block/blk-io-hierarchy/debugfs.c | 3 +++ block/blk-mq.c | 8 ++++++++ block/blk-sysfs.c | 1 + include/linux/blk_types.h | 1 + 7 files changed, 26 insertions(+)
diff --git a/block/blk-core.c b/block/blk-core.c index f19b35f90800..a72ceb688f59 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1008,6 +1008,7 @@ static void blk_mq_unregister_default_hierarchy(struct request_queue *q) blk_mq_unregister_hierarchy(q, STAGE_PLUG); blk_mq_unregister_hierarchy(q, STAGE_HCTX); blk_mq_unregister_hierarchy(q, STAGE_REQUEUE); + blk_mq_unregister_hierarchy(q, STAGE_RQ_DRIVER); }
/** diff --git a/block/blk-flush.c b/block/blk-flush.c index f3944533307c..bc65d38eea4a 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -455,6 +455,8 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) blk_mq_put_driver_tag_hctx(hctx, rq); }
+ blk_rq_hierarchy_set_flush_done(rq); + /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 1d8a781167da..2c2055a4502e 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -121,4 +121,14 @@ config HIERARCHY_REQUEUE
If unsure, say N.
+config HIERARCHY_RQ_DRIVER + bool "Enable hierarchy stats layer rq_driver" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for rq_driver. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 00e225860842..7a386133da94 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -46,6 +46,9 @@ static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_REQUEUE [STAGE_REQUEUE] = "requeue", #endif +#ifdef CONFIG_HIERARCHY_RQ_DRIVER + [STAGE_RQ_DRIVER] = "rq_driver", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9b30d7bb27a2..80df133486ad 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -589,6 +589,12 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_stat_add(rq, now); }
+ /* + * Avoid accounting flush request with data twice and request that is + * not started. + */ + if (blk_mq_request_started(rq) && !blk_rq_hierarchy_is_flush_done(rq)) + rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER); blk_account_io_done(rq, now);
if (rq->end_io) { @@ -728,6 +734,7 @@ void blk_mq_start_request(struct request *rq) blk_mq_sched_started_request(rq);
trace_block_rq_issue(q, rq); + rq_hierarchy_start_io_acct(rq, STAGE_RQ_DRIVER);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = blk_time_get_ns(); @@ -766,6 +773,7 @@ static void __blk_mq_requeue_request(struct request *rq) if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; + rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER); if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1ffe7c2fa4b6..93af23610f98 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -934,6 +934,7 @@ static void blk_mq_register_default_hierarchy(struct request_queue *q) blk_mq_register_hierarchy(q, STAGE_PLUG); blk_mq_register_hierarchy(q, STAGE_HCTX); blk_mq_register_hierarchy(q, STAGE_REQUEUE); + blk_mq_register_hierarchy(q, STAGE_RQ_DRIVER); }
/** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0f664c7104b0..04cc9b6bd524 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -406,6 +406,7 @@ enum stage_group { #endif STAGE_HCTX, STAGE_REQUEUE, + STAGE_RQ_DRIVER, NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS, };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- bio | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves for bio lifetime.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/bio.c | 1 + block/blk-core.c | 10 +- block/blk-io-hierarchy/Kconfig | 11 ++ block/blk-io-hierarchy/debugfs.c | 3 + block/blk-io-hierarchy/iodump.c | 241 ++++++++++++++++++++++++++++++- block/blk-io-hierarchy/stats.c | 47 +++++- block/blk-io-hierarchy/stats.h | 52 +++++++ block/blk-mq.c | 4 + block/blk-sysfs.c | 1 + include/linux/blk_types.h | 13 ++ 10 files changed, 380 insertions(+), 3 deletions(-)
diff --git a/block/bio.c b/block/bio.c index c3aeae529dfd..194ba44e9783 100644 --- a/block/bio.c +++ b/block/bio.c @@ -252,6 +252,7 @@ void bio_uninit(struct bio *bio) bio->pid = NULL; } #endif + bio_hierarchy_end(bio); bio_free_hierarchy_data(bio); } EXPORT_SYMBOL(bio_uninit); diff --git a/block/blk-core.c b/block/blk-core.c index a72ceb688f59..0c74101424dc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -538,8 +538,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio_advance(bio, nbytes);
/* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) { + req_bio_hierarchy_end(rq, bio); bio_endio(bio); + } }
void blk_dump_rq_flags(struct request *rq, char *msg) @@ -2627,6 +2629,12 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node);
+ /* + * On the one hand REQ_PREFLUSH | REQ_FUA can be cleared above, on the + * other hand it doesn't make sense to count invalid bio. Split bio will + * be accounted separately. + */ + bio_hierarchy_start(bio); if (!blkcg_bio_issue_check(q, bio)) return false;
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 2c2055a4502e..01019f6aa425 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -13,6 +13,17 @@ menuconfig BLK_IO_HIERARCHY_STATS
if BLK_IO_HIERARCHY_STATS
+config HIERARCHY_BIO + bool "Support to record stats for bio lifetime" + default n + select BLK_BIO_ALLOC_TIME + help + Enabling this lets blk hierarchy stats to record additional information + for bio. Such information can be helpful to debug performance and + problems like io hang. + + If unsure, say N. + config HIERARCHY_IO_DUMP bool "Support to dump io that is throttled" default n diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 7a386133da94..29c17e116773 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -49,6 +49,9 @@ static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_RQ_DRIVER [STAGE_RQ_DRIVER] = "rq_driver", #endif +#ifdef CONFIG_HIERARCHY_BIO + [STAGE_BIO] = "bio", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/block/blk-io-hierarchy/iodump.c b/block/blk-io-hierarchy/iodump.c index 7e8029dfdd96..90a1a58000d8 100644 --- a/block/blk-io-hierarchy/iodump.c +++ b/block/blk-io-hierarchy/iodump.c @@ -42,6 +42,22 @@ struct rq_dump_data { bool enter_queue; };
+#ifdef CONFIG_HIERARCHY_BIO +struct pos_data { + enum stage_group stage; + unsigned int count; +}; + +struct bio_stage_dump_data { + union { + loff_t pos; + struct pos_data pdata; + }; + struct rq_dump_data rq_ddata; + u64 stat_time; +}; +#endif + int blk_io_hierarchy_iodump_init(struct request_queue *q, struct hierarchy_stage *hstage) { @@ -73,6 +89,23 @@ int blk_io_hierarchy_iodump_init(struct request_queue *q, return 0; }
+#ifdef CONFIG_HIERARCHY_BIO + BUILD_BUG_ON(sizeof(struct pos_data) != sizeof(loff_t)); + + if (hstage->stage == STAGE_BIO) { + struct bio_stage_dump_data *bstage_ddata = + kzalloc(sizeof(*bstage_ddata), GFP_KERNEL); + + if (!bstage_ddata) + return -ENOMEM; + + bstage_ddata->rq_ddata.q = q; + bstage_ddata->rq_ddata.stage = hstage->stage; + hstage->dump_data = bstage_ddata; + return 0; + } +#endif + return -EINVAL; }
@@ -371,7 +404,8 @@ static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata) * fast path to avoid refcount cas operations for the request that * is from other shared request_queue or other stages. */ - if (rq->q != q || READ_ONCE(rq_wrapper->stage) != rq_ddata->stage) + if (rq->q != q || (rq_ddata->stage != STAGE_BIO && + READ_ONCE(rq_wrapper->stage) != rq_ddata->stage)) return NULL;
if (!refcount_inc_not_zero(&rq->ref)) @@ -383,6 +417,9 @@ static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata) return NULL; }
+ if (rq_ddata->stage == STAGE_BIO) + return rq; + /* * Barrier is paired with the smp_store_release() in * rq_hierarchy_start_io_acct(), so that if stage is read, uninitialized @@ -471,6 +508,206 @@ static const struct blk_mq_debugfs_attr hierarchy_rq_dump_attr[] = { {}, };
+#ifdef CONFIG_HIERARCHY_BIO +static struct bio_dump_data *get_bio_stage_ddata(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + struct hierarchy_stage *hstage = READ_ONCE(stats->hstage[stage]); + + if (!hstage) + return NULL; + + return hstage->dump_data; +} + +static void bio_stage_start_next_stage(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + pdata->stage++; + if (!stage_is_bio(pdata->stage)) + pdata->stage = STAGE_BIO; + pdata->count = 0; + + *pos = bstage_ddata->pos; +} + +static void bio_stage_start_next_io(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) + pdata->count++; + else + pdata->count = bstage_ddata->rq_ddata.tag; + + *pos = bstage_ddata->pos; +} + +static void __bio_stage_hierarchy_stop(struct bio_stage_dump_data *bstage_ddata) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + spin_unlock_irq(&bio_ddata->lock); + } + + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +void *__bio_stage_hierarchy_start(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + +retry: + if (stage_is_bio(pdata->stage)) { + struct list_head *list; + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + if (!bio_ddata) { + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + spin_lock_irq(&bio_ddata->lock); + list = seq_list_start(&bio_ddata->head, pdata->count); + if (list) + return list; + + spin_unlock_irq(&bio_ddata->lock); + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_start(rq_ddata, pdata->count)) + return bstage_ddata; + + return NULL; +} + +static void *bio_stage_hierarchy_start(struct seq_file *m, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + bstage_ddata->pos = *pos; + bstage_ddata->stat_time = blk_time_get_ns(); + + return __bio_stage_hierarchy_start(bstage_ddata, pos); +} + +static void *bio_stage_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + struct list_head *list = ((struct list_head *)v)->next; + + if (list != &bio_ddata->head) { + bio_stage_start_next_io(bstage_ddata, pos); + return list; + } + + spin_unlock_irq(&bio_ddata->lock); + + bio_stage_start_next_stage(bstage_ddata, pos); + return __bio_stage_hierarchy_start(bstage_ddata, pos); + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_next(rq_ddata)) { + bio_stage_start_next_io(bstage_ddata, pos); + return bstage_ddata; + } + + (*pos)++; + return NULL; +} + +static void bio_stage_hierarchy_stop(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + __bio_stage_hierarchy_stop(bstage_ddata); + mutex_unlock(&dump_mutex); +} + +static int bio_stage_hierarchy_show(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + u64 duration; + + if (stage_is_bio(pdata->stage)) { + struct bio_hierarchy_data *data = list_entry( + v, struct bio_hierarchy_data, hierarchy_list); + + duration = get_duration(bstage_ddata->stat_time, + data->bio->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + __hierarchy_show_bio(m, data, pdata->stage, duration); + } else if (pdata->stage == STAGE_BIO) { + struct request *rq = hierarchy_find_and_get_rq(rq_ddata); + + if (rq) { + duration = get_duration(bstage_ddata->stat_time, + request_to_wrapper(rq)->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + hierarchy_show_rq(m, rq, duration); + blk_mq_put_rq_ref(rq); + } + } + + return 0; +} + +static const struct seq_operations bio_stage_hierarchy_ops = { + .start = bio_stage_hierarchy_start, + .next = bio_stage_hierarchy_next, + .stop = bio_stage_hierarchy_stop, + .show = bio_stage_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &bio_stage_hierarchy_ops, + }, + {}, +}; + +#else /* CONFIG_HIERARCHY_BIO */ +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + {}, +}; + +#endif + void io_hierarchy_register_iodump(struct hierarchy_stage *hstage) { const struct blk_mq_debugfs_attr *attr; @@ -479,6 +716,8 @@ void io_hierarchy_register_iodump(struct hierarchy_stage *hstage) attr = hierarchy_bio_dump_attr; else if (stage_is_rq(hstage->stage)) attr = hierarchy_rq_dump_attr; + else if (hstage->stage == STAGE_BIO) + attr = bio_stage_dump_attr; else attr = NULL;
diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c index 430c5fd6a3e3..16288db68370 100644 --- a/block/blk-io-hierarchy/stats.c +++ b/block/blk-io-hierarchy/stats.c @@ -254,7 +254,8 @@ static enum stat_group bio_hierarchy_op(struct bio *bio) if (op_is_discard(bio->bi_opf)) return STAT_DISCARD;
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + if (op_is_flush(bio->bi_opf) && + !(bio_sectors(bio) || (bio->bi_opf & REQ_HAS_DATA))) return STAT_FLUSH;
if (op_is_write(bio->bi_opf)) @@ -388,6 +389,50 @@ void __rq_hierarchy_end_io_acct(struct request *rq, } EXPORT_SYMBOL_GPL(__rq_hierarchy_end_io_acct);
+#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio) +{ + struct request_queue_wrapper *q_wrapper; + struct gendisk *disk = bio->bi_disk; + struct hierarchy_stage *hstage; + + if (bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(disk->queue, STAGE_BIO)) + return; + + bio_set_flag(bio, BIO_HIERARCHY_ACCT); + if (bio_has_data(bio)) + bio->bi_opf |= REQ_HAS_DATA; + q_wrapper = queue_to_wrapper(disk->queue); + hstage = q_wrapper->io_hierarchy_stats->hstage[STAGE_BIO]; + io_hierarchy_inc(hstage->hstats_data, dispatched, + bio_hierarchy_op(bio)); +} + +void __bio_hierarchy_end(struct bio *bio, u64 now) +{ + struct request_queue_wrapper *q_wrapper; + struct gendisk *disk = bio->bi_disk; + struct hierarchy_stage *hstage; + enum stat_group op; + u64 duration; + + op = bio_hierarchy_op(bio); + duration = now - bio->bi_alloc_time_ns; + q_wrapper = queue_to_wrapper(disk->queue); + hstage = q_wrapper->io_hierarchy_stats->hstage[STAGE_BIO]; + + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); + + bio_clear_flag(bio, BIO_HIERARCHY_ACCT); + bio->bi_opf &= ~REQ_HAS_DATA; +} +#endif + static int __init hierarchy_stats_init(void) { hdata_pool = mempool_create_kmalloc_pool(PRE_ALLOC_BIO_CNT, diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h index d77cf2b58ade..c9ce61c824ca 100644 --- a/block/blk-io-hierarchy/stats.h +++ b/block/blk-io-hierarchy/stats.h @@ -203,6 +203,41 @@ static inline bool blk_rq_hierarchy_is_flush_done(struct request *rq) return request_to_wrapper(rq)->flush_done; }
+#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio); +void __bio_hierarchy_end(struct bio *bio, u64 now); + +static inline void bio_hierarchy_end(struct bio *bio) +{ + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO)) + return; + + __bio_hierarchy_end(bio, blk_time_get_ns()); +} + +static inline void req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ + u64 now; + + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO)) + return; + + now = request_to_wrapper(rq)->io_end_time_ns; + if (!now) { + now = blk_time_get_ns(); + request_to_wrapper(rq)->io_end_time_ns = now; + } + + __bio_hierarchy_end(bio, now); +} +#endif + #else /* CONFIG_BLK_IO_HIERARCHY_STATS */
static inline int @@ -317,4 +352,21 @@ blk_rq_hierarchy_stats_init(struct request *rq)
#endif /* CONFIG_BLK_IO_HIERARCHY_STATS */
+#if !defined(CONFIG_BLK_IO_HIERARCHY_STATS) || !defined(CONFIG_HIERARCHY_BIO) +static inline void +bio_hierarchy_start(struct bio *bio) +{ +} + +static inline void +bio_hierarchy_end(struct bio *bio) +{ +} + +static inline void +req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ +} +#endif + #endif /* BLK_IO_HIERARCHY_STATS_H */ diff --git a/block/blk-mq.c b/block/blk-mq.c index 80df133486ad..d7d3589c1e5a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2015,6 +2015,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_queue_split(q, &bio);
+ /* account for split bio. */ + bio_hierarchy_start(bio); + if (!bio_integrity_prep(bio)) return BLK_QC_T_NONE;
@@ -2781,6 +2784,7 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx, *next; int i;
+ blk_mq_unregister_hierarchy(q, STAGE_BIO); blk_io_hierarchy_stats_free(q);
queue_for_each_hw_ctx(q, hctx, i) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 93af23610f98..719687a394ea 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -935,6 +935,7 @@ static void blk_mq_register_default_hierarchy(struct request_queue *q) blk_mq_register_hierarchy(q, STAGE_HCTX); blk_mq_register_hierarchy(q, STAGE_REQUEUE); blk_mq_register_hierarchy(q, STAGE_RQ_DRIVER); + blk_mq_register_hierarchy(q, STAGE_BIO); }
/** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 04cc9b6bd524..f77227229f58 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -232,6 +232,12 @@ struct bio { /* * bio flags */ +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS +#define BIO_HIERARCHY_ACCT 0 /* + * This bio has already been subjected to + * blk-io-hierarchy, don't do it again. + */ +#endif #define BIO_SEG_VALID 1 /* bi_phys_segments valid */ #define BIO_CLONED 2 /* doesn't own data */ #define BIO_BOUNCED 3 /* bio is a bounce bio */ @@ -346,6 +352,9 @@ enum req_flag_bits { /* for driver use */ __REQ_DRV, __REQ_SWAP, /* swapping request. */ +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + _REQ_HAS_DATA, /* io contain data. */ +#endif __REQ_NR_BITS, /* stops here */ };
@@ -368,6 +377,9 @@ enum req_flag_bits {
#define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS +#define REQ_HAS_DATA (1UL << _REQ_HAS_DATA) +#endif
#define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) @@ -408,6 +420,7 @@ enum stage_group { STAGE_REQUEUE, STAGE_RQ_DRIVER, NR_RQ_STAGE_GROUPS, + STAGE_BIO = NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS, };
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/11005 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/W...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/11005 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/W...