hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/Kconfig | 2 + block/Makefile | 1 + block/bfq-iosched.c | 11 +- block/blk-core.c | 15 + block/blk-flush.c | 5 + block/blk-io-hierarchy/Kconfig | 156 +++++++ block/blk-io-hierarchy/Makefile | 8 + block/blk-io-hierarchy/debugfs.c | 230 ++++++++++ block/blk-io-hierarchy/iodump.c | 753 +++++++++++++++++++++++++++++++ block/blk-io-hierarchy/iodump.h | 100 ++++ block/blk-io-hierarchy/stats.c | 331 ++++++++++++++ block/blk-io-hierarchy/stats.h | 323 +++++++++++++ block/blk-mq-debugfs.c | 16 +- block/blk-mq-debugfs.h | 8 + block/blk-mq-sched.c | 7 +- block/blk-mq-tag.c | 13 +- block/blk-mq.c | 51 ++- block/blk-mq.h | 36 +- block/blk-sysfs.c | 16 + block/blk-throttle.c | 21 + block/blk-wbt.c | 12 +- block/blk.h | 58 +++ block/kyber-iosched.c | 8 +- block/mq-deadline.c | 15 +- include/linux/blk_types.h | 41 +- include/linux/blkdev.h | 9 + 26 files changed, 2202 insertions(+), 44 deletions(-) create mode 100644 block/blk-io-hierarchy/Kconfig create mode 100644 block/blk-io-hierarchy/Makefile create mode 100644 block/blk-io-hierarchy/debugfs.c create mode 100644 block/blk-io-hierarchy/iodump.c create mode 100644 block/blk-io-hierarchy/iodump.h create mode 100644 block/blk-io-hierarchy/stats.c create mode 100644 block/blk-io-hierarchy/stats.h
diff --git a/block/Kconfig b/block/Kconfig index da71e56f8682..770cd3fa1367 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -213,6 +213,8 @@ config BLK_BIO_DISPATCH_ASYNC feature will require special care in the driver to work. If unsure, say N here.
+source "block/blk-io-hierarchy/Kconfig" + menu "Partition Types"
source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 572b33f32c07..bb711b0c307a 100644 --- a/block/Makefile +++ b/block/Makefile @@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o +obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk-io-hierarchy/ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 473d9e31ff87..2cb1bca71d39 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -140,6 +140,7 @@ #include "blk-mq-sched.h" #include "bfq-iosched.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h"
#define BFQ_BFQQ_FNS(name) \ void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ @@ -1882,8 +1883,10 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) ret = blk_mq_sched_try_merge(q, bio, &free);
spin_unlock_irq(&bfqd->lock); - if (free) + if (free) { + rq_hierarchy_end_io_acct(free, STAGE_BFQ); blk_mq_free_request(free); + }
return ret; } @@ -4168,6 +4171,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) idle_timer_disabled ? in_serv_queue : NULL, idle_timer_disabled);
+ if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_BFQ); return rq; }
@@ -4750,6 +4755,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + rq_list_hierarchy_end_io_acct(&free, STAGE_BFQ); blk_mq_free_requests(&free); return; } @@ -4797,6 +4803,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool at_head) { + rq_list_hierarchy_start_io_acct(list, STAGE_BFQ); while (!list_empty(list)) { struct request *rq;
@@ -5394,6 +5401,7 @@ static void bfq_exit_queue(struct elevator_queue *e) struct bfq_queue *bfqq, *n; struct request_queue *q = bfqd->queue;
+ blk_mq_unregister_hierarchy(q, STAGE_BFQ); hrtimer_cancel(&bfqd->idle_slice_timer);
spin_lock_irq(&bfqd->lock); @@ -5560,6 +5568,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
wbt_disable_default(q); + blk_mq_register_hierarchy(q, STAGE_BFQ); return 0;
out_free: diff --git a/block/blk-core.c b/block/blk-core.c index acf5585b0557..03b8c2367164 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -43,6 +43,7 @@ #include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h"
#ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; @@ -1001,6 +1002,15 @@ void blk_exit_queue(struct request_queue *q) bdi_put(q->backing_dev_info); }
+static void blk_mq_unregister_default_hierarchy(struct request_queue *q) +{ + blk_mq_unregister_hierarchy(q, STAGE_GETTAG); + blk_mq_unregister_hierarchy(q, STAGE_PLUG); + blk_mq_unregister_hierarchy(q, STAGE_HCTX); + blk_mq_unregister_hierarchy(q, STAGE_REQUEUE); + blk_mq_unregister_hierarchy(q, STAGE_RQ_DRIVER); +} + /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -1088,6 +1098,7 @@ void blk_cleanup_queue(struct request_queue *q) blk_exit_queue(q);
if (q->mq_ops) { + blk_mq_unregister_default_hierarchy(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); } @@ -3919,6 +3930,8 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); + plug->cur_ktime = 0; + /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier @@ -4060,6 +4073,8 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (q) queue_unplugged(q, depth, from_schedule); + + plug->cur_ktime = 0; }
void blk_finish_plug(struct blk_plug *plug) diff --git a/block/blk-flush.c b/block/blk-flush.c index c1bfcde165af..384fce3b6bf6 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -75,6 +75,7 @@ #include "blk-mq.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h"
/* PREFLUSH/FUA sequences */ enum { @@ -187,6 +188,7 @@ static bool blk_flush_complete_seq(struct request *rq, if (list_empty(pending)) fq->flush_pending_since = jiffies; list_move_tail(&rq->flush.list, pending); + rq_hierarchy_start_io_acct(rq, STAGE_HCTX); break;
case REQ_FSEQ_DATA: @@ -245,6 +247,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) * avoiding use-after-free. */ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); + blk_mq_put_alloc_task(flush_rq); if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; fq->rq_status = BLK_STS_OK; @@ -274,6 +277,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) unsigned int seq = blk_flush_cur_seq(rq);
BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); + rq_hierarchy_end_io_acct(rq, STAGE_HCTX); queued |= blk_flush_complete_seq(rq, fq, seq, error); }
@@ -377,6 +381,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; + blk_mq_get_alloc_task(flush_rq, first_rq->bio);
/* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig new file mode 100644 index 000000000000..ce72d0593fce --- /dev/null +++ b/block/blk-io-hierarchy/Kconfig @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menuconfig BLK_IO_HIERARCHY_STATS + bool "Enable hierarchy io stats" + default n + depends on BLK_DEBUG_FS=y + help + Enabling this lets the block layer to record additional information + in different io stages. Such information can be helpful to debug + performance and problems like io hang. + + If unsure, say N. + +if BLK_IO_HIERARCHY_STATS + +config HIERARCHY_BIO + bool "Support to record stats for bio lifetime" + default n + select BLK_BIO_ALLOC_TIME + help + Enabling this lets blk hierarchy stats to record additional information + for bio. Such information can be helpful to debug performance and + problems like io hang. + + If unsure, say N. + +config HIERARCHY_IO_DUMP + bool "Support to dump io that is throttled" + default n + select BLK_BIO_ALLOC_TIME + select BLK_BIO_ALLOC_TASK + depends on BLK_DEV_IO_TRACE + help + Enable this will create new debugfs entries to show user the detailed + information of IO that are submitted and not done yet, and user can + filter the result by IO stage or IO latency. + + If unsure, say N. + +config HIERARCHY_THROTTLE + bool "Enable hierarchy stats layer blk-throttle" + default n + depends on BLK_DEV_THROTTLING=y + help + Enabling this lets blk hierarchy stats to record additional information + for blk-throttle. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_WBT + bool "Enable hierarchy stats layer blk-wbt" + default n + depends on BLK_WBT + help + Enabling this lets blk hierarchy stats to record additional information + for blk-wbt. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_IOCOST + bool "Enable hierarchy stats layer blk-iocost" + default n + depends on BLK_CGROUP_IOCOST + help + Enabling this lets blk hierarchy stats to record additional information + for blk-iocost. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_GETTAG + bool "Enable hierarchy stats layer get-tag" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for getting tag. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_PLUG + bool "Enable hierarchy stats layer plug" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for plug. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_DEADLINE + bool "Enable hierarchy stats layer mq-deadline" + default n + depends on MQ_IOSCHED_DEADLINE + help + Enabling this lets blk hierarchy stats to record additional information + for mq-deadline. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_BFQ + bool "Enable hierarchy stats layer bfq" + default n + depends on IOSCHED_BFQ + help + Enabling this lets blk hierarchy stats to record additional information + for bfq. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_KYBER + bool "Enable hierarchy stats layer kyber" + default n + depends on MQ_IOSCHED_KYBER + help + Enabling this lets blk hierarchy stats to record additional information + for kyber. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_HCTX + bool "Enable hierarchy stats layer hctx" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for hctx. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_REQUEUE + bool "Enable hierarchy stats layer requeue" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for requeue. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_RQ_DRIVER + bool "Enable hierarchy stats layer rq_driver" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for requeue driver. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +endif diff --git a/block/blk-io-hierarchy/Makefile b/block/blk-io-hierarchy/Makefile new file mode 100644 index 000000000000..9b989d379e58 --- /dev/null +++ b/block/blk-io-hierarchy/Makefile @@ -0,0 +1,8 @@ +# +# Make file for blk_io_hierarchy_stats +# + +obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk_io_hierarchy_stats.o + +blk_io_hierarchy_stats-y := stats.o debugfs.o +obj-$(CONFIG_HIERARCHY_IO_DUMP) += iodump.o diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c new file mode 100644 index 000000000000..cb7ff2866c49 --- /dev/null +++ b/block/blk-io-hierarchy/debugfs.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/debugfs.h> +#include <linux/blkdev.h> + +#include "../blk-mq-debugfs.h" +#include "stats.h" +#include "iodump.h" + +static const char *stage_name[NR_STAGE_GROUPS] = { +#ifdef CONFIG_HIERARCHY_THROTTLE + [STAGE_THROTTLE] = "throtl", +#endif +#ifdef CONFIG_HIERARCHY_WBT + [STAGE_WBT] = "wbt", +#endif +#ifdef CONFIG_HIERARCHY_IOCOST + [STAGE_IOCOST] = "iocost", +#endif +#ifdef CONFIG_HIERARCHY_GETTAG + [STAGE_GETTAG] = "gettag", +#endif +#ifdef CONFIG_HIERARCHY_PLUG + [STAGE_PLUG] = "plug", +#endif +#ifdef CONFIG_HIERARCHY_DEADLINE + [STAGE_DEADLINE] = "deadline", +#endif +#ifdef CONFIG_HIERARCHY_BFQ + [STAGE_BFQ] = "bfq", +#endif +#ifdef CONFIG_HIERARCHY_KYBER + [STAGE_KYBER] = "kyber", +#endif +#ifdef CONFIG_HIERARCHY_HCTX + [STAGE_HCTX] = "hctx", +#endif +#ifdef CONFIG_HIERARCHY_REQUEUE + [STAGE_REQUEUE] = "requeue", +#endif +#ifdef CONFIG_HIERARCHY_RQ_DRIVER + [STAGE_RQ_DRIVER] = "rq_driver", +#endif +#ifdef CONFIG_HIERARCHY_BIO + [STAGE_BIO] = "bio", +#endif +}; + +const char *hierarchy_stage_name(enum stage_group stage) +{ + return stage_name[stage]; +} + +static int __hierarchy_stats_show(void *data, struct seq_file *m) +{ + struct hierarchy_stage *hstage = data; + u64 dispatched[NEW_NR_STAT_GROUPS] = {0}; + u64 completed[NEW_NR_STAT_GROUPS] = {0}; + u64 latency[NEW_NR_STAT_GROUPS] = {0}; + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct hierarchy_stats *stat = per_cpu_ptr(hstage->hstats, cpu); + + for (i = 0; i < NEW_NR_STAT_GROUPS; ++i) { + dispatched[i] += stat->dispatched[i]; + completed[i] += stat->completed[i]; + latency[i] += stage_is_rq(hstage->stage) ? + stat->jiffies[i] : stat->nsecs[i]; + } + } + + if (stage_is_rq(hstage->stage)) + for (i = 0; i < NEW_NR_STAT_GROUPS; ++i) + latency[i] = + jiffies_to_msecs(latency[i]) * NSEC_PER_MSEC; + + seq_printf(m, "%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", + dispatched[STAT_READ], completed[STAT_READ], + latency[STAT_READ], dispatched[STAT_WRITE], + completed[STAT_WRITE], latency[STAT_WRITE], + dispatched[STAT_DISCARD], completed[STAT_DISCARD], + latency[STAT_DISCARD], dispatched[STAT_FLUSH], + completed[STAT_FLUSH], latency[STAT_FLUSH]); + + hierarchy_show_slow_io(hstage, m); + seq_puts(m, "\n"); + return 0; +} + +static void *hierarchy_stats_start(struct seq_file *m, loff_t *pos) +{ + int ret; + enum stage_group stage = *pos; + struct blk_io_hierarchy_stats *stats = m->private; + + ret = blk_queue_enter(stats->q, 0); + if (ret) + return ERR_PTR(ret); + + if (stage < 0 || stage >= NR_STAGE_GROUPS) + return NULL; + + return pos; +} + +static void *hierarchy_stats_next(struct seq_file *m, void *v, loff_t *pos) +{ + enum stage_group stage = ++(*pos); + + if (stage >= 0 && stage < NR_STAGE_GROUPS) + return pos; + + return NULL; +} + +static void hierarchy_stats_stop(struct seq_file *m, void *v) +{ + struct blk_io_hierarchy_stats *stats = m->private; + + if (!IS_ERR(v)) + blk_queue_exit(stats->q); +} + +static int hierarchy_stats_show(struct seq_file *m, void *v) +{ + enum stage_group stage = (*(loff_t *)v); + struct blk_io_hierarchy_stats *stats = m->private; + struct hierarchy_stage *hstage = stats->hstage[stage]; + + if (!hstage) + return 0; + + seq_printf(m, "%s ", hierarchy_stage_name(stage)); + __hierarchy_stats_show(hstage, m); + return 0; +} + +static const struct seq_operations hierarchy_stats_ops = { + .start = hierarchy_stats_start, + .next = hierarchy_stats_next, + .stop = hierarchy_stats_stop, + .show = hierarchy_stats_show, +}; + +static const struct blk_mq_debugfs_attr hierarchy_debugfs_attrs[] = { + {"stats", 0400, __hierarchy_stats_show}, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_stats_attr[] = { + {"stats", 0400, .seq_ops = &hierarchy_stats_ops}, + {}, +}; + +static void hierarchy_register_stage(struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = stats->hstage[stage]; + struct dentry *dir; + + if (!stage_name[stage] || hstage->debugfs_dir) + return; + + dir = debugfs_create_dir(stage_name[stage], stats->debugfs_dir); + if (IS_ERR(dir)) + return; + + hstage->debugfs_dir = dir; + debugfs_create_files(dir, hstage, hierarchy_debugfs_attrs); + io_hierarchy_register_iodump(hstage); +} + +static void hierarchy_unregister_stage(struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = stats->hstage[stage]; + + if (!stage_name[stage] || !hstage->debugfs_dir) + return; + + debugfs_remove_recursive(hstage->debugfs_dir); + hstage->debugfs_dir = NULL; +} + +void blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!blk_mq_hierarchy_registered(q, stage) || + !blk_mq_debugfs_enabled(q)) + return; + + hierarchy_register_stage(stats, stage); +} + +void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!blk_mq_hierarchy_registered(q, stage) || + !blk_mq_debugfs_enabled(q)) + return; + + hierarchy_unregister_stage(stats, stage); +} + +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!blk_mq_debugfs_enabled(q)) + return; + + debugfs_create_files(stats->debugfs_dir, stats, hierarchy_stats_attr); +} diff --git a/block/blk-io-hierarchy/iodump.c b/block/blk-io-hierarchy/iodump.c new file mode 100644 index 000000000000..49ad2292873c --- /dev/null +++ b/block/blk-io-hierarchy/iodump.c @@ -0,0 +1,753 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/seq_file.h> +#include <linux/blktrace_api.h> +#include <linux/blk-cgroup.h> +#include <linux/sched/task.h> + +#include "iodump.h" +#include "../blk.h" +#include "../blk-mq-debugfs.h" + +#define RWB_LEN 6 +#define PATH_LEN 64 +#define ms_to_ns(time) (time * NSEC_PER_MSEC) +#define DEFAULT_THRESHOLD 1000 + +static DEFINE_MUTEX(dump_mutex); + +struct bio_dump_data { + u64 stat_time; + struct list_head head; + spinlock_t lock; +}; + +struct rq_dump_data { + struct request_queue *q; + enum stage_group stage; + unsigned int tag; + unsigned int total_tags; + bool shared; + bool has_elevator; + bool enter_queue; +}; + +#ifdef CONFIG_HIERARCHY_BIO +struct pos_data { + enum stage_group stage; + unsigned int count; +}; + +struct bio_stage_dump_data { + union { + loff_t pos; + struct pos_data pdata; + }; + struct rq_dump_data rq_ddata; + u64 stat_time; +}; +#endif + +static struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) +{ + return q->queue_hw_ctx[id]; +} + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + hstage->threshold = DEFAULT_THRESHOLD; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = + kmalloc(sizeof(*bio_ddata), GFP_KERNEL); + + if (!bio_ddata) + return -ENOMEM; + + INIT_LIST_HEAD(&bio_ddata->head); + spin_lock_init(&bio_ddata->lock); + hstage->dump_data = bio_ddata; + return 0; + } + + if (stage_is_rq(hstage->stage)) { + struct rq_dump_data *rq_ddata = + kzalloc(sizeof(*rq_ddata), GFP_KERNEL); + + if (!rq_ddata) + return -ENOMEM; + + rq_ddata->q = q; + rq_ddata->stage = hstage->stage; + hstage->dump_data = rq_ddata; + return 0; + } + +#ifdef CONFIG_HIERARCHY_BIO + BUILD_BUG_ON(sizeof(struct pos_data) != sizeof(loff_t)); + + if (hstage->stage == STAGE_BIO) { + struct bio_stage_dump_data *bstage_ddata = + kzalloc(sizeof(*bstage_ddata), GFP_KERNEL); + + if (!bstage_ddata) + return -ENOMEM; + + bstage_ddata->rq_ddata.q = q; + bstage_ddata->rq_ddata.stage = hstage->stage; + hstage->dump_data = bstage_ddata; + return 0; + } +#endif + + return -EINVAL; +} + +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = q->io_hierarchy_stats->hstage[stage]; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = hstage->dump_data; + + WARN(!list_empty(&bio_ddata->head), + "blk-io-hierarchy: disk %s stage %s unregistered whih throttled IO.\n", + kobject_name(q->kobj.parent), hierarchy_stage_name(stage)); + } + + kfree(hstage->dump_data); + hstage->dump_data = NULL; +} + +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_hierarchy_data *data = bio->hdata; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_add_tail(&data->hierarchy_list, &bio_ddata->head); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_hierarchy_data *data = bio->hdata; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_del_init(&data->hierarchy_list); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata) +{ + hdata->bio = bio; + INIT_LIST_HEAD(&hdata->hierarchy_list); +} + +static void *bio_hierarchy_list_start(struct seq_file *m, loff_t *pos) + __acquires(&bio_ddata->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irq(&bio_ddata->lock); + bio_ddata->stat_time = blk_time_get_ns(); + + return seq_list_start(&bio_ddata->head, *pos); +} + +static void *bio_hierarchy_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + return seq_list_next(v, &bio_ddata->head, pos); +} + +static void bio_hierarchy_list_stop(struct seq_file *m, void *v) + __releases(&hstage->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_unlock_irq(&bio_ddata->lock); +} + +static void __hierarchy_show_bio(struct seq_file *m, + struct bio_hierarchy_data *data, + enum stage_group stage, u64 duration) +{ + char rwbs[RWB_LEN]; + char path[PATH_LEN] = {0}; + struct bio *bio = data->bio; + struct task_struct *task = get_pid_task(bio->pid, PIDTYPE_PID); + + blk_fill_rwbs(rwbs, bio->bi_opf, bio->bi_iter.bi_size); + cgroup_path(bio->bi_css->cgroup, path, PATH_LEN); + + seq_printf(m, "%s-%d %s stage %s bio %s %lu + %u cgroup %s started %llu ns ago\n", + task ? task->comm : "null", task ? task->pid : 0, + bio->bi_disk->disk_name, hierarchy_stage_name(stage), + rwbs, bio->bi_iter.bi_sector, bio_sectors(bio), path, + duration); + + if (task) + put_task_struct(task); +} + +static u64 get_duration(u64 a, u64 b) +{ + return a > b ? a - b : 0; +} + +static void hierarchy_show_bio(struct seq_file *m, + struct bio_hierarchy_data *data) +{ + u64 duration; + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + duration = get_duration(bio_ddata->stat_time, data->time); + if (hstage->threshold > ns_to_ms(duration)) + return; + + __hierarchy_show_bio(m, data, hstage->stage, duration); +} + +static int bio_hierarchy_list_show(struct seq_file *m, void *v) +{ + struct bio_hierarchy_data *data = + list_entry(v, struct bio_hierarchy_data, hierarchy_list); + + hierarchy_show_bio(m, data); + return 0; +} + +static const struct seq_operations hierarchy_bio_dump_ops = { + .start = bio_hierarchy_list_start, + .next = bio_hierarchy_list_next, + .stop = bio_hierarchy_list_stop, + .show = bio_hierarchy_list_show, +}; + +static int threshold_show(void *data, struct seq_file *m) +{ + struct hierarchy_stage *hstage = data; + + seq_printf(m, "%lu\n", hstage->threshold); + return 0; +} + +/* + * max size needed by different bases to express U64 + * HEX: "0xFFFFFFFFFFFFFFFF" --> 18 + * DEC: "18446744073709551615" --> 20 + * OCT: "01777777777777777777777" --> 23 + * pick the max one to define NUMBER_BUF_LEN + */ +#define MAX_BUF_LEN 24 +static ssize_t threshold_store(void *data, const char __user *buf, size_t count, + loff_t *ppos) +{ + int err; + unsigned long val; + char b[MAX_BUF_LEN + 1]; + struct hierarchy_stage *hstage = data; + + if (count > MAX_BUF_LEN) + return -EINVAL; + + if (copy_from_user(b, buf, count)) + return -EFAULT; + + b[count] = 0; + err = kstrtoul(b, 0, &val); + if (!err) + hstage->threshold = val; + + return err ? err : count; +} + +static void rq_hierarchy_init_dump_data(struct rq_dump_data *rq_ddata) +{ + struct request_queue *q = rq_ddata->q; + + rq_ddata->shared = blk_mq_is_sbitmap_shared(q->tag_set->flags); + rq_ddata->has_elevator = !!q->elevator; + + if (rq_ddata->has_elevator) + rq_ddata->total_tags = q->nr_hw_queues * q->nr_requests; + else + rq_ddata->total_tags = q->nr_hw_queues * q->tag_set->queue_depth; +} + +static bool __rq_hierarchy_start(struct rq_dump_data *rq_ddata, + unsigned int tag) +{ + /* + * Grab .q_usage_counter so request pool won't go away, then no + * request use-after-free is possible during iteration. If queue is + * frozen, there won't be any inflight requests. + */ + if (!percpu_ref_tryget(&rq_ddata->q->q_usage_counter)) { + rq_ddata->enter_queue = false; + return false; + } + + rq_ddata->enter_queue = true; + rq_hierarchy_init_dump_data(rq_ddata); + rq_ddata->tag = tag; + + return tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static bool __rq_hierarchy_next(struct rq_dump_data *rq_ddata) +{ + rq_ddata->tag++; + + return rq_ddata->tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static void __rq_hierarchy_stop(struct rq_dump_data *rq_ddata) +{ + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +static void *rq_hierarchy_start(struct seq_file *m, loff_t *pos) + __acquires(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + + if (__rq_hierarchy_start(rq_ddata, *pos)) + return rq_ddata; + + return NULL; +} + +static void *rq_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct rq_dump_data *rq_ddata = v; + + if (__rq_hierarchy_next(rq_ddata)) { + *pos = rq_ddata->tag; + return rq_ddata; + } + + (*pos)++; + return NULL; +} + +static void rq_hierarchy_stop(struct seq_file *m, void *v) + __releases(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + __rq_hierarchy_stop(rq_ddata); + mutex_unlock(&dump_mutex); +} + +static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata) +{ + struct request *rq; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq_ddata->q; + unsigned int nr_tag = rq_ddata->tag; + unsigned int hctx_id; + + if (nr_tag >= rq_ddata->total_tags) { + hctx_id = nr_tag - rq_ddata->total_tags; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = queue_hctx(q, hctx_id); + rq = hctx->fq->flush_rq; + } else if (rq_ddata->shared) { + return NULL; + } else if (rq_ddata->has_elevator) { + hctx_id = nr_tag / q->nr_requests; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = queue_hctx(q, hctx_id); + rq = hctx->sched_tags->static_rqs[nr_tag % q->nr_requests]; + } else { + hctx_id = nr_tag / q->tag_set->queue_depth; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = queue_hctx(q, hctx_id); + if (!hctx->tags) + return NULL; + + rq = hctx->tags->static_rqs[nr_tag % q->tag_set->queue_depth]; + } + + /* + * fast path to avoid refcount cas operations for the request that + * is from other shared request_queue or other stages. + */ + if (rq->q != q || (rq_ddata->stage != STAGE_BIO && + READ_ONCE(rq->stage) != rq_ddata->stage)) + return NULL; + + if (!refcount_inc_not_zero(&rq->ref)) + return NULL; + + /* Check again after request is pinned, in case request is resued. */ + if (rq->q != q) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + if (rq_ddata->stage == STAGE_BIO) + return rq; + + /* + * Barrier is paired with the smp_store_release() in + * rq_hierarchy_start_io_acct(), so that if stage is read, uninitialized + * hierarchy_time won't be read. + */ + if (smp_load_acquire(&rq->stage) != rq_ddata->stage) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + return rq; +} + +static void hierarchy_show_rq(struct seq_file *m, struct request *rq, + u64 duration) +{ + struct task_struct *task = get_pid_task(rq->pid, PIDTYPE_PID); + const char *name = hierarchy_stage_name(rq->stage); + + seq_printf(m, "%s-%d %s stage %s ", task ? task->comm : "null", + task ? task->pid : 0, + rq->rq_disk ? rq->rq_disk->disk_name : "?", + name ? name : "?"); + debugfs_rq_show(m, rq); + seq_printf(m, " started %llu ns ago}\n", duration); + + if (task) + put_task_struct(task); +} + +static int rq_hierarchy_show(struct seq_file *m, void *v) +{ + u64 duration; + unsigned long htime; + struct hierarchy_stage *hstage = m->private; + struct request *rq = hierarchy_find_and_get_rq(v); + + if (!rq) + return 0; + + htime = READ_ONCE(rq->hierarchy_time); + htime = time_after(jiffies, htime) ? jiffies - htime : 0; + duration = jiffies_to_msecs(htime); + if (hstage->threshold <= duration) + hierarchy_show_rq(m, rq, ms_to_ns(duration)); + + blk_mq_put_rq_ref(rq); + return 0; +} + +static const struct seq_operations hierarchy_rq_dump_ops = { + .start = rq_hierarchy_start, + .next = rq_hierarchy_next, + .stop = rq_hierarchy_stop, + .show = rq_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr hierarchy_threshold_attr[] = { + { + "threshold", + 0600, + threshold_show, + threshold_store, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_bio_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_bio_dump_ops, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_rq_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_rq_dump_ops, + }, + {}, +}; + +#ifdef CONFIG_HIERARCHY_BIO +static struct bio_dump_data *get_bio_stage_ddata(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + struct hierarchy_stage *hstage = READ_ONCE(stats->hstage[stage]); + + if (!hstage) + return NULL; + + return hstage->dump_data; +} + +static void bio_stage_start_next_stage(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + pdata->stage++; + if (!stage_is_bio(pdata->stage)) + pdata->stage = STAGE_BIO; + pdata->count = 0; + + *pos = bstage_ddata->pos; +} + +static void bio_stage_start_next_io(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) + pdata->count++; + else + pdata->count = bstage_ddata->rq_ddata.tag; + + *pos = bstage_ddata->pos; +} + +static void __bio_stage_hierarchy_stop(struct bio_stage_dump_data *bstage_ddata) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + spin_unlock_irq(&bio_ddata->lock); + } + + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +void *__bio_stage_hierarchy_start(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + +retry: + if (stage_is_bio(pdata->stage)) { + struct list_head *list; + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + if (!bio_ddata) { + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + spin_lock_irq(&bio_ddata->lock); + list = seq_list_start(&bio_ddata->head, pdata->count); + if (list) + return list; + + spin_unlock_irq(&bio_ddata->lock); + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_start(rq_ddata, pdata->count)) + return bstage_ddata; + + return NULL; +} + +static void *bio_stage_hierarchy_start(struct seq_file *m, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + bstage_ddata->pos = *pos; + bstage_ddata->stat_time = blk_time_get_ns(); + + return __bio_stage_hierarchy_start(bstage_ddata, pos); +} + +static void *bio_stage_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + struct list_head *list = ((struct list_head *)v)->next; + + if (list != &bio_ddata->head) { + bio_stage_start_next_io(bstage_ddata, pos); + return list; + } + + spin_unlock_irq(&bio_ddata->lock); + + bio_stage_start_next_stage(bstage_ddata, pos); + return __bio_stage_hierarchy_start(bstage_ddata, pos); + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_next(rq_ddata)) { + bio_stage_start_next_io(bstage_ddata, pos); + return bstage_ddata; + } + + (*pos)++; + return NULL; +} + +static void bio_stage_hierarchy_stop(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + __bio_stage_hierarchy_stop(bstage_ddata); + mutex_unlock(&dump_mutex); +} + +static int bio_stage_hierarchy_show(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + u64 duration; + + if (stage_is_bio(pdata->stage)) { + struct bio_hierarchy_data *data = list_entry( + v, struct bio_hierarchy_data, hierarchy_list); + + duration = get_duration(bstage_ddata->stat_time, + data->bio->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + __hierarchy_show_bio(m, data, pdata->stage, duration); + } else if (pdata->stage == STAGE_BIO) { + struct request *rq = hierarchy_find_and_get_rq(rq_ddata); + + if (rq) { + duration = get_duration(bstage_ddata->stat_time, + rq->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + hierarchy_show_rq(m, rq, duration); + blk_mq_put_rq_ref(rq); + } + } + + return 0; +} + +static const struct seq_operations bio_stage_hierarchy_ops = { + .start = bio_stage_hierarchy_start, + .next = bio_stage_hierarchy_next, + .stop = bio_stage_hierarchy_stop, + .show = bio_stage_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &bio_stage_hierarchy_ops, + }, + {}, +}; + +#else /* CONFIG_HIERARCHY_BIO */ +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + {}, +}; + +#endif + +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ + const struct blk_mq_debugfs_attr *attr; + + if (stage_is_bio(hstage->stage)) + attr = hierarchy_bio_dump_attr; + else if (stage_is_rq(hstage->stage)) + attr = hierarchy_rq_dump_attr; + else if (hstage->stage == STAGE_BIO) + attr = bio_stage_dump_attr; + else + attr = NULL; + + debugfs_create_files(hstage->debugfs_dir, hstage, + hierarchy_threshold_attr); + if (attr) + debugfs_create_files(hstage->debugfs_dir, hstage, attr); +} + +void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + if (hstage->threshold <= duration) + this_cpu_inc(hstage->hstats->slow[op]); +} + +void hierarchy_show_slow_io(struct hierarchy_stage *hstage, struct seq_file *m) +{ + u64 slow[NEW_NR_STAT_GROUPS] = {0}; + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct hierarchy_stats *stat = per_cpu_ptr(hstage->hstats, cpu); + + for (i = 0; i < NEW_NR_STAT_GROUPS; ++i) + slow[i] += stat->slow[i]; + } + + seq_printf(m, " %llu %llu %llu %llu", slow[STAT_READ], slow[STAT_WRITE], + slow[STAT_DISCARD], slow[STAT_FLUSH]); +} diff --git a/block/blk-io-hierarchy/iodump.h b/block/blk-io-hierarchy/iodump.h new file mode 100644 index 000000000000..2f9e159f2588 --- /dev/null +++ b/block/blk-io-hierarchy/iodump.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef BLK_IO_HIERARCHY_IODUMP_H +#define BLK_IO_HIERARCHY_IODUMP_H + +#ifdef CONFIG_HIERARCHY_IO_DUMP + +#include "stats.h" + +#define ns_to_ms(time) div_u64(time, NSEC_PER_MSEC) + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage); +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage); +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio); +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio); +void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata); +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage); + +void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration); +void hierarchy_show_slow_io(struct hierarchy_stage *hstage, struct seq_file *m); + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ + hierarchy_account_slow_io(hstage, op, ns_to_ms(duration)); +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + hierarchy_account_slow_io(hstage, op, jiffies_to_msecs(duration)); +} + +#else +static inline int +blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + return 0; +} + +static inline void +blk_io_hierarchy_iodump_exit(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata) +{ +} + +static inline void +io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ +} + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ +} + +static inline void +hierarchy_show_slow_io(struct hierarchy_stage *hstage, struct seq_file *m) +{ +} +#endif +#endif diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c new file mode 100644 index 000000000000..52a23413f468 --- /dev/null +++ b/block/blk-io-hierarchy/stats.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/debugfs.h> + +#include "stats.h" +#include "iodump.h" +#include "../blk.h" +#include "../blk-mq-debugfs.h" + +#define io_hierarchy_add(statsp, field, group, nr) \ + this_cpu_add((statsp)->field[group], nr) +#define io_hierarchy_inc(statsp, field, group) \ + io_hierarchy_add(statsp, field, group, 1) + +#define PRE_ALLOC_BIO_CNT 8 + +static mempool_t *hdata_pool; + +void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + enum stage_group stage; + + stats = q->io_hierarchy_stats; + if (!stats || !blk_mq_debugfs_enabled(q)) + return; + + stats->debugfs_dir = debugfs_create_dir("blk_io_hierarchy", + q->debugfs_dir); + blk_mq_debugfs_create_default_hierarchy_attr(q); + + for (stage = 0; stage < NR_STAGE_GROUPS; ++stage) + blk_mq_debugfs_register_hierarchy(q, stage); +} + +static void bio_alloc_hierarchy_data(struct bio *bio) +{ + if (!bio->hdata) { + struct bio_hierarchy_data *hdata = + mempool_alloc(hdata_pool, GFP_NOIO); + + bio_hierarchy_data_init(bio, hdata); + bio->hdata = hdata; + } +} + +void bio_free_hierarchy_data(struct bio *bio) +{ + if (!bio->hdata) + return; + + mempool_free(bio->hdata, hdata_pool); + bio->hdata = NULL; +} + +void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + enum stage_group stage; + + stats = q->io_hierarchy_stats; + if (!stats || !blk_mq_debugfs_enabled(q)) + return; + + for (stage = 0; stage < NR_STAGE_GROUPS; ++stage) + blk_mq_debugfs_unregister_hierarchy(q, stage); + + debugfs_remove_recursive(stats->debugfs_dir); + stats->debugfs_dir = NULL; +} + +int blk_io_hierarchy_stats_alloc(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + + if (!q->mq_ops) + return 0; + + stats = kzalloc(sizeof(struct blk_io_hierarchy_stats), GFP_KERNEL); + if (!stats) + return -ENOMEM; + + stats->q = q; + q->io_hierarchy_stats = stats; + + return 0; +} + +void blk_io_hierarchy_stats_free(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!stats) + return; + + q->io_hierarchy_stats = NULL; + kfree(stats); +} + +bool blk_mq_hierarchy_registered(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!stats) + return false; + + return stats->hstage[stage] != NULL; +} +EXPORT_SYMBOL_GPL(blk_mq_hierarchy_registered); + +void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + struct hierarchy_stage *hstage; + + if (!stats || !hierarchy_stage_name(stage)) + return; + + if (blk_mq_hierarchy_registered(q, stage)) { + pr_warn("blk-io-hierarchy: disk %s is registering stage %s again.", + kobject_name(q->kobj.parent), + hierarchy_stage_name(stage)); + return; + } + + /* + * Alloc memory before freeze queue, prevent deadlock if new IO is + * issued by memory reclaim. + */ + hstage = kmalloc(sizeof(*hstage), GFP_KERNEL); + if (!hstage) + return; + + hstage->hstats = alloc_percpu(struct hierarchy_stats); + if (!hstage->hstats) { + kfree(hstage); + return; + } + + hstage->stage = stage; + hstage->debugfs_dir = NULL; + if (blk_io_hierarchy_iodump_init(q, hstage) < 0) { + free_percpu(hstage->hstats); + kfree(hstage); + return; + } + + blk_mq_freeze_queue(q); + + WRITE_ONCE(stats->hstage[stage], hstage); + blk_mq_debugfs_register_hierarchy(q, stage); + + blk_mq_unfreeze_queue(q); +} +EXPORT_SYMBOL_GPL(blk_mq_register_hierarchy); + +void blk_mq_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + struct hierarchy_stage *hstage; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + blk_mq_debugfs_unregister_hierarchy(q, stage); + blk_io_hierarchy_iodump_exit(q, stage); + + hstage = stats->hstage[stage]; + stats->hstage[stage] = NULL; + free_percpu(hstage->hstats); + kfree(hstage); +} +EXPORT_SYMBOL_GPL(blk_mq_unregister_hierarchy); + +static enum stat_group bio_hierarchy_op(struct bio *bio) +{ + if (op_is_discard(bio->bi_opf)) + return STAT_DISCARD; + + if (op_is_flush(bio->bi_opf) && + !(bio_sectors(bio) || bio_flagged(bio, BIO_HAS_DATA))) + return STAT_FLUSH; + + if (op_is_write(bio->bi_opf)) + return STAT_WRITE; + + return STAT_READ; +} + + +void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) +{ + struct request_queue *q = bio->bi_disk->queue; + struct hierarchy_stage *hstage; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + hstage = q->io_hierarchy_stats->hstage[stage]; + bio_alloc_hierarchy_data(bio); + io_hierarchy_inc(hstage->hstats, dispatched, bio_hierarchy_op(bio)); + bio->hdata->time = blk_time_get_ns(); + hierarchy_add_bio(hstage, bio); +} + +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time) +{ + struct request_queue *q = bio->bi_disk->queue; + struct hierarchy_stage *hstage; + u64 duration; + enum stat_group op; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + op = bio_hierarchy_op(bio); + duration = time - bio->hdata->time; + hstage = q->io_hierarchy_stats->hstage[stage]; + + hierarchy_remove_bio(hstage, bio); + io_hierarchy_inc(hstage->hstats, completed, op); + io_hierarchy_add(hstage->hstats, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); +} + +static enum stat_group rq_hierarchy_op(struct request *rq) +{ + if (op_is_discard(rq->cmd_flags)) + return STAT_DISCARD; + + if (is_flush_rq(rq)) + return STAT_FLUSH; + + if (op_is_write(rq->cmd_flags)) + return STAT_WRITE; + + return STAT_READ; +} + +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + io_hierarchy_inc(hstage->hstats, dispatched, rq_hierarchy_op(rq)); + WRITE_ONCE(rq->hierarchy_time, jiffies); + + /* + * Paired with barrier in hierarchy_show_rq_fn(), make sure + * hierarchy_time is set before stage. + */ + smp_store_release(&rq->stage, hstage->stage); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_start_io_acct); + +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + unsigned long duration = jiffies - rq->hierarchy_time; + enum stat_group op = rq_hierarchy_op(rq); + + io_hierarchy_inc(hstage->hstats, completed, op); + io_hierarchy_add(hstage->hstats, jiffies, op, duration); + hierarchy_account_slow_io_jiffies(hstage, op, duration); + WRITE_ONCE(rq->stage, NR_RQ_STAGE_GROUPS); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_end_io_acct); + +#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio) +{ + struct gendisk *disk = bio->bi_disk; + struct hierarchy_stage *hstage; + + if (bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(disk->queue, STAGE_BIO)) + return; + + bio_set_flag(bio, BIO_HIERARCHY_ACCT); + if (bio_has_data(bio)) + bio_set_flag(bio, BIO_HAS_DATA); + hstage = disk->queue->io_hierarchy_stats->hstage[STAGE_BIO]; + io_hierarchy_inc(hstage->hstats, dispatched, bio_hierarchy_op(bio)); +} + +void __bio_hierarchy_end(struct bio *bio, u64 now) +{ + struct gendisk *disk = bio->bi_disk; + struct hierarchy_stage *hstage; + u64 duration; + enum stat_group op; + + op = bio_hierarchy_op(bio); + duration = now - bio->bi_alloc_time_ns; + hstage = disk->queue->io_hierarchy_stats->hstage[STAGE_BIO]; + + io_hierarchy_inc(hstage->hstats, completed, op); + io_hierarchy_add(hstage->hstats, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); + + bio_clear_flag(bio, BIO_HIERARCHY_ACCT); + bio_clear_flag(bio, BIO_HAS_DATA); +} + +#endif + +static int __init hierarchy_stats_init(void) +{ + hdata_pool = mempool_create_kmalloc_pool(PRE_ALLOC_BIO_CNT, + sizeof(struct bio_hierarchy_data)); + if (!hdata_pool) + panic("Failed to create hdata_pool\n"); + + return 0; +} +module_init(hierarchy_stats_init); diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h new file mode 100644 index 000000000000..ed3e5ddc084a --- /dev/null +++ b/block/blk-io-hierarchy/stats.h @@ -0,0 +1,323 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef BLK_IO_HIERARCHY_STATS_H +#define BLK_IO_HIERARCHY_STATS_H + +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + +#include <linux/blkdev.h> +#include <linux/blk_types.h> +#include "../blk.h" + +struct bio_hierarchy_data { + u64 time; +#ifdef CONFIG_HIERARCHY_IO_DUMP + struct bio *bio; + struct list_head hierarchy_list; +#endif +}; + +struct hierarchy_stats { + union { + /* for bio based stages. */ + u64 nsecs[NEW_NR_STAT_GROUPS]; + /* for request based stages. */ + unsigned long jiffies[NEW_NR_STAT_GROUPS]; + }; + unsigned long dispatched[NEW_NR_STAT_GROUPS]; + unsigned long completed[NEW_NR_STAT_GROUPS]; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long slow[NEW_NR_STAT_GROUPS]; +#endif + +}; + +struct hierarchy_stage { + enum stage_group stage; + struct dentry *debugfs_dir; + struct hierarchy_stats __percpu *hstats; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long threshold; + void *dump_data; +#endif +}; + +struct blk_io_hierarchy_stats { + struct request_queue *q; + struct dentry *debugfs_dir; + struct hierarchy_stage *hstage[NR_STAGE_GROUPS]; +}; + +static inline bool stage_is_bio(enum stage_group stage) +{ + return stage >= 0 && stage < NR_BIO_STAGE_GROUPS; +} + +static inline bool stage_is_rq(enum stage_group stage) +{ + return stage >= NR_BIO_STAGE_GROUPS && stage < NR_RQ_STAGE_GROUPS; +} + +const char *hierarchy_stage_name(enum stage_group stage); +int blk_io_hierarchy_stats_alloc(struct request_queue *q); +void blk_io_hierarchy_stats_free(struct request_queue *q); + +/* APIs for stage registration */ +bool blk_mq_hierarchy_registered(struct request_queue *q, + enum stage_group stage); +void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage); +void blk_mq_unregister_hierarchy(struct request_queue *q, + enum stage_group stage); + +/* APIs for disk level debugfs */ +void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q); +void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q); +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q); + +/* APIs for stage level debugfs */ +void blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage); +void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage); + +/* APIs for bio based stage io accounting */ +void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage); +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time); +void bio_free_hierarchy_data(struct bio *bio); + +static inline void bio_hierarchy_end_io_acct(struct bio *bio, + enum stage_group stage) +{ + __bio_hierarchy_end_io_acct(bio, stage, blk_time_get_ns()); +} + +static inline void bio_list_hierarchy_end_io_acct(struct bio_list *list, + enum stage_group stage) +{ + u64 time = blk_time_get_ns(); + struct bio *bio; + + bio_list_for_each(bio, list) + __bio_hierarchy_end_io_acct(bio, stage, time); +} + +/* APIs for request based stage io accounting */ +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage); +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage); + +static inline void rq_hierarchy_start_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_start_io_acct( + rq, rq->q->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_hierarchy_end_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_end_io_acct( + rq, rq->q->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_list_hierarchy_start_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = rq->q->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_start_io_acct(rq, hstage); +} + +static inline void rq_list_hierarchy_end_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = rq->q->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_end_io_acct(rq, hstage); +} + +#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio); +void __bio_hierarchy_end(struct bio *bio, u64 now); + +static inline void bio_hierarchy_end(struct bio *bio) +{ + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO)) + return; + + __bio_hierarchy_end(bio, blk_time_get_ns()); +} + +static inline void req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ + u64 now; + + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO)) + return; + + now = rq->io_end_time_ns; + if (!now) { + now = blk_time_get_ns(); + rq->io_end_time_ns = now; + } + + __bio_hierarchy_end(bio, now); +} +#endif + +#else /* CONFIG_BLK_IO_HIERARCHY_STATS */ + +static inline int +blk_io_hierarchy_stats_alloc(struct request_queue *q) +{ + return 0; +} + +static inline void +blk_io_hierarchy_stats_free(struct request_queue *q) +{ +} + +static inline bool +blk_mq_hierarchy_registered(struct request_queue *q, enum stage_group stage) +{ + return false; +} + +static inline void +blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +blk_mq_unregister_hierarchy(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q) +{ +} + +static inline void +blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q) +{ +} + +static inline void +blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage) +{ +} + +static inline void +blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ +} + +static inline void +bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) +{ +} + +static inline void +bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage) +{ +} + +static inline void +bio_list_hierarchy_end_io_acct(struct bio_list *list, enum stage_group stage) +{ +} + +static inline void +bio_free_hierarchy_data(struct bio *bio) +{ +} + +static inline void +rq_hierarchy_start_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_hierarchy_end_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_start_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_end_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +#endif /* CONFIG_BLK_IO_HIERARCHY_STATS */ + +#if !defined(CONFIG_BLK_IO_HIERARCHY_STATS) || !defined(CONFIG_HIERARCHY_BIO) +static inline void +bio_hierarchy_start(struct bio *bio) +{ +} + +static inline void +bio_hierarchy_end(struct bio *bio) +{ +} + +static inline void +req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ +} +#endif + +#endif /* BLK_IO_HIERARCHY_STATS_H */ diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f0865b6ea1e1..a0909e56d669 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -23,6 +23,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" +#include "blk-io-hierarchy/stats.h"
static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) { @@ -355,9 +356,8 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) return blk_mq_rq_state_name_array[rq_state]; }
-int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +void debugfs_rq_show(struct seq_file *m, struct request *rq) { - const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
seq_printf(m, "%p {.op=", rq); @@ -374,6 +374,13 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); +} + +int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +{ + const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; + + debugfs_rq_show(m, rq); if (mq_ops->show_rq) mq_ops->show_rq(m, rq); seq_puts(m, "}\n"); @@ -811,8 +818,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {}, };
-static bool debugfs_create_files(struct dentry *parent, void *data, - const struct blk_mq_debugfs_attr *attr) +bool debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr) { if (IS_ERR_OR_NULL(parent)) return false; @@ -861,6 +868,7 @@ int blk_mq_debugfs_register(struct request_queue *q) goto err; }
+ blk_mq_debugfs_register_hierarchy_stats(q); return 0;
err: diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index a9160be12be0..73a3796bd03c 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -31,6 +31,14 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q); int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); +bool debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr); +void debugfs_rq_show(struct seq_file *m, struct request *rq); + +static inline bool blk_mq_debugfs_enabled(struct request_queue *q) +{ + return !IS_ERR_OR_NULL(q->debugfs_dir); +} #else static inline int blk_mq_debugfs_register(struct request_queue *q) { diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 0fb33abac3f6..1c8befbe7b69 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -15,6 +15,7 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h"
void blk_mq_sched_free_hctx_data(struct request_queue *q, void (*exit)(struct blk_mq_hw_ctx *)) @@ -250,6 +251,7 @@ int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); + rq_list_hierarchy_end_io_acct(&rq_list, STAGE_HCTX); if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { if (has_sched_dispatch) ret = blk_mq_do_dispatch_sched(hctx); @@ -389,10 +391,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); bool ret = false;
- if (e && e->type->ops.mq.bio_merge) { - blk_mq_put_ctx(ctx); + if (e && e->type->ops.mq.bio_merge) return e->type->ops.mq.bio_merge(hctx, bio); - }
if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && !list_empty_careful(&ctx->rq_list)) { @@ -402,7 +402,6 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) spin_unlock(&ctx->lock); }
- blk_mq_put_ctx(ctx); return ret; }
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index bee92ab06a5e..f7b21d7f136e 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-io-hierarchy/stats.h"
bool blk_mq_has_free_tags(struct blk_mq_tags *tags) { @@ -113,7 +114,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct sbq_wait_state *ws; DEFINE_WAIT(wait); unsigned int tag_offset; - bool drop_ctx; int tag;
if (data->flags & BLK_MQ_REQ_RESERVED) { @@ -135,8 +135,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (data->flags & BLK_MQ_REQ_NOWAIT) return BLK_MQ_TAG_FAIL;
+ if (data->bio) + bio_hierarchy_start_io_acct(data->bio, STAGE_GETTAG); ws = bt_wait_ptr(bt, data->hctx); - drop_ctx = data->ctx == NULL; do { struct sbitmap_queue *bt_prev;
@@ -162,9 +163,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break;
- if (data->ctx) - blk_mq_put_ctx(data->ctx); - bt_prev = bt; io_schedule();
@@ -189,10 +187,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ws = bt_wait_ptr(bt, data->hctx); } while (1);
- if (drop_ctx && data->ctx) - blk_mq_put_ctx(data->ctx); - finish_wait(&ws->wait, &wait); + if (data->bio) + bio_hierarchy_end_io_acct(data->bio, STAGE_GETTAG);
found_tag: return tag + tag_offset; diff --git a/block/blk-mq.c b/block/blk-mq.c index aa4b3c608249..7f24ff0692d4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -36,6 +36,7 @@ #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h"
static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); @@ -368,6 +369,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->part = NULL; rq->start_time_ns = ktime_get_ns(); rq->io_start_time_ns = 0; + blk_mq_get_alloc_task(rq, data->bio); + rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -400,13 +403,13 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct elevator_queue *e = q->elevator; struct request *rq; unsigned int tag; - bool put_ctx_on_error = false; + bool clear_ctx_on_error = false;
blk_queue_enter_live(q); data->q = q; if (likely(!data->ctx)) { data->ctx = blk_mq_get_ctx(q); - put_ctx_on_error = true; + clear_ctx_on_error = true; } if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->ctx->cpu); @@ -430,10 +433,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
tag = blk_mq_get_tag(data); if (tag == BLK_MQ_TAG_FAIL) { - if (put_ctx_on_error) { - blk_mq_put_ctx(data->ctx); + if (clear_ctx_on_error) data->ctx = NULL; - } blk_queue_exit(q); return NULL; } @@ -470,8 +471,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (!rq) return ERR_PTR(-EWOULDBLOCK);
- blk_mq_put_ctx(alloc_data.ctx); - rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; @@ -532,6 +531,8 @@ static void __blk_mq_free_request(struct request *rq) struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); const int sched_tag = rq->internal_tag;
+ blk_mq_put_alloc_task(rq); + if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -583,6 +584,8 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_stat_add(rq, now); }
+ if (blk_mq_request_started(rq)) + rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER); blk_account_io_done(rq, now);
if (rq->end_io) { @@ -722,6 +725,7 @@ void blk_mq_start_request(struct request *rq) blk_mq_sched_started_request(rq);
trace_block_rq_issue(q, rq); + rq_hierarchy_start_io_acct(rq, STAGE_RQ_DRIVER);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); @@ -762,6 +766,7 @@ static void __blk_mq_requeue_request(struct request *rq) rq->rq_flags &= ~RQF_TIMED_OUT; if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; + rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER); } }
@@ -787,6 +792,7 @@ static void blk_mq_requeue_work(struct work_struct *work) spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); spin_unlock_irq(&q->requeue_lock); + rq_list_hierarchy_end_io_acct(&rq_list, STAGE_REQUEUE);
list_for_each_entry_safe(rq, next, &rq_list, queuelist) { if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) @@ -826,6 +832,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, */ BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
+ rq_hierarchy_start_io_acct(rq, STAGE_REQUEUE); spin_lock_irqsave(&q->requeue_lock, flags); if (at_head) { rq->rq_flags |= RQF_SOFTBARRIER; @@ -1317,6 +1324,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!list_empty(list)) { bool needs_restart;
+ rq_list_hierarchy_start_io_acct(list, STAGE_HCTX); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1726,6 +1734,7 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head, struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+ rq_hierarchy_start_io_acct(rq, STAGE_HCTX); spin_lock(&hctx->lock); if (at_head) list_add(&rq->queuelist, &hctx->dispatch); @@ -1792,6 +1801,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) if (rq->mq_ctx != this_ctx) { if (this_ctx) { trace_block_unplug(this_q, depth, !from_schedule); + rq_list_hierarchy_end_io_acct(&ctx_list, + STAGE_PLUG); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); @@ -1812,6 +1823,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (this_ctx) { trace_block_unplug(this_q, depth, !from_schedule); + rq_list_hierarchy_end_io_acct(&ctx_list, STAGE_PLUG); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); } @@ -1975,7 +1987,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { .flags = 0 }; + struct blk_mq_alloc_data data = { + .flags = 0, + .bio = bio, + }; struct request *rq; unsigned int request_count = 0; struct blk_plug *plug; @@ -2019,7 +2034,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
plug = current->plug; if (unlikely(is_flush_fua)) { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio);
/* bypass scheduler for flush rq */ @@ -2028,7 +2042,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else if (plug && q->nr_hw_queues == 1) { struct request *last = NULL;
- blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio);
/* @@ -2051,6 +2064,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_plug(q); }
+ rq_hierarchy_start_io_acct(rq, STAGE_PLUG); list_add_tail(&rq->queuelist, &plug->mq_list); } else if (plug && !blk_queue_nomerges(q)) { blk_mq_bio_to_request(rq, bio); @@ -2066,23 +2080,21 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) same_queue_rq = NULL; if (same_queue_rq) list_del_init(&same_queue_rq->queuelist); + rq_hierarchy_start_io_acct(rq, STAGE_PLUG); list_add_tail(&rq->queuelist, &plug->mq_list);
- blk_mq_put_ctx(data.ctx); - if (same_queue_rq) { data.hctx = blk_mq_map_queue(q, same_queue_rq->mq_ctx->cpu); + rq_hierarchy_end_io_acct(same_queue_rq, STAGE_PLUG); blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_sched_insert_request(rq, false, true, true); } @@ -2324,6 +2336,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) if (list_empty(&tmp)) return 0;
+ rq_list_hierarchy_start_io_acct(&tmp, STAGE_HCTX); spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -2758,6 +2771,9 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx, *next; int i;
+ blk_mq_unregister_hierarchy(q, STAGE_BIO); + blk_io_hierarchy_stats_free(q); + queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
@@ -2895,11 +2911,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops;
+ if (blk_io_hierarchy_stats_alloc(q)) + goto err_exit; + q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, blk_mq_poll_stats_bkt, BLK_MQ_POLL_STATS_BKTS, q); if (!q->poll_cb) - goto err_exit; + goto err_hierarchy_stats;
if (blk_mq_alloc_ctxs(q)) goto err_exit; @@ -2972,6 +2991,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_hw_queues = 0; err_sys_init: blk_mq_sysfs_deinit(q); +err_hierarchy_stats: + blk_io_hierarchy_stats_free(q); err_exit: q->mq_ops = NULL; return ERR_PTR(-ENOMEM); diff --git a/block/blk-mq.h b/block/blk-mq.h index c6ec9aa12fb2..1bba4eb18332 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -125,12 +125,7 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { - return __blk_mq_get_ctx(q, get_cpu()); -} - -static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) -{ - put_cpu(); + return __blk_mq_get_ctx(q, raw_smp_processor_id()); }
struct blk_mq_alloc_data { @@ -142,6 +137,7 @@ struct blk_mq_alloc_data { /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; + struct bio *bio; };
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) @@ -234,4 +230,32 @@ static inline void blk_mq_free_requests(struct list_head *list) } }
+static inline bool blk_mq_is_sbitmap_shared(unsigned int flags) +{ + return false; +} + +#ifdef CONFIG_BLK_BIO_ALLOC_TASK +static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio) +{ + rq->pid = bio ? get_pid(bio->pid) : get_pid(task_pid(current)); +} + +static inline void blk_mq_put_alloc_task(struct request *rq) +{ + if (rq->pid) { + put_pid(rq->pid); + rq->pid = NULL; + } +} +#else +static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio) +{ +} + +static inline void blk_mq_put_alloc_task(struct request *rq) +{ +} +#endif + #endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1c4d795bbdc4..719687a394ea 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -17,6 +17,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h"
struct queue_sysfs_entry { struct attribute attr; @@ -924,6 +925,19 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, };
+static void blk_mq_register_default_hierarchy(struct request_queue *q) +{ + if (!q->mq_ops) + return; + + blk_mq_register_hierarchy(q, STAGE_GETTAG); + blk_mq_register_hierarchy(q, STAGE_PLUG); + blk_mq_register_hierarchy(q, STAGE_HCTX); + blk_mq_register_hierarchy(q, STAGE_REQUEUE); + blk_mq_register_hierarchy(q, STAGE_RQ_DRIVER); + blk_mq_register_hierarchy(q, STAGE_BIO); +} + /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -973,6 +987,8 @@ int blk_register_queue(struct gendisk *disk) has_elevator = true; }
+ blk_mq_register_default_hierarchy(q); + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 598191286557..446864c27c3b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -14,6 +14,7 @@ #include <linux/sched/signal.h> #include <linux/delay.h> #include "blk.h" +#include "blk-io-hierarchy/stats.h"
/* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; @@ -1350,6 +1351,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(q->queue_lock);
+ bio_list_hierarchy_end_io_acct(&bio_list_on_stack, STAGE_THROTTLE); + if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while((bio = bio_list_pop(&bio_list_on_stack))) @@ -2333,6 +2336,20 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
tg->last_low_overflow_time[rw] = jiffies;
+ /* + * This is slow path now, bio_hierarchy_start_io_acct() might spend + * some time to allocate memory. However, it's safe because 'tg' is + * pinned by this bio, and io charge should still be accurate because + * slice is already started from tg_may_dispatch(). + */ + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); + + bio_hierarchy_start_io_acct(bio, STAGE_THROTTLE); + + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; @@ -2561,6 +2578,8 @@ void blk_throtl_exit(struct request_queue *q) del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); + blk_mq_unregister_hierarchy(q, STAGE_THROTTLE); + free_percpu(q->td->latency_buckets[READ]); free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); @@ -2593,6 +2612,8 @@ void blk_throtl_register_queue(struct request_queue *q) td->track_bio_latency = !queue_is_rq_based(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); + + blk_mq_register_hierarchy(q, STAGE_THROTTLE); }
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 94b5eff0cd3a..87d7816af6e0 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -29,6 +29,7 @@
#include "blk-wbt.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h"
#define CREATE_TRACE_POINTS #include <trace/events/wbt.h> @@ -532,11 +533,12 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. */ -static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, - unsigned long rw, spinlock_t *lock) +static void __wbt_wait(struct rq_wb *rwb, struct bio *bio, + enum wbt_flags wb_acct, spinlock_t *lock) __releases(lock) __acquires(lock) { + unsigned long rw = bio->bi_opf; struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); struct wbt_wait_data data = { .wq = { @@ -554,6 +556,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) return;
+ bio_hierarchy_start_io_acct(bio, STAGE_WBT); has_sleeper = !__prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); do { @@ -588,6 +591,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, } while (1);
finish_wait(&rqw->wait, &data.wq); + bio_hierarchy_end_io_acct(bio, STAGE_WBT); }
static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) @@ -652,7 +656,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) return; }
- __wbt_wait(rwb, flags, bio->bi_opf, lock); + __wbt_wait(rwb, bio, flags, lock);
if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); @@ -770,6 +774,7 @@ static void wbt_exit(struct rq_qos *rqos) struct rq_wb *rwb = RQWB(rqos); struct request_queue *q = rqos->q;
+ blk_mq_unregister_hierarchy(q, STAGE_WBT); blk_stat_remove_callback(q, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); @@ -845,6 +850,7 @@ int wbt_init(struct request_queue *q)
rwb->min_lat_nsec = wbt_default_latency_nsec(q); wbt_set_queue_depth(q, blk_queue_depth(q)); + blk_mq_register_hierarchy(q, STAGE_WBT);
blk_mq_unfreeze_queue(q); wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); diff --git a/block/blk.h b/block/blk.h index 965e9c507654..162b42388610 100644 --- a/block/blk.h +++ b/block/blk.h @@ -175,6 +175,51 @@ static inline void blk_queue_enter_live(struct request_queue *q) percpu_ref_get(&q->q_usage_counter); }
+#ifdef CONFIG_BLK_BIO_ALLOC_TIME +static inline u64 blk_time_get_ns(void); +static inline void blk_rq_init_bi_alloc_time(struct request *rq, + struct request *first_rq) +{ + rq->bi_alloc_time_ns = first_rq ? first_rq->bi_alloc_time_ns : + blk_time_get_ns(); +} + +/* + * Used in following cases to updated request bi_alloc_time_ns: + * + * 1) Allocate a new @rq for @bio; + * 2) @bio is merged to @rq, in this case @merged_rq should be NULL; + * 3) @merged_rq is merged to @rq, in this case @bio should be NULL; + */ +static inline void blk_rq_update_bi_alloc_time(struct request *rq, + struct bio *bio, + struct request *merged_rq) +{ + if (bio) { + if (rq->bi_alloc_time_ns > bio->bi_alloc_time_ns) + rq->bi_alloc_time_ns = bio->bi_alloc_time_ns; + return; + } + + if (!merged_rq) + return; + + if (rq->bi_alloc_time_ns > merged_rq->bi_alloc_time_ns) + rq->bi_alloc_time_ns = merged_rq->bi_alloc_time_ns; +} +#else /* CONFIG_BLK_BIO_ALLOC_TIME */ +static inline void blk_rq_init_bi_alloc_time(struct request *rq, + struct request *first_rq) +{ +} + +static inline void blk_rq_update_bi_alloc_time(struct request *rq, + struct bio *bio, + struct request *merged_rq) +{ +} +#endif + #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); @@ -479,4 +524,17 @@ static inline void blk_free_queue_dispatch_async(struct request_queue *q) } #endif
+static inline u64 blk_time_get_ns(void) +{ + struct blk_plug *plug = current->plug; + + if (!plug || !in_task()) + return ktime_get_ns(); + + if (!plug->cur_ktime) + plug->cur_ktime = ktime_get_ns(); + + return plug->cur_ktime; +} + #endif /* BLK_INTERNAL_H */ diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 833e9eaae640..04ff97c076fb 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -30,6 +30,7 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-stat.h" +#include "blk-io-hierarchy/stats.h"
/* Scheduling domains. */ enum { @@ -365,6 +366,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
blk_stat_add_callback(q, kqd->cb);
+ blk_mq_register_hierarchy(q, STAGE_KYBER); return 0; }
@@ -374,6 +376,7 @@ static void kyber_exit_sched(struct elevator_queue *e) struct request_queue *q = kqd->q; int i;
+ blk_mq_unregister_hierarchy(kqd->q, STAGE_KYBER); blk_stat_remove_callback(q, kqd->cb);
for (i = 0; i < KYBER_NUM_DOMAINS; i++) @@ -517,7 +520,6 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx_q, struct bio *bio) spin_lock(&kcq->lock); merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); spin_unlock(&kcq->lock); - blk_mq_put_ctx(ctx);
return merged; } @@ -533,6 +535,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, struct kyber_hctx_data *khd = hctx->sched_data; struct request *rq, *next;
+ rq_list_hierarchy_start_io_acct(rq_list, STAGE_KYBER); list_for_each_entry_safe(rq, next, rq_list, queuelist) { unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; @@ -772,6 +775,9 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = NULL; out: spin_unlock(&khd->lock); + + if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_KYBER); return rq; }
diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 7ad820050675..aa51abb3eaa4 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -22,6 +22,7 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h"
/* * See Documentation/block/deadline-iosched.txt @@ -61,6 +62,8 @@ struct deadline_data { spinlock_t lock; spinlock_t zone_lock; struct list_head dispatch; + + struct request_queue *q; };
static inline struct rb_root * @@ -386,6 +389,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock);
+ if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_DEADLINE); return rq; }
@@ -396,6 +401,7 @@ static void dd_exit_queue(struct elevator_queue *e) BUG_ON(!list_empty(&dd->fifo_list[READ])); BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+ blk_mq_unregister_hierarchy(dd->q, STAGE_DEADLINE); kfree(dd); }
@@ -427,11 +433,13 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e) dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; + dd->q = q; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); INIT_LIST_HEAD(&dd->dispatch);
q->elevator = eq; + blk_mq_register_hierarchy(q, STAGE_DEADLINE); return 0; }
@@ -469,8 +477,10 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) ret = blk_mq_sched_try_merge(q, bio, &free); spin_unlock(&dd->lock);
- if (free) + if (free) { + rq_hierarchy_end_io_acct(free, STAGE_DEADLINE); blk_mq_free_request(free); + }
return ret; } @@ -493,6 +503,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_req_zone_write_unlock(rq);
if (blk_mq_sched_try_insert_merge(q, rq, &free)) { + rq_list_hierarchy_end_io_acct(&free, STAGE_DEADLINE); blk_mq_free_requests(&free); return; } @@ -527,6 +538,8 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data;
+ rq_list_hierarchy_start_io_acct(list, STAGE_DEADLINE); + spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8075b9955bb3..c2867571bcc7 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -203,6 +203,9 @@ struct bio {
struct bio_set *bi_pool;
+ u64 bi_alloc_time_ns; + struct bio_hierarchy_data *hdata; + struct pid *pid; KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) @@ -234,6 +237,13 @@ struct bio { * of this bio. */ #define BIO_QUEUE_ENTERED 11 /* can use blk_queue_enter_live() */ #define BIO_TRACKED 12 /* set if bio goes through the rq_qos path */ +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS +#define BIO_HAS_DATA 13 /* bio contain data. */ +#define BIO_HIERARCHY_ACCT 14 /* + * This bio has already been subjected to + * blk-io-hierarchy, don't do it again. + */ +#endif
/* See BVEC_POOL_OFFSET below before adding new flags */
@@ -368,7 +378,36 @@ enum stat_group { STAT_WRITE, STAT_DISCARD,
- NR_STAT_GROUPS + NR_STAT_GROUPS, + STAT_FLUSH = NR_STAT_GROUPS, + NEW_NR_STAT_GROUPS, +}; + +enum stage_group { +#ifdef CONFIG_BLK_DEV_THROTTLING + STAGE_THROTTLE, +#endif +#ifdef CONFIG_BLK_WBT + STAGE_WBT, +#endif + STAGE_GETTAG, + NR_BIO_STAGE_GROUPS, + STAGE_PLUG = NR_BIO_STAGE_GROUPS, +#if IS_ENABLED(CONFIG_MQ_IOSCHED_DEADLINE) + STAGE_DEADLINE, +#endif +#if IS_ENABLED(CONFIG_IOSCHED_BFQ) + STAGE_BFQ, +#endif +#if IS_ENABLED(CONFIG_MQ_IOSCHED_KYBER) + STAGE_KYBER, +#endif + STAGE_HCTX, + STAGE_REQUEUE, + STAGE_RQ_DRIVER, + NR_RQ_STAGE_GROUPS, + STAGE_BIO = NR_RQ_STAGE_GROUPS, + NR_STAGE_GROUPS, };
#define bio_op(bio) \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c848f4205729..713c42987851 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -277,6 +277,12 @@ struct request { #ifdef CONFIG_BLK_CGROUP struct request_list *rl; /* rl this rq is alloced from */ #endif + + enum stage_group stage; + unsigned long hierarchy_time; + u64 io_end_time_ns; + u64 bi_alloc_time_ns; + struct pid *pid; };
static inline bool blk_op_is_scsi(unsigned int op) @@ -703,6 +709,8 @@ struct request_queue {
#define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; + + struct blk_io_hierarchy_stats *io_hierarchy_stats; };
#define QUEUE_FLAG_QUEUED 0 /* uses generic tag queueing */ @@ -1372,6 +1380,7 @@ struct blk_plug { struct list_head list; /* requests */ struct list_head mq_list; /* blk-mq requests */ struct list_head cb_list; /* md requires an unplug callback */ + u64 cur_ktime; }; #define BLK_MAX_REQUEST_COUNT 16 #define BLK_PLUG_FLUSH_SIZE (128 * 1024)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/10692 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/I...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/10692 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/I...