hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Include main structures definition and provide helpers for different IO stages to record IO stats and dump inflight IO.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/bio.c | 5 + block/blk-flush.c | 4 + block/blk-io-hierarchy/Kconfig | 13 + block/blk-io-hierarchy/Makefile | 1 + block/blk-io-hierarchy/debugfs.c | 134 +++++- block/blk-io-hierarchy/iodump.c | 745 +++++++++++++++++++++++++++++++ block/blk-io-hierarchy/iodump.h | 96 ++++ block/blk-io-hierarchy/stats.c | 234 +++++++++- block/blk-io-hierarchy/stats.h | 245 +++++++++- block/blk-mq-debugfs.c | 15 +- block/blk-mq-debugfs.h | 1 + block/blk-mq.c | 2 + include/linux/blk-mq.h | 5 + include/linux/blk_types.h | 16 +- 14 files changed, 1473 insertions(+), 43 deletions(-) create mode 100644 block/blk-io-hierarchy/iodump.c create mode 100644 block/blk-io-hierarchy/iodump.h
diff --git a/block/bio.c b/block/bio.c index c5569f8b65af..d64b0da22e38 100644 --- a/block/bio.c +++ b/block/bio.c @@ -297,6 +297,11 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, #ifdef CONFIG_BLK_BIO_ALLOC_TASK bio->pid = get_pid(task_pid(current)); #endif + +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + bio->hierarchy_time = 0; + INIT_LIST_HEAD(&bio->hierarchy_list); +#endif } EXPORT_SYMBOL(bio_init);
diff --git a/block/blk-flush.c b/block/blk-flush.c index 4f64194f2eb6..4628a9ee1904 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -73,6 +73,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h"
/* PREFLUSH/FUA sequences */ enum { @@ -343,6 +344,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->end_io = flush_end_io; blk_rq_init_bi_alloc_time(flush_rq, first_rq); blk_mq_get_alloc_task(flush_rq, first_rq->bio); + blk_rq_hierarchy_stats_init(flush_rq); /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one * implied in refcount_inc_not_zero() called from @@ -373,6 +375,8 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, blk_mq_put_driver_tag(rq); }
+ blk_rq_hierarchy_set_flush_done(rq); + /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index a12476c73fa5..2c15b5a7a006 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -13,6 +13,19 @@ menuconfig BLK_IO_HIERARCHY_STATS
if BLK_IO_HIERARCHY_STATS
+config HIERARCHY_IO_DUMP + bool "Support to dump io that is throttled" + default n + select BLK_BIO_ALLOC_TIME + select BLK_BIO_ALLOC_TASK + depends on BLK_DEV_IO_TRACE + help + Enable this will create new debugfs entries to show user the detailed + information of IO that are submitted and not done yet, and user can + filter the result by IO stage or IO latency. + + If unsure, say N. + config HIERARCHY_THROTTLE bool "Enable hierarchy stats layer blk-throttle" default n diff --git a/block/blk-io-hierarchy/Makefile b/block/blk-io-hierarchy/Makefile index 1fb663c75521..9b989d379e58 100644 --- a/block/blk-io-hierarchy/Makefile +++ b/block/blk-io-hierarchy/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk_io_hierarchy_stats.o
blk_io_hierarchy_stats-y := stats.o debugfs.o +obj-$(CONFIG_HIERARCHY_IO_DUMP) += iodump.o diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 9072a091c013..4cf30f172b86 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -12,13 +12,49 @@ */
#include <linux/debugfs.h> +#include <linux/blkdev.h>
+#include "../blk-mq-debugfs.h" #include "stats.h" +#include "iodump.h"
static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_THROTTLE [STAGE_THROTTLE] = "throtl", #endif +#ifdef CONFIG_HIERARCHY_WBT + [STAGE_WBT] = "wbt", +#endif +#ifdef CONFIG_HIERARCHY_IOCOST + [STAGE_IOCOST] = "iocost", +#endif +#ifdef CONFIG_HIERARCHY_GETTAG + [STAGE_GETTAG] = "gettag", +#endif +#ifdef CONFIG_HIERARCHY_PLUG + [STAGE_PLUG] = "plug", +#endif +#ifdef CONFIG_HIERARCHY_DEADLINE + [STAGE_DEADLINE] = "deadline", +#endif +#ifdef CONFIG_HIERARCHY_BFQ + [STAGE_BFQ] = "bfq", +#endif +#ifdef CONFIG_HIERARCHY_KYBER + [STAGE_KYBER] = "kyber", +#endif +#ifdef CONFIG_HIERARCHY_HCTX + [STAGE_HCTX] = "hctx", +#endif +#ifdef CONFIG_HIERARCHY_REQUEUE + [STAGE_REQUEUE] = "requeue", +#endif +#ifdef CONFIG_HIERARCHY_RQ_DRIVER + [STAGE_RQ_DRIVER] = "rq_driver", +#endif +#ifdef CONFIG_HIERARCHY_BIO + [STAGE_BIO] = "bio", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) @@ -26,26 +62,33 @@ const char *hierarchy_stage_name(enum stage_group stage) return stage_name[stage]; }
-static int hierarchy_stats_show(void *data, struct seq_file *m) +static int __hierarchy_stats_show(struct hierarchy_stats_data *hstats_data, + struct seq_file *m, enum stage_group stage) { - struct hierarchy_stage *hstage = data; - int cpu; u64 dispatched[NR_STAT_GROUPS] = {0}; u64 completed[NR_STAT_GROUPS] = {0}; u64 latency[NR_STAT_GROUPS] = {0}; + int cpu; + int i;
for_each_possible_cpu(cpu) { - int i; - struct hierarchy_stats *stat = per_cpu_ptr(hstage->hstats, cpu); + struct hierarchy_stats *stat = + per_cpu_ptr(hstats_data->hstats, cpu);
for (i = 0; i < NR_STAT_GROUPS; ++i) { dispatched[i] += stat->dispatched[i]; completed[i] += stat->completed[i]; - latency[i] += stat->nsecs[i]; + latency[i] += stage_is_rq(stage) ? + stat->jiffies[i] : stat->nsecs[i]; } }
- seq_printf(m, "%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + if (stage_is_rq(stage)) + for (i = 0; i < NR_STAT_GROUPS; ++i) + latency[i] = + jiffies_to_msecs(latency[i]) * NSEC_PER_MSEC; + + seq_printf(m, "%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", dispatched[STAT_READ], completed[STAT_READ], latency[STAT_READ], dispatched[STAT_WRITE], completed[STAT_WRITE], latency[STAT_WRITE], @@ -53,11 +96,71 @@ static int hierarchy_stats_show(void *data, struct seq_file *m) latency[STAT_DISCARD], dispatched[STAT_FLUSH], completed[STAT_FLUSH], latency[STAT_FLUSH]);
+ hierarchy_show_slow_io(hstats_data, m); + seq_putc(m, '\n'); return 0; }
-static struct blk_mq_debugfs_attr hierarchy_debugfs_attrs[] = { - {"stats", 0400, hierarchy_stats_show}, +static void *hierarchy_stats_start(struct seq_file *m, loff_t *pos) +{ + enum stage_group stage = *pos; + + if (stage < 0 || stage >= NR_STAGE_GROUPS) + return NULL; + + return pos; +} + +static void *hierarchy_stats_next(struct seq_file *m, void *v, loff_t *pos) +{ + enum stage_group stage = ++(*pos); + + if (stage >= 0 && stage < NR_STAGE_GROUPS) + return pos; + + return NULL; +} + +static void hierarchy_stats_stop(struct seq_file *m, void *v) +{ +} + +static int hierarchy_stats_show(struct seq_file *m, void *v) +{ + enum stage_group stage = (*(loff_t *)v); + struct blk_io_hierarchy_stats *stats = m->private; + struct hierarchy_stats_data *hstats_data = get_hstats_data(stats, stage); + + if (!hstats_data) + return 0; + + seq_printf(m, "%s ", hierarchy_stage_name(stage)); + __hierarchy_stats_show(hstats_data, m, stage); + put_hstats_data(stats, hstats_data); + return 0; +} + +static const struct seq_operations hierarchy_stats_ops = { + .start = hierarchy_stats_start, + .next = hierarchy_stats_next, + .stop = hierarchy_stats_stop, + .show = hierarchy_stats_show, +}; + +static int hierarchy_stats_show_single(void *v, struct seq_file *m) +{ + struct hierarchy_stage *hstage = v; + + return __hierarchy_stats_show(hstage->hstats_data, m, hstage->stage); +} + +static const struct blk_mq_debugfs_attr hierarchy_debugfs_attrs[] = { + {"stats", 0400, hierarchy_stats_show_single}, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_stats_attr[] = { + {"stats", 0400, .seq_ops = &hierarchy_stats_ops}, {}, };
@@ -76,6 +179,7 @@ static void hierarchy_register_stage(struct blk_io_hierarchy_stats *stats,
hstage->debugfs_dir = dir; debugfs_create_files(dir, hstage, hierarchy_debugfs_attrs); + io_hierarchy_register_iodump(hstage); }
static void hierarchy_unregister_stage(struct blk_io_hierarchy_stats *stats, @@ -117,3 +221,15 @@ void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q,
hierarchy_unregister_stage(stats, stage); } + +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + lockdep_assert_held(&q->debugfs_mutex); + + if (!blk_mq_debugfs_enabled(q)) + return; + + debugfs_create_files(stats->debugfs_dir, stats, hierarchy_stats_attr); +} diff --git a/block/blk-io-hierarchy/iodump.c b/block/blk-io-hierarchy/iodump.c new file mode 100644 index 000000000000..d7fcc458acb3 --- /dev/null +++ b/block/blk-io-hierarchy/iodump.c @@ -0,0 +1,745 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/seq_file.h> +#include <linux/blktrace_api.h> +#include <linux/blk-cgroup.h> + +#include "iodump.h" +#include "../blk.h" +#include "../blk-mq.h" +#include "../blk-cgroup.h" +#include "../blk-mq-debugfs.h" + +#define RWB_LEN 6 +#define PATH_LEN 64 +#define ms_to_ns(time) (time * NSEC_PER_MSEC) +#define DEFAULT_THRESHOLD 1000 + +static DEFINE_MUTEX(dump_mutex); + +struct bio_dump_data { + u64 stat_time; + struct list_head head; + spinlock_t lock; +}; + +struct rq_dump_data { + struct request_queue *q; + enum stage_group stage; + unsigned int tag; + unsigned int total_tags; + bool shared; + bool has_elevator; + bool enter_queue; +}; + +#ifdef CONFIG_HIERARCHY_BIO +struct pos_data { + enum stage_group stage; + unsigned int count; +}; + +struct bio_stage_dump_data { + union { + loff_t pos; + struct pos_data pdata; + }; + struct rq_dump_data rq_ddata; + u64 stat_time; +}; +#endif + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + hstage->threshold = DEFAULT_THRESHOLD; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = + kmalloc(sizeof(*bio_ddata), GFP_KERNEL); + + if (!bio_ddata) + return -ENOMEM; + + INIT_LIST_HEAD(&bio_ddata->head); + spin_lock_init(&bio_ddata->lock); + hstage->dump_data = bio_ddata; + return 0; + } + + if (stage_is_rq(hstage->stage)) { + struct rq_dump_data *rq_ddata = + kzalloc(sizeof(*rq_ddata), GFP_KERNEL); + + if (!rq_ddata) + return -ENOMEM; + + rq_ddata->q = q; + rq_ddata->stage = hstage->stage; + hstage->dump_data = rq_ddata; + return 0; + } + +#ifdef CONFIG_HIERARCHY_BIO + BUILD_BUG_ON(sizeof(struct pos_data) != sizeof(loff_t)); + + if (hstage->stage == STAGE_BIO) { + struct bio_stage_dump_data *bstage_ddata = + kzalloc(sizeof(*bstage_ddata), GFP_KERNEL); + + if (!bstage_ddata) + return -ENOMEM; + + bstage_ddata->rq_ddata.q = q; + bstage_ddata->rq_ddata.stage = hstage->stage; + hstage->dump_data = bstage_ddata; + return 0; + } +#endif + + return -EINVAL; +} + +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = q->io_hierarchy_stats->hstage[stage]; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = hstage->dump_data; + + WARN(!list_empty(&bio_ddata->head), + "blk-io-hierarchy: disk %s stage %s unregistered whih throttled IO.\n", + kobject_name(q->mq_kobj->parent), hierarchy_stage_name(stage)); + } + + kfree(hstage->dump_data); + hstage->dump_data = NULL; +} + +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_add_tail(&bio->hierarchy_list, &bio_ddata->head); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_del_init(&bio->hierarchy_list); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +static void *bio_hierarchy_list_start(struct seq_file *m, loff_t *pos) + __acquires(&bio_ddata->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irq(&bio_ddata->lock); + bio_ddata->stat_time = blk_time_get_ns(); + + return seq_list_start(&bio_ddata->head, *pos); +} + +static void *bio_hierarchy_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + return seq_list_next(v, &bio_ddata->head, pos); +} + +static void bio_hierarchy_list_stop(struct seq_file *m, void *v) + __releases(&hstage->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_unlock_irq(&bio_ddata->lock); +} + +static void __hierarchy_show_bio(struct seq_file *m, struct bio *bio, + enum stage_group stage, u64 duration) +{ + char rwbs[RWB_LEN]; + char path[PATH_LEN] = {0}; + struct task_struct *task = get_pid_task(bio->pid, PIDTYPE_PID); + + blk_fill_rwbs(rwbs, bio->bi_opf); + blkg_path(bio->bi_blkg, path, PATH_LEN); + + seq_printf(m, "%s-%d %s stage %s bio %s %llu + %u cgroup %s started %llu ns ago\n", + task ? task->comm : "null", task ? task->pid : 0, + bio->bi_bdev->bd_disk->disk_name, + hierarchy_stage_name(stage), rwbs, bio->bi_iter.bi_sector, + bio_sectors(bio), path, duration); + + if (task) + put_task_struct(task); +} + +static u64 get_duration(u64 a, u64 b) +{ + return a > b ? a - b : 0; +} + +static void hierarchy_show_bio(struct seq_file *m, struct bio *bio) +{ + u64 duration; + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + duration = get_duration(bio_ddata->stat_time, bio->hierarchy_time); + if (hstage->threshold > ns_to_ms(duration)) + return; + + __hierarchy_show_bio(m, bio, hstage->stage, duration); +} + +static int bio_hierarchy_list_show(struct seq_file *m, void *v) +{ + struct bio *bio = list_entry(v, struct bio, hierarchy_list); + + hierarchy_show_bio(m, bio); + return 0; +} + +static const struct seq_operations hierarchy_bio_dump_ops = { + .start = bio_hierarchy_list_start, + .next = bio_hierarchy_list_next, + .stop = bio_hierarchy_list_stop, + .show = bio_hierarchy_list_show, +}; + +static int threshold_show(void *data, struct seq_file *m) +{ + struct hierarchy_stage *hstage = data; + + seq_printf(m, "%lu\n", hstage->threshold); + return 0; +} + +/* + * max size needed by different bases to express U64 + * HEX: "0xFFFFFFFFFFFFFFFF" --> 18 + * DEC: "18446744073709551615" --> 20 + * OCT: "01777777777777777777777" --> 23 + * pick the max one to define NUMBER_BUF_LEN + */ +#define MAX_BUF_LEN 24 +static ssize_t threshold_store(void *data, const char __user *buf, size_t count, + loff_t *ppos) +{ + int err; + unsigned long val; + char b[MAX_BUF_LEN + 1]; + struct hierarchy_stage *hstage = data; + + if (count > MAX_BUF_LEN) + return -EINVAL; + + if (copy_from_user(b, buf, count)) + return -EFAULT; + + b[count] = 0; + err = kstrtoul(b, 0, &val); + if (!err) + hstage->threshold = val; + + return err ? err : count; +} + +static void rq_hierarchy_init_dump_data(struct rq_dump_data *rq_ddata) +{ + struct request_queue *q = rq_ddata->q; + + rq_ddata->shared = blk_mq_is_shared_tags(q->tag_set->flags); + rq_ddata->has_elevator = !!q->elevator; + + if (rq_ddata->shared) + rq_ddata->total_tags = rq_ddata->has_elevator ? + q->nr_requests : + q->tag_set->shared_tags->nr_tags; + else if (rq_ddata->has_elevator) + rq_ddata->total_tags = q->nr_hw_queues * q->nr_requests; + else + rq_ddata->total_tags = q->nr_hw_queues * q->tag_set->queue_depth; +} + +static bool __rq_hierarchy_start(struct rq_dump_data *rq_ddata, + unsigned int tag) +{ + /* + * Grab .q_usage_counter so request pool won't go away, then no + * request use-after-free is possible during iteration. If queue is + * frozen, there won't be any inflight requests. + */ + if (!percpu_ref_tryget(&rq_ddata->q->q_usage_counter)) { + rq_ddata->enter_queue = false; + return false; + } + + rq_ddata->enter_queue = true; + rq_hierarchy_init_dump_data(rq_ddata); + rq_ddata->tag = tag; + + return tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static bool __rq_hierarchy_next(struct rq_dump_data *rq_ddata) +{ + rq_ddata->tag++; + + return rq_ddata->tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static void __rq_hierarchy_stop(struct rq_dump_data *rq_ddata) +{ + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +static void *rq_hierarchy_start(struct seq_file *m, loff_t *pos) + __acquires(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + + if (__rq_hierarchy_start(rq_ddata, *pos)) + return rq_ddata; + + return NULL; +} + +static void *rq_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct rq_dump_data *rq_ddata = v; + + if (__rq_hierarchy_next(rq_ddata)) { + *pos = rq_ddata->tag; + return rq_ddata; + } + + (*pos)++; + return NULL; +} + +static void rq_hierarchy_stop(struct seq_file *m, void *v) + __releases(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + __rq_hierarchy_stop(rq_ddata); + mutex_unlock(&dump_mutex); +} + +static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata) +{ + struct request *rq; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq_ddata->q; + unsigned int nr_tag = rq_ddata->tag; + unsigned int hctx_id; + + if (nr_tag >= rq_ddata->total_tags) { + hctx_id = nr_tag - rq_ddata->total_tags; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = xa_load(&q->hctx_table, hctx_id); + rq = hctx->fq->flush_rq; + } else if (rq_ddata->shared) { + struct blk_mq_tags *tags = rq_ddata->has_elevator ? + q->sched_shared_tags : q->tag_set->shared_tags; + + rq = tags->static_rqs[nr_tag]; + } else if (rq_ddata->has_elevator) { + hctx_id = nr_tag / q->nr_requests; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = xa_load(&q->hctx_table, hctx_id); + rq = hctx->sched_tags->static_rqs[nr_tag % q->nr_requests]; + } else { + hctx_id = nr_tag / q->tag_set->queue_depth; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = xa_load(&q->hctx_table, hctx_id); + if (!hctx->tags) + return NULL; + + rq = hctx->tags->static_rqs[nr_tag % q->tag_set->queue_depth]; + } + + /* + * fast path to avoid refcount cas operations for the request that + * is from other shared request_queue or other stages. + */ + if (rq->q != q || (rq_ddata->stage != STAGE_BIO && + READ_ONCE(rq->stage) != rq_ddata->stage)) + return NULL; + + if (!req_ref_inc_not_zero(rq)) + return NULL; + + /* Check again after request is pinned, in case request is resued. */ + if (rq->q != q) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + if (rq_ddata->stage == STAGE_BIO) + return rq; + + /* + * Barrier is paired with the smp_store_release() in + * rq_hierarchy_start_io_acct(), so that if stage is read, uninitialized + * hierarchy_time won't be read. + */ + if (smp_load_acquire(&rq->stage) != rq_ddata->stage) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + return rq; +} + +static void hierarchy_show_rq(struct seq_file *m, struct request *rq, + u64 duration) +{ + struct task_struct *task = get_pid_task(rq->pid, PIDTYPE_PID); + const char *name = hierarchy_stage_name(rq->stage); + + seq_printf(m, "%s-%d %s stage %s ", task ? task->comm : "null", + task ? task->pid : 0, + rq->q->disk ? rq->q->disk->disk_name : "?", + name ? name : "?"); + debugfs_rq_show(m, rq); + seq_printf(m, " started %llu ns ago}\n", duration); + + if (task) + put_task_struct(task); +} + +static int rq_hierarchy_show(struct seq_file *m, void *v) +{ + u64 duration; + unsigned long htime; + struct hierarchy_stage *hstage = m->private; + struct request *rq = hierarchy_find_and_get_rq(v); + + if (!rq) + return 0; + + htime = READ_ONCE(rq->hierarchy_time); + htime = time_after(jiffies, htime) ? jiffies - htime : 0; + duration = jiffies_to_msecs(htime); + if (hstage->threshold <= duration) + hierarchy_show_rq(m, rq, ms_to_ns(duration)); + + blk_mq_put_rq_ref(rq); + return 0; +} + +static const struct seq_operations hierarchy_rq_dump_ops = { + .start = rq_hierarchy_start, + .next = rq_hierarchy_next, + .stop = rq_hierarchy_stop, + .show = rq_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr hierarchy_threshold_attr[] = { + { + "threshold", + 0600, + threshold_show, + threshold_store, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_bio_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_bio_dump_ops, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_rq_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_rq_dump_ops, + }, + {}, +}; + +#ifdef CONFIG_HIERARCHY_BIO +static struct bio_dump_data *get_bio_stage_ddata(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + struct hierarchy_stage *hstage = READ_ONCE(stats->hstage[stage]); + + if (!hstage) + return NULL; + + return hstage->dump_data; +} + +static void bio_stage_start_next_stage(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + pdata->stage++; + if (!stage_is_bio(pdata->stage)) + pdata->stage = STAGE_BIO; + pdata->count = 0; + + *pos = bstage_ddata->pos; +} + +static void bio_stage_start_next_io(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) + pdata->count++; + else + pdata->count = bstage_ddata->rq_ddata.tag; + + *pos = bstage_ddata->pos; +} + +static void __bio_stage_hierarchy_stop(struct bio_stage_dump_data *bstage_ddata) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + spin_unlock_irq(&bio_ddata->lock); + } + + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +void *__bio_stage_hierarchy_start(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + +retry: + if (stage_is_bio(pdata->stage)) { + struct list_head *list; + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + if (!bio_ddata) { + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + spin_lock_irq(&bio_ddata->lock); + list = seq_list_start(&bio_ddata->head, pdata->count); + if (list) + return list; + + spin_unlock_irq(&bio_ddata->lock); + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_start(rq_ddata, pdata->count)) + return bstage_ddata; + + return NULL; +} + +static void *bio_stage_hierarchy_start(struct seq_file *m, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + bstage_ddata->pos = *pos; + bstage_ddata->stat_time = blk_time_get_ns(); + + return __bio_stage_hierarchy_start(bstage_ddata, pos); +} + +static void *bio_stage_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + struct list_head *list = ((struct list_head *)v)->next; + + if (list != &bio_ddata->head) { + bio_stage_start_next_io(bstage_ddata, pos); + return list; + } + + spin_unlock_irq(&bio_ddata->lock); + + bio_stage_start_next_stage(bstage_ddata, pos); + return __bio_stage_hierarchy_start(bstage_ddata, pos); + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_next(rq_ddata)) { + bio_stage_start_next_io(bstage_ddata, pos); + return bstage_ddata; + } + + (*pos)++; + return NULL; +} + +static void bio_stage_hierarchy_stop(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + __bio_stage_hierarchy_stop(bstage_ddata); + mutex_unlock(&dump_mutex); +} + +static int bio_stage_hierarchy_show(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + u64 duration; + + if (stage_is_bio(pdata->stage)) { + struct bio *bio = list_entry(v, struct bio, hierarchy_list); + + duration = get_duration(bstage_ddata->stat_time, + bio->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + __hierarchy_show_bio(m, bio, pdata->stage, duration); + } else if (pdata->stage == STAGE_BIO) { + struct request *rq = hierarchy_find_and_get_rq(rq_ddata); + + if (rq) { + duration = get_duration(bstage_ddata->stat_time, + rq->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + hierarchy_show_rq(m, rq, duration); + blk_mq_put_rq_ref(rq); + } + } + + return 0; +} + +static const struct seq_operations bio_stage_hierarchy_ops = { + .start = bio_stage_hierarchy_start, + .next = bio_stage_hierarchy_next, + .stop = bio_stage_hierarchy_stop, + .show = bio_stage_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &bio_stage_hierarchy_ops, + }, + {}, +}; + +#else /* CONFIG_HIERARCHY_BIO */ +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + {}, +}; + +#endif + +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ + const struct blk_mq_debugfs_attr *attr; + + if (stage_is_bio(hstage->stage)) + attr = hierarchy_bio_dump_attr; + else if (stage_is_rq(hstage->stage)) + attr = hierarchy_rq_dump_attr; + else if (hstage->stage == STAGE_BIO) + attr = bio_stage_dump_attr; + else + attr = NULL; + + debugfs_create_files(hstage->debugfs_dir, hstage, + hierarchy_threshold_attr); + if (attr) + debugfs_create_files(hstage->debugfs_dir, hstage, attr); +} + +void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + if (hstage->threshold <= duration) + this_cpu_inc(hstage->hstats_data->hstats->slow[op]); +} + +void hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data, + struct seq_file *m) +{ + u64 slow[NR_STAT_GROUPS] = {0}; + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct hierarchy_stats *stat = + per_cpu_ptr(hstats_data->hstats, cpu); + + for (i = 0; i < NR_STAT_GROUPS; ++i) + slow[i] += stat->slow[i]; + } + + seq_printf(m, " %llu %llu %llu %llu", slow[STAT_READ], slow[STAT_WRITE], + slow[STAT_DISCARD], slow[STAT_FLUSH]); +} diff --git a/block/blk-io-hierarchy/iodump.h b/block/blk-io-hierarchy/iodump.h new file mode 100644 index 000000000000..2f14999e42b5 --- /dev/null +++ b/block/blk-io-hierarchy/iodump.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef BLK_IO_HIERARCHY_IODUMP_H +#define BLK_IO_HIERARCHY_IODUMP_H + +#ifdef CONFIG_HIERARCHY_IO_DUMP + +#include "stats.h" + +#define ns_to_ms(time) div_u64(time, NSEC_PER_MSEC) + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage); +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage); +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio); +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio); +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage); + +void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration); +void hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data, + struct seq_file *m); + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ + hierarchy_account_slow_io(hstage, op, ns_to_ms(duration)); +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + hierarchy_account_slow_io(hstage, op, jiffies_to_msecs(duration)); +} + +#else +static inline int +blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + return 0; +} + +static inline void +blk_io_hierarchy_iodump_exit(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ +} + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ +} + +static inline void +hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data, + struct seq_file *m) +{ +} +#endif +#endif diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c index e717bf790709..01b08fe5e8bb 100644 --- a/block/blk-io-hierarchy/stats.c +++ b/block/blk-io-hierarchy/stats.c @@ -11,14 +11,16 @@ * GNU General Public License for more details. */
+#include <linux/module.h> #include <linux/debugfs.h>
#include "stats.h" +#include "iodump.h" #include "../blk.h" #include "../blk-mq-debugfs.h"
#define io_hierarchy_add(statsp, field, group, nr) \ - this_cpu_add((statsp)->field[group], nr) + this_cpu_add((statsp)->hstats->field[group], nr) #define io_hierarchy_inc(statsp, field, group) \ io_hierarchy_add(statsp, field, group, 1)
@@ -35,6 +37,7 @@ void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q)
stats->debugfs_dir = debugfs_create_dir("blk_io_hierarchy", q->debugfs_dir); + blk_mq_debugfs_create_default_hierarchy_attr(q);
for (stage = 0; stage < NR_STAGE_GROUPS; ++stage) blk_mq_debugfs_register_hierarchy(q, stage); @@ -69,6 +72,7 @@ int blk_io_hierarchy_stats_alloc(struct request_queue *q) if (!stats) return -ENOMEM;
+ spin_lock_init(&stats->hstage_lock); stats->q = q; q->io_hierarchy_stats = stats;
@@ -96,6 +100,61 @@ bool blk_mq_hierarchy_registered(struct request_queue *q,
return stats->hstage[stage] != NULL; } +EXPORT_SYMBOL_GPL(blk_mq_hierarchy_registered); + +static struct hierarchy_stats_data *alloc_hstats_data(void) +{ + struct hierarchy_stats_data *hstats_data; + + hstats_data = kmalloc(sizeof(*hstats_data), GFP_KERNEL); + if (!hstats_data) + return NULL; + + hstats_data->hstats = alloc_percpu(struct hierarchy_stats); + if (!hstats_data->hstats) { + kfree(hstats_data); + return NULL; + } + + hstats_data->ref = 1; + return hstats_data; +} + +struct hierarchy_stats_data *get_hstats_data( + struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage; + struct hierarchy_stats_data *hstats_data = NULL; + + spin_lock(&stats->hstage_lock); + hstage = stats->hstage[stage]; + if (hstage) { + hstats_data = hstage->hstats_data; + if (hstats_data) + hstats_data->ref++; + } + spin_unlock(&stats->hstage_lock); + + return hstats_data; +} + +static void __put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data) +{ + if (--hstats_data->ref == 0) { + free_percpu(hstats_data->hstats); + kfree(hstats_data); + } +} + +void put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data) +{ + spin_lock(&stats->hstage_lock); + __put_hstats_data(stats, hstats_data); + spin_unlock(&stats->hstage_lock); +}
void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) { @@ -107,7 +166,8 @@ void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage)
if (blk_mq_hierarchy_registered(q, stage)) { pr_warn("blk-io-hierarchy: disk %s is registering stage %s again.", - q->disk->disk_name, hierarchy_stage_name(stage)); + kobject_name(q->mq_kobj->parent), + hierarchy_stage_name(stage)); return; }
@@ -119,26 +179,31 @@ void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) if (!hstage) return;
- hstage->hstats = alloc_percpu(struct hierarchy_stats); - if (!hstage->hstats) { + hstage->hstats_data = alloc_hstats_data(); + if (!hstage->hstats_data) { kfree(hstage); return; }
hstage->stage = stage; + hstage->unbalanced_warned = false; hstage->debugfs_dir = NULL; + if (blk_io_hierarchy_iodump_init(q, hstage) < 0) { + put_hstats_data(stats, hstage->hstats_data); + kfree(hstage); + return; + }
blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q);
mutex_lock(&q->debugfs_mutex); - stats->hstage[stage] = hstage; + WRITE_ONCE(stats->hstage[stage], hstage); blk_mq_debugfs_register_hierarchy(q, stage); mutex_unlock(&q->debugfs_mutex);
- blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); } +EXPORT_SYMBOL_GPL(blk_mq_register_hierarchy);
void blk_mq_unregister_hierarchy(struct request_queue *q, enum stage_group stage) @@ -152,21 +217,27 @@ void blk_mq_unregister_hierarchy(struct request_queue *q, mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_hierarchy(q, stage); + blk_io_hierarchy_iodump_exit(q, stage);
+ spin_lock(&stats->hstage_lock); hstage = stats->hstage[stage]; stats->hstage[stage] = NULL; - free_percpu(hstage->hstats); + __put_hstats_data(stats, hstage->hstats_data); + spin_unlock(&stats->hstage_lock); + kfree(hstage);
mutex_unlock(&q->debugfs_mutex); } +EXPORT_SYMBOL_GPL(blk_mq_unregister_hierarchy);
-static enum stat_group hierarchy_op(const struct bio *bio) +static enum stat_group bio_hierarchy_op(struct bio *bio) { if (op_is_discard(bio->bi_opf)) return STAT_DISCARD;
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + if (op_is_flush(bio->bi_opf) && + !(bio_sectors(bio) || bio_flagged(bio, BIO_HAS_DATA))) return STAT_FLUSH;
if (op_is_write(bio->bi_opf)) @@ -185,32 +256,151 @@ void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) return;
hstage = q->io_hierarchy_stats->hstage[stage]; - io_hierarchy_inc(hstage->hstats, dispatched, hierarchy_op(bio)); + io_hierarchy_inc(hstage->hstats_data, dispatched, bio_hierarchy_op(bio)); bio->hierarchy_time = blk_time_get_ns(); + hierarchy_add_bio(hstage, bio); }
-void bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, - u64 time) +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time) { struct request_queue *q = bio->bi_bdev->bd_queue; struct hierarchy_stage *hstage; + u64 duration; enum stat_group op;
if (!blk_mq_hierarchy_registered(q, stage)) return;
- op = hierarchy_op(bio); + op = bio_hierarchy_op(bio); + duration = time - bio->hierarchy_time; hstage = q->io_hierarchy_stats->hstage[stage]; - io_hierarchy_inc(hstage->hstats, completed, op); - io_hierarchy_add(hstage->hstats, nsecs, op, time - bio->hierarchy_time); + + hierarchy_remove_bio(hstage, bio); + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); +} + +static enum stat_group rq_hierarchy_op(struct request *rq) +{ + if (op_is_discard(rq->cmd_flags)) + return STAT_DISCARD; + + if (is_flush_rq(rq)) + return STAT_FLUSH; + + if (op_is_write(rq->cmd_flags)) + return STAT_WRITE; + + return STAT_READ; }
-void bio_list_hierarchy_end_io_acct(struct bio_list *list, - enum stage_group stage) +static void rq_hierarchy_warn_unbalanced(struct request *rq, + struct hierarchy_stage *hstage, + enum stage_group old_stage, + enum stage_group new_stage) { - u64 time = blk_time_get_ns(); - struct bio *bio; + if (hstage->unbalanced_warned) + return;
- bio_list_for_each(bio, list) - bio_hierarchy_end_io_acct(bio, stage, time); + pr_warn("blk-io-hierarchy: disk %s stage %d(%s) -> %d(%s) unbalanced accounting.", + kobject_name(rq->q->mq_kobj->parent), + old_stage, hierarchy_stage_name(old_stage), + new_stage, hierarchy_stage_name(new_stage)); + hstage->unbalanced_warned = true; } + +void blk_rq_hierarchy_stats_complete(struct request *rq) +{ + struct hierarchy_stage *hstage; + enum stage_group stage; + + stage = rq->stage; + if (stage == NR_RQ_STAGE_GROUPS) + return; + + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = rq->q->io_hierarchy_stats->hstage[stage]; + rq_hierarchy_warn_unbalanced(rq, hstage, stage, NR_RQ_STAGE_GROUPS); + __rq_hierarchy_end_io_acct(rq, hstage); +} + +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + blk_rq_hierarchy_stats_complete(rq); + io_hierarchy_inc(hstage->hstats_data, dispatched, rq_hierarchy_op(rq)); + WRITE_ONCE(rq->hierarchy_time, jiffies); + + /* + * Paired with barrier in hierarchy_show_rq_fn(), make sure + * hierarchy_time is set before stage. + */ + smp_store_release(&rq->stage, hstage->stage); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_start_io_acct); + +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + enum stat_group op; + unsigned long duration; + + if (rq->stage != hstage->stage) { + rq_hierarchy_warn_unbalanced(rq, hstage, rq->stage, + hstage->stage); + return; + } + + op = rq_hierarchy_op(rq); + duration = jiffies - rq->hierarchy_time; + + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, jiffies, op, duration); + hierarchy_account_slow_io_jiffies(hstage, op, duration); + WRITE_ONCE(rq->stage, NR_RQ_STAGE_GROUPS); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_end_io_acct); + +#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct hierarchy_stage *hstage; + + if (bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(disk->queue, STAGE_BIO)) + return; + + bio_set_flag(bio, BIO_HIERARCHY_ACCT); + if (bio_has_data(bio)) + bio_set_flag(bio, BIO_HAS_DATA); + hstage = disk->queue->io_hierarchy_stats->hstage[STAGE_BIO]; + io_hierarchy_inc(hstage->hstats_data, dispatched, bio_hierarchy_op(bio)); +} + +void __bio_hierarchy_end(struct bio *bio, u64 now) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct hierarchy_stage *hstage; + u64 duration; + enum stat_group op; + + op = bio_hierarchy_op(bio); + duration = now - bio->bi_alloc_time_ns; + hstage = disk->queue->io_hierarchy_stats->hstage[STAGE_BIO]; + + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); + + bio_clear_flag(bio, BIO_HIERARCHY_ACCT); + bio_clear_flag(bio, BIO_HAS_DATA); +} + +#endif diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h index 0a86d1235715..8166e71969bd 100644 --- a/block/blk-io-hierarchy/stats.h +++ b/block/blk-io-hierarchy/stats.h @@ -17,30 +17,64 @@ #ifdef CONFIG_BLK_IO_HIERARCHY_STATS
#include <linux/blkdev.h> -#include "../blk-mq-debugfs.h" +#include <linux/blk_types.h> +#include "../blk.h"
struct bio_hierarchy_data { u64 time; +#ifdef CONFIG_HIERARCHY_IO_DUMP + struct bio *bio; + struct list_head hierarchy_list; +#endif };
struct hierarchy_stats { - u64 nsecs[NR_STAT_GROUPS]; + union { + /* for bio based stages. */ + u64 nsecs[NR_STAT_GROUPS]; + /* for request based stages. */ + unsigned long jiffies[NR_STAT_GROUPS]; + }; unsigned long dispatched[NR_STAT_GROUPS]; unsigned long completed[NR_STAT_GROUPS]; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long slow[NR_STAT_GROUPS]; +#endif +}; + +struct hierarchy_stats_data { + int ref; + struct hierarchy_stats __percpu *hstats; };
struct hierarchy_stage { enum stage_group stage; + bool unbalanced_warned; struct dentry *debugfs_dir; - struct hierarchy_stats __percpu *hstats; + struct hierarchy_stats_data *hstats_data; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long threshold; + void *dump_data; +#endif };
struct blk_io_hierarchy_stats { struct request_queue *q; struct dentry *debugfs_dir; + spinlock_t hstage_lock; struct hierarchy_stage *hstage[NR_STAGE_GROUPS]; };
+static inline bool stage_is_bio(enum stage_group stage) +{ + return stage >= 0 && stage < NR_BIO_STAGE_GROUPS; +} + +static inline bool stage_is_rq(enum stage_group stage) +{ + return stage >= NR_BIO_STAGE_GROUPS && stage < NR_RQ_STAGE_GROUPS; +} + const char *hierarchy_stage_name(enum stage_group stage); int blk_io_hierarchy_stats_alloc(struct request_queue *q); void blk_io_hierarchy_stats_free(struct request_queue *q); @@ -55,19 +89,154 @@ void blk_mq_unregister_hierarchy(struct request_queue *q, /* APIs for disk level debugfs */ void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q); void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q); +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q);
/* APIs for stage level debugfs */ void blk_mq_debugfs_register_hierarchy(struct request_queue *q, enum stage_group stage); void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, enum stage_group stage); +struct hierarchy_stats_data *get_hstats_data( + struct blk_io_hierarchy_stats *stats, + enum stage_group stage); +void put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data);
/* APIs for bio based stage io accounting */ void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage); -void bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, - u64 time); -void bio_list_hierarchy_end_io_acct(struct bio_list *list, - enum stage_group stage); +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time); + +static inline void bio_hierarchy_end_io_acct(struct bio *bio, + enum stage_group stage) +{ + __bio_hierarchy_end_io_acct(bio, stage, blk_time_get_ns()); +} + +static inline void bio_list_hierarchy_end_io_acct(struct bio_list *list, + enum stage_group stage) +{ + u64 time = blk_time_get_ns(); + struct bio *bio; + + bio_list_for_each(bio, list) + __bio_hierarchy_end_io_acct(bio, stage, time); +} + +/* APIs for request based stage io accounting */ +void blk_rq_hierarchy_stats_complete(struct request *rq); +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage); +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage); + +static inline void rq_hierarchy_start_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_start_io_acct( + rq, rq->q->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_hierarchy_end_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_end_io_acct( + rq, rq->q->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_list_hierarchy_start_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = rq->q->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_start_io_acct(rq, hstage); +} + +static inline void rq_list_hierarchy_end_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = rq->q->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_end_io_acct(rq, hstage); +} + +static inline void blk_rq_hierarchy_stats_init(struct request *rq) +{ + rq->stage = NR_RQ_STAGE_GROUPS; + rq->flush_done = false; +} + +static inline void blk_rq_hierarchy_set_flush_done(struct request *rq) +{ + rq->flush_done = true; +} + +static inline bool blk_rq_hierarchy_is_flush_done(struct request *rq) +{ + return rq->flush_done; +} + +#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio); +void __bio_hierarchy_end(struct bio *bio, u64 now); + +static inline void bio_hierarchy_end(struct bio *bio) +{ + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_bdev->bd_queue, STAGE_BIO)) + return; + + __bio_hierarchy_end(bio, blk_time_get_ns()); +} + +static inline void req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ + u64 now; + + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_bdev->bd_queue, STAGE_BIO)) + return; + + now = rq->io_end_time_ns; + if (!now) { + now = blk_time_get_ns(); + rq->io_end_time_ns = now; + } + + __bio_hierarchy_end(bio, now); +} +#endif + #else /* CONFIG_BLK_IO_HIERARCHY_STATS */
static inline int @@ -125,7 +294,7 @@ bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) }
static inline void -bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, u64 time) +bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage) { }
@@ -133,5 +302,65 @@ static inline void bio_list_hierarchy_end_io_acct(struct bio_list *list, enum stage_group stage) { } + +static inline void +blk_rq_hierarchy_set_flush_done(struct request *rq) +{ +} + +static inline bool +blk_rq_hierarchy_is_flush_done(struct request *rq) +{ + return false; +} + +static inline void +blk_rq_hierarchy_stats_complete(struct request *rq) +{ +} + +static inline void +rq_hierarchy_start_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_hierarchy_end_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_start_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_end_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +static inline void +blk_rq_hierarchy_stats_init(struct request *rq) +{ +} + #endif /* CONFIG_BLK_IO_HIERARCHY_STATS */ + +#if !defined(CONFIG_BLK_IO_HIERARCHY_STATS) || !defined(CONFIG_HIERARCHY_BIO) +static inline void +bio_hierarchy_start(struct bio *bio) +{ +} + +static inline void +bio_hierarchy_end(struct bio *bio) +{ +} + +static inline void +req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ +} +#endif + #endif /* BLK_IO_HIERARCHY_STATS_H */ diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index a955ee42765f..efe99cfae51d 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -281,9 +281,13 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) return blk_mq_rq_state_name_array[rq_state]; }
-int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +/* + * This helper will dump general information for @rq into @m, started with '{' + * and doesn't end with '}', caller must include a closing curly brace '}' at + * the end after adding the custom string. + */ +void debugfs_rq_show(struct seq_file *m, struct request *rq) { - const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const enum req_op op = req_op(rq); const char *op_str = blk_op_str(op);
@@ -301,6 +305,13 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); +} + +int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +{ + const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; + + debugfs_rq_show(m, rq); if (mq_ops->show_rq) mq_ops->show_rq(m, rq); seq_puts(m, "}\n"); diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 4c422580ce84..4f70a87094b0 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -17,6 +17,7 @@ struct blk_mq_debugfs_attr { const struct seq_operations *seq_ops; };
+void debugfs_rq_show(struct seq_file *m, struct request *rq); int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
diff --git a/block/blk-mq.c b/block/blk-mq.c index 1159a06e2543..835ea9495396 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -388,6 +388,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->end_io = NULL; rq->end_io_data = NULL;
+ blk_rq_hierarchy_stats_init(rq); blk_rq_init_bi_alloc_time(rq, NULL); blk_mq_get_alloc_task(rq, data->bio);
@@ -712,6 +713,7 @@ static void __blk_mq_free_request(struct request *rq) struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag;
+ blk_rq_hierarchy_stats_complete(rq); blk_mq_put_alloc_task(rq); blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index da56055731d6..4c4416fd2df7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -201,8 +201,13 @@ struct request { KABI_RESERVE(2) #endif KABI_USE(3, u64 io_end_time_ns) +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + KABI_USE2(4, bool flush_done, enum stage_group stage) + KABI_USE(5, unsigned long hierarchy_time) +#else KABI_RESERVE(4) KABI_RESERVE(5) +#endif KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 95759212836a..e8c1d0790923 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -292,11 +292,13 @@ struct bio {
#ifdef CONFIG_BLK_IO_HIERARCHY_STATS KABI_USE(1, u64 hierarchy_time) + KABI_REPLACE(_KABI_RESERVE(2); _KABI_RESERVE(3), + struct list_head hierarchy_list) #else KABI_RESERVE(1) -#endif KABI_RESERVE(2) KABI_RESERVE(3) +#endif #ifdef CONFIG_BLK_BIO_ALLOC_TIME KABI_USE(4, u64 bi_alloc_time_ns) #else @@ -341,6 +343,13 @@ enum { BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + BIO_HAS_DATA, /* bio contain data. */ + BIO_HIERARCHY_ACCT, /* + * This bio has already been subjected to + * blk-io-hierarchy, don't do it again. + */ +#endif BIO_FLAG_LAST };
@@ -473,7 +482,10 @@ enum stage_group { #endif STAGE_RESERVE, NR_BIO_STAGE_GROUPS, - NR_STAGE_GROUPS = NR_BIO_STAGE_GROUPS, + STAGE_PLUG = NR_BIO_STAGE_GROUPS, + NR_RQ_STAGE_GROUPS, + STAGE_BIO = NR_RQ_STAGE_GROUPS, + NR_STAGE_GROUPS, };
static inline enum req_op bio_op(const struct bio *bio)