Jens Axboe (5): block: move cgroup time handling code into blk.h block: add blk_time_get_ns() and blk_time_get() helpers block: cache current nsec time in struct blk_plug block: update cached timestamp post schedule/preemption block: limit block time caching to in_task() context
Kundan Kumar (1): block: skip QUEUE_FLAG_STATS and rq-qos for passthrough io
Yu Kuai (20): blk-io-hierarchy: support hierarchy stats for blk-throttle block: fix that blk_time_get_ns() doesn't update time after schedule block: fix kabi broken after caching ns time in blk_plug block: support to recored bio allocation time block: support to recored bio allocation task block: support to recored bio allocation time in request block: support to recored bio allocation task in request block: support to record when request is completed block-io-hierarchy: core hierarchy stats and iodump implementation blk-io-hierarchy: support new bio based stage wbt blk-io-hierarchy: support new bio based stage iocost blk-io-hierarchy: support new bio based stage gettag blk-io-hierarchy: support new rq based stage plug blk-io-hierarchy: support new rq based stage mq-deadline blk-io-hierarchy: support new rq based stage bfq blk-io-hierarchy: support new rq based stage kyber blk-io-hierarchy: support new rq based stage hctx blk-io-hierarchy: support new rq based stage requeue blk-io-hierarchy: support new rq based stage rq_driver blk-io-hierarchy: support hierarchy stats for bio lifetime
arch/arm64/configs/openeuler_defconfig | 1 + arch/powerpc/configs/openeuler_defconfig | 1 + arch/riscv/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + block/Kconfig | 7 + block/Makefile | 1 + block/bfq-cgroup.c | 14 +- block/bfq-iosched.c | 39 +- block/bio.c | 22 + block/blk-cgroup.c | 2 +- block/blk-cgroup.h | 4 + block/blk-core.c | 11 + block/blk-flush.c | 13 +- block/blk-io-hierarchy/Kconfig | 156 +++++ block/blk-io-hierarchy/Makefile | 8 + block/blk-io-hierarchy/debugfs.c | 235 +++++++ block/blk-io-hierarchy/iodump.c | 745 +++++++++++++++++++++++ block/blk-io-hierarchy/iodump.h | 96 +++ block/blk-io-hierarchy/stats.c | 406 ++++++++++++ block/blk-io-hierarchy/stats.h | 378 ++++++++++++ block/blk-iocost.c | 14 +- block/blk-iolatency.c | 8 +- block/blk-map.c | 1 + block/blk-merge.c | 4 + block/blk-mq-debugfs.c | 22 +- block/blk-mq-debugfs.h | 8 + block/blk-mq-sched.c | 2 + block/blk-mq-tag.c | 5 + block/blk-mq.c | 125 +++- block/blk-mq.h | 25 + block/blk-rq-qos.c | 9 +- block/blk-rq-qos.h | 5 +- block/blk-sysfs.c | 14 + block/blk-throttle.c | 14 +- block/blk-wbt.c | 40 +- block/blk.h | 67 ++ block/kyber-iosched.c | 7 + block/mq-deadline.c | 15 +- include/linux/blk-mq.h | 33 +- include/linux/blk_types.h | 94 +-- include/linux/blkdev.h | 22 +- include/linux/sched.h | 2 +- kernel/sched/core.c | 6 +- 43 files changed, 2569 insertions(+), 114 deletions(-) create mode 100644 block/blk-io-hierarchy/Kconfig create mode 100644 block/blk-io-hierarchy/Makefile create mode 100644 block/blk-io-hierarchy/debugfs.c create mode 100644 block/blk-io-hierarchy/iodump.c create mode 100644 block/blk-io-hierarchy/iodump.h create mode 100644 block/blk-io-hierarchy/stats.c create mode 100644 block/blk-io-hierarchy/stats.h
Offering: HULK hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Support to account the numbers of io and io latency that is throttled by blk-throttle.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/powerpc/configs/openeuler_defconfig | 1 + arch/riscv/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + block/Kconfig | 1 + block/Makefile | 1 + block/blk-io-hierarchy/Kconfig | 27 +++ block/blk-io-hierarchy/Makefile | 7 + block/blk-io-hierarchy/debugfs.c | 119 +++++++++++++ block/blk-io-hierarchy/stats.c | 216 +++++++++++++++++++++++ block/blk-io-hierarchy/stats.h | 137 ++++++++++++++ block/blk-mq-debugfs.c | 7 +- block/blk-mq-debugfs.h | 7 + block/blk-mq.c | 10 +- block/blk-throttle.c | 8 + include/linux/blk_types.h | 14 +- include/linux/blkdev.h | 4 + 17 files changed, 558 insertions(+), 4 deletions(-) create mode 100644 block/blk-io-hierarchy/Kconfig create mode 100644 block/blk-io-hierarchy/Makefile create mode 100644 block/blk-io-hierarchy/debugfs.c create mode 100644 block/blk-io-hierarchy/stats.c create mode 100644 block/blk-io-hierarchy/stats.h
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index ddbc29bf75f8..7ca156b31e51 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -952,6 +952,7 @@ CONFIG_BLK_DEBUG_FS_ZONED=y CONFIG_BLK_DEV_DETECT_WRITING_PART0=y CONFIG_BLK_DEV_WRITE_MOUNTED_DUMP=y CONFIG_BLK_IO_HUNG_TASK_CHECK=y +# CONFIG_BLK_IO_HIERARCHY_STATS is not set
# # Partition Types diff --git a/arch/powerpc/configs/openeuler_defconfig b/arch/powerpc/configs/openeuler_defconfig index 19754209e3c8..d725e7c82254 100644 --- a/arch/powerpc/configs/openeuler_defconfig +++ b/arch/powerpc/configs/openeuler_defconfig @@ -679,6 +679,7 @@ CONFIG_BLK_DEV_THROTTLING=y CONFIG_BLK_DEBUG_FS=y # CONFIG_BLK_SED_OPAL is not set # CONFIG_BLK_INLINE_ENCRYPTION is not set +# CONFIG_BLK_IO_HIERARCHY_STATS is not set
# # Partition Types diff --git a/arch/riscv/configs/openeuler_defconfig b/arch/riscv/configs/openeuler_defconfig index 026582613f2c..295dc52b3994 100644 --- a/arch/riscv/configs/openeuler_defconfig +++ b/arch/riscv/configs/openeuler_defconfig @@ -646,6 +646,7 @@ CONFIG_BLK_INLINE_ENCRYPTION=y CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y # CONFIG_BLK_DEV_DETECT_WRITING_PART0 is not set # CONFIG_BLK_DEV_WRITE_MOUNTED_DUMP is not set +# CONFIG_BLK_IO_HIERARCHY_STATS is not set
# # Partition Types diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index a2b6381ef2c3..576d961c7a40 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -988,6 +988,7 @@ CONFIG_BLK_DEBUG_FS_ZONED=y CONFIG_BLK_DEV_DETECT_WRITING_PART0=y CONFIG_BLK_DEV_WRITE_MOUNTED_DUMP=y CONFIG_BLK_IO_HUNG_TASK_CHECK=y +# CONFIG_BLK_IO_HIERARCHY_STATS is not set
# # Partition Types diff --git a/block/Kconfig b/block/Kconfig index 04bb49f13176..8fd2a8cb539e 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -275,6 +275,7 @@ config BLK_IO_HUNG_TASK_CHECK if this is set, hungtask will complain about slow io even if such io is not hanged. Be careful to enable hungtask panic in this case.
+source "block/blk-io-hierarchy/Kconfig" source "block/partitions/Kconfig"
config BLK_MQ_PCI diff --git a/block/Makefile b/block/Makefile index 46ada9dc8bbf..bfba1d2afc0e 100644 --- a/block/Makefile +++ b/block/Makefile @@ -40,3 +40,4 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \ blk-crypto-sysfs.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o +obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk-io-hierarchy/ diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig new file mode 100644 index 000000000000..a12476c73fa5 --- /dev/null +++ b/block/blk-io-hierarchy/Kconfig @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menuconfig BLK_IO_HIERARCHY_STATS + bool "Enable hierarchy io stats" + default n + depends on BLK_DEBUG_FS=y + help + Enabling this lets the block layer to record additional information + in different io stages. Such information can be helpful to debug + performance and problems like io hang. + + If unsure, say N. + +if BLK_IO_HIERARCHY_STATS + +config HIERARCHY_THROTTLE + bool "Enable hierarchy stats layer blk-throttle" + default n + depends on BLK_DEV_THROTTLING=y + help + Enabling this lets blk hierarchy stats to record additional information + for blk-throttle. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +endif diff --git a/block/blk-io-hierarchy/Makefile b/block/blk-io-hierarchy/Makefile new file mode 100644 index 000000000000..1fb663c75521 --- /dev/null +++ b/block/blk-io-hierarchy/Makefile @@ -0,0 +1,7 @@ +# +# Make file for blk_io_hierarchy_stats +# + +obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk_io_hierarchy_stats.o + +blk_io_hierarchy_stats-y := stats.o debugfs.o diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c new file mode 100644 index 000000000000..9072a091c013 --- /dev/null +++ b/block/blk-io-hierarchy/debugfs.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/debugfs.h> + +#include "stats.h" + +static const char *stage_name[NR_STAGE_GROUPS] = { +#ifdef CONFIG_HIERARCHY_THROTTLE + [STAGE_THROTTLE] = "throtl", +#endif +}; + +const char *hierarchy_stage_name(enum stage_group stage) +{ + return stage_name[stage]; +} + +static int hierarchy_stats_show(void *data, struct seq_file *m) +{ + struct hierarchy_stage *hstage = data; + int cpu; + u64 dispatched[NR_STAT_GROUPS] = {0}; + u64 completed[NR_STAT_GROUPS] = {0}; + u64 latency[NR_STAT_GROUPS] = {0}; + + for_each_possible_cpu(cpu) { + int i; + struct hierarchy_stats *stat = per_cpu_ptr(hstage->hstats, cpu); + + for (i = 0; i < NR_STAT_GROUPS; ++i) { + dispatched[i] += stat->dispatched[i]; + completed[i] += stat->completed[i]; + latency[i] += stat->nsecs[i]; + } + } + + seq_printf(m, "%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + dispatched[STAT_READ], completed[STAT_READ], + latency[STAT_READ], dispatched[STAT_WRITE], + completed[STAT_WRITE], latency[STAT_WRITE], + dispatched[STAT_DISCARD], completed[STAT_DISCARD], + latency[STAT_DISCARD], dispatched[STAT_FLUSH], + completed[STAT_FLUSH], latency[STAT_FLUSH]); + + return 0; +} + +static struct blk_mq_debugfs_attr hierarchy_debugfs_attrs[] = { + {"stats", 0400, hierarchy_stats_show}, + {}, +}; + +static void hierarchy_register_stage(struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = stats->hstage[stage]; + struct dentry *dir; + + if (!stage_name[stage] || hstage->debugfs_dir) + return; + + dir = debugfs_create_dir(stage_name[stage], stats->debugfs_dir); + if (IS_ERR(dir)) + return; + + hstage->debugfs_dir = dir; + debugfs_create_files(dir, hstage, hierarchy_debugfs_attrs); +} + +static void hierarchy_unregister_stage(struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = stats->hstage[stage]; + + if (!stage_name[stage] || !hstage->debugfs_dir) + return; + + debugfs_remove_recursive(hstage->debugfs_dir); + hstage->debugfs_dir = NULL; +} + +void blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + lockdep_assert_held(&q->debugfs_mutex); + + if (!blk_mq_hierarchy_registered(q, stage) || + !blk_mq_debugfs_enabled(q)) + return; + + hierarchy_register_stage(stats, stage); +} + +void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + lockdep_assert_held(&q->debugfs_mutex); + + if (!blk_mq_hierarchy_registered(q, stage) || + !blk_mq_debugfs_enabled(q)) + return; + + hierarchy_unregister_stage(stats, stage); +} diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c new file mode 100644 index 000000000000..f078ebc5f668 --- /dev/null +++ b/block/blk-io-hierarchy/stats.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/debugfs.h> + +#include "stats.h" +#include "../blk.h" +#include "../blk-mq-debugfs.h" + +#define io_hierarchy_add(statsp, field, group, nr) \ + this_cpu_add((statsp)->field[group], nr) +#define io_hierarchy_inc(statsp, field, group) \ + io_hierarchy_add(statsp, field, group, 1) + +void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + enum stage_group stage; + + lockdep_assert_held(&q->debugfs_mutex); + + stats = q->io_hierarchy_stats; + if (!stats || !blk_mq_debugfs_enabled(q)) + return; + + stats->debugfs_dir = debugfs_create_dir("blk_io_hierarchy", + q->debugfs_dir); + + for (stage = 0; stage < NR_STAGE_GROUPS; ++stage) + blk_mq_debugfs_register_hierarchy(q, stage); +} + +void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + enum stage_group stage; + + lockdep_assert_held(&q->debugfs_mutex); + + stats = q->io_hierarchy_stats; + if (!stats || !blk_mq_debugfs_enabled(q)) + return; + + for (stage = 0; stage < NR_STAGE_GROUPS; ++stage) + blk_mq_debugfs_unregister_hierarchy(q, stage); + + debugfs_remove_recursive(stats->debugfs_dir); + stats->debugfs_dir = NULL; +} + +int blk_io_hierarchy_stats_alloc(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + + if (!queue_is_mq(q)) + return 0; + + stats = kzalloc(sizeof(struct blk_io_hierarchy_stats), GFP_KERNEL); + if (!stats) + return -ENOMEM; + + stats->q = q; + q->io_hierarchy_stats = stats; + + return 0; +} + +void blk_io_hierarchy_stats_free(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!stats) + return; + + q->io_hierarchy_stats = NULL; + kfree(stats); +} + +bool blk_mq_hierarchy_registered(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!stats) + return false; + + return stats->hstage[stage] != NULL; +} + +void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + struct hierarchy_stage *hstage; + + if (!stats || !hierarchy_stage_name(stage)) + return; + + if (blk_mq_hierarchy_registered(q, stage)) { + pr_warn("blk-io-hierarchy: disk %s is registering stage %s again.", + q->disk->disk_name, hierarchy_stage_name(stage)); + return; + } + + /* + * Alloc memory before freeze queue, prevent deadlock if new IO is + * issued by memory reclaim. + */ + hstage = kmalloc(sizeof(*hstage), GFP_KERNEL); + if (!hstage) + return; + + hstage->hstats = alloc_percpu(struct hierarchy_stats); + if (!hstage->hstats) { + kfree(hstage); + return; + } + + hstage->stage = stage; + hstage->debugfs_dir = NULL; + + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + mutex_lock(&q->debugfs_mutex); + stats->hstage[stage] = hstage; + blk_mq_debugfs_register_hierarchy(q, stage); + mutex_unlock(&q->debugfs_mutex); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); +} + +void blk_mq_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + struct hierarchy_stage *hstage; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + mutex_lock(&q->debugfs_mutex); + + blk_mq_debugfs_unregister_hierarchy(q, stage); + + hstage = stats->hstage[stage]; + stats->hstage[stage] = NULL; + free_percpu(hstage->hstats); + kfree(hstage); + + mutex_unlock(&q->debugfs_mutex); +} + +static enum stat_group hierarchy_op(const struct bio *bio) +{ + if (op_is_discard(bio->bi_opf)) + return STAT_DISCARD; + + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return STAT_FLUSH; + + if (op_is_write(bio->bi_opf)) + return STAT_WRITE; + + return STAT_READ; +} + + +void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) +{ + struct request_queue *q = bio->bi_bdev->bd_queue; + struct hierarchy_stage *hstage; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + hstage = q->io_hierarchy_stats->hstage[stage]; + io_hierarchy_inc(hstage->hstats, dispatched, hierarchy_op(bio)); + bio->hierarchy_time = ktime_get_ns(); +} + +void bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time) +{ + struct request_queue *q = bio->bi_bdev->bd_queue; + struct hierarchy_stage *hstage; + enum stat_group op; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + op = hierarchy_op(bio); + hstage = q->io_hierarchy_stats->hstage[stage]; + io_hierarchy_inc(hstage->hstats, completed, op); + io_hierarchy_add(hstage->hstats, nsecs, op, time - bio->hierarchy_time); +} + +void bio_list_hierarchy_end_io_acct(struct bio_list *list, + enum stage_group stage) +{ + u64 time = ktime_get_ns(); + struct bio *bio; + + bio_list_for_each(bio, list) + bio_hierarchy_end_io_acct(bio, stage, time); +} diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h new file mode 100644 index 000000000000..0a86d1235715 --- /dev/null +++ b/block/blk-io-hierarchy/stats.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef BLK_IO_HIERARCHY_STATS_H +#define BLK_IO_HIERARCHY_STATS_H + +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + +#include <linux/blkdev.h> +#include "../blk-mq-debugfs.h" + +struct bio_hierarchy_data { + u64 time; +}; + +struct hierarchy_stats { + u64 nsecs[NR_STAT_GROUPS]; + unsigned long dispatched[NR_STAT_GROUPS]; + unsigned long completed[NR_STAT_GROUPS]; +}; + +struct hierarchy_stage { + enum stage_group stage; + struct dentry *debugfs_dir; + struct hierarchy_stats __percpu *hstats; +}; + +struct blk_io_hierarchy_stats { + struct request_queue *q; + struct dentry *debugfs_dir; + struct hierarchy_stage *hstage[NR_STAGE_GROUPS]; +}; + +const char *hierarchy_stage_name(enum stage_group stage); +int blk_io_hierarchy_stats_alloc(struct request_queue *q); +void blk_io_hierarchy_stats_free(struct request_queue *q); + +/* APIs for stage registration */ +bool blk_mq_hierarchy_registered(struct request_queue *q, + enum stage_group stage); +void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage); +void blk_mq_unregister_hierarchy(struct request_queue *q, + enum stage_group stage); + +/* APIs for disk level debugfs */ +void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q); +void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q); + +/* APIs for stage level debugfs */ +void blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage); +void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage); + +/* APIs for bio based stage io accounting */ +void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage); +void bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time); +void bio_list_hierarchy_end_io_acct(struct bio_list *list, + enum stage_group stage); +#else /* CONFIG_BLK_IO_HIERARCHY_STATS */ + +static inline int +blk_io_hierarchy_stats_alloc(struct request_queue *q) +{ + return 0; +} + +static inline void +blk_io_hierarchy_stats_free(struct request_queue *q) +{ +} + +static inline bool +blk_mq_hierarchy_registered(struct request_queue *q, enum stage_group stage) +{ + return false; +} + +static inline void +blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +blk_mq_unregister_hierarchy(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q) +{ +} + +static inline void +blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q) +{ +} + +static inline void +blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage) +{ +} + +static inline void +blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ +} + +static inline void +bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) +{ +} + +static inline void +bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, u64 time) +{ +} + +static inline void +bio_list_hierarchy_end_io_acct(struct bio_list *list, enum stage_group stage) +{ +} +#endif /* CONFIG_BLK_IO_HIERARCHY_STATS */ +#endif /* BLK_IO_HIERARCHY_STATS_H */ diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 9ad108307344..a955ee42765f 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -12,6 +12,7 @@ #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h"
static int queue_poll_stat_show(void *data, struct seq_file *m) { @@ -642,8 +643,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {}, };
-static void debugfs_create_files(struct dentry *parent, void *data, - const struct blk_mq_debugfs_attr *attr) +void debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr) { if (IS_ERR_OR_NULL(parent)) return; @@ -686,6 +687,8 @@ void blk_mq_debugfs_register(struct request_queue *q) rqos = rqos->next; } } + + blk_mq_debugfs_register_hierarchy_stats(q); }
static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 9c7d4b6117d4..4c422580ce84 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -35,6 +35,13 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_debugfs_register_rqos(struct rq_qos *rqos); void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); +void debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr); + +static inline bool blk_mq_debugfs_enabled(struct request_queue *q) +{ + return !IS_ERR_OR_NULL(q->debugfs_dir); +} #else static inline void blk_mq_debugfs_register(struct request_queue *q) { diff --git a/block/blk-mq.c b/block/blk-mq.c index ed43c89132a7..630147c43a35 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -41,6 +41,7 @@ #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h"
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); @@ -4117,6 +4118,8 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx, *next; unsigned long i;
+ blk_io_hierarchy_stats_free(q); + queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
@@ -4315,9 +4318,12 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops;
- if (blk_mq_alloc_ctxs(q)) + if (blk_io_hierarchy_stats_alloc(q)) goto err_exit;
+ if (blk_mq_alloc_ctxs(q)) + goto err_hierarchy_stats; + /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q);
@@ -4352,6 +4358,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
err_hctxs: blk_mq_release(q); +err_hierarchy_stats: + blk_io_hierarchy_stats_free(q); err_exit: q->mq_ops = NULL; return -ENOMEM; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b1cbb3a1da0f..62662bcd2392 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -14,6 +14,7 @@ #include "blk-cgroup-rwstat.h" #include "blk-stat.h" #include "blk-throttle.h" +#include "blk-io-hierarchy/stats.h"
/* Max dispatch from a group in 1 round */ #define THROTL_GRP_QUANTUM 8 @@ -1291,6 +1292,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(&q->queue_lock);
+ bio_list_hierarchy_end_io_acct(&bio_list_on_stack, STAGE_THROTTLE); + if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) @@ -2283,6 +2286,7 @@ bool __blk_throtl_bio(struct bio *bio) td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; + bio_hierarchy_start_io_acct(bio, STAGE_THROTTLE);
/* * Update @tg's dispatch time and force schedule dispatch if @tg @@ -2443,6 +2447,8 @@ void blk_throtl_exit(struct gendisk *disk) del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(disk, &blkcg_policy_throtl); + blk_mq_unregister_hierarchy(q, STAGE_THROTTLE); + free_percpu(q->td->latency_buckets[READ]); free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); @@ -2477,6 +2483,8 @@ void blk_throtl_register(struct gendisk *disk) if (!td->track_bio_latency) blk_stat_enable_accounting(q); #endif + + blk_mq_register_hierarchy(q, STAGE_THROTTLE); }
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index ad92611a85f8..8739f169a99e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -332,8 +332,11 @@ struct bio {
struct bio_set *bi_pool;
- +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + KABI_USE(1, u64 hierarchy_time) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) @@ -498,6 +501,15 @@ enum stat_group { NR_STAT_GROUPS };
+enum stage_group { +#ifdef CONFIG_BLK_DEV_THROTTLING + STAGE_THROTTLE, +#endif + STAGE_RESERVE, + NR_BIO_STAGE_GROUPS, + NR_STAGE_GROUPS = NR_BIO_STAGE_GROUPS, +}; + static inline enum req_op bio_op(const struct bio *bio) { return bio->bi_opf & REQ_OP_MASK; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a41707b09d8d..bf58ae46cc59 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -575,7 +575,11 @@ struct request_queue {
bool mq_sysfs_init_done;
+#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + KABI_USE(1, struct blk_io_hierarchy_stats *io_hierarchy_stats) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4)
From: Kundan Kumar kundan.kumar@samsung.com
mainline inclusion from mainline-v6.8-rc1 commit 847c5bcdfb41704e52930783b028302f415a3209 category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Write-back throttling (WBT) enables QUEUE_FLAG_STATS on the request queue. But WBT does not make sense for passthrough io, so skip QUEUE_FLAG_STATS processing.
Also skip rq_qos_issue/done for passthrough io.
Overall, the change gives ~11% hike in peak performance.
Signed-off-by: Kundan Kumar kundan.kumar@samsung.com Signed-off-by: Kanchan Joshi joshi.k@samsung.com Link: https://lore.kernel.org/r/20231123190331.7934-1-kundan.kumar@samsung.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-mq.c | 3 ++- block/blk-rq-qos.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index 630147c43a35..dab6b477f3e0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1258,7 +1258,8 @@ void blk_mq_start_request(struct request *rq)
trace_block_rq_issue(rq);
- if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { + if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && + !blk_rq_is_passthrough(rq)) { rq->io_start_time_ns = ktime_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index f48ee150d667..37245c97ee61 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -118,7 +118,7 @@ static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
static inline void rq_qos_done(struct request_queue *q, struct request *rq) { - if (q->rq_qos) + if (q->rq_qos && !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); }
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-v6.9-rc1 commit c4e47bbb00dad9240f4c054859950e962042ecb8 category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
In preparation for moving time keeping into blk.h, move the cgroup related code for timestamps in here too. This will help avoid a circular dependency, and also moves it into a more appropriate header as this one is private to the block layer code.
Leave struct bio_issue in blk_types.h as it's a proper time definition.
Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-cgroup.h | 1 + block/blk.h | 42 +++++++++++++++++++++++++++++++++++++++ include/linux/blk_types.h | 42 --------------------------------------- 3 files changed, 43 insertions(+), 42 deletions(-)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index e4bf964cf9a6..1ee4bda5c641 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -19,6 +19,7 @@ #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> +#include "blk.h"
struct blkcg_gq; struct blkg_policy_data; diff --git a/block/blk.h b/block/blk.h index 3874d6037230..2dbb2bd8b347 100644 --- a/block/blk.h +++ b/block/blk.h @@ -520,4 +520,46 @@ static inline int req_ref_read(struct request *req) return atomic_read(&req->ref); }
+/* + * From most significant bit: + * 1 bit: reserved for other usage, see below + * 12 bits: original size of bio + * 51 bits: issue time of bio + */ +#define BIO_ISSUE_RES_BITS 1 +#define BIO_ISSUE_SIZE_BITS 12 +#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) +#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) +#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) +#define BIO_ISSUE_SIZE_MASK \ + (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) +#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) + +/* Reserved bit for blk-throtl */ +#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) + +static inline u64 __bio_issue_time(u64 time) +{ + return time & BIO_ISSUE_TIME_MASK; +} + +static inline u64 bio_issue_time(struct bio_issue *issue) +{ + return __bio_issue_time(issue->value); +} + +static inline sector_t bio_issue_size(struct bio_issue *issue) +{ + return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); +} + +static inline void bio_issue_init(struct bio_issue *issue, + sector_t size) +{ + size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; + issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | + (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | + ((u64)size << BIO_ISSUE_SIZE_SHIFT)); +} + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8739f169a99e..858bb4b71474 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -223,52 +223,10 @@ static inline bool blk_path_error(blk_status_t error) return true; }
-/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - struct bio_issue { u64 value; };
-static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - typedef __u32 __bitwise blk_opf_t;
typedef unsigned int blk_qc_t;
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-v6.9-rc1 commit 08420cf70cfb32eed2a0abfeb5c54c5651bd0c99 category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Convert any user of ktime_get_ns() to use blk_time_get_ns(), and ktime_get() to blk_time_get(), so we have a unified API for querying the current time in nanoseconds or as ktime.
No functional changes intended, this patch just wraps ktime_get_ns() and ktime_get() with a block helper.
Reviewed-by: Johannes Thumshirn johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: block/blk-cgroup.c [commit a65983d90bfb ("sched/psi: Introduce fine grained stall time collect for cgroup reclaim") changes context in blkcg_maybe_throttle_blkg().] block/blk-io-hierarchy/stats.c [Also change ktime_get_ns() from self-innovate patch.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/bfq-cgroup.c | 14 +++++++------- block/bfq-iosched.c | 28 ++++++++++++++-------------- block/blk-cgroup.c | 2 +- block/blk-flush.c | 2 +- block/blk-io-hierarchy/stats.c | 4 ++-- block/blk-iocost.c | 8 ++++---- block/blk-iolatency.c | 6 +++--- block/blk-mq.c | 16 ++++++++-------- block/blk-throttle.c | 6 +++--- block/blk-wbt.c | 6 +++--- block/blk.h | 13 ++++++++++++- 11 files changed, 58 insertions(+), 47 deletions(-)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 2c90e5de0acd..d442ee358fc2 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -127,7 +127,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) if (!bfqg_stats_waiting(stats)) return;
- now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_group_wait_time) bfq_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); @@ -144,7 +144,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, return; if (bfqg == curr_bfqg) return; - stats->start_group_wait_time = ktime_get_ns(); + stats->start_group_wait_time = blk_time_get_ns(); bfqg_stats_mark_waiting(stats); }
@@ -156,7 +156,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) if (!bfqg_stats_empty(stats)) return;
- now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_empty_time) bfq_stat_add(&stats->empty_time, now - stats->start_empty_time); @@ -183,7 +183,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) if (bfqg_stats_empty(stats)) return;
- stats->start_empty_time = ktime_get_ns(); + stats->start_empty_time = blk_time_get_ns(); bfqg_stats_mark_empty(stats); }
@@ -192,7 +192,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) struct bfqg_stats *stats = &bfqg->stats;
if (bfqg_stats_idling(stats)) { - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns();
if (now > stats->start_idle_time) bfq_stat_add(&stats->idle_time, @@ -205,7 +205,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats;
- stats->start_idle_time = ktime_get_ns(); + stats->start_idle_time = blk_time_get_ns(); bfqg_stats_mark_idling(stats); }
@@ -242,7 +242,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { struct bfqg_stats *stats = &bfqg->stats; - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns();
if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, opf, diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7e0dcded5713..05372a78cd51 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1005,7 +1005,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
rq = rq_entry_fifo(bfqq->fifo.next);
- if (rq == last || ktime_get_ns() < rq->fifo_time) + if (rq == last || blk_time_get_ns() < rq->fifo_time) return NULL;
bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); @@ -1829,7 +1829,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * bfq_bfqq_update_budg_for_activation for * details on the usage of the next variable. */ - arrived_in_time = ktime_get_ns() <= + arrived_in_time = blk_time_get_ns() <= bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); @@ -2208,7 +2208,7 @@ static void bfq_add_request(struct request *rq) struct request *next_rq, *prev; unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns();
bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; @@ -2262,7 +2262,7 @@ static void bfq_add_request(struct request *rq) bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + msecs_to_jiffies(10))) { - bfqd->last_empty_occupied_ns = ktime_get_ns(); + bfqd->last_empty_occupied_ns = blk_time_get_ns(); /* * Start the state machine for measuring the * total service time of rq: setting @@ -3296,7 +3296,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, else timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
- bfqd->last_budget_start = ktime_get(); + bfqd->last_budget_start = blk_time_get();
bfqq->budget_timeout = jiffies + bfqd->bfq_timeout * timeout_coeff; @@ -3396,7 +3396,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) else if (bfqq->wr_coeff > 1) sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
- bfqd->last_idling_start = ktime_get(); + bfqd->last_idling_start = blk_time_get(); bfqd->last_idling_start_jiffies = jiffies;
hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), @@ -3435,7 +3435,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) { if (rq != NULL) { /* new rq dispatch now, reset accordingly */ - bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); + bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns(); bfqd->peak_rate_samples = 1; bfqd->sequential_samples = 0; bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = @@ -3592,7 +3592,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) */ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns();
if (bfqd->peak_rate_samples == 0) { /* first dispatch */ bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", @@ -4164,7 +4164,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (compensate) delta_ktime = bfqd->last_idling_start; else - delta_ktime = ktime_get(); + delta_ktime = blk_time_get(); delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); delta_usecs = ktime_to_us(delta_ktime);
@@ -5593,7 +5593,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, pid_t pid, int is_sync, unsigned int act_idx) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns();
bfqq->actuator_idx = act_idx; RB_CLEAR_NODE(&bfqq->entity.rb_node); @@ -5903,7 +5903,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, */ if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) return; - elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; @@ -6195,7 +6195,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bfq_add_request(rq); idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
- rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + rq->fifo_time = blk_time_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo);
bfq_rq_enqueued(bfqd, bfqq, rq); @@ -6371,7 +6371,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_weights_tree_remove(bfqq); }
- now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns();
bfqq->ttime.last_end_request = now_ns;
@@ -6586,7 +6586,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) static void bfq_update_inject_limit(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; + u64 tot_time_ns = blk_time_get_ns() - bfqd->last_empty_occupied_ns; unsigned int old_limit = bfqq->inject_limit;
if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f6cb2cd8ab33..cd3e8ac50fd0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1891,7 +1891,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags = 0; bool clamp; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; diff --git a/block/blk-flush.c b/block/blk-flush.c index 313f0ffcce42..315ef8542380 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -143,7 +143,7 @@ static void blk_account_io_flush(struct request *rq) part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); part_stat_add(part, nsecs[STAT_FLUSH], - ktime_get_ns() - rq->start_time_ns); + blk_time_get_ns() - rq->start_time_ns); part_stat_unlock(); }
diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c index f078ebc5f668..e717bf790709 100644 --- a/block/blk-io-hierarchy/stats.c +++ b/block/blk-io-hierarchy/stats.c @@ -186,7 +186,7 @@ void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage)
hstage = q->io_hierarchy_stats->hstage[stage]; io_hierarchy_inc(hstage->hstats, dispatched, hierarchy_op(bio)); - bio->hierarchy_time = ktime_get_ns(); + bio->hierarchy_time = blk_time_get_ns(); }
void bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, @@ -208,7 +208,7 @@ void bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, void bio_list_hierarchy_end_io_acct(struct bio_list *list, enum stage_group stage) { - u64 time = ktime_get_ns(); + u64 time = blk_time_get_ns(); struct bio *bio;
bio_list_for_each(bio, list) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 3a5ba10ba2b1..67c8941b2c1d 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -829,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
/* step up/down based on the vrate */ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); - now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns();
if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { if (!ioc->autop_too_fast_at) @@ -1044,7 +1044,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now) unsigned seq; u64 vrate;
- now->now_ns = ktime_get(); + now->now_ns = blk_time_get_ns(); now->now = ktime_to_us(now->now_ns); vrate = atomic64_read(&ioc->vtime_rate);
@@ -2823,7 +2823,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) return; }
- on_q_ns = ktime_get_ns() - rq->alloc_time_ns; + on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
@@ -2906,7 +2906,7 @@ static int blk_iocost_init(struct gendisk *disk) ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); - ioc->period_at = ktime_to_us(ktime_get()); + ioc->period_at = ktime_to_us(blk_time_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c1a6aba1d59e..ebb522788d97 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -609,7 +609,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) if (!iolat->blkiolat->enabled) return;
- now = ktime_to_ns(ktime_get()); + now = blk_time_get_ns(); while (blkg && blkg->parent) { iolat = blkg_to_lat(blkg); if (!iolat) { @@ -661,7 +661,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns();
rcu_read_lock(); blkg_for_each_descendant_pre(blkg, pos_css, @@ -985,7 +985,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) struct blkcg_gq *blkg = lat_to_blkg(iolat); struct rq_qos *rqos = iolat_rq_qos(blkg->q); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); int cpu;
if (blk_queue_nonrot(blkg->q)) diff --git a/block/blk-mq.c b/block/blk-mq.c index dab6b477f3e0..389da90d73cc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -324,7 +324,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); rq->part = NULL; blk_crypto_rq_set_defaults(rq); } @@ -334,7 +334,7 @@ EXPORT_SYMBOL(blk_rq_init); static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); else rq->start_time_ns = 0;
@@ -446,7 +446,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
/* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) - alloc_time_ns = ktime_get_ns(); + alloc_time_ns = blk_time_get_ns();
if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; @@ -629,7 +629,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
/* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) - alloc_time_ns = ktime_get_ns(); + alloc_time_ns = blk_time_get_ns();
/* * If the tag allocator sleeps we could get an allocation for a @@ -1042,7 +1042,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_mq_need_time_stamp(rq)) - __blk_mq_end_request_acct(rq, ktime_get_ns()); + __blk_mq_end_request_acct(rq, blk_time_get_ns());
blk_mq_finish_request(rq);
@@ -1090,7 +1090,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) u64 now = 0;
if (iob->need_ts) - now = ktime_get_ns(); + now = blk_time_get_ns();
while ((rq = rq_list_pop(&iob->req_list)) != NULL) { prefetch(rq->bio); @@ -1260,7 +1260,7 @@ void blk_mq_start_request(struct request *rq)
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && !blk_rq_is_passthrough(rq)) { - rq->io_start_time_ns = ktime_get_ns(); + rq->io_start_time_ns = blk_time_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); @@ -3125,7 +3125,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq) blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) - blk_account_io_done(rq, ktime_get_ns()); + blk_account_io_done(rq, blk_time_get_ns()); return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 62662bcd2392..61fc85f8f53a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1844,7 +1844,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); ret = tg->latency_target == DFL_LATENCY_TARGET || tg->idletime_threshold == DFL_IDLE_THRESHOLD || - (ktime_get_ns() >> 10) - tg->last_finish_time > time || + (blk_time_get_ns() >> 10) - tg->last_finish_time > time || tg->avg_idletime > tg->idletime_threshold || (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); @@ -2089,7 +2089,7 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) if (last_finish_time == 0) return;
- now = ktime_get_ns() >> 10; + now = blk_time_get_ns() >> 10; if (now <= last_finish_time || last_finish_time == tg->checked_last_finish_time) return; @@ -2357,7 +2357,7 @@ void blk_throtl_bio_endio(struct bio *bio) if (!tg->td->limit_valid[LIMIT_LOW]) return;
- finish_time_ns = ktime_get_ns(); + finish_time_ns = blk_time_get_ns(); tg->last_finish_time = finish_time_ns >> 10;
start_time = bio_issue_time(&bio->bi_issue) >> 10; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 296cdfea7c2a..a212120591c7 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -29,6 +29,7 @@ #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" +#include "blk.h"
#define CREATE_TRACE_POINTS #include <trace/events/wbt.h> @@ -275,13 +276,12 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
static u64 rwb_sync_issue_lat(struct rq_wb *rwb) { - u64 now, issue = READ_ONCE(rwb->sync_issue); + u64 issue = READ_ONCE(rwb->sync_issue);
if (!issue || !rwb->sync_cookie) return 0;
- now = ktime_to_ns(ktime_get()); - return now - issue; + return blk_time_get_ns() - issue; }
static inline unsigned int wbt_inflight(struct rq_wb *rwb) diff --git a/block/blk.h b/block/blk.h index 2dbb2bd8b347..2a0ac69e238b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@
#include <linux/blk-crypto.h> #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ +#include <linux/timekeeping.h> #include <xen/xen.h> #include "blk-crypto-internal.h"
@@ -520,6 +521,16 @@ static inline int req_ref_read(struct request *req) return atomic_read(&req->ref); }
+static inline u64 blk_time_get_ns(void) +{ + return ktime_get_ns(); +} + +static inline ktime_t blk_time_get(void) +{ + return ns_to_ktime(blk_time_get_ns()); +} + /* * From most significant bit: * 1 bit: reserved for other usage, see below @@ -558,7 +569,7 @@ static inline void bio_issue_init(struct bio_issue *issue, { size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | + (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | ((u64)size << BIO_ISSUE_SIZE_SHIFT)); }
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-v6.9-rc1 commit da4c8c3d0975f031ef82d39927102e39fa6ddfac category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Querying the current time is the most costly thing we do in the block layer per IO, and depending on kernel config settings, we may do it many times per IO.
None of the callers actually need nsec granularity. Take advantage of that by caching the current time in the plug, with the assumption here being that any time checking will be temporally close enough that the slight loss of precision doesn't matter.
If the block plug gets flushed, eg on preempt or schedule out, then we invalidate the cached clock.
On a basic peak IOPS test case with iostats enabled, this changes the performance from:
IOPS=108.41M, BW=52.93GiB/s, IOS/call=31/31 IOPS=108.43M, BW=52.94GiB/s, IOS/call=32/32 IOPS=108.29M, BW=52.88GiB/s, IOS/call=31/32 IOPS=108.35M, BW=52.91GiB/s, IOS/call=32/32 IOPS=108.42M, BW=52.94GiB/s, IOS/call=31/31 IOPS=108.40M, BW=52.93GiB/s, IOS/call=32/32 IOPS=108.31M, BW=52.89GiB/s, IOS/call=32/31
to
IOPS=118.79M, BW=58.00GiB/s, IOS/call=31/32 IOPS=118.62M, BW=57.92GiB/s, IOS/call=31/31 IOPS=118.80M, BW=58.01GiB/s, IOS/call=32/31 IOPS=118.78M, BW=58.00GiB/s, IOS/call=32/32 IOPS=118.69M, BW=57.95GiB/s, IOS/call=32/31 IOPS=118.62M, BW=57.92GiB/s, IOS/call=32/31 IOPS=118.63M, BW=57.92GiB/s, IOS/call=31/32
which is more than a 9% improvement in performance. Looking at perf diff, we can see a huge reduction in time overhead:
10.55% -9.88% [kernel.vmlinux] [k] read_tsc 1.31% -1.22% [kernel.vmlinux] [k] ktime_get
Note that since this relies on blk_plug for the caching, it's only applicable to the issue side. But this is where most of the time calls happen anyway. On the completion side, cached time stamping is done with struct io_comp patch, as long as the driver supports it.
It's also worth noting that the above testing doesn't enable any of the higher cost CPU items on the block layer side, like wbt, cgroups, iocost, etc, which all would add additional time querying and hence overhead. IOW, results would likely look even better in comparison with those enabled, as distros would do.
Reviewed-by: Johannes Thumshirn johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 1 + block/blk.h | 14 +++++++++++++- include/linux/blkdev.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 76a1b6d245b2..337f4b163b3a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1101,6 +1101,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) if (tsk->plug) return;
+ plug->cur_ktime = 0; plug->mq_list = NULL; plug->cached_rq = NULL; plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); diff --git a/block/blk.h b/block/blk.h index 2a0ac69e238b..bf407eace2df 100644 --- a/block/blk.h +++ b/block/blk.h @@ -523,7 +523,19 @@ static inline int req_ref_read(struct request *req)
static inline u64 blk_time_get_ns(void) { - return ktime_get_ns(); + struct blk_plug *plug = current->plug; + + if (!plug) + return ktime_get_ns(); + + /* + * 0 could very well be a valid time, but rather than flag "this is + * a valid timestamp" separately, just accept that we'll do an extra + * ktime_get_ns() if we just happen to get 0 as the current time. + */ + if (!plug->cur_ktime) + plug->cur_ktime = ktime_get_ns(); + return plug->cur_ktime; }
static inline ktime_t blk_time_get(void) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bf58ae46cc59..d363f46ef1c3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1032,6 +1032,7 @@ struct blk_plug {
/* if ios_left is > 1, we can batch tag/rq allocations */ struct request *cached_rq; + u64 cur_ktime; unsigned short nr_ios;
unsigned short rq_count;
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-v6.9-rc1 commit 06b23f92af87a84d70881b2ecaa72e00f7838264 category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Mark the task as having a cached timestamp when set assign it, so we can efficiently check if it needs updating post being scheduled back in. This covers both the actual schedule out case, which would've flushed the plug, and the preemption case which doesn't touch the plugged requests (for many reasons, one of them being then we'd need to have preemption disabled around plug state manipulation).
Reviewed-by: Johannes Thumshirn johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-core.c | 2 ++ block/blk.h | 4 +++- include/linux/blkdev.h | 16 ++++++++++++++++ include/linux/sched.h | 2 +- kernel/sched/core.c | 6 ++++-- 5 files changed, 26 insertions(+), 4 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 337f4b163b3a..e7d825435e48 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1201,6 +1201,8 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) */ if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); + + current->flags &= ~PF_BLOCK_TS; }
/** diff --git a/block/blk.h b/block/blk.h index bf407eace2df..5d00d7f7910e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -533,8 +533,10 @@ static inline u64 blk_time_get_ns(void) * a valid timestamp" separately, just accept that we'll do an extra * ktime_get_ns() if we just happen to get 0 as the current time. */ - if (!plug->cur_ktime) + if (!plug->cur_ktime) { plug->cur_ktime = ktime_get_ns(); + current->flags |= PF_BLOCK_TS; + } return plug->cur_ktime; }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d363f46ef1c3..3a80bedef6ee 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1067,6 +1067,18 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) __blk_flush_plug(plug, async); }
+/* + * tsk == current here + */ +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (plug) + plug->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; +} + int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ @@ -1090,6 +1102,10 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) { }
+static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ +} + static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index b8be76b0c120..fe23380288b7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1840,7 +1840,7 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ -#define PF__HOLE__20000000 0x20000000 +#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7a9aff1dbb3c..462571b26f88 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6808,10 +6808,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { + if (tsk->flags & PF_BLOCK_TS) + blk_plug_invalidate_ts(tsk); if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); - else + else if (tsk->flags & PF_IO_WORKER) io_wq_worker_running(tsk); } }
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-v6.9-rc1 commit b874d4aae58b92144ec2c8fa5dc0a27c98388fcc category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We should not have any callers of this from non-task context, but Jakub ran [1] into one from blk-iocost. Rather than risk running into others, or future ones, just limit blk_time_get_ns() to when it is called from a task. Any other usage is invalid.
[1] https://lore.kernel.org/lkml/CAHk-=wiOaBLqarS2uFhM1YdwOvCX4CZaWkeyNDY1zONpbY...
Fixes: da4c8c3d0975 ("block: cache current nsec time in struct blk_plug") Reported-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/blk.h b/block/blk.h index 5d00d7f7910e..0c469a84c9e0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -525,7 +525,7 @@ static inline u64 blk_time_get_ns(void) { struct blk_plug *plug = current->plug;
- if (!plug) + if (!plug || !in_task()) return ktime_get_ns();
/*
mainline inclusion from mainline-v6.9-rc4 commit 3ec4848913d695245716ea45ca4872d9dff097a5 category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
While monitoring the throttle time of IO from iocost, it's found that such time is always zero after the io_schedule() from ioc_rqos_throttle, for example, with the following debug patch:
+ printk("%s-%d: %s enter %llu\n", current->comm, current->pid, __func__, blk_time_get_ns()); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); if (wait.committed) break; io_schedule(); } + printk("%s-%d: %s exit %llu\n", current->comm, current->pid, __func__, blk_time_get_ns());
It can be observerd that blk_time_get_ns() always return the same time:
[ 1068.096579] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.272587] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.274389] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.472690] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.474485] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.672656] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.674451] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.872655] fio-1268: ioc_rqos_throttle exit 1067901962288
And I think the root cause is that 'PF_BLOCK_TS' is always cleared by blk_flush_plug() before scheduel(), hence blk_plug_invalidate_ts() will never be called:
blk_time_get_ns plug->cur_ktime = ktime_get_ns(); current->flags |= PF_BLOCK_TS;
io_schedule: io_schedule_prepare blk_flush_plug __blk_flush_plug /* the flag is cleared, while time is not */ current->flags &= ~PF_BLOCK_TS; schedule sched_update_worker /* the flag is not set, hence plug->cur_ktime is not cleared */ if (tsk->flags & PF_BLOCK_TS) blk_plug_invalidate_ts()
blk_time_get_ns /* got the time stashed before schedule */ return plug->cur_ktime;
Fix the problem by clearing cached time in __blk_flush_plug().
Fixes: 06b23f92af87 ("block: update cached timestamp post schedule/preemption") Signed-off-by: Yu Kuai yukuai3@huawei.com Link: https://lore.kernel.org/r/20240411032349.3051233-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/block/blk-core.c b/block/blk-core.c index e7d825435e48..0ba11c853051 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1202,6 +1202,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug);
+ plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; }
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
- new field 'cur_ktime' is added to struct blk_plug; - "blk.h" is included from blk-wbt.c; - "blk.h" is included from blk-cgroup.h;
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-cgroup.h | 3 +++ block/blk-wbt.c | 3 +++ include/linux/blkdev.h | 3 +-- 3 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 1ee4bda5c641..82c800699da8 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -19,7 +19,10 @@ #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> + +#ifndef __GENKSYMS__ #include "blk.h" +#endif
struct blkcg_gq; struct blkg_policy_data; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index a212120591c7..4933a7738ebd 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -29,7 +29,10 @@ #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" + +#ifndef __GENKSYMS__ #include "blk.h" +#endif
#define CREATE_TRACE_POINTS #include <trace/events/wbt.h> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3a80bedef6ee..b00846ed412c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1032,7 +1032,6 @@ struct blk_plug {
/* if ios_left is > 1, we can batch tag/rq allocations */ struct request *cached_rq; - u64 cur_ktime; unsigned short nr_ios;
unsigned short rq_count; @@ -1042,7 +1041,7 @@ struct blk_plug {
struct list_head cb_list; /* md requires an unplug callback */
- KABI_RESERVE(1) + KABI_USE(1, u64 cur_ktime) KABI_RESERVE(2) KABI_RESERVE(3) };
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
A new config BLK_BIO_ALLOC_TIME is added to control the behaviour, and the time will be used later for blk-io-hierarchy.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/Kconfig | 3 +++ block/bio.c | 4 ++++ include/linux/blk_types.h | 4 ++++ 3 files changed, 11 insertions(+)
diff --git a/block/Kconfig b/block/Kconfig index 8fd2a8cb539e..371f4f9ab298 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -36,6 +36,9 @@ config BLOCK_LEGACY_AUTOLOAD created on demand, but scripts that manually create device nodes and then call losetup might rely on this behavior.
+config BLK_BIO_ALLOC_TIME + bool + config BLK_RQ_ALLOC_TIME bool
diff --git a/block/bio.c b/block/bio.c index bf6bec43b27e..060c91e50936 100644 --- a/block/bio.c +++ b/block/bio.c @@ -282,6 +282,10 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_max_vecs = max_vecs; bio->bi_io_vec = table; bio->bi_pool = NULL; + +#ifdef CONFIG_BLK_BIO_ALLOC_TIME + bio->bi_alloc_time_ns = blk_time_get_ns(); +#endif } EXPORT_SYMBOL(bio_init);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 858bb4b71474..1d86790dd07c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -297,7 +297,11 @@ struct bio { #endif KABI_RESERVE(2) KABI_RESERVE(3) +#ifdef CONFIG_BLK_BIO_ALLOC_TIME + KABI_USE(4, u64 bi_alloc_time_ns) +#else KABI_RESERVE(4) +#endif KABI_RESERVE(5) KABI_RESERVE(6) KABI_RESERVE(7)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
A new config BLK_BIO_ALLOC_TASK is added to control the behaviour, and the task_struct will be used later for blk-io-hierarchy to dump thread info to user.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/Kconfig | 3 +++ block/bio.c | 11 +++++++++++ include/linux/blk_types.h | 4 ++++ 3 files changed, 18 insertions(+)
diff --git a/block/Kconfig b/block/Kconfig index 371f4f9ab298..7018fdcaa459 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -39,6 +39,9 @@ config BLOCK_LEGACY_AUTOLOAD config BLK_BIO_ALLOC_TIME bool
+config BLK_BIO_ALLOC_TASK + bool + config BLK_RQ_ALLOC_TIME bool
diff --git a/block/bio.c b/block/bio.c index 060c91e50936..c5569f8b65af 100644 --- a/block/bio.c +++ b/block/bio.c @@ -223,6 +223,13 @@ void bio_uninit(struct bio *bio) bio_integrity_free(bio);
bio_crypt_free_ctx(bio); + +#ifdef CONFIG_BLK_BIO_ALLOC_TASK + if (bio->pid) { + put_pid(bio->pid); + bio->pid = NULL; + } +#endif } EXPORT_SYMBOL(bio_uninit);
@@ -286,6 +293,10 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, #ifdef CONFIG_BLK_BIO_ALLOC_TIME bio->bi_alloc_time_ns = blk_time_get_ns(); #endif + +#ifdef CONFIG_BLK_BIO_ALLOC_TASK + bio->pid = get_pid(task_pid(current)); +#endif } EXPORT_SYMBOL(bio_init);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1d86790dd07c..95759212836a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -302,7 +302,11 @@ struct bio { #else KABI_RESERVE(4) #endif +#ifdef CONFIG_BLK_BIO_ALLOC_TASK + KABI_USE(5, struct pid *pid) +#else KABI_RESERVE(5) +#endif KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
The time will be used later for dumping request in blk-io-hierarchy.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-flush.c | 1 + block/blk-map.c | 1 + block/blk-merge.c | 4 ++++ block/blk-mq.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 22 ++++++++++++++++++++++ 5 files changed, 62 insertions(+)
diff --git a/block/blk-flush.c b/block/blk-flush.c index 315ef8542380..ff462409db1c 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -340,6 +340,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->end_io = flush_end_io; + blk_rq_init_bi_alloc_time(flush_rq, first_rq); /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one * implied in refcount_inc_not_zero() called from diff --git a/block/blk-map.c b/block/blk-map.c index 0aadbaf7a9dd..a9c75f6c7127 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -552,6 +552,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) rq->biotail = bio; rq->__data_len += (bio)->bi_iter.bi_size; bio_crypt_free_ctx(bio); + blk_rq_update_bi_alloc_time(rq, bio, NULL); }
return 0; diff --git a/block/blk-merge.c b/block/blk-merge.c index 5db8228c46fc..9a29a2212e5c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -866,6 +866,7 @@ static struct request *attempt_merge(struct request_queue *q, req->biotail = next->biotail;
req->__data_len += blk_rq_bytes(next); + blk_rq_update_bi_alloc_time(req, NULL, next);
if (!blk_discard_mergable(req)) elv_merge_requests(q, req, next); @@ -996,6 +997,7 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req, req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; + blk_rq_update_bi_alloc_time(req, bio, NULL);
bio_crypt_free_ctx(bio);
@@ -1024,6 +1026,7 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; + blk_rq_update_bi_alloc_time(req, bio, NULL);
bio_crypt_do_front_merge(req, bio);
@@ -1048,6 +1051,7 @@ static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q, req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; req->nr_phys_segments = segments + 1; + blk_rq_update_bi_alloc_time(req, bio, NULL);
blk_account_io_merge_bio(req); return BIO_MERGE_OK; diff --git a/block/blk-mq.c b/block/blk-mq.c index 389da90d73cc..b375e1ec6c09 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -387,6 +387,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->end_io = NULL; rq->end_io_data = NULL;
+ blk_rq_init_bi_alloc_time(rq, NULL); + blk_crypto_rq_set_defaults(rq); INIT_LIST_HEAD(&rq->queuelist); /* tag was already set */ @@ -4948,6 +4950,38 @@ void blk_mq_cancel_work_sync(struct request_queue *q) cancel_delayed_work_sync(&hctx->run_work); }
+#ifdef CONFIG_BLK_BIO_ALLOC_TIME +void blk_rq_init_bi_alloc_time(struct request *rq, struct request *first_rq) +{ + rq->bi_alloc_time_ns = first_rq ? first_rq->bi_alloc_time_ns : + blk_time_get_ns(); +} + +/* + * Used in following cases to updated request bi_alloc_time_ns: + * + * 1) Allocate a new @rq for @bio; + * 2) @bio is merged to @rq, in this case @merged_rq should be NULL; + * 3) @merged_rq is merged to @rq, in this case @bio should be NULL; + */ +void blk_rq_update_bi_alloc_time(struct request *rq, struct bio *bio, + struct request *merged_rq) +{ + if (bio) { + if (rq->bi_alloc_time_ns > bio->bi_alloc_time_ns) + rq->bi_alloc_time_ns = bio->bi_alloc_time_ns; + return; + } + + if (!merged_rq) + return; + + if (rq->bi_alloc_time_ns > merged_rq->bi_alloc_time_ns) + rq->bi_alloc_time_ns = merged_rq->bi_alloc_time_ns; +} +EXPORT_SYMBOL_GPL(blk_rq_update_bi_alloc_time); +#endif + static int __init blk_mq_init(void) { int i; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2f7d8aeec4a7..9e8a860f74f4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -190,7 +190,11 @@ struct request { rq_end_io_fn *end_io; void *end_io_data;
+#ifdef CONFIG_BLK_BIO_ALLOC_TIME + KABI_USE(1, u64 bi_alloc_time_ns) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) @@ -957,6 +961,23 @@ static inline bool blk_should_fake_timeout(struct request_queue *q) return false; }
+#ifdef CONFIG_BLK_BIO_ALLOC_TIME +void blk_rq_init_bi_alloc_time(struct request *rq, struct request *first_rq); +void blk_rq_update_bi_alloc_time(struct request *rq, struct bio *bio, + struct request *merged_rq); +#else /* CONFIG_BLK_BIO_ALLOC_TIME */ +static inline void blk_rq_init_bi_alloc_time(struct request *rq, + struct request *first_rq) +{ +} + +static inline void blk_rq_update_bi_alloc_time(struct request *rq, + struct bio *bio, + struct request *merged_rq) +{ +} +#endif + /** * blk_mq_rq_from_pdu - cast a PDU to a request * @pdu: the PDU (Protocol Data Unit) to be casted @@ -1005,6 +1026,7 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); + blk_rq_update_bi_alloc_time(rq, bio, NULL); }
void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
The task will be used later for dumping request in blk-io-hierarchy.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-flush.c | 2 ++ block/blk-mq.c | 3 +++ block/blk-mq.h | 25 +++++++++++++++++++++++++ include/linux/blk-mq.h | 4 ++++ 4 files changed, 34 insertions(+)
diff --git a/block/blk-flush.c b/block/blk-flush.c index ff462409db1c..4f64194f2eb6 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -238,6 +238,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, * avoiding use-after-free. */ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); + blk_mq_put_alloc_task(flush_rq); if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; fq->rq_status = BLK_STS_OK; @@ -341,6 +342,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->end_io = flush_end_io; blk_rq_init_bi_alloc_time(flush_rq, first_rq); + blk_mq_get_alloc_task(flush_rq, first_rq->bio); /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one * implied in refcount_inc_not_zero() called from diff --git a/block/blk-mq.c b/block/blk-mq.c index b375e1ec6c09..56d6d59f9735 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -388,6 +388,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->end_io_data = NULL;
blk_rq_init_bi_alloc_time(rq, NULL); + blk_mq_get_alloc_task(rq, data->bio);
blk_crypto_rq_set_defaults(rq); INIT_LIST_HEAD(&rq->queuelist); @@ -710,6 +711,7 @@ static void __blk_mq_free_request(struct request *rq) struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag;
+ blk_mq_put_alloc_task(rq); blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; @@ -2921,6 +2923,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, .q = q, .nr_tags = 1, .cmd_flags = bio->bi_opf, + .bio = bio, }; struct request *rq;
diff --git a/block/blk-mq.h b/block/blk-mq.h index 8b9aac701035..29ee2c3fe00b 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -158,6 +158,8 @@ struct blk_mq_alloc_data { /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; + + KABI_EXTEND(struct bio *bio) };
struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, @@ -445,4 +447,27 @@ do { \ #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
+#ifdef CONFIG_BLK_BIO_ALLOC_TASK +static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio) +{ + rq->pid = bio ? get_pid(bio->pid) : get_pid(task_pid(current)); +} + +static inline void blk_mq_put_alloc_task(struct request *rq) +{ + if (rq->pid) { + put_pid(rq->pid); + rq->pid = NULL; + } +} +#else +static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio) +{ +} + +static inline void blk_mq_put_alloc_task(struct request *rq) +{ +} +#endif + #endif diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9e8a860f74f4..b301ebe67eb4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -195,7 +195,11 @@ struct request { #else KABI_RESERVE(1) #endif +#ifdef CONFIG_BLK_BIO_ALLOC_TASK + KABI_USE(2, struct pid *pid) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
The time will be reused for completing bio, to avoid getting ns time again.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-mq.c | 10 ++++++++-- include/linux/blk-mq.h | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index 56d6d59f9735..1159a06e2543 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -379,6 +379,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->part = NULL; rq->io_start_time_ns = 0; + rq->io_end_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -1045,8 +1046,13 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { - if (blk_mq_need_time_stamp(rq)) - __blk_mq_end_request_acct(rq, blk_time_get_ns()); + if (blk_mq_need_time_stamp(rq)) { + u64 now = rq->io_end_time_ns; + + if (!now) + now = blk_time_get_ns(); + __blk_mq_end_request_acct(rq, now); + }
blk_mq_finish_request(rq);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b301ebe67eb4..da56055731d6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -200,7 +200,7 @@ struct request { #else KABI_RESERVE(2) #endif - KABI_RESERVE(3) + KABI_USE(3, u64 io_end_time_ns) KABI_RESERVE(4) KABI_RESERVE(5) KABI_RESERVE(6)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Include main structures definition and provide helpers for different IO stages to record IO stats and dump inflight IO.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/bio.c | 5 + block/blk-flush.c | 4 + block/blk-io-hierarchy/Kconfig | 13 + block/blk-io-hierarchy/Makefile | 1 + block/blk-io-hierarchy/debugfs.c | 134 +++++- block/blk-io-hierarchy/iodump.c | 745 +++++++++++++++++++++++++++++++ block/blk-io-hierarchy/iodump.h | 96 ++++ block/blk-io-hierarchy/stats.c | 234 +++++++++- block/blk-io-hierarchy/stats.h | 245 +++++++++- block/blk-mq-debugfs.c | 15 +- block/blk-mq-debugfs.h | 1 + block/blk-mq.c | 2 + include/linux/blk-mq.h | 5 + include/linux/blk_types.h | 16 +- 14 files changed, 1473 insertions(+), 43 deletions(-) create mode 100644 block/blk-io-hierarchy/iodump.c create mode 100644 block/blk-io-hierarchy/iodump.h
diff --git a/block/bio.c b/block/bio.c index c5569f8b65af..d64b0da22e38 100644 --- a/block/bio.c +++ b/block/bio.c @@ -297,6 +297,11 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, #ifdef CONFIG_BLK_BIO_ALLOC_TASK bio->pid = get_pid(task_pid(current)); #endif + +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + bio->hierarchy_time = 0; + INIT_LIST_HEAD(&bio->hierarchy_list); +#endif } EXPORT_SYMBOL(bio_init);
diff --git a/block/blk-flush.c b/block/blk-flush.c index 4f64194f2eb6..4628a9ee1904 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -73,6 +73,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h"
/* PREFLUSH/FUA sequences */ enum { @@ -343,6 +344,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->end_io = flush_end_io; blk_rq_init_bi_alloc_time(flush_rq, first_rq); blk_mq_get_alloc_task(flush_rq, first_rq->bio); + blk_rq_hierarchy_stats_init(flush_rq); /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one * implied in refcount_inc_not_zero() called from @@ -373,6 +375,8 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, blk_mq_put_driver_tag(rq); }
+ blk_rq_hierarchy_set_flush_done(rq); + /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index a12476c73fa5..2c15b5a7a006 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -13,6 +13,19 @@ menuconfig BLK_IO_HIERARCHY_STATS
if BLK_IO_HIERARCHY_STATS
+config HIERARCHY_IO_DUMP + bool "Support to dump io that is throttled" + default n + select BLK_BIO_ALLOC_TIME + select BLK_BIO_ALLOC_TASK + depends on BLK_DEV_IO_TRACE + help + Enable this will create new debugfs entries to show user the detailed + information of IO that are submitted and not done yet, and user can + filter the result by IO stage or IO latency. + + If unsure, say N. + config HIERARCHY_THROTTLE bool "Enable hierarchy stats layer blk-throttle" default n diff --git a/block/blk-io-hierarchy/Makefile b/block/blk-io-hierarchy/Makefile index 1fb663c75521..9b989d379e58 100644 --- a/block/blk-io-hierarchy/Makefile +++ b/block/blk-io-hierarchy/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk_io_hierarchy_stats.o
blk_io_hierarchy_stats-y := stats.o debugfs.o +obj-$(CONFIG_HIERARCHY_IO_DUMP) += iodump.o diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 9072a091c013..4cf30f172b86 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -12,13 +12,49 @@ */
#include <linux/debugfs.h> +#include <linux/blkdev.h>
+#include "../blk-mq-debugfs.h" #include "stats.h" +#include "iodump.h"
static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_THROTTLE [STAGE_THROTTLE] = "throtl", #endif +#ifdef CONFIG_HIERARCHY_WBT + [STAGE_WBT] = "wbt", +#endif +#ifdef CONFIG_HIERARCHY_IOCOST + [STAGE_IOCOST] = "iocost", +#endif +#ifdef CONFIG_HIERARCHY_GETTAG + [STAGE_GETTAG] = "gettag", +#endif +#ifdef CONFIG_HIERARCHY_PLUG + [STAGE_PLUG] = "plug", +#endif +#ifdef CONFIG_HIERARCHY_DEADLINE + [STAGE_DEADLINE] = "deadline", +#endif +#ifdef CONFIG_HIERARCHY_BFQ + [STAGE_BFQ] = "bfq", +#endif +#ifdef CONFIG_HIERARCHY_KYBER + [STAGE_KYBER] = "kyber", +#endif +#ifdef CONFIG_HIERARCHY_HCTX + [STAGE_HCTX] = "hctx", +#endif +#ifdef CONFIG_HIERARCHY_REQUEUE + [STAGE_REQUEUE] = "requeue", +#endif +#ifdef CONFIG_HIERARCHY_RQ_DRIVER + [STAGE_RQ_DRIVER] = "rq_driver", +#endif +#ifdef CONFIG_HIERARCHY_BIO + [STAGE_BIO] = "bio", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) @@ -26,26 +62,33 @@ const char *hierarchy_stage_name(enum stage_group stage) return stage_name[stage]; }
-static int hierarchy_stats_show(void *data, struct seq_file *m) +static int __hierarchy_stats_show(struct hierarchy_stats_data *hstats_data, + struct seq_file *m, enum stage_group stage) { - struct hierarchy_stage *hstage = data; - int cpu; u64 dispatched[NR_STAT_GROUPS] = {0}; u64 completed[NR_STAT_GROUPS] = {0}; u64 latency[NR_STAT_GROUPS] = {0}; + int cpu; + int i;
for_each_possible_cpu(cpu) { - int i; - struct hierarchy_stats *stat = per_cpu_ptr(hstage->hstats, cpu); + struct hierarchy_stats *stat = + per_cpu_ptr(hstats_data->hstats, cpu);
for (i = 0; i < NR_STAT_GROUPS; ++i) { dispatched[i] += stat->dispatched[i]; completed[i] += stat->completed[i]; - latency[i] += stat->nsecs[i]; + latency[i] += stage_is_rq(stage) ? + stat->jiffies[i] : stat->nsecs[i]; } }
- seq_printf(m, "%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + if (stage_is_rq(stage)) + for (i = 0; i < NR_STAT_GROUPS; ++i) + latency[i] = + jiffies_to_msecs(latency[i]) * NSEC_PER_MSEC; + + seq_printf(m, "%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", dispatched[STAT_READ], completed[STAT_READ], latency[STAT_READ], dispatched[STAT_WRITE], completed[STAT_WRITE], latency[STAT_WRITE], @@ -53,11 +96,71 @@ static int hierarchy_stats_show(void *data, struct seq_file *m) latency[STAT_DISCARD], dispatched[STAT_FLUSH], completed[STAT_FLUSH], latency[STAT_FLUSH]);
+ hierarchy_show_slow_io(hstats_data, m); + seq_putc(m, '\n'); return 0; }
-static struct blk_mq_debugfs_attr hierarchy_debugfs_attrs[] = { - {"stats", 0400, hierarchy_stats_show}, +static void *hierarchy_stats_start(struct seq_file *m, loff_t *pos) +{ + enum stage_group stage = *pos; + + if (stage < 0 || stage >= NR_STAGE_GROUPS) + return NULL; + + return pos; +} + +static void *hierarchy_stats_next(struct seq_file *m, void *v, loff_t *pos) +{ + enum stage_group stage = ++(*pos); + + if (stage >= 0 && stage < NR_STAGE_GROUPS) + return pos; + + return NULL; +} + +static void hierarchy_stats_stop(struct seq_file *m, void *v) +{ +} + +static int hierarchy_stats_show(struct seq_file *m, void *v) +{ + enum stage_group stage = (*(loff_t *)v); + struct blk_io_hierarchy_stats *stats = m->private; + struct hierarchy_stats_data *hstats_data = get_hstats_data(stats, stage); + + if (!hstats_data) + return 0; + + seq_printf(m, "%s ", hierarchy_stage_name(stage)); + __hierarchy_stats_show(hstats_data, m, stage); + put_hstats_data(stats, hstats_data); + return 0; +} + +static const struct seq_operations hierarchy_stats_ops = { + .start = hierarchy_stats_start, + .next = hierarchy_stats_next, + .stop = hierarchy_stats_stop, + .show = hierarchy_stats_show, +}; + +static int hierarchy_stats_show_single(void *v, struct seq_file *m) +{ + struct hierarchy_stage *hstage = v; + + return __hierarchy_stats_show(hstage->hstats_data, m, hstage->stage); +} + +static const struct blk_mq_debugfs_attr hierarchy_debugfs_attrs[] = { + {"stats", 0400, hierarchy_stats_show_single}, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_stats_attr[] = { + {"stats", 0400, .seq_ops = &hierarchy_stats_ops}, {}, };
@@ -76,6 +179,7 @@ static void hierarchy_register_stage(struct blk_io_hierarchy_stats *stats,
hstage->debugfs_dir = dir; debugfs_create_files(dir, hstage, hierarchy_debugfs_attrs); + io_hierarchy_register_iodump(hstage); }
static void hierarchy_unregister_stage(struct blk_io_hierarchy_stats *stats, @@ -117,3 +221,15 @@ void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q,
hierarchy_unregister_stage(stats, stage); } + +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + lockdep_assert_held(&q->debugfs_mutex); + + if (!blk_mq_debugfs_enabled(q)) + return; + + debugfs_create_files(stats->debugfs_dir, stats, hierarchy_stats_attr); +} diff --git a/block/blk-io-hierarchy/iodump.c b/block/blk-io-hierarchy/iodump.c new file mode 100644 index 000000000000..d7fcc458acb3 --- /dev/null +++ b/block/blk-io-hierarchy/iodump.c @@ -0,0 +1,745 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/seq_file.h> +#include <linux/blktrace_api.h> +#include <linux/blk-cgroup.h> + +#include "iodump.h" +#include "../blk.h" +#include "../blk-mq.h" +#include "../blk-cgroup.h" +#include "../blk-mq-debugfs.h" + +#define RWB_LEN 6 +#define PATH_LEN 64 +#define ms_to_ns(time) (time * NSEC_PER_MSEC) +#define DEFAULT_THRESHOLD 1000 + +static DEFINE_MUTEX(dump_mutex); + +struct bio_dump_data { + u64 stat_time; + struct list_head head; + spinlock_t lock; +}; + +struct rq_dump_data { + struct request_queue *q; + enum stage_group stage; + unsigned int tag; + unsigned int total_tags; + bool shared; + bool has_elevator; + bool enter_queue; +}; + +#ifdef CONFIG_HIERARCHY_BIO +struct pos_data { + enum stage_group stage; + unsigned int count; +}; + +struct bio_stage_dump_data { + union { + loff_t pos; + struct pos_data pdata; + }; + struct rq_dump_data rq_ddata; + u64 stat_time; +}; +#endif + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + hstage->threshold = DEFAULT_THRESHOLD; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = + kmalloc(sizeof(*bio_ddata), GFP_KERNEL); + + if (!bio_ddata) + return -ENOMEM; + + INIT_LIST_HEAD(&bio_ddata->head); + spin_lock_init(&bio_ddata->lock); + hstage->dump_data = bio_ddata; + return 0; + } + + if (stage_is_rq(hstage->stage)) { + struct rq_dump_data *rq_ddata = + kzalloc(sizeof(*rq_ddata), GFP_KERNEL); + + if (!rq_ddata) + return -ENOMEM; + + rq_ddata->q = q; + rq_ddata->stage = hstage->stage; + hstage->dump_data = rq_ddata; + return 0; + } + +#ifdef CONFIG_HIERARCHY_BIO + BUILD_BUG_ON(sizeof(struct pos_data) != sizeof(loff_t)); + + if (hstage->stage == STAGE_BIO) { + struct bio_stage_dump_data *bstage_ddata = + kzalloc(sizeof(*bstage_ddata), GFP_KERNEL); + + if (!bstage_ddata) + return -ENOMEM; + + bstage_ddata->rq_ddata.q = q; + bstage_ddata->rq_ddata.stage = hstage->stage; + hstage->dump_data = bstage_ddata; + return 0; + } +#endif + + return -EINVAL; +} + +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = q->io_hierarchy_stats->hstage[stage]; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = hstage->dump_data; + + WARN(!list_empty(&bio_ddata->head), + "blk-io-hierarchy: disk %s stage %s unregistered whih throttled IO.\n", + kobject_name(q->mq_kobj->parent), hierarchy_stage_name(stage)); + } + + kfree(hstage->dump_data); + hstage->dump_data = NULL; +} + +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_add_tail(&bio->hierarchy_list, &bio_ddata->head); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_del_init(&bio->hierarchy_list); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +static void *bio_hierarchy_list_start(struct seq_file *m, loff_t *pos) + __acquires(&bio_ddata->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irq(&bio_ddata->lock); + bio_ddata->stat_time = blk_time_get_ns(); + + return seq_list_start(&bio_ddata->head, *pos); +} + +static void *bio_hierarchy_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + return seq_list_next(v, &bio_ddata->head, pos); +} + +static void bio_hierarchy_list_stop(struct seq_file *m, void *v) + __releases(&hstage->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_unlock_irq(&bio_ddata->lock); +} + +static void __hierarchy_show_bio(struct seq_file *m, struct bio *bio, + enum stage_group stage, u64 duration) +{ + char rwbs[RWB_LEN]; + char path[PATH_LEN] = {0}; + struct task_struct *task = get_pid_task(bio->pid, PIDTYPE_PID); + + blk_fill_rwbs(rwbs, bio->bi_opf); + blkg_path(bio->bi_blkg, path, PATH_LEN); + + seq_printf(m, "%s-%d %s stage %s bio %s %llu + %u cgroup %s started %llu ns ago\n", + task ? task->comm : "null", task ? task->pid : 0, + bio->bi_bdev->bd_disk->disk_name, + hierarchy_stage_name(stage), rwbs, bio->bi_iter.bi_sector, + bio_sectors(bio), path, duration); + + if (task) + put_task_struct(task); +} + +static u64 get_duration(u64 a, u64 b) +{ + return a > b ? a - b : 0; +} + +static void hierarchy_show_bio(struct seq_file *m, struct bio *bio) +{ + u64 duration; + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + duration = get_duration(bio_ddata->stat_time, bio->hierarchy_time); + if (hstage->threshold > ns_to_ms(duration)) + return; + + __hierarchy_show_bio(m, bio, hstage->stage, duration); +} + +static int bio_hierarchy_list_show(struct seq_file *m, void *v) +{ + struct bio *bio = list_entry(v, struct bio, hierarchy_list); + + hierarchy_show_bio(m, bio); + return 0; +} + +static const struct seq_operations hierarchy_bio_dump_ops = { + .start = bio_hierarchy_list_start, + .next = bio_hierarchy_list_next, + .stop = bio_hierarchy_list_stop, + .show = bio_hierarchy_list_show, +}; + +static int threshold_show(void *data, struct seq_file *m) +{ + struct hierarchy_stage *hstage = data; + + seq_printf(m, "%lu\n", hstage->threshold); + return 0; +} + +/* + * max size needed by different bases to express U64 + * HEX: "0xFFFFFFFFFFFFFFFF" --> 18 + * DEC: "18446744073709551615" --> 20 + * OCT: "01777777777777777777777" --> 23 + * pick the max one to define NUMBER_BUF_LEN + */ +#define MAX_BUF_LEN 24 +static ssize_t threshold_store(void *data, const char __user *buf, size_t count, + loff_t *ppos) +{ + int err; + unsigned long val; + char b[MAX_BUF_LEN + 1]; + struct hierarchy_stage *hstage = data; + + if (count > MAX_BUF_LEN) + return -EINVAL; + + if (copy_from_user(b, buf, count)) + return -EFAULT; + + b[count] = 0; + err = kstrtoul(b, 0, &val); + if (!err) + hstage->threshold = val; + + return err ? err : count; +} + +static void rq_hierarchy_init_dump_data(struct rq_dump_data *rq_ddata) +{ + struct request_queue *q = rq_ddata->q; + + rq_ddata->shared = blk_mq_is_shared_tags(q->tag_set->flags); + rq_ddata->has_elevator = !!q->elevator; + + if (rq_ddata->shared) + rq_ddata->total_tags = rq_ddata->has_elevator ? + q->nr_requests : + q->tag_set->shared_tags->nr_tags; + else if (rq_ddata->has_elevator) + rq_ddata->total_tags = q->nr_hw_queues * q->nr_requests; + else + rq_ddata->total_tags = q->nr_hw_queues * q->tag_set->queue_depth; +} + +static bool __rq_hierarchy_start(struct rq_dump_data *rq_ddata, + unsigned int tag) +{ + /* + * Grab .q_usage_counter so request pool won't go away, then no + * request use-after-free is possible during iteration. If queue is + * frozen, there won't be any inflight requests. + */ + if (!percpu_ref_tryget(&rq_ddata->q->q_usage_counter)) { + rq_ddata->enter_queue = false; + return false; + } + + rq_ddata->enter_queue = true; + rq_hierarchy_init_dump_data(rq_ddata); + rq_ddata->tag = tag; + + return tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static bool __rq_hierarchy_next(struct rq_dump_data *rq_ddata) +{ + rq_ddata->tag++; + + return rq_ddata->tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static void __rq_hierarchy_stop(struct rq_dump_data *rq_ddata) +{ + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +static void *rq_hierarchy_start(struct seq_file *m, loff_t *pos) + __acquires(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + + if (__rq_hierarchy_start(rq_ddata, *pos)) + return rq_ddata; + + return NULL; +} + +static void *rq_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct rq_dump_data *rq_ddata = v; + + if (__rq_hierarchy_next(rq_ddata)) { + *pos = rq_ddata->tag; + return rq_ddata; + } + + (*pos)++; + return NULL; +} + +static void rq_hierarchy_stop(struct seq_file *m, void *v) + __releases(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + __rq_hierarchy_stop(rq_ddata); + mutex_unlock(&dump_mutex); +} + +static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata) +{ + struct request *rq; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq_ddata->q; + unsigned int nr_tag = rq_ddata->tag; + unsigned int hctx_id; + + if (nr_tag >= rq_ddata->total_tags) { + hctx_id = nr_tag - rq_ddata->total_tags; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = xa_load(&q->hctx_table, hctx_id); + rq = hctx->fq->flush_rq; + } else if (rq_ddata->shared) { + struct blk_mq_tags *tags = rq_ddata->has_elevator ? + q->sched_shared_tags : q->tag_set->shared_tags; + + rq = tags->static_rqs[nr_tag]; + } else if (rq_ddata->has_elevator) { + hctx_id = nr_tag / q->nr_requests; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = xa_load(&q->hctx_table, hctx_id); + rq = hctx->sched_tags->static_rqs[nr_tag % q->nr_requests]; + } else { + hctx_id = nr_tag / q->tag_set->queue_depth; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = xa_load(&q->hctx_table, hctx_id); + if (!hctx->tags) + return NULL; + + rq = hctx->tags->static_rqs[nr_tag % q->tag_set->queue_depth]; + } + + /* + * fast path to avoid refcount cas operations for the request that + * is from other shared request_queue or other stages. + */ + if (rq->q != q || (rq_ddata->stage != STAGE_BIO && + READ_ONCE(rq->stage) != rq_ddata->stage)) + return NULL; + + if (!req_ref_inc_not_zero(rq)) + return NULL; + + /* Check again after request is pinned, in case request is resued. */ + if (rq->q != q) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + if (rq_ddata->stage == STAGE_BIO) + return rq; + + /* + * Barrier is paired with the smp_store_release() in + * rq_hierarchy_start_io_acct(), so that if stage is read, uninitialized + * hierarchy_time won't be read. + */ + if (smp_load_acquire(&rq->stage) != rq_ddata->stage) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + return rq; +} + +static void hierarchy_show_rq(struct seq_file *m, struct request *rq, + u64 duration) +{ + struct task_struct *task = get_pid_task(rq->pid, PIDTYPE_PID); + const char *name = hierarchy_stage_name(rq->stage); + + seq_printf(m, "%s-%d %s stage %s ", task ? task->comm : "null", + task ? task->pid : 0, + rq->q->disk ? rq->q->disk->disk_name : "?", + name ? name : "?"); + debugfs_rq_show(m, rq); + seq_printf(m, " started %llu ns ago}\n", duration); + + if (task) + put_task_struct(task); +} + +static int rq_hierarchy_show(struct seq_file *m, void *v) +{ + u64 duration; + unsigned long htime; + struct hierarchy_stage *hstage = m->private; + struct request *rq = hierarchy_find_and_get_rq(v); + + if (!rq) + return 0; + + htime = READ_ONCE(rq->hierarchy_time); + htime = time_after(jiffies, htime) ? jiffies - htime : 0; + duration = jiffies_to_msecs(htime); + if (hstage->threshold <= duration) + hierarchy_show_rq(m, rq, ms_to_ns(duration)); + + blk_mq_put_rq_ref(rq); + return 0; +} + +static const struct seq_operations hierarchy_rq_dump_ops = { + .start = rq_hierarchy_start, + .next = rq_hierarchy_next, + .stop = rq_hierarchy_stop, + .show = rq_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr hierarchy_threshold_attr[] = { + { + "threshold", + 0600, + threshold_show, + threshold_store, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_bio_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_bio_dump_ops, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_rq_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_rq_dump_ops, + }, + {}, +}; + +#ifdef CONFIG_HIERARCHY_BIO +static struct bio_dump_data *get_bio_stage_ddata(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + struct hierarchy_stage *hstage = READ_ONCE(stats->hstage[stage]); + + if (!hstage) + return NULL; + + return hstage->dump_data; +} + +static void bio_stage_start_next_stage(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + pdata->stage++; + if (!stage_is_bio(pdata->stage)) + pdata->stage = STAGE_BIO; + pdata->count = 0; + + *pos = bstage_ddata->pos; +} + +static void bio_stage_start_next_io(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) + pdata->count++; + else + pdata->count = bstage_ddata->rq_ddata.tag; + + *pos = bstage_ddata->pos; +} + +static void __bio_stage_hierarchy_stop(struct bio_stage_dump_data *bstage_ddata) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + spin_unlock_irq(&bio_ddata->lock); + } + + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +void *__bio_stage_hierarchy_start(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + +retry: + if (stage_is_bio(pdata->stage)) { + struct list_head *list; + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + if (!bio_ddata) { + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + spin_lock_irq(&bio_ddata->lock); + list = seq_list_start(&bio_ddata->head, pdata->count); + if (list) + return list; + + spin_unlock_irq(&bio_ddata->lock); + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_start(rq_ddata, pdata->count)) + return bstage_ddata; + + return NULL; +} + +static void *bio_stage_hierarchy_start(struct seq_file *m, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + bstage_ddata->pos = *pos; + bstage_ddata->stat_time = blk_time_get_ns(); + + return __bio_stage_hierarchy_start(bstage_ddata, pos); +} + +static void *bio_stage_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + struct list_head *list = ((struct list_head *)v)->next; + + if (list != &bio_ddata->head) { + bio_stage_start_next_io(bstage_ddata, pos); + return list; + } + + spin_unlock_irq(&bio_ddata->lock); + + bio_stage_start_next_stage(bstage_ddata, pos); + return __bio_stage_hierarchy_start(bstage_ddata, pos); + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_next(rq_ddata)) { + bio_stage_start_next_io(bstage_ddata, pos); + return bstage_ddata; + } + + (*pos)++; + return NULL; +} + +static void bio_stage_hierarchy_stop(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + __bio_stage_hierarchy_stop(bstage_ddata); + mutex_unlock(&dump_mutex); +} + +static int bio_stage_hierarchy_show(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + u64 duration; + + if (stage_is_bio(pdata->stage)) { + struct bio *bio = list_entry(v, struct bio, hierarchy_list); + + duration = get_duration(bstage_ddata->stat_time, + bio->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + __hierarchy_show_bio(m, bio, pdata->stage, duration); + } else if (pdata->stage == STAGE_BIO) { + struct request *rq = hierarchy_find_and_get_rq(rq_ddata); + + if (rq) { + duration = get_duration(bstage_ddata->stat_time, + rq->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + hierarchy_show_rq(m, rq, duration); + blk_mq_put_rq_ref(rq); + } + } + + return 0; +} + +static const struct seq_operations bio_stage_hierarchy_ops = { + .start = bio_stage_hierarchy_start, + .next = bio_stage_hierarchy_next, + .stop = bio_stage_hierarchy_stop, + .show = bio_stage_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &bio_stage_hierarchy_ops, + }, + {}, +}; + +#else /* CONFIG_HIERARCHY_BIO */ +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + {}, +}; + +#endif + +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ + const struct blk_mq_debugfs_attr *attr; + + if (stage_is_bio(hstage->stage)) + attr = hierarchy_bio_dump_attr; + else if (stage_is_rq(hstage->stage)) + attr = hierarchy_rq_dump_attr; + else if (hstage->stage == STAGE_BIO) + attr = bio_stage_dump_attr; + else + attr = NULL; + + debugfs_create_files(hstage->debugfs_dir, hstage, + hierarchy_threshold_attr); + if (attr) + debugfs_create_files(hstage->debugfs_dir, hstage, attr); +} + +void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + if (hstage->threshold <= duration) + this_cpu_inc(hstage->hstats_data->hstats->slow[op]); +} + +void hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data, + struct seq_file *m) +{ + u64 slow[NR_STAT_GROUPS] = {0}; + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct hierarchy_stats *stat = + per_cpu_ptr(hstats_data->hstats, cpu); + + for (i = 0; i < NR_STAT_GROUPS; ++i) + slow[i] += stat->slow[i]; + } + + seq_printf(m, " %llu %llu %llu %llu", slow[STAT_READ], slow[STAT_WRITE], + slow[STAT_DISCARD], slow[STAT_FLUSH]); +} diff --git a/block/blk-io-hierarchy/iodump.h b/block/blk-io-hierarchy/iodump.h new file mode 100644 index 000000000000..2f14999e42b5 --- /dev/null +++ b/block/blk-io-hierarchy/iodump.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef BLK_IO_HIERARCHY_IODUMP_H +#define BLK_IO_HIERARCHY_IODUMP_H + +#ifdef CONFIG_HIERARCHY_IO_DUMP + +#include "stats.h" + +#define ns_to_ms(time) div_u64(time, NSEC_PER_MSEC) + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage); +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage); +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio); +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio); +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage); + +void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration); +void hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data, + struct seq_file *m); + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ + hierarchy_account_slow_io(hstage, op, ns_to_ms(duration)); +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + hierarchy_account_slow_io(hstage, op, jiffies_to_msecs(duration)); +} + +#else +static inline int +blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + return 0; +} + +static inline void +blk_io_hierarchy_iodump_exit(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ +} + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ +} + +static inline void +hierarchy_show_slow_io(struct hierarchy_stats_data *hstats_data, + struct seq_file *m) +{ +} +#endif +#endif diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c index e717bf790709..01b08fe5e8bb 100644 --- a/block/blk-io-hierarchy/stats.c +++ b/block/blk-io-hierarchy/stats.c @@ -11,14 +11,16 @@ * GNU General Public License for more details. */
+#include <linux/module.h> #include <linux/debugfs.h>
#include "stats.h" +#include "iodump.h" #include "../blk.h" #include "../blk-mq-debugfs.h"
#define io_hierarchy_add(statsp, field, group, nr) \ - this_cpu_add((statsp)->field[group], nr) + this_cpu_add((statsp)->hstats->field[group], nr) #define io_hierarchy_inc(statsp, field, group) \ io_hierarchy_add(statsp, field, group, 1)
@@ -35,6 +37,7 @@ void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q)
stats->debugfs_dir = debugfs_create_dir("blk_io_hierarchy", q->debugfs_dir); + blk_mq_debugfs_create_default_hierarchy_attr(q);
for (stage = 0; stage < NR_STAGE_GROUPS; ++stage) blk_mq_debugfs_register_hierarchy(q, stage); @@ -69,6 +72,7 @@ int blk_io_hierarchy_stats_alloc(struct request_queue *q) if (!stats) return -ENOMEM;
+ spin_lock_init(&stats->hstage_lock); stats->q = q; q->io_hierarchy_stats = stats;
@@ -96,6 +100,61 @@ bool blk_mq_hierarchy_registered(struct request_queue *q,
return stats->hstage[stage] != NULL; } +EXPORT_SYMBOL_GPL(blk_mq_hierarchy_registered); + +static struct hierarchy_stats_data *alloc_hstats_data(void) +{ + struct hierarchy_stats_data *hstats_data; + + hstats_data = kmalloc(sizeof(*hstats_data), GFP_KERNEL); + if (!hstats_data) + return NULL; + + hstats_data->hstats = alloc_percpu(struct hierarchy_stats); + if (!hstats_data->hstats) { + kfree(hstats_data); + return NULL; + } + + hstats_data->ref = 1; + return hstats_data; +} + +struct hierarchy_stats_data *get_hstats_data( + struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage; + struct hierarchy_stats_data *hstats_data = NULL; + + spin_lock(&stats->hstage_lock); + hstage = stats->hstage[stage]; + if (hstage) { + hstats_data = hstage->hstats_data; + if (hstats_data) + hstats_data->ref++; + } + spin_unlock(&stats->hstage_lock); + + return hstats_data; +} + +static void __put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data) +{ + if (--hstats_data->ref == 0) { + free_percpu(hstats_data->hstats); + kfree(hstats_data); + } +} + +void put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data) +{ + spin_lock(&stats->hstage_lock); + __put_hstats_data(stats, hstats_data); + spin_unlock(&stats->hstage_lock); +}
void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) { @@ -107,7 +166,8 @@ void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage)
if (blk_mq_hierarchy_registered(q, stage)) { pr_warn("blk-io-hierarchy: disk %s is registering stage %s again.", - q->disk->disk_name, hierarchy_stage_name(stage)); + kobject_name(q->mq_kobj->parent), + hierarchy_stage_name(stage)); return; }
@@ -119,26 +179,31 @@ void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) if (!hstage) return;
- hstage->hstats = alloc_percpu(struct hierarchy_stats); - if (!hstage->hstats) { + hstage->hstats_data = alloc_hstats_data(); + if (!hstage->hstats_data) { kfree(hstage); return; }
hstage->stage = stage; + hstage->unbalanced_warned = false; hstage->debugfs_dir = NULL; + if (blk_io_hierarchy_iodump_init(q, hstage) < 0) { + put_hstats_data(stats, hstage->hstats_data); + kfree(hstage); + return; + }
blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q);
mutex_lock(&q->debugfs_mutex); - stats->hstage[stage] = hstage; + WRITE_ONCE(stats->hstage[stage], hstage); blk_mq_debugfs_register_hierarchy(q, stage); mutex_unlock(&q->debugfs_mutex);
- blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); } +EXPORT_SYMBOL_GPL(blk_mq_register_hierarchy);
void blk_mq_unregister_hierarchy(struct request_queue *q, enum stage_group stage) @@ -152,21 +217,27 @@ void blk_mq_unregister_hierarchy(struct request_queue *q, mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_hierarchy(q, stage); + blk_io_hierarchy_iodump_exit(q, stage);
+ spin_lock(&stats->hstage_lock); hstage = stats->hstage[stage]; stats->hstage[stage] = NULL; - free_percpu(hstage->hstats); + __put_hstats_data(stats, hstage->hstats_data); + spin_unlock(&stats->hstage_lock); + kfree(hstage);
mutex_unlock(&q->debugfs_mutex); } +EXPORT_SYMBOL_GPL(blk_mq_unregister_hierarchy);
-static enum stat_group hierarchy_op(const struct bio *bio) +static enum stat_group bio_hierarchy_op(struct bio *bio) { if (op_is_discard(bio->bi_opf)) return STAT_DISCARD;
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + if (op_is_flush(bio->bi_opf) && + !(bio_sectors(bio) || bio_flagged(bio, BIO_HAS_DATA))) return STAT_FLUSH;
if (op_is_write(bio->bi_opf)) @@ -185,32 +256,151 @@ void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) return;
hstage = q->io_hierarchy_stats->hstage[stage]; - io_hierarchy_inc(hstage->hstats, dispatched, hierarchy_op(bio)); + io_hierarchy_inc(hstage->hstats_data, dispatched, bio_hierarchy_op(bio)); bio->hierarchy_time = blk_time_get_ns(); + hierarchy_add_bio(hstage, bio); }
-void bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, - u64 time) +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time) { struct request_queue *q = bio->bi_bdev->bd_queue; struct hierarchy_stage *hstage; + u64 duration; enum stat_group op;
if (!blk_mq_hierarchy_registered(q, stage)) return;
- op = hierarchy_op(bio); + op = bio_hierarchy_op(bio); + duration = time - bio->hierarchy_time; hstage = q->io_hierarchy_stats->hstage[stage]; - io_hierarchy_inc(hstage->hstats, completed, op); - io_hierarchy_add(hstage->hstats, nsecs, op, time - bio->hierarchy_time); + + hierarchy_remove_bio(hstage, bio); + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); +} + +static enum stat_group rq_hierarchy_op(struct request *rq) +{ + if (op_is_discard(rq->cmd_flags)) + return STAT_DISCARD; + + if (is_flush_rq(rq)) + return STAT_FLUSH; + + if (op_is_write(rq->cmd_flags)) + return STAT_WRITE; + + return STAT_READ; }
-void bio_list_hierarchy_end_io_acct(struct bio_list *list, - enum stage_group stage) +static void rq_hierarchy_warn_unbalanced(struct request *rq, + struct hierarchy_stage *hstage, + enum stage_group old_stage, + enum stage_group new_stage) { - u64 time = blk_time_get_ns(); - struct bio *bio; + if (hstage->unbalanced_warned) + return;
- bio_list_for_each(bio, list) - bio_hierarchy_end_io_acct(bio, stage, time); + pr_warn("blk-io-hierarchy: disk %s stage %d(%s) -> %d(%s) unbalanced accounting.", + kobject_name(rq->q->mq_kobj->parent), + old_stage, hierarchy_stage_name(old_stage), + new_stage, hierarchy_stage_name(new_stage)); + hstage->unbalanced_warned = true; } + +void blk_rq_hierarchy_stats_complete(struct request *rq) +{ + struct hierarchy_stage *hstage; + enum stage_group stage; + + stage = rq->stage; + if (stage == NR_RQ_STAGE_GROUPS) + return; + + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = rq->q->io_hierarchy_stats->hstage[stage]; + rq_hierarchy_warn_unbalanced(rq, hstage, stage, NR_RQ_STAGE_GROUPS); + __rq_hierarchy_end_io_acct(rq, hstage); +} + +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + blk_rq_hierarchy_stats_complete(rq); + io_hierarchy_inc(hstage->hstats_data, dispatched, rq_hierarchy_op(rq)); + WRITE_ONCE(rq->hierarchy_time, jiffies); + + /* + * Paired with barrier in hierarchy_show_rq_fn(), make sure + * hierarchy_time is set before stage. + */ + smp_store_release(&rq->stage, hstage->stage); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_start_io_acct); + +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + enum stat_group op; + unsigned long duration; + + if (rq->stage != hstage->stage) { + rq_hierarchy_warn_unbalanced(rq, hstage, rq->stage, + hstage->stage); + return; + } + + op = rq_hierarchy_op(rq); + duration = jiffies - rq->hierarchy_time; + + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, jiffies, op, duration); + hierarchy_account_slow_io_jiffies(hstage, op, duration); + WRITE_ONCE(rq->stage, NR_RQ_STAGE_GROUPS); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_end_io_acct); + +#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct hierarchy_stage *hstage; + + if (bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(disk->queue, STAGE_BIO)) + return; + + bio_set_flag(bio, BIO_HIERARCHY_ACCT); + if (bio_has_data(bio)) + bio_set_flag(bio, BIO_HAS_DATA); + hstage = disk->queue->io_hierarchy_stats->hstage[STAGE_BIO]; + io_hierarchy_inc(hstage->hstats_data, dispatched, bio_hierarchy_op(bio)); +} + +void __bio_hierarchy_end(struct bio *bio, u64 now) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct hierarchy_stage *hstage; + u64 duration; + enum stat_group op; + + op = bio_hierarchy_op(bio); + duration = now - bio->bi_alloc_time_ns; + hstage = disk->queue->io_hierarchy_stats->hstage[STAGE_BIO]; + + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); + + bio_clear_flag(bio, BIO_HIERARCHY_ACCT); + bio_clear_flag(bio, BIO_HAS_DATA); +} + +#endif diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h index 0a86d1235715..8166e71969bd 100644 --- a/block/blk-io-hierarchy/stats.h +++ b/block/blk-io-hierarchy/stats.h @@ -17,30 +17,64 @@ #ifdef CONFIG_BLK_IO_HIERARCHY_STATS
#include <linux/blkdev.h> -#include "../blk-mq-debugfs.h" +#include <linux/blk_types.h> +#include "../blk.h"
struct bio_hierarchy_data { u64 time; +#ifdef CONFIG_HIERARCHY_IO_DUMP + struct bio *bio; + struct list_head hierarchy_list; +#endif };
struct hierarchy_stats { - u64 nsecs[NR_STAT_GROUPS]; + union { + /* for bio based stages. */ + u64 nsecs[NR_STAT_GROUPS]; + /* for request based stages. */ + unsigned long jiffies[NR_STAT_GROUPS]; + }; unsigned long dispatched[NR_STAT_GROUPS]; unsigned long completed[NR_STAT_GROUPS]; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long slow[NR_STAT_GROUPS]; +#endif +}; + +struct hierarchy_stats_data { + int ref; + struct hierarchy_stats __percpu *hstats; };
struct hierarchy_stage { enum stage_group stage; + bool unbalanced_warned; struct dentry *debugfs_dir; - struct hierarchy_stats __percpu *hstats; + struct hierarchy_stats_data *hstats_data; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long threshold; + void *dump_data; +#endif };
struct blk_io_hierarchy_stats { struct request_queue *q; struct dentry *debugfs_dir; + spinlock_t hstage_lock; struct hierarchy_stage *hstage[NR_STAGE_GROUPS]; };
+static inline bool stage_is_bio(enum stage_group stage) +{ + return stage >= 0 && stage < NR_BIO_STAGE_GROUPS; +} + +static inline bool stage_is_rq(enum stage_group stage) +{ + return stage >= NR_BIO_STAGE_GROUPS && stage < NR_RQ_STAGE_GROUPS; +} + const char *hierarchy_stage_name(enum stage_group stage); int blk_io_hierarchy_stats_alloc(struct request_queue *q); void blk_io_hierarchy_stats_free(struct request_queue *q); @@ -55,19 +89,154 @@ void blk_mq_unregister_hierarchy(struct request_queue *q, /* APIs for disk level debugfs */ void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q); void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q); +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q);
/* APIs for stage level debugfs */ void blk_mq_debugfs_register_hierarchy(struct request_queue *q, enum stage_group stage); void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, enum stage_group stage); +struct hierarchy_stats_data *get_hstats_data( + struct blk_io_hierarchy_stats *stats, + enum stage_group stage); +void put_hstats_data(struct blk_io_hierarchy_stats *stats, + struct hierarchy_stats_data *hstats_data);
/* APIs for bio based stage io accounting */ void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage); -void bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, - u64 time); -void bio_list_hierarchy_end_io_acct(struct bio_list *list, - enum stage_group stage); +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time); + +static inline void bio_hierarchy_end_io_acct(struct bio *bio, + enum stage_group stage) +{ + __bio_hierarchy_end_io_acct(bio, stage, blk_time_get_ns()); +} + +static inline void bio_list_hierarchy_end_io_acct(struct bio_list *list, + enum stage_group stage) +{ + u64 time = blk_time_get_ns(); + struct bio *bio; + + bio_list_for_each(bio, list) + __bio_hierarchy_end_io_acct(bio, stage, time); +} + +/* APIs for request based stage io accounting */ +void blk_rq_hierarchy_stats_complete(struct request *rq); +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage); +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage); + +static inline void rq_hierarchy_start_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_start_io_acct( + rq, rq->q->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_hierarchy_end_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_end_io_acct( + rq, rq->q->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_list_hierarchy_start_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = rq->q->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_start_io_acct(rq, hstage); +} + +static inline void rq_list_hierarchy_end_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = rq->q->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_end_io_acct(rq, hstage); +} + +static inline void blk_rq_hierarchy_stats_init(struct request *rq) +{ + rq->stage = NR_RQ_STAGE_GROUPS; + rq->flush_done = false; +} + +static inline void blk_rq_hierarchy_set_flush_done(struct request *rq) +{ + rq->flush_done = true; +} + +static inline bool blk_rq_hierarchy_is_flush_done(struct request *rq) +{ + return rq->flush_done; +} + +#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio); +void __bio_hierarchy_end(struct bio *bio, u64 now); + +static inline void bio_hierarchy_end(struct bio *bio) +{ + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_bdev->bd_queue, STAGE_BIO)) + return; + + __bio_hierarchy_end(bio, blk_time_get_ns()); +} + +static inline void req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ + u64 now; + + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_bdev->bd_queue, STAGE_BIO)) + return; + + now = rq->io_end_time_ns; + if (!now) { + now = blk_time_get_ns(); + rq->io_end_time_ns = now; + } + + __bio_hierarchy_end(bio, now); +} +#endif + #else /* CONFIG_BLK_IO_HIERARCHY_STATS */
static inline int @@ -125,7 +294,7 @@ bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) }
static inline void -bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, u64 time) +bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage) { }
@@ -133,5 +302,65 @@ static inline void bio_list_hierarchy_end_io_acct(struct bio_list *list, enum stage_group stage) { } + +static inline void +blk_rq_hierarchy_set_flush_done(struct request *rq) +{ +} + +static inline bool +blk_rq_hierarchy_is_flush_done(struct request *rq) +{ + return false; +} + +static inline void +blk_rq_hierarchy_stats_complete(struct request *rq) +{ +} + +static inline void +rq_hierarchy_start_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_hierarchy_end_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_start_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_end_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +static inline void +blk_rq_hierarchy_stats_init(struct request *rq) +{ +} + #endif /* CONFIG_BLK_IO_HIERARCHY_STATS */ + +#if !defined(CONFIG_BLK_IO_HIERARCHY_STATS) || !defined(CONFIG_HIERARCHY_BIO) +static inline void +bio_hierarchy_start(struct bio *bio) +{ +} + +static inline void +bio_hierarchy_end(struct bio *bio) +{ +} + +static inline void +req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ +} +#endif + #endif /* BLK_IO_HIERARCHY_STATS_H */ diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index a955ee42765f..efe99cfae51d 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -281,9 +281,13 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) return blk_mq_rq_state_name_array[rq_state]; }
-int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +/* + * This helper will dump general information for @rq into @m, started with '{' + * and doesn't end with '}', caller must include a closing curly brace '}' at + * the end after adding the custom string. + */ +void debugfs_rq_show(struct seq_file *m, struct request *rq) { - const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const enum req_op op = req_op(rq); const char *op_str = blk_op_str(op);
@@ -301,6 +305,13 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); +} + +int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +{ + const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; + + debugfs_rq_show(m, rq); if (mq_ops->show_rq) mq_ops->show_rq(m, rq); seq_puts(m, "}\n"); diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 4c422580ce84..4f70a87094b0 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -17,6 +17,7 @@ struct blk_mq_debugfs_attr { const struct seq_operations *seq_ops; };
+void debugfs_rq_show(struct seq_file *m, struct request *rq); int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
diff --git a/block/blk-mq.c b/block/blk-mq.c index 1159a06e2543..835ea9495396 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -388,6 +388,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->end_io = NULL; rq->end_io_data = NULL;
+ blk_rq_hierarchy_stats_init(rq); blk_rq_init_bi_alloc_time(rq, NULL); blk_mq_get_alloc_task(rq, data->bio);
@@ -712,6 +713,7 @@ static void __blk_mq_free_request(struct request *rq) struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag;
+ blk_rq_hierarchy_stats_complete(rq); blk_mq_put_alloc_task(rq); blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index da56055731d6..4c4416fd2df7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -201,8 +201,13 @@ struct request { KABI_RESERVE(2) #endif KABI_USE(3, u64 io_end_time_ns) +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + KABI_USE2(4, bool flush_done, enum stage_group stage) + KABI_USE(5, unsigned long hierarchy_time) +#else KABI_RESERVE(4) KABI_RESERVE(5) +#endif KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 95759212836a..e8c1d0790923 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -292,11 +292,13 @@ struct bio {
#ifdef CONFIG_BLK_IO_HIERARCHY_STATS KABI_USE(1, u64 hierarchy_time) + KABI_REPLACE(_KABI_RESERVE(2); _KABI_RESERVE(3), + struct list_head hierarchy_list) #else KABI_RESERVE(1) -#endif KABI_RESERVE(2) KABI_RESERVE(3) +#endif #ifdef CONFIG_BLK_BIO_ALLOC_TIME KABI_USE(4, u64 bi_alloc_time_ns) #else @@ -341,6 +343,13 @@ enum { BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + BIO_HAS_DATA, /* bio contain data. */ + BIO_HIERARCHY_ACCT, /* + * This bio has already been subjected to + * blk-io-hierarchy, don't do it again. + */ +#endif BIO_FLAG_LAST };
@@ -473,7 +482,10 @@ enum stage_group { #endif STAGE_RESERVE, NR_BIO_STAGE_GROUPS, - NR_STAGE_GROUPS = NR_BIO_STAGE_GROUPS, + STAGE_PLUG = NR_BIO_STAGE_GROUPS, + NR_RQ_STAGE_GROUPS, + STAGE_BIO = NR_RQ_STAGE_GROUPS, + NR_STAGE_GROUPS, };
static inline enum req_op bio_op(const struct bio *bio)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Like blk-throttle, if wbt is enabled, following new debugfs entries will be created as well.
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- wbt | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in wbt.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 11 +++++++++++ block/blk-iolatency.c | 2 +- block/blk-rq-qos.c | 9 ++++++++- block/blk-rq-qos.h | 3 ++- block/blk-wbt.c | 31 ++++++++++++++++++++++++------- include/linux/blk_types.h | 3 +++ 6 files changed, 49 insertions(+), 10 deletions(-)
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 2c15b5a7a006..ad1b7abc7610 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -37,4 +37,15 @@ config HIERARCHY_THROTTLE
If unsure, say N.
+config HIERARCHY_WBT + bool "Enable hierarchy stats layer blk-wbt" + default n + depends on BLK_WBT + help + Enabling this lets blk hierarchy stats to record additional information + for blk-wbt. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index ebb522788d97..b256043bbed6 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -306,7 +306,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos, return; }
- rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb); + rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb, NULL); }
#define SCALE_DOWN_FACTOR 2 diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index dc510f493ba5..6ea4bdf4f802 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -230,6 +230,7 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, * @private_data: caller provided specific data * @acquire_inflight_cb: inc the rqw->inflight counter if we can * @cleanup_cb: the callback to cleanup in case we race with a waker + * @io_acct_cb: the callback for io accounting * * This provides a uniform place for the rq_qos users to do their throttling. * Since you can end up with a lot of things sleeping at once, this manages the @@ -242,7 +243,7 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, */ void rq_qos_wait(struct rq_wait *rqw, void *private_data, acquire_inflight_cb_t *acquire_inflight_cb, - cleanup_cb_t *cleanup_cb) + cleanup_cb_t *cleanup_cb, io_acct_cb_t *io_acct_cb) { struct rq_qos_wait_data data = { .wq = { @@ -260,6 +261,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) return;
+ if (io_acct_cb) + io_acct_cb(private_data, true); + has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); do { @@ -284,6 +288,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(&rqw->wait, &data.wq); + + if (io_acct_cb) + io_acct_cb(private_data, false); }
void rq_qos_exit(struct request_queue *q) diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 37245c97ee61..93d1ba692973 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -91,10 +91,11 @@ void rq_qos_del(struct rq_qos *rqos);
typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); +typedef void (io_acct_cb_t)(void *private_data, bool start_acct);
void rq_qos_wait(struct rq_wait *rqw, void *private_data, acquire_inflight_cb_t *acquire_inflight_cb, - cleanup_cb_t *cleanup_cb); + cleanup_cb_t *cleanup_cb, io_acct_cb_t *io_acct_cb); bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); bool rq_depth_scale_up(struct rq_depth *rqd); bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 4933a7738ebd..6b81f2c47279 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -32,6 +32,7 @@
#ifndef __GENKSYMS__ #include "blk.h" +#include "blk-io-hierarchy/stats.h" #endif
#define CREATE_TRACE_POINTS @@ -564,38 +565,51 @@ static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) }
struct wbt_wait_data { + struct bio *bio; struct rq_wb *rwb; enum wbt_flags wb_acct; - blk_opf_t opf; };
static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data) { struct wbt_wait_data *data = private_data; - return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf)); + + return rq_wait_inc_below(rqw, get_limit(data->rwb, data->bio->bi_opf)); }
static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) { struct wbt_wait_data *data = private_data; + wbt_rqw_done(data->rwb, rqw, data->wb_acct); }
+static void wbt_io_acct_cb(void *private_data, bool start) +{ + struct wbt_wait_data *data = private_data; + + if (start) + bio_hierarchy_start_io_acct(data->bio, STAGE_WBT); + else + bio_hierarchy_end_io_acct(data->bio, STAGE_WBT); +} + /* * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. */ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, - blk_opf_t opf) + struct bio *bio) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); struct wbt_wait_data data = { .rwb = rwb, .wb_acct = wb_acct, - .opf = opf, + .bio = bio, };
- rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); + rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb, + wbt_io_acct_cb); }
static inline bool wbt_should_throttle(struct bio *bio) @@ -659,7 +673,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio) return; }
- __wbt_wait(rwb, flags, bio->bi_opf); + __wbt_wait(rwb, flags, bio);
if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); @@ -773,8 +787,10 @@ static void wbt_queue_depth_changed(struct rq_qos *rqos) static void wbt_exit(struct rq_qos *rqos) { struct rq_wb *rwb = RQWB(rqos); + struct request_queue *q = rqos->disk->queue;
- blk_stat_remove_callback(rqos->disk->queue, rwb->cb); + blk_mq_unregister_hierarchy(q, STAGE_WBT); + blk_stat_remove_callback(q, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); } @@ -937,6 +953,7 @@ int wbt_init(struct gendisk *disk) goto err_free;
blk_stat_add_callback(q, rwb->cb); + blk_mq_register_hierarchy(q, STAGE_WBT);
return 0;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e8c1d0790923..fe683cfbc157 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -479,6 +479,9 @@ enum stat_group { enum stage_group { #ifdef CONFIG_BLK_DEV_THROTTLING STAGE_THROTTLE, +#endif +#ifdef CONFIG_BLK_WBT + STAGE_WBT, #endif STAGE_RESERVE, NR_BIO_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Like blk-throttle, if iocost is enabled, following new debugfs entries will be created as well.
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- iocost | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in iocost.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 11 +++++++++++ block/blk-iocost.c | 6 ++++++ include/linux/blk_types.h | 3 +++ 3 files changed, 20 insertions(+)
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index ad1b7abc7610..43ada3d9f0a5 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -48,4 +48,15 @@ config HIERARCHY_WBT
If unsure, say N.
+config HIERARCHY_IOCOST + bool "Enable hierarchy stats layer iocost" + default n + depends on BLK_CGROUP_IOCOST + help + Enabling this lets blk hierarchy stats to record additional information + for blk-iocost. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 67c8941b2c1d..5c1366a2182b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -184,6 +184,7 @@ #include "blk-stat.h" #include "blk-wbt.h" #include "blk-cgroup.h" +#include "blk-io-hierarchy/stats.h"
#ifdef CONFIG_TRACEPOINTS
@@ -2722,12 +2723,14 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
iocg_unlock(iocg, ioc_locked, &flags);
+ bio_hierarchy_start_io_acct(bio, STAGE_IOCOST); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); if (wait.committed) break; io_schedule(); } + bio_hierarchy_end_io_acct(bio, STAGE_IOCOST);
/* waker already committed us, proceed */ finish_wait(&iocg->waitq, &wait.wait); @@ -2853,6 +2856,7 @@ static void ioc_rqos_exit(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos);
+ blk_mq_unregister_hierarchy(rqos->disk->queue, STAGE_IOCOST); blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost);
spin_lock_irq(&ioc->lock); @@ -2928,6 +2932,8 @@ static int blk_iocost_init(struct gendisk *disk) ret = blkcg_activate_policy(disk, &blkcg_policy_iocost); if (ret) goto err_del_qos; + + blk_mq_register_hierarchy(disk->queue, STAGE_IOCOST); return 0;
err_del_qos: diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index fe683cfbc157..eb794e8820ec 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -482,6 +482,9 @@ enum stage_group { #endif #ifdef CONFIG_BLK_WBT STAGE_WBT, +#endif +#ifdef CONFIG_BLK_CGROUP_IOCOST + STAGE_IOCOST, #endif STAGE_RESERVE, NR_BIO_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created for rq-based disk:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- gettag | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in gettag.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 10 ++++++++++ block/blk-mq-tag.c | 5 +++++ block/blk-mq.c | 1 + block/blk-sysfs.c | 4 ++++ include/linux/blk_types.h | 2 +- 5 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 43ada3d9f0a5..54c2cb88199a 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -59,4 +59,14 @@ config HIERARCHY_IOCOST
If unsure, say N.
+config HIERARCHY_GETTAG + bool "Enable hierarchy stats layer get-tag" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for getting tag. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2cafcf11ee8b..80a5a60f6e51 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h"
/* * Recalculate wakeup batch when tag is shared by hctx. @@ -159,6 +160,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (data->flags & BLK_MQ_REQ_NOWAIT) return BLK_MQ_NO_TAG;
+ if (data->bio) + bio_hierarchy_start_io_acct(data->bio, STAGE_GETTAG); ws = bt_wait_ptr(bt, data->hctx); do { struct sbitmap_queue *bt_prev; @@ -210,6 +213,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) } while (1);
sbitmap_finish_wait(bt, ws, &wait); + if (data->bio) + bio_hierarchy_end_io_acct(data->bio, STAGE_GETTAG);
found_tag: /* diff --git a/block/blk-mq.c b/block/blk-mq.c index 835ea9495396..f32644ca3c12 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4385,6 +4385,7 @@ void blk_mq_exit_queue(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set;
+ blk_mq_unregister_hierarchy(q, STAGE_GETTAG); /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e58b1574e023..65be33558d28 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -19,6 +19,7 @@ #include "blk-wbt.h" #include "blk-cgroup.h" #include "blk-throttle.h" +#include "blk-io-hierarchy/stats.h"
struct queue_sysfs_entry { struct attribute attr; @@ -860,6 +861,9 @@ int blk_register_queue(struct gendisk *disk) if (ret) goto out_elv_unregister;
+ if (queue_is_mq(q)) + blk_mq_register_hierarchy(q, STAGE_GETTAG); + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(disk); blk_throtl_register(disk); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index eb794e8820ec..cea20cdc351a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -486,7 +486,7 @@ enum stage_group { #ifdef CONFIG_BLK_CGROUP_IOCOST STAGE_IOCOST, #endif - STAGE_RESERVE, + STAGE_GETTAG, NR_BIO_STAGE_GROUPS, STAGE_PLUG = NR_BIO_STAGE_GROUPS, NR_RQ_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created for rq-based disk:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- plug | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in plug.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 10 ++++++++++ block/blk-io-hierarchy/stats.h | 12 ++++++++++++ block/blk-mq.c | 12 +++++++++++- block/blk-sysfs.c | 8 +++++++- 4 files changed, 40 insertions(+), 2 deletions(-)
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 54c2cb88199a..342c6a72fffb 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -69,4 +69,14 @@ config HIERARCHY_GETTAG
If unsure, say N.
+config HIERARCHY_PLUG + bool "Enable hierarchy stats layer plug" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for plug. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h index 8166e71969bd..dc397a90c4b1 100644 --- a/block/blk-io-hierarchy/stats.h +++ b/block/blk-io-hierarchy/stats.h @@ -186,6 +186,14 @@ static inline void rq_list_hierarchy_end_io_acct(struct list_head *head, __rq_hierarchy_end_io_acct(rq, hstage); }
+static inline void plug_list_hierarchy_end_io_acct(struct blk_plug *plug) +{ + struct request *rq; + + rq_list_for_each(&plug->mq_list, rq) + rq_hierarchy_end_io_acct(rq, STAGE_PLUG); +} + static inline void blk_rq_hierarchy_stats_init(struct request *rq) { rq->stage = NR_RQ_STAGE_GROUPS; @@ -339,6 +347,10 @@ rq_list_hierarchy_end_io_acct(struct list_head *head, enum stage_group stage) { }
+static inline void plug_list_hierarchy_end_io_acct(struct blk_plug *plug) +{ +} + static inline void blk_rq_hierarchy_stats_init(struct request *rq) { diff --git a/block/blk-mq.c b/block/blk-mq.c index f32644ca3c12..bf1329ada77b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1308,6 +1308,8 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) { struct request *last = rq_list_peek(&plug->mq_list);
+ rq_hierarchy_start_io_acct(rq, STAGE_PLUG); + if (!plug->rq_count) { trace_block_plug(rq->q); } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || @@ -2840,6 +2842,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (plug->rq_count == 0) return; + + plug_list_hierarchy_end_io_acct(plug); plug->rq_count = 0;
if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { @@ -4380,12 +4384,18 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, } EXPORT_SYMBOL(blk_mq_init_allocated_queue);
+static void blk_mq_unregister_default_hierarchy(struct request_queue *q) +{ + blk_mq_unregister_hierarchy(q, STAGE_GETTAG); + blk_mq_unregister_hierarchy(q, STAGE_PLUG); +} + /* tags can _not_ be used after returning from blk_mq_exit_queue */ void blk_mq_exit_queue(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set;
- blk_mq_unregister_hierarchy(q, STAGE_GETTAG); + blk_mq_unregister_default_hierarchy(q); /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 65be33558d28..1539a137502e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -819,6 +819,12 @@ static void blk_debugfs_remove(struct gendisk *disk) mutex_unlock(&q->debugfs_mutex); }
+static void blk_mq_register_default_hierarchy(struct request_queue *q) +{ + blk_mq_register_hierarchy(q, STAGE_GETTAG); + blk_mq_register_hierarchy(q, STAGE_PLUG); +} + /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -862,7 +868,7 @@ int blk_register_queue(struct gendisk *disk) goto out_elv_unregister;
if (queue_is_mq(q)) - blk_mq_register_hierarchy(q, STAGE_GETTAG); + blk_mq_register_default_hierarchy(q);
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(disk);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created for rq-based disk if mq-deadline is enabled:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- deadline | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in mq-deadline.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 11 +++++++++++ block/mq-deadline.c | 15 ++++++++++++++- include/linux/blk_types.h | 3 +++ 3 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 342c6a72fffb..b94f4c69402a 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -79,4 +79,15 @@ config HIERARCHY_PLUG
If unsure, say N.
+config HIERARCHY_DEADLINE + bool "Enable hierarchy stats layer mq-deadline" + default n + depends on MQ_IOSCHED_DEADLINE + help + Enabling this lets blk hierarchy stats to record additional information + for mq-deadline. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 78a8aa204c15..a9cf8e19f9d1 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -23,6 +23,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h"
/* * See Documentation/block/deadline-iosched.rst @@ -103,6 +104,8 @@ struct deadline_data {
spinlock_t lock; spinlock_t zone_lock; + + struct request_queue *q; };
/* Maps an I/O priority class to a deadline scheduler priority. */ @@ -618,6 +621,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) unlock: spin_unlock(&dd->lock);
+ if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_DEADLINE); return rq; }
@@ -696,6 +701,7 @@ static void dd_exit_sched(struct elevator_queue *e) stats->dispatched, atomic_read(&stats->completed)); }
+ blk_mq_unregister_hierarchy(dd->q, STAGE_DEADLINE); kfree(dd); }
@@ -735,6 +741,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; + dd->q = q; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock);
@@ -742,6 +749,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
q->elevator = eq; + blk_mq_register_hierarchy(q, STAGE_DEADLINE); return 0;
put_eq: @@ -796,8 +804,10 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock);
- if (free) + if (free) { + rq_hierarchy_end_io_acct(free, STAGE_DEADLINE); blk_mq_free_request(free); + }
return ret; } @@ -882,6 +892,8 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct deadline_data *dd = q->elevator->elevator_data; LIST_HEAD(free);
+ rq_list_hierarchy_start_io_acct(list, STAGE_DEADLINE); + spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; @@ -892,6 +904,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, } spin_unlock(&dd->lock);
+ rq_list_hierarchy_end_io_acct(&free, STAGE_DEADLINE); blk_mq_free_requests(&free); }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index cea20cdc351a..78387dd66b51 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -489,6 +489,9 @@ enum stage_group { STAGE_GETTAG, NR_BIO_STAGE_GROUPS, STAGE_PLUG = NR_BIO_STAGE_GROUPS, +#if IS_ENABLED(CONFIG_MQ_IOSCHED_DEADLINE) + STAGE_DEADLINE, +#endif NR_RQ_STAGE_GROUPS, STAGE_BIO = NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created for rq-based disk if bfq is enabled:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- bfq | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in bfq.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/bfq-iosched.c | 11 ++++++++++- block/blk-io-hierarchy/Kconfig | 11 +++++++++++ include/linux/blk_types.h | 3 +++ 3 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 05372a78cd51..b350d2c51bfc 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -132,6 +132,7 @@ #include "blk-mq-sched.h" #include "bfq-iosched.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h"
#define BFQ_BFQQ_FNS(name) \ void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ @@ -2476,8 +2477,10 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
spin_unlock_irq(&bfqd->lock); - if (free) + if (free) { + rq_hierarchy_end_io_acct(free, STAGE_BFQ); blk_mq_free_request(free); + }
return ret; } @@ -5322,6 +5325,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) idle_timer_disabled ? in_serv_queue : NULL, idle_timer_disabled);
+ if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_BFQ); return rq; }
@@ -6255,6 +6260,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bfqq = bfq_init_rq(rq); if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + rq_list_hierarchy_end_io_acct(&free, STAGE_BFQ); blk_mq_free_requests(&free); return; } @@ -6297,6 +6303,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags) { + rq_list_hierarchy_start_io_acct(list, STAGE_BFQ); while (!list_empty(list)) { struct request *rq;
@@ -7168,6 +7175,7 @@ static void bfq_exit_queue(struct elevator_queue *e) struct bfq_queue *bfqq, *n; unsigned int actuator;
+ blk_mq_unregister_hierarchy(bfqd->queue, STAGE_BFQ); hrtimer_cancel(&bfqd->idle_slice_timer);
spin_lock_irq(&bfqd->lock); @@ -7385,6 +7393,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags); wbt_disable_default(q->disk); blk_stat_enable_accounting(q); + blk_mq_register_hierarchy(q, STAGE_BFQ);
return 0;
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index b94f4c69402a..e978fa5c162c 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -90,4 +90,15 @@ config HIERARCHY_DEADLINE
If unsure, say N.
+config HIERARCHY_BFQ + bool "Enable hierarchy stats layer bfq" + default n + depends on IOSCHED_BFQ + help + Enabling this lets blk hierarchy stats to record additional information + for bfq. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 78387dd66b51..98df563ef755 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -491,6 +491,9 @@ enum stage_group { STAGE_PLUG = NR_BIO_STAGE_GROUPS, #if IS_ENABLED(CONFIG_MQ_IOSCHED_DEADLINE) STAGE_DEADLINE, +#endif +#if IS_ENABLED(CONFIG_IOSCHED_BFQ) + STAGE_BFQ, #endif NR_RQ_STAGE_GROUPS, STAGE_BIO = NR_RQ_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created for rq-based disk if kyber is enabled:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- kyber | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in kyber.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 11 +++++++++++ block/kyber-iosched.c | 7 +++++++ include/linux/blk_types.h | 3 +++ 3 files changed, 21 insertions(+)
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index e978fa5c162c..3cc51933def3 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -101,4 +101,15 @@ config HIERARCHY_BFQ
If unsure, say N.
+config HIERARCHY_KYBER + bool "Enable hierarchy stats layer kyber" + default n + depends on MQ_IOSCHED_KYBER + help + Enabling this lets blk hierarchy stats to record additional information + for kyber. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 4155594aefc6..299d6edce8ea 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -18,6 +18,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h"
#define CREATE_TRACE_POINTS #include <trace/events/kyber.h> @@ -424,6 +425,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) eq->elevator_data = kqd; q->elevator = eq;
+ blk_mq_register_hierarchy(q, STAGE_KYBER); return 0; }
@@ -432,6 +434,7 @@ static void kyber_exit_sched(struct elevator_queue *e) struct kyber_queue_data *kqd = e->elevator_data; int i;
+ blk_mq_unregister_hierarchy(kqd->q, STAGE_KYBER); timer_shutdown_sync(&kqd->timer); blk_stat_disable_accounting(kqd->q);
@@ -594,6 +597,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, struct kyber_hctx_data *khd = hctx->sched_data; struct request *rq, *next;
+ rq_list_hierarchy_start_io_acct(rq_list, STAGE_KYBER); list_for_each_entry_safe(rq, next, rq_list, queuelist) { unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]]; @@ -843,6 +847,9 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = NULL; out: spin_unlock(&khd->lock); + + if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_KYBER); return rq; }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 98df563ef755..95fdc8fc2dc5 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -494,6 +494,9 @@ enum stage_group { #endif #if IS_ENABLED(CONFIG_IOSCHED_BFQ) STAGE_BFQ, +#endif +#if IS_ENABLED(CONFIG_MQ_IOSCHED_KYBER) + STAGE_KYBER, #endif NR_RQ_STAGE_GROUPS, STAGE_BIO = NR_RQ_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created for rq-based disk:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- hctx | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in hctx.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-flush.c | 2 ++ block/blk-io-hierarchy/Kconfig | 10 ++++++++++ block/blk-mq-sched.c | 2 ++ block/blk-mq.c | 5 +++++ block/blk-sysfs.c | 1 + include/linux/blk_types.h | 1 + 6 files changed, 21 insertions(+)
diff --git a/block/blk-flush.c b/block/blk-flush.c index 4628a9ee1904..354ed5d2a853 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -185,6 +185,7 @@ static void blk_flush_complete_seq(struct request *rq, if (list_empty(pending)) fq->flush_pending_since = jiffies; list_add_tail(&rq->queuelist, pending); + rq_hierarchy_start_io_acct(rq, STAGE_HCTX); break;
case REQ_FSEQ_DATA: @@ -264,6 +265,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); list_del_init(&rq->queuelist); + rq_hierarchy_end_io_acct(rq, STAGE_HCTX); blk_flush_complete_seq(rq, fq, seq, error); }
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 3cc51933def3..6a36f7b83f51 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -112,4 +112,14 @@ config HIERARCHY_KYBER
If unsure, say N.
+config HIERARCHY_HCTX + bool "Enable hierarchy stats layer hctx" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for hctx. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 67c95f31b15b..7b48630b63a7 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -15,6 +15,7 @@ #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h"
/* * Mark a hardware queue as needing a restart. @@ -298,6 +299,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); + rq_list_hierarchy_end_io_acct(&rq_list, STAGE_HCTX); if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) return 0; need_dispatch = true; diff --git a/block/blk-mq.c b/block/blk-mq.c index bf1329ada77b..9e67d097e29b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2147,6 +2147,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, if (nr_budgets) blk_mq_release_budgets(q, list);
+ rq_list_hierarchy_start_io_acct(list, STAGE_HCTX); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -2508,6 +2509,7 @@ static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+ rq_hierarchy_start_io_acct(rq, STAGE_HCTX); spin_lock(&hctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &hctx->dispatch); @@ -2815,6 +2817,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) percpu_ref_get(&this_hctx->queue->q_usage_counter); /* passthrough requests should never be issued to the I/O scheduler */ if (is_passthrough) { + rq_list_hierarchy_start_io_acct(&list, STAGE_HCTX); spin_lock(&this_hctx->lock); list_splice_tail_init(&list, &this_hctx->dispatch); spin_unlock(&this_hctx->lock); @@ -3618,6 +3621,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) if (list_empty(&tmp)) return 0;
+ rq_list_hierarchy_start_io_acct(&tmp, STAGE_HCTX); spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -4388,6 +4392,7 @@ static void blk_mq_unregister_default_hierarchy(struct request_queue *q) { blk_mq_unregister_hierarchy(q, STAGE_GETTAG); blk_mq_unregister_hierarchy(q, STAGE_PLUG); + blk_mq_unregister_hierarchy(q, STAGE_HCTX); }
/* tags can _not_ be used after returning from blk_mq_exit_queue */ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1539a137502e..428db09db8e8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -823,6 +823,7 @@ static void blk_mq_register_default_hierarchy(struct request_queue *q) { blk_mq_register_hierarchy(q, STAGE_GETTAG); blk_mq_register_hierarchy(q, STAGE_PLUG); + blk_mq_register_hierarchy(q, STAGE_HCTX); }
/** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 95fdc8fc2dc5..898dc10c99e2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -498,6 +498,7 @@ enum stage_group { #if IS_ENABLED(CONFIG_MQ_IOSCHED_KYBER) STAGE_KYBER, #endif + STAGE_HCTX, NR_RQ_STAGE_GROUPS, STAGE_BIO = NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created for rq-based disk:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- requeue | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in requeue.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-flush.c | 2 ++ block/blk-io-hierarchy/Kconfig | 10 ++++++++++ block/blk-mq.c | 5 +++++ block/blk-sysfs.c | 1 + include/linux/blk_types.h | 1 + 5 files changed, 19 insertions(+)
diff --git a/block/blk-flush.c b/block/blk-flush.c index 354ed5d2a853..4033518e3ffe 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -190,6 +190,7 @@ static void blk_flush_complete_seq(struct request *rq,
case REQ_FSEQ_DATA: fq->flush_data_in_flight++; + rq_hierarchy_start_io_acct(rq, STAGE_REQUEUE); spin_lock(&q->requeue_lock); list_move(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); @@ -356,6 +357,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, smp_wmb(); req_ref_set(flush_rq, 1);
+ rq_hierarchy_start_io_acct(flush_rq, STAGE_REQUEUE); spin_lock(&q->requeue_lock); list_add_tail(&flush_rq->queuelist, &q->flush_list); spin_unlock(&q->requeue_lock); diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 6a36f7b83f51..6b85c5bc5fb6 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -122,4 +122,14 @@ config HIERARCHY_HCTX
If unsure, say N.
+config HIERARCHY_REQUEUE + bool "Enable hierarchy stats layer requeue" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for requeue. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-mq.c b/block/blk-mq.c index 9e67d097e29b..f68d8d7a2e69 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1474,6 +1474,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq);
+ rq_hierarchy_start_io_acct(rq, STAGE_REQUEUE); spin_lock_irqsave(&q->requeue_lock, flags); list_add_tail(&rq->queuelist, &q->requeue_list); spin_unlock_irqrestore(&q->requeue_lock, flags); @@ -1496,6 +1497,9 @@ static void blk_mq_requeue_work(struct work_struct *work) list_splice_init(&q->flush_list, &flush_list); spin_unlock_irq(&q->requeue_lock);
+ rq_list_hierarchy_end_io_acct(&rq_list, STAGE_REQUEUE); + rq_list_hierarchy_end_io_acct(&flush_list, STAGE_REQUEUE); + while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); /* @@ -4393,6 +4397,7 @@ static void blk_mq_unregister_default_hierarchy(struct request_queue *q) blk_mq_unregister_hierarchy(q, STAGE_GETTAG); blk_mq_unregister_hierarchy(q, STAGE_PLUG); blk_mq_unregister_hierarchy(q, STAGE_HCTX); + blk_mq_unregister_hierarchy(q, STAGE_REQUEUE); }
/* tags can _not_ be used after returning from blk_mq_exit_queue */ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 428db09db8e8..141f32a62022 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -824,6 +824,7 @@ static void blk_mq_register_default_hierarchy(struct request_queue *q) blk_mq_register_hierarchy(q, STAGE_GETTAG); blk_mq_register_hierarchy(q, STAGE_PLUG); blk_mq_register_hierarchy(q, STAGE_HCTX); + blk_mq_register_hierarchy(q, STAGE_REQUEUE); }
/** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 898dc10c99e2..fe6569942370 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -499,6 +499,7 @@ enum stage_group { STAGE_KYBER, #endif STAGE_HCTX, + STAGE_REQUEUE, NR_RQ_STAGE_GROUPS, STAGE_BIO = NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created for rq-based disk:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- rq_driver | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves in rq_driver.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-io-hierarchy/Kconfig | 10 ++++++++++ block/blk-mq.c | 14 ++++++++++++++ block/blk-sysfs.c | 1 + include/linux/blk_types.h | 1 + 4 files changed, 26 insertions(+)
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 6b85c5bc5fb6..0761a4bfb95d 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -132,4 +132,14 @@ config HIERARCHY_REQUEUE
If unsure, say N.
+config HIERARCHY_RQ_DRIVER + bool "Enable hierarchy stats layer rq_driver" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for rq_driver. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-mq.c b/block/blk-mq.c index f68d8d7a2e69..3c4b0ea44c00 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1058,6 +1058,13 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
blk_mq_finish_request(rq);
+ /* + * Avoid accounting flush request with data twice and request that is + * not started. + */ + if (blk_mq_request_started(rq) && !blk_rq_hierarchy_is_flush_done(rq)) + rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER); + if (rq->end_io) { rq_qos_done(rq->q, rq); if (rq->end_io(rq, error) == RQ_END_IO_FREE) @@ -1116,6 +1123,10 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
rq_qos_done(rq->q, rq);
+ /* Avoid accounting flush request with data twice. */ + if (!blk_rq_hierarchy_is_flush_done(rq)) + rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER); + /* * If end_io handler returns NONE, then it still has * ownership of the request. @@ -1269,6 +1280,7 @@ void blk_mq_start_request(struct request *rq) struct request_queue *q = rq->q;
trace_block_rq_issue(rq); + rq_hierarchy_start_io_acct(rq, STAGE_RQ_DRIVER);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && !blk_rq_is_passthrough(rq)) { @@ -1461,6 +1473,7 @@ static void __blk_mq_requeue_request(struct request *rq) if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; + rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER); } }
@@ -4398,6 +4411,7 @@ static void blk_mq_unregister_default_hierarchy(struct request_queue *q) blk_mq_unregister_hierarchy(q, STAGE_PLUG); blk_mq_unregister_hierarchy(q, STAGE_HCTX); blk_mq_unregister_hierarchy(q, STAGE_REQUEUE); + blk_mq_unregister_hierarchy(q, STAGE_RQ_DRIVER); }
/* tags can _not_ be used after returning from blk_mq_exit_queue */ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 141f32a62022..866449af8681 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -825,6 +825,7 @@ static void blk_mq_register_default_hierarchy(struct request_queue *q) blk_mq_register_hierarchy(q, STAGE_PLUG); blk_mq_register_hierarchy(q, STAGE_HCTX); blk_mq_register_hierarchy(q, STAGE_REQUEUE); + blk_mq_register_hierarchy(q, STAGE_RQ_DRIVER); }
/** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index fe6569942370..1f8b62f663a1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -500,6 +500,7 @@ enum stage_group { #endif STAGE_HCTX, STAGE_REQUEUE, + STAGE_RQ_DRIVER, NR_RQ_STAGE_GROUPS, STAGE_BIO = NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS,
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IB4E8P CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created for rq-based disk:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- bio | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how bio behaves.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/bio.c | 2 ++ block/blk-core.c | 7 +++++++ block/blk-io-hierarchy/Kconfig | 11 +++++++++++ block/blk-mq.c | 14 ++++++++++++-- block/blk-sysfs.c | 1 + 5 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/block/bio.c b/block/bio.c index d64b0da22e38..6f142d4c09ea 100644 --- a/block/bio.c +++ b/block/bio.c @@ -24,6 +24,7 @@ #include "blk.h" #include "blk-rq-qos.h" #include "blk-cgroup.h" +#include "blk-io-hierarchy/stats.h"
#define ALLOC_CACHE_THRESHOLD 16 #define ALLOC_CACHE_MAX 256 @@ -230,6 +231,7 @@ void bio_uninit(struct bio *bio) bio->pid = NULL; } #endif + bio_hierarchy_end(bio); } EXPORT_SYMBOL(bio_uninit);
diff --git a/block/blk-core.c b/block/blk-core.c index 0ba11c853051..80ea58299ef6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -50,6 +50,7 @@ #include "blk-cgroup.h" #include "blk-throttle.h" #include "blk-ioprio.h" +#include "blk-io-hierarchy/stats.h"
struct dentry *blk_debugfs_root;
@@ -835,6 +836,12 @@ void submit_bio_noacct(struct bio *bio) break; }
+ /* + * On the one hand REQ_PREFLUSH | REQ_FUA can be cleared above, on the + * other hand it doesn't make sense to count invalid bio. Split bio will + * be accounted separately. + */ + bio_hierarchy_start(bio); if (blk_throtl_bio(bio)) return; submit_bio_noacct_nocheck(bio); diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 0761a4bfb95d..ae8adf238059 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -142,4 +142,15 @@ config HIERARCHY_RQ_DRIVER
If unsure, say N.
+config HIERARCHY_BIO + bool "Support to record stats for bio lifetime" + default n + select BLK_BIO_ALLOC_TIME + help + Enabling this lets blk hierarchy stats to record additional information + for bio. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + endif diff --git a/block/blk-mq.c b/block/blk-mq.c index 3c4b0ea44c00..7a9c9e4dc50e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -790,8 +790,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio, if (unlikely(rq->rq_flags & RQF_QUIET)) bio_set_flag(bio, BIO_QUIET); /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) { + req_bio_hierarchy_end(rq, bio); bio_endio(bio); + } }
static void blk_account_io_completion(struct request *req, unsigned int bytes) @@ -856,8 +858,10 @@ static void blk_complete_request(struct request *req) if (req_op(req) == REQ_OP_ZONE_APPEND) bio->bi_iter.bi_sector = req->__sector;
- if (!is_flush) + if (!is_flush) { + req_bio_hierarchy_end(req, bio); bio_endio(bio); + } bio = next; } while (bio);
@@ -1115,6 +1119,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) prefetch(rq->bio); prefetch(rq->rq_next);
+ rq->io_end_time_ns = now; blk_complete_request(rq); if (iob->need_ts) __blk_mq_end_request_acct(rq, now); @@ -3043,6 +3048,8 @@ void blk_mq_submit_bio(struct bio *bio) bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) return; + /* account for split bio. */ + bio_hierarchy_start(bio); } if (!bio_integrity_prep(bio)) return; @@ -3058,6 +3065,8 @@ void blk_mq_submit_bio(struct bio *bio) bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) goto fail; + /* account for split bio. */ + bio_hierarchy_start(bio); } if (!bio_integrity_prep(bio)) goto fail; @@ -4412,6 +4421,7 @@ static void blk_mq_unregister_default_hierarchy(struct request_queue *q) blk_mq_unregister_hierarchy(q, STAGE_HCTX); blk_mq_unregister_hierarchy(q, STAGE_REQUEUE); blk_mq_unregister_hierarchy(q, STAGE_RQ_DRIVER); + blk_mq_unregister_hierarchy(q, STAGE_BIO); }
/* tags can _not_ be used after returning from blk_mq_exit_queue */ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 866449af8681..7a38a1d5dceb 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -826,6 +826,7 @@ static void blk_mq_register_default_hierarchy(struct request_queue *q) blk_mq_register_hierarchy(q, STAGE_HCTX); blk_mq_register_hierarchy(q, STAGE_REQUEUE); blk_mq_register_hierarchy(q, STAGE_RQ_DRIVER); + blk_mq_register_hierarchy(q, STAGE_BIO); }
/**
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/13893 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/S...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/13893 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/S...