August 2024 - Kernel - mailweb.openeuler.org

[PATCH openEuler-1.0-LTS] drm/nouveau/dispnv04: fix null pointer dereference in nv17_tv_get_ld_modes
by Yuan Can 05 Aug '24

05 Aug '24

From: Ma Ke <make24(a)iscas.ac.cn> stable inclusion from stable-v4.19.317 commit 9289cd3450d1da3e271ef4b054d4d2932c41243e category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGENV CVE: CVE-2024-41095 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id… -------------------------------- commit 66edf3fb331b6c55439b10f9862987b0916b3726 upstream. In nv17_tv_get_ld_modes(), the return value of drm_mode_duplicate() is assigned to mode, which will lead to a possible NULL pointer dereference on failure of drm_mode_duplicate(). Add a check to avoid npd. Cc: stable(a)vger.kernel.org Signed-off-by: Ma Ke <make24(a)iscas.ac.cn> Signed-off-by: Lyude Paul <lyude(a)redhat.com> Link: https://patchwork.freedesktop.org/patch/msgid/20240625081828.2620794-1-make… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> Signed-off-by: Yuan Can <yuancan(a)huawei.com> --- drivers/gpu/drm/nouveau/dispnv04/tvnv17.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c index 8fd8124d72ba..a01613ad1608 100644 --- a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c +++ b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c @@ -208,6 +208,8 @@ static int nv17_tv_get_ld_modes(struct drm_encoder *encoder, struct drm_display_mode *mode; mode = drm_mode_duplicate(encoder->dev, tv_mode); + if (!mode) + continue; mode->clock = tv_norm->tv_enc_mode.vrefresh * mode->htotal / 1000 * -- 2.17.1

2 1

[PATCH openEuler-1.0-LTS] block: support blk-io-hierarchy stats
by Yu Kuai 05 Aug '24

05 Aug '24

hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA -------------------------------- Signed-off-by: Yu Kuai <yukuai3(a)huawei.com> --- block/Kconfig | 2 + block/Makefile | 1 + block/bfq-iosched.c | 11 +- block/blk-core.c | 15 + block/blk-flush.c | 5 + block/blk-io-hierarchy/Kconfig | 156 +++++++ block/blk-io-hierarchy/Makefile | 8 + block/blk-io-hierarchy/debugfs.c | 230 ++++++++++ block/blk-io-hierarchy/iodump.c | 753 +++++++++++++++++++++++++++++++ block/blk-io-hierarchy/iodump.h | 100 ++++ block/blk-io-hierarchy/stats.c | 331 ++++++++++++++ block/blk-io-hierarchy/stats.h | 323 +++++++++++++ block/blk-mq-debugfs.c | 16 +- block/blk-mq-debugfs.h | 8 + block/blk-mq-sched.c | 7 +- block/blk-mq-tag.c | 13 +- block/blk-mq.c | 51 ++- block/blk-mq.h | 36 +- block/blk-sysfs.c | 16 + block/blk-throttle.c | 21 + block/blk-wbt.c | 12 +- block/blk.h | 58 +++ block/kyber-iosched.c | 8 +- block/mq-deadline.c | 15 +- include/linux/blk_types.h | 41 +- include/linux/blkdev.h | 9 + 26 files changed, 2202 insertions(+), 44 deletions(-) create mode 100644 block/blk-io-hierarchy/Kconfig create mode 100644 block/blk-io-hierarchy/Makefile create mode 100644 block/blk-io-hierarchy/debugfs.c create mode 100644 block/blk-io-hierarchy/iodump.c create mode 100644 block/blk-io-hierarchy/iodump.h create mode 100644 block/blk-io-hierarchy/stats.c create mode 100644 block/blk-io-hierarchy/stats.h diff --git a/block/Kconfig b/block/Kconfig index da71e56f8682..770cd3fa1367 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -213,6 +213,8 @@ config BLK_BIO_DISPATCH_ASYNC feature will require special care in the driver to work. If unsure, say N here. +source "block/blk-io-hierarchy/Kconfig" + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 572b33f32c07..bb711b0c307a 100644 --- a/block/Makefile +++ b/block/Makefile @@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o +obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk-io-hierarchy/ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 473d9e31ff87..2cb1bca71d39 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -140,6 +140,7 @@ #include "blk-mq-sched.h" #include "bfq-iosched.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h" #define BFQ_BFQQ_FNS(name) \ void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ @@ -1882,8 +1883,10 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) ret = blk_mq_sched_try_merge(q, bio, &free); spin_unlock_irq(&bfqd->lock); - if (free) + if (free) { + rq_hierarchy_end_io_acct(free, STAGE_BFQ); blk_mq_free_request(free); + } return ret; } @@ -4168,6 +4171,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) idle_timer_disabled ? in_serv_queue : NULL, idle_timer_disabled); + if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_BFQ); return rq; } @@ -4750,6 +4755,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + rq_list_hierarchy_end_io_acct(&free, STAGE_BFQ); blk_mq_free_requests(&free); return; } @@ -4797,6 +4803,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool at_head) { + rq_list_hierarchy_start_io_acct(list, STAGE_BFQ); while (!list_empty(list)) { struct request *rq; @@ -5394,6 +5401,7 @@ static void bfq_exit_queue(struct elevator_queue *e) struct bfq_queue *bfqq, *n; struct request_queue *q = bfqd->queue; + blk_mq_unregister_hierarchy(q, STAGE_BFQ); hrtimer_cancel(&bfqd->idle_slice_timer); spin_lock_irq(&bfqd->lock); @@ -5560,6 +5568,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); wbt_disable_default(q); + blk_mq_register_hierarchy(q, STAGE_BFQ); return 0; out_free: diff --git a/block/blk-core.c b/block/blk-core.c index acf5585b0557..03b8c2367164 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -43,6 +43,7 @@ #include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h" #ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; @@ -1001,6 +1002,15 @@ void blk_exit_queue(struct request_queue *q) bdi_put(q->backing_dev_info); } +static void blk_mq_unregister_default_hierarchy(struct request_queue *q) +{ + blk_mq_unregister_hierarchy(q, STAGE_GETTAG); + blk_mq_unregister_hierarchy(q, STAGE_PLUG); + blk_mq_unregister_hierarchy(q, STAGE_HCTX); + blk_mq_unregister_hierarchy(q, STAGE_REQUEUE); + blk_mq_unregister_hierarchy(q, STAGE_RQ_DRIVER); +} + /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -1088,6 +1098,7 @@ void blk_cleanup_queue(struct request_queue *q) blk_exit_queue(q); if (q->mq_ops) { + blk_mq_unregister_default_hierarchy(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); } @@ -3919,6 +3930,8 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); + plug->cur_ktime = 0; + /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier @@ -4060,6 +4073,8 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (q) queue_unplugged(q, depth, from_schedule); + + plug->cur_ktime = 0; } void blk_finish_plug(struct blk_plug *plug) diff --git a/block/blk-flush.c b/block/blk-flush.c index c1bfcde165af..384fce3b6bf6 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -75,6 +75,7 @@ #include "blk-mq.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h" /* PREFLUSH/FUA sequences */ enum { @@ -187,6 +188,7 @@ static bool blk_flush_complete_seq(struct request *rq, if (list_empty(pending)) fq->flush_pending_since = jiffies; list_move_tail(&rq->flush.list, pending); + rq_hierarchy_start_io_acct(rq, STAGE_HCTX); break; case REQ_FSEQ_DATA: @@ -245,6 +247,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) * avoiding use-after-free. */ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); + blk_mq_put_alloc_task(flush_rq); if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; fq->rq_status = BLK_STS_OK; @@ -274,6 +277,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); + rq_hierarchy_end_io_acct(rq, STAGE_HCTX); queued |= blk_flush_complete_seq(rq, fq, seq, error); } @@ -377,6 +381,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; + blk_mq_get_alloc_task(flush_rq, first_rq->bio); /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig new file mode 100644 index 000000000000..ce72d0593fce --- /dev/null +++ b/block/blk-io-hierarchy/Kconfig @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menuconfig BLK_IO_HIERARCHY_STATS + bool "Enable hierarchy io stats" + default n + depends on BLK_DEBUG_FS=y + help + Enabling this lets the block layer to record additional information + in different io stages. Such information can be helpful to debug + performance and problems like io hang. + + If unsure, say N. + +if BLK_IO_HIERARCHY_STATS + +config HIERARCHY_BIO + bool "Support to record stats for bio lifetime" + default n + select BLK_BIO_ALLOC_TIME + help + Enabling this lets blk hierarchy stats to record additional information + for bio. Such information can be helpful to debug performance and + problems like io hang. + + If unsure, say N. + +config HIERARCHY_IO_DUMP + bool "Support to dump io that is throttled" + default n + select BLK_BIO_ALLOC_TIME + select BLK_BIO_ALLOC_TASK + depends on BLK_DEV_IO_TRACE + help + Enable this will create new debugfs entries to show user the detailed + information of IO that are submitted and not done yet, and user can + filter the result by IO stage or IO latency. + + If unsure, say N. + +config HIERARCHY_THROTTLE + bool "Enable hierarchy stats layer blk-throttle" + default n + depends on BLK_DEV_THROTTLING=y + help + Enabling this lets blk hierarchy stats to record additional information + for blk-throttle. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_WBT + bool "Enable hierarchy stats layer blk-wbt" + default n + depends on BLK_WBT + help + Enabling this lets blk hierarchy stats to record additional information + for blk-wbt. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_IOCOST + bool "Enable hierarchy stats layer blk-iocost" + default n + depends on BLK_CGROUP_IOCOST + help + Enabling this lets blk hierarchy stats to record additional information + for blk-iocost. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_GETTAG + bool "Enable hierarchy stats layer get-tag" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for getting tag. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_PLUG + bool "Enable hierarchy stats layer plug" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for plug. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_DEADLINE + bool "Enable hierarchy stats layer mq-deadline" + default n + depends on MQ_IOSCHED_DEADLINE + help + Enabling this lets blk hierarchy stats to record additional information + for mq-deadline. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_BFQ + bool "Enable hierarchy stats layer bfq" + default n + depends on IOSCHED_BFQ + help + Enabling this lets blk hierarchy stats to record additional information + for bfq. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_KYBER + bool "Enable hierarchy stats layer kyber" + default n + depends on MQ_IOSCHED_KYBER + help + Enabling this lets blk hierarchy stats to record additional information + for kyber. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_HCTX + bool "Enable hierarchy stats layer hctx" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for hctx. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_REQUEUE + bool "Enable hierarchy stats layer requeue" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for requeue. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +config HIERARCHY_RQ_DRIVER + bool "Enable hierarchy stats layer rq_driver" + default n + help + Enabling this lets blk hierarchy stats to record additional information + for requeue driver. Such information can be helpful to debug performance + and problems like io hang. + + If unsure, say N. + +endif diff --git a/block/blk-io-hierarchy/Makefile b/block/blk-io-hierarchy/Makefile new file mode 100644 index 000000000000..9b989d379e58 --- /dev/null +++ b/block/blk-io-hierarchy/Makefile @@ -0,0 +1,8 @@ +# +# Make file for blk_io_hierarchy_stats +# + +obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk_io_hierarchy_stats.o + +blk_io_hierarchy_stats-y := stats.o debugfs.o +obj-$(CONFIG_HIERARCHY_IO_DUMP) += iodump.o diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c new file mode 100644 index 000000000000..cb7ff2866c49 --- /dev/null +++ b/block/blk-io-hierarchy/debugfs.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/debugfs.h> +#include <linux/blkdev.h> + +#include "../blk-mq-debugfs.h" +#include "stats.h" +#include "iodump.h" + +static const char *stage_name[NR_STAGE_GROUPS] = { +#ifdef CONFIG_HIERARCHY_THROTTLE + [STAGE_THROTTLE] = "throtl", +#endif +#ifdef CONFIG_HIERARCHY_WBT + [STAGE_WBT] = "wbt", +#endif +#ifdef CONFIG_HIERARCHY_IOCOST + [STAGE_IOCOST] = "iocost", +#endif +#ifdef CONFIG_HIERARCHY_GETTAG + [STAGE_GETTAG] = "gettag", +#endif +#ifdef CONFIG_HIERARCHY_PLUG + [STAGE_PLUG] = "plug", +#endif +#ifdef CONFIG_HIERARCHY_DEADLINE + [STAGE_DEADLINE] = "deadline", +#endif +#ifdef CONFIG_HIERARCHY_BFQ + [STAGE_BFQ] = "bfq", +#endif +#ifdef CONFIG_HIERARCHY_KYBER + [STAGE_KYBER] = "kyber", +#endif +#ifdef CONFIG_HIERARCHY_HCTX + [STAGE_HCTX] = "hctx", +#endif +#ifdef CONFIG_HIERARCHY_REQUEUE + [STAGE_REQUEUE] = "requeue", +#endif +#ifdef CONFIG_HIERARCHY_RQ_DRIVER + [STAGE_RQ_DRIVER] = "rq_driver", +#endif +#ifdef CONFIG_HIERARCHY_BIO + [STAGE_BIO] = "bio", +#endif +}; + +const char *hierarchy_stage_name(enum stage_group stage) +{ + return stage_name[stage]; +} + +static int __hierarchy_stats_show(void *data, struct seq_file *m) +{ + struct hierarchy_stage *hstage = data; + u64 dispatched[NEW_NR_STAT_GROUPS] = {0}; + u64 completed[NEW_NR_STAT_GROUPS] = {0}; + u64 latency[NEW_NR_STAT_GROUPS] = {0}; + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct hierarchy_stats *stat = per_cpu_ptr(hstage->hstats, cpu); + + for (i = 0; i < NEW_NR_STAT_GROUPS; ++i) { + dispatched[i] += stat->dispatched[i]; + completed[i] += stat->completed[i]; + latency[i] += stage_is_rq(hstage->stage) ? + stat->jiffies[i] : stat->nsecs[i]; + } + } + + if (stage_is_rq(hstage->stage)) + for (i = 0; i < NEW_NR_STAT_GROUPS; ++i) + latency[i] = + jiffies_to_msecs(latency[i]) * NSEC_PER_MSEC; + + seq_printf(m, "%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", + dispatched[STAT_READ], completed[STAT_READ], + latency[STAT_READ], dispatched[STAT_WRITE], + completed[STAT_WRITE], latency[STAT_WRITE], + dispatched[STAT_DISCARD], completed[STAT_DISCARD], + latency[STAT_DISCARD], dispatched[STAT_FLUSH], + completed[STAT_FLUSH], latency[STAT_FLUSH]); + + hierarchy_show_slow_io(hstage, m); + seq_puts(m, "\n"); + return 0; +} + +static void *hierarchy_stats_start(struct seq_file *m, loff_t *pos) +{ + int ret; + enum stage_group stage = *pos; + struct blk_io_hierarchy_stats *stats = m->private; + + ret = blk_queue_enter(stats->q, 0); + if (ret) + return ERR_PTR(ret); + + if (stage < 0 || stage >= NR_STAGE_GROUPS) + return NULL; + + return pos; +} + +static void *hierarchy_stats_next(struct seq_file *m, void *v, loff_t *pos) +{ + enum stage_group stage = ++(*pos); + + if (stage >= 0 && stage < NR_STAGE_GROUPS) + return pos; + + return NULL; +} + +static void hierarchy_stats_stop(struct seq_file *m, void *v) +{ + struct blk_io_hierarchy_stats *stats = m->private; + + if (!IS_ERR(v)) + blk_queue_exit(stats->q); +} + +static int hierarchy_stats_show(struct seq_file *m, void *v) +{ + enum stage_group stage = (*(loff_t *)v); + struct blk_io_hierarchy_stats *stats = m->private; + struct hierarchy_stage *hstage = stats->hstage[stage]; + + if (!hstage) + return 0; + + seq_printf(m, "%s ", hierarchy_stage_name(stage)); + __hierarchy_stats_show(hstage, m); + return 0; +} + +static const struct seq_operations hierarchy_stats_ops = { + .start = hierarchy_stats_start, + .next = hierarchy_stats_next, + .stop = hierarchy_stats_stop, + .show = hierarchy_stats_show, +}; + +static const struct blk_mq_debugfs_attr hierarchy_debugfs_attrs[] = { + {"stats", 0400, __hierarchy_stats_show}, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_stats_attr[] = { + {"stats", 0400, .seq_ops = &hierarchy_stats_ops}, + {}, +}; + +static void hierarchy_register_stage(struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = stats->hstage[stage]; + struct dentry *dir; + + if (!stage_name[stage] || hstage->debugfs_dir) + return; + + dir = debugfs_create_dir(stage_name[stage], stats->debugfs_dir); + if (IS_ERR(dir)) + return; + + hstage->debugfs_dir = dir; + debugfs_create_files(dir, hstage, hierarchy_debugfs_attrs); + io_hierarchy_register_iodump(hstage); +} + +static void hierarchy_unregister_stage(struct blk_io_hierarchy_stats *stats, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = stats->hstage[stage]; + + if (!stage_name[stage] || !hstage->debugfs_dir) + return; + + debugfs_remove_recursive(hstage->debugfs_dir); + hstage->debugfs_dir = NULL; +} + +void blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!blk_mq_hierarchy_registered(q, stage) || + !blk_mq_debugfs_enabled(q)) + return; + + hierarchy_register_stage(stats, stage); +} + +void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!blk_mq_hierarchy_registered(q, stage) || + !blk_mq_debugfs_enabled(q)) + return; + + hierarchy_unregister_stage(stats, stage); +} + +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!blk_mq_debugfs_enabled(q)) + return; + + debugfs_create_files(stats->debugfs_dir, stats, hierarchy_stats_attr); +} diff --git a/block/blk-io-hierarchy/iodump.c b/block/blk-io-hierarchy/iodump.c new file mode 100644 index 000000000000..49ad2292873c --- /dev/null +++ b/block/blk-io-hierarchy/iodump.c @@ -0,0 +1,753 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/seq_file.h> +#include <linux/blktrace_api.h> +#include <linux/blk-cgroup.h> +#include <linux/sched/task.h> + +#include "iodump.h" +#include "../blk.h" +#include "../blk-mq-debugfs.h" + +#define RWB_LEN 6 +#define PATH_LEN 64 +#define ms_to_ns(time) (time * NSEC_PER_MSEC) +#define DEFAULT_THRESHOLD 1000 + +static DEFINE_MUTEX(dump_mutex); + +struct bio_dump_data { + u64 stat_time; + struct list_head head; + spinlock_t lock; +}; + +struct rq_dump_data { + struct request_queue *q; + enum stage_group stage; + unsigned int tag; + unsigned int total_tags; + bool shared; + bool has_elevator; + bool enter_queue; +}; + +#ifdef CONFIG_HIERARCHY_BIO +struct pos_data { + enum stage_group stage; + unsigned int count; +}; + +struct bio_stage_dump_data { + union { + loff_t pos; + struct pos_data pdata; + }; + struct rq_dump_data rq_ddata; + u64 stat_time; +}; +#endif + +static struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) +{ + return q->queue_hw_ctx[id]; +} + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + hstage->threshold = DEFAULT_THRESHOLD; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = + kmalloc(sizeof(*bio_ddata), GFP_KERNEL); + + if (!bio_ddata) + return -ENOMEM; + + INIT_LIST_HEAD(&bio_ddata->head); + spin_lock_init(&bio_ddata->lock); + hstage->dump_data = bio_ddata; + return 0; + } + + if (stage_is_rq(hstage->stage)) { + struct rq_dump_data *rq_ddata = + kzalloc(sizeof(*rq_ddata), GFP_KERNEL); + + if (!rq_ddata) + return -ENOMEM; + + rq_ddata->q = q; + rq_ddata->stage = hstage->stage; + hstage->dump_data = rq_ddata; + return 0; + } + +#ifdef CONFIG_HIERARCHY_BIO + BUILD_BUG_ON(sizeof(struct pos_data) != sizeof(loff_t)); + + if (hstage->stage == STAGE_BIO) { + struct bio_stage_dump_data *bstage_ddata = + kzalloc(sizeof(*bstage_ddata), GFP_KERNEL); + + if (!bstage_ddata) + return -ENOMEM; + + bstage_ddata->rq_ddata.q = q; + bstage_ddata->rq_ddata.stage = hstage->stage; + hstage->dump_data = bstage_ddata; + return 0; + } +#endif + + return -EINVAL; +} + +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage) +{ + struct hierarchy_stage *hstage = q->io_hierarchy_stats->hstage[stage]; + + if (stage_is_bio(hstage->stage)) { + struct bio_dump_data *bio_ddata = hstage->dump_data; + + WARN(!list_empty(&bio_ddata->head), + "blk-io-hierarchy: disk %s stage %s unregistered whih throttled IO.\n", + kobject_name(q->kobj.parent), hierarchy_stage_name(stage)); + } + + kfree(hstage->dump_data); + hstage->dump_data = NULL; +} + +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_hierarchy_data *data = bio->hdata; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_add_tail(&data->hierarchy_list, &bio_ddata->head); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ + unsigned long flags; + struct bio_hierarchy_data *data = bio->hdata; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irqsave(&bio_ddata->lock, flags); + list_del_init(&data->hierarchy_list); + spin_unlock_irqrestore(&bio_ddata->lock, flags); +} + +void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata) +{ + hdata->bio = bio; + INIT_LIST_HEAD(&hdata->hierarchy_list); +} + +static void *bio_hierarchy_list_start(struct seq_file *m, loff_t *pos) + __acquires(&bio_ddata->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_lock_irq(&bio_ddata->lock); + bio_ddata->stat_time = blk_time_get_ns(); + + return seq_list_start(&bio_ddata->head, *pos); +} + +static void *bio_hierarchy_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + return seq_list_next(v, &bio_ddata->head, pos); +} + +static void bio_hierarchy_list_stop(struct seq_file *m, void *v) + __releases(&hstage->lock) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + spin_unlock_irq(&bio_ddata->lock); +} + +static void __hierarchy_show_bio(struct seq_file *m, + struct bio_hierarchy_data *data, + enum stage_group stage, u64 duration) +{ + char rwbs[RWB_LEN]; + char path[PATH_LEN] = {0}; + struct bio *bio = data->bio; + struct task_struct *task = get_pid_task(bio->pid, PIDTYPE_PID); + + blk_fill_rwbs(rwbs, bio->bi_opf, bio->bi_iter.bi_size); + cgroup_path(bio->bi_css->cgroup, path, PATH_LEN); + + seq_printf(m, "%s-%d %s stage %s bio %s %lu + %u cgroup %s started %llu ns ago\n", + task ? task->comm : "null", task ? task->pid : 0, + bio->bi_disk->disk_name, hierarchy_stage_name(stage), + rwbs, bio->bi_iter.bi_sector, bio_sectors(bio), path, + duration); + + if (task) + put_task_struct(task); +} + +static u64 get_duration(u64 a, u64 b) +{ + return a > b ? a - b : 0; +} + +static void hierarchy_show_bio(struct seq_file *m, + struct bio_hierarchy_data *data) +{ + u64 duration; + struct hierarchy_stage *hstage = m->private; + struct bio_dump_data *bio_ddata = hstage->dump_data; + + duration = get_duration(bio_ddata->stat_time, data->time); + if (hstage->threshold > ns_to_ms(duration)) + return; + + __hierarchy_show_bio(m, data, hstage->stage, duration); +} + +static int bio_hierarchy_list_show(struct seq_file *m, void *v) +{ + struct bio_hierarchy_data *data = + list_entry(v, struct bio_hierarchy_data, hierarchy_list); + + hierarchy_show_bio(m, data); + return 0; +} + +static const struct seq_operations hierarchy_bio_dump_ops = { + .start = bio_hierarchy_list_start, + .next = bio_hierarchy_list_next, + .stop = bio_hierarchy_list_stop, + .show = bio_hierarchy_list_show, +}; + +static int threshold_show(void *data, struct seq_file *m) +{ + struct hierarchy_stage *hstage = data; + + seq_printf(m, "%lu\n", hstage->threshold); + return 0; +} + +/* + * max size needed by different bases to express U64 + * HEX: "0xFFFFFFFFFFFFFFFF" --> 18 + * DEC: "18446744073709551615" --> 20 + * OCT: "01777777777777777777777" --> 23 + * pick the max one to define NUMBER_BUF_LEN + */ +#define MAX_BUF_LEN 24 +static ssize_t threshold_store(void *data, const char __user *buf, size_t count, + loff_t *ppos) +{ + int err; + unsigned long val; + char b[MAX_BUF_LEN + 1]; + struct hierarchy_stage *hstage = data; + + if (count > MAX_BUF_LEN) + return -EINVAL; + + if (copy_from_user(b, buf, count)) + return -EFAULT; + + b[count] = 0; + err = kstrtoul(b, 0, &val); + if (!err) + hstage->threshold = val; + + return err ? err : count; +} + +static void rq_hierarchy_init_dump_data(struct rq_dump_data *rq_ddata) +{ + struct request_queue *q = rq_ddata->q; + + rq_ddata->shared = blk_mq_is_sbitmap_shared(q->tag_set->flags); + rq_ddata->has_elevator = !!q->elevator; + + if (rq_ddata->has_elevator) + rq_ddata->total_tags = q->nr_hw_queues * q->nr_requests; + else + rq_ddata->total_tags = q->nr_hw_queues * q->tag_set->queue_depth; +} + +static bool __rq_hierarchy_start(struct rq_dump_data *rq_ddata, + unsigned int tag) +{ + /* + * Grab .q_usage_counter so request pool won't go away, then no + * request use-after-free is possible during iteration. If queue is + * frozen, there won't be any inflight requests. + */ + if (!percpu_ref_tryget(&rq_ddata->q->q_usage_counter)) { + rq_ddata->enter_queue = false; + return false; + } + + rq_ddata->enter_queue = true; + rq_hierarchy_init_dump_data(rq_ddata); + rq_ddata->tag = tag; + + return tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static bool __rq_hierarchy_next(struct rq_dump_data *rq_ddata) +{ + rq_ddata->tag++; + + return rq_ddata->tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues; +} + +static void __rq_hierarchy_stop(struct rq_dump_data *rq_ddata) +{ + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +static void *rq_hierarchy_start(struct seq_file *m, loff_t *pos) + __acquires(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + + if (__rq_hierarchy_start(rq_ddata, *pos)) + return rq_ddata; + + return NULL; +} + +static void *rq_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct rq_dump_data *rq_ddata = v; + + if (__rq_hierarchy_next(rq_ddata)) { + *pos = rq_ddata->tag; + return rq_ddata; + } + + (*pos)++; + return NULL; +} + +static void rq_hierarchy_stop(struct seq_file *m, void *v) + __releases(&dump_mutex) +{ + struct hierarchy_stage *hstage = m->private; + struct rq_dump_data *rq_ddata = hstage->dump_data; + + __rq_hierarchy_stop(rq_ddata); + mutex_unlock(&dump_mutex); +} + +static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata) +{ + struct request *rq; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq_ddata->q; + unsigned int nr_tag = rq_ddata->tag; + unsigned int hctx_id; + + if (nr_tag >= rq_ddata->total_tags) { + hctx_id = nr_tag - rq_ddata->total_tags; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = queue_hctx(q, hctx_id); + rq = hctx->fq->flush_rq; + } else if (rq_ddata->shared) { + return NULL; + } else if (rq_ddata->has_elevator) { + hctx_id = nr_tag / q->nr_requests; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = queue_hctx(q, hctx_id); + rq = hctx->sched_tags->static_rqs[nr_tag % q->nr_requests]; + } else { + hctx_id = nr_tag / q->tag_set->queue_depth; + if (hctx_id >= q->nr_hw_queues) + return NULL; + + hctx = queue_hctx(q, hctx_id); + if (!hctx->tags) + return NULL; + + rq = hctx->tags->static_rqs[nr_tag % q->tag_set->queue_depth]; + } + + /* + * fast path to avoid refcount cas operations for the request that + * is from other shared request_queue or other stages. + */ + if (rq->q != q || (rq_ddata->stage != STAGE_BIO && + READ_ONCE(rq->stage) != rq_ddata->stage)) + return NULL; + + if (!refcount_inc_not_zero(&rq->ref)) + return NULL; + + /* Check again after request is pinned, in case request is resued. */ + if (rq->q != q) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + if (rq_ddata->stage == STAGE_BIO) + return rq; + + /* + * Barrier is paired with the smp_store_release() in + * rq_hierarchy_start_io_acct(), so that if stage is read, uninitialized + * hierarchy_time won't be read. + */ + if (smp_load_acquire(&rq->stage) != rq_ddata->stage) { + blk_mq_put_rq_ref(rq); + return NULL; + } + + return rq; +} + +static void hierarchy_show_rq(struct seq_file *m, struct request *rq, + u64 duration) +{ + struct task_struct *task = get_pid_task(rq->pid, PIDTYPE_PID); + const char *name = hierarchy_stage_name(rq->stage); + + seq_printf(m, "%s-%d %s stage %s ", task ? task->comm : "null", + task ? task->pid : 0, + rq->rq_disk ? rq->rq_disk->disk_name : "?", + name ? name : "?"); + debugfs_rq_show(m, rq); + seq_printf(m, " started %llu ns ago}\n", duration); + + if (task) + put_task_struct(task); +} + +static int rq_hierarchy_show(struct seq_file *m, void *v) +{ + u64 duration; + unsigned long htime; + struct hierarchy_stage *hstage = m->private; + struct request *rq = hierarchy_find_and_get_rq(v); + + if (!rq) + return 0; + + htime = READ_ONCE(rq->hierarchy_time); + htime = time_after(jiffies, htime) ? jiffies - htime : 0; + duration = jiffies_to_msecs(htime); + if (hstage->threshold <= duration) + hierarchy_show_rq(m, rq, ms_to_ns(duration)); + + blk_mq_put_rq_ref(rq); + return 0; +} + +static const struct seq_operations hierarchy_rq_dump_ops = { + .start = rq_hierarchy_start, + .next = rq_hierarchy_next, + .stop = rq_hierarchy_stop, + .show = rq_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr hierarchy_threshold_attr[] = { + { + "threshold", + 0600, + threshold_show, + threshold_store, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_bio_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_bio_dump_ops, + }, + {}, +}; + +static const struct blk_mq_debugfs_attr hierarchy_rq_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &hierarchy_rq_dump_ops, + }, + {}, +}; + +#ifdef CONFIG_HIERARCHY_BIO +static struct bio_dump_data *get_bio_stage_ddata(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + struct hierarchy_stage *hstage = READ_ONCE(stats->hstage[stage]); + + if (!hstage) + return NULL; + + return hstage->dump_data; +} + +static void bio_stage_start_next_stage(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + pdata->stage++; + if (!stage_is_bio(pdata->stage)) + pdata->stage = STAGE_BIO; + pdata->count = 0; + + *pos = bstage_ddata->pos; +} + +static void bio_stage_start_next_io(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) + pdata->count++; + else + pdata->count = bstage_ddata->rq_ddata.tag; + + *pos = bstage_ddata->pos; +} + +static void __bio_stage_hierarchy_stop(struct bio_stage_dump_data *bstage_ddata) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + spin_unlock_irq(&bio_ddata->lock); + } + + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +void *__bio_stage_hierarchy_start(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + +retry: + if (stage_is_bio(pdata->stage)) { + struct list_head *list; + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + if (!bio_ddata) { + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + spin_lock_irq(&bio_ddata->lock); + list = seq_list_start(&bio_ddata->head, pdata->count); + if (list) + return list; + + spin_unlock_irq(&bio_ddata->lock); + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_start(rq_ddata, pdata->count)) + return bstage_ddata; + + return NULL; +} + +static void *bio_stage_hierarchy_start(struct seq_file *m, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + bstage_ddata->pos = *pos; + bstage_ddata->stat_time = blk_time_get_ns(); + + return __bio_stage_hierarchy_start(bstage_ddata, pos); +} + +static void *bio_stage_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + struct list_head *list = ((struct list_head *)v)->next; + + if (list != &bio_ddata->head) { + bio_stage_start_next_io(bstage_ddata, pos); + return list; + } + + spin_unlock_irq(&bio_ddata->lock); + + bio_stage_start_next_stage(bstage_ddata, pos); + return __bio_stage_hierarchy_start(bstage_ddata, pos); + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_next(rq_ddata)) { + bio_stage_start_next_io(bstage_ddata, pos); + return bstage_ddata; + } + + (*pos)++; + return NULL; +} + +static void bio_stage_hierarchy_stop(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + __bio_stage_hierarchy_stop(bstage_ddata); + mutex_unlock(&dump_mutex); +} + +static int bio_stage_hierarchy_show(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + u64 duration; + + if (stage_is_bio(pdata->stage)) { + struct bio_hierarchy_data *data = list_entry( + v, struct bio_hierarchy_data, hierarchy_list); + + duration = get_duration(bstage_ddata->stat_time, + data->bio->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + __hierarchy_show_bio(m, data, pdata->stage, duration); + } else if (pdata->stage == STAGE_BIO) { + struct request *rq = hierarchy_find_and_get_rq(rq_ddata); + + if (rq) { + duration = get_duration(bstage_ddata->stat_time, + rq->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + hierarchy_show_rq(m, rq, duration); + blk_mq_put_rq_ref(rq); + } + } + + return 0; +} + +static const struct seq_operations bio_stage_hierarchy_ops = { + .start = bio_stage_hierarchy_start, + .next = bio_stage_hierarchy_next, + .stop = bio_stage_hierarchy_stop, + .show = bio_stage_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &bio_stage_hierarchy_ops, + }, + {}, +}; + +#else /* CONFIG_HIERARCHY_BIO */ +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + {}, +}; + +#endif + +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ + const struct blk_mq_debugfs_attr *attr; + + if (stage_is_bio(hstage->stage)) + attr = hierarchy_bio_dump_attr; + else if (stage_is_rq(hstage->stage)) + attr = hierarchy_rq_dump_attr; + else if (hstage->stage == STAGE_BIO) + attr = bio_stage_dump_attr; + else + attr = NULL; + + debugfs_create_files(hstage->debugfs_dir, hstage, + hierarchy_threshold_attr); + if (attr) + debugfs_create_files(hstage->debugfs_dir, hstage, attr); +} + +void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + if (hstage->threshold <= duration) + this_cpu_inc(hstage->hstats->slow[op]); +} + +void hierarchy_show_slow_io(struct hierarchy_stage *hstage, struct seq_file *m) +{ + u64 slow[NEW_NR_STAT_GROUPS] = {0}; + int cpu; + int i; + + for_each_possible_cpu(cpu) { + struct hierarchy_stats *stat = per_cpu_ptr(hstage->hstats, cpu); + + for (i = 0; i < NEW_NR_STAT_GROUPS; ++i) + slow[i] += stat->slow[i]; + } + + seq_printf(m, " %llu %llu %llu %llu", slow[STAT_READ], slow[STAT_WRITE], + slow[STAT_DISCARD], slow[STAT_FLUSH]); +} diff --git a/block/blk-io-hierarchy/iodump.h b/block/blk-io-hierarchy/iodump.h new file mode 100644 index 000000000000..2f9e159f2588 --- /dev/null +++ b/block/blk-io-hierarchy/iodump.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef BLK_IO_HIERARCHY_IODUMP_H +#define BLK_IO_HIERARCHY_IODUMP_H + +#ifdef CONFIG_HIERARCHY_IO_DUMP + +#include "stats.h" + +#define ns_to_ms(time) div_u64(time, NSEC_PER_MSEC) + +int blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage); +void blk_io_hierarchy_iodump_exit(struct request_queue *q, + enum stage_group stage); +void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio); +void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio); +void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata); +void io_hierarchy_register_iodump(struct hierarchy_stage *hstage); + +void hierarchy_account_slow_io(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration); +void hierarchy_show_slow_io(struct hierarchy_stage *hstage, struct seq_file *m); + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ + hierarchy_account_slow_io(hstage, op, ns_to_ms(duration)); +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ + hierarchy_account_slow_io(hstage, op, jiffies_to_msecs(duration)); +} + +#else +static inline int +blk_io_hierarchy_iodump_init(struct request_queue *q, + struct hierarchy_stage *hstage) +{ + return 0; +} + +static inline void +blk_io_hierarchy_iodump_exit(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio) +{ +} + +static inline void +bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata) +{ +} + +static inline void +io_hierarchy_register_iodump(struct hierarchy_stage *hstage) +{ +} + +static inline void +hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage, + enum stat_group op, u64 duration) +{ +} + +static inline void +hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage, + enum stat_group op, unsigned long duration) +{ +} + +static inline void +hierarchy_show_slow_io(struct hierarchy_stage *hstage, struct seq_file *m) +{ +} +#endif +#endif diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c new file mode 100644 index 000000000000..52a23413f468 --- /dev/null +++ b/block/blk-io-hierarchy/stats.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/debugfs.h> + +#include "stats.h" +#include "iodump.h" +#include "../blk.h" +#include "../blk-mq-debugfs.h" + +#define io_hierarchy_add(statsp, field, group, nr) \ + this_cpu_add((statsp)->field[group], nr) +#define io_hierarchy_inc(statsp, field, group) \ + io_hierarchy_add(statsp, field, group, 1) + +#define PRE_ALLOC_BIO_CNT 8 + +static mempool_t *hdata_pool; + +void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + enum stage_group stage; + + stats = q->io_hierarchy_stats; + if (!stats || !blk_mq_debugfs_enabled(q)) + return; + + stats->debugfs_dir = debugfs_create_dir("blk_io_hierarchy", + q->debugfs_dir); + blk_mq_debugfs_create_default_hierarchy_attr(q); + + for (stage = 0; stage < NR_STAGE_GROUPS; ++stage) + blk_mq_debugfs_register_hierarchy(q, stage); +} + +static void bio_alloc_hierarchy_data(struct bio *bio) +{ + if (!bio->hdata) { + struct bio_hierarchy_data *hdata = + mempool_alloc(hdata_pool, GFP_NOIO); + + bio_hierarchy_data_init(bio, hdata); + bio->hdata = hdata; + } +} + +void bio_free_hierarchy_data(struct bio *bio) +{ + if (!bio->hdata) + return; + + mempool_free(bio->hdata, hdata_pool); + bio->hdata = NULL; +} + +void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + enum stage_group stage; + + stats = q->io_hierarchy_stats; + if (!stats || !blk_mq_debugfs_enabled(q)) + return; + + for (stage = 0; stage < NR_STAGE_GROUPS; ++stage) + blk_mq_debugfs_unregister_hierarchy(q, stage); + + debugfs_remove_recursive(stats->debugfs_dir); + stats->debugfs_dir = NULL; +} + +int blk_io_hierarchy_stats_alloc(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats; + + if (!q->mq_ops) + return 0; + + stats = kzalloc(sizeof(struct blk_io_hierarchy_stats), GFP_KERNEL); + if (!stats) + return -ENOMEM; + + stats->q = q; + q->io_hierarchy_stats = stats; + + return 0; +} + +void blk_io_hierarchy_stats_free(struct request_queue *q) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!stats) + return; + + q->io_hierarchy_stats = NULL; + kfree(stats); +} + +bool blk_mq_hierarchy_registered(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + + if (!stats) + return false; + + return stats->hstage[stage] != NULL; +} +EXPORT_SYMBOL_GPL(blk_mq_hierarchy_registered); + +void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + struct hierarchy_stage *hstage; + + if (!stats || !hierarchy_stage_name(stage)) + return; + + if (blk_mq_hierarchy_registered(q, stage)) { + pr_warn("blk-io-hierarchy: disk %s is registering stage %s again.", + kobject_name(q->kobj.parent), + hierarchy_stage_name(stage)); + return; + } + + /* + * Alloc memory before freeze queue, prevent deadlock if new IO is + * issued by memory reclaim. + */ + hstage = kmalloc(sizeof(*hstage), GFP_KERNEL); + if (!hstage) + return; + + hstage->hstats = alloc_percpu(struct hierarchy_stats); + if (!hstage->hstats) { + kfree(hstage); + return; + } + + hstage->stage = stage; + hstage->debugfs_dir = NULL; + if (blk_io_hierarchy_iodump_init(q, hstage) < 0) { + free_percpu(hstage->hstats); + kfree(hstage); + return; + } + + blk_mq_freeze_queue(q); + + WRITE_ONCE(stats->hstage[stage], hstage); + blk_mq_debugfs_register_hierarchy(q, stage); + + blk_mq_unfreeze_queue(q); +} +EXPORT_SYMBOL_GPL(blk_mq_register_hierarchy); + +void blk_mq_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats; + struct hierarchy_stage *hstage; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + blk_mq_debugfs_unregister_hierarchy(q, stage); + blk_io_hierarchy_iodump_exit(q, stage); + + hstage = stats->hstage[stage]; + stats->hstage[stage] = NULL; + free_percpu(hstage->hstats); + kfree(hstage); +} +EXPORT_SYMBOL_GPL(blk_mq_unregister_hierarchy); + +static enum stat_group bio_hierarchy_op(struct bio *bio) +{ + if (op_is_discard(bio->bi_opf)) + return STAT_DISCARD; + + if (op_is_flush(bio->bi_opf) && + !(bio_sectors(bio) || bio_flagged(bio, BIO_HAS_DATA))) + return STAT_FLUSH; + + if (op_is_write(bio->bi_opf)) + return STAT_WRITE; + + return STAT_READ; +} + + +void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) +{ + struct request_queue *q = bio->bi_disk->queue; + struct hierarchy_stage *hstage; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + hstage = q->io_hierarchy_stats->hstage[stage]; + bio_alloc_hierarchy_data(bio); + io_hierarchy_inc(hstage->hstats, dispatched, bio_hierarchy_op(bio)); + bio->hdata->time = blk_time_get_ns(); + hierarchy_add_bio(hstage, bio); +} + +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time) +{ + struct request_queue *q = bio->bi_disk->queue; + struct hierarchy_stage *hstage; + u64 duration; + enum stat_group op; + + if (!blk_mq_hierarchy_registered(q, stage)) + return; + + op = bio_hierarchy_op(bio); + duration = time - bio->hdata->time; + hstage = q->io_hierarchy_stats->hstage[stage]; + + hierarchy_remove_bio(hstage, bio); + io_hierarchy_inc(hstage->hstats, completed, op); + io_hierarchy_add(hstage->hstats, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); +} + +static enum stat_group rq_hierarchy_op(struct request *rq) +{ + if (op_is_discard(rq->cmd_flags)) + return STAT_DISCARD; + + if (is_flush_rq(rq)) + return STAT_FLUSH; + + if (op_is_write(rq->cmd_flags)) + return STAT_WRITE; + + return STAT_READ; +} + +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + io_hierarchy_inc(hstage->hstats, dispatched, rq_hierarchy_op(rq)); + WRITE_ONCE(rq->hierarchy_time, jiffies); + + /* + * Paired with barrier in hierarchy_show_rq_fn(), make sure + * hierarchy_time is set before stage. + */ + smp_store_release(&rq->stage, hstage->stage); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_start_io_acct); + +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage) +{ + unsigned long duration = jiffies - rq->hierarchy_time; + enum stat_group op = rq_hierarchy_op(rq); + + io_hierarchy_inc(hstage->hstats, completed, op); + io_hierarchy_add(hstage->hstats, jiffies, op, duration); + hierarchy_account_slow_io_jiffies(hstage, op, duration); + WRITE_ONCE(rq->stage, NR_RQ_STAGE_GROUPS); +} +EXPORT_SYMBOL_GPL(__rq_hierarchy_end_io_acct); + +#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio) +{ + struct gendisk *disk = bio->bi_disk; + struct hierarchy_stage *hstage; + + if (bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(disk->queue, STAGE_BIO)) + return; + + bio_set_flag(bio, BIO_HIERARCHY_ACCT); + if (bio_has_data(bio)) + bio_set_flag(bio, BIO_HAS_DATA); + hstage = disk->queue->io_hierarchy_stats->hstage[STAGE_BIO]; + io_hierarchy_inc(hstage->hstats, dispatched, bio_hierarchy_op(bio)); +} + +void __bio_hierarchy_end(struct bio *bio, u64 now) +{ + struct gendisk *disk = bio->bi_disk; + struct hierarchy_stage *hstage; + u64 duration; + enum stat_group op; + + op = bio_hierarchy_op(bio); + duration = now - bio->bi_alloc_time_ns; + hstage = disk->queue->io_hierarchy_stats->hstage[STAGE_BIO]; + + io_hierarchy_inc(hstage->hstats, completed, op); + io_hierarchy_add(hstage->hstats, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); + + bio_clear_flag(bio, BIO_HIERARCHY_ACCT); + bio_clear_flag(bio, BIO_HAS_DATA); +} + +#endif + +static int __init hierarchy_stats_init(void) +{ + hdata_pool = mempool_create_kmalloc_pool(PRE_ALLOC_BIO_CNT, + sizeof(struct bio_hierarchy_data)); + if (!hdata_pool) + panic("Failed to create hdata_pool\n"); + + return 0; +} +module_init(hierarchy_stats_init); diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h new file mode 100644 index 000000000000..ed3e5ddc084a --- /dev/null +++ b/block/blk-io-hierarchy/stats.h @@ -0,0 +1,323 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef BLK_IO_HIERARCHY_STATS_H +#define BLK_IO_HIERARCHY_STATS_H + +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + +#include <linux/blkdev.h> +#include <linux/blk_types.h> +#include "../blk.h" + +struct bio_hierarchy_data { + u64 time; +#ifdef CONFIG_HIERARCHY_IO_DUMP + struct bio *bio; + struct list_head hierarchy_list; +#endif +}; + +struct hierarchy_stats { + union { + /* for bio based stages. */ + u64 nsecs[NEW_NR_STAT_GROUPS]; + /* for request based stages. */ + unsigned long jiffies[NEW_NR_STAT_GROUPS]; + }; + unsigned long dispatched[NEW_NR_STAT_GROUPS]; + unsigned long completed[NEW_NR_STAT_GROUPS]; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long slow[NEW_NR_STAT_GROUPS]; +#endif + +}; + +struct hierarchy_stage { + enum stage_group stage; + struct dentry *debugfs_dir; + struct hierarchy_stats __percpu *hstats; +#ifdef CONFIG_HIERARCHY_IO_DUMP + unsigned long threshold; + void *dump_data; +#endif +}; + +struct blk_io_hierarchy_stats { + struct request_queue *q; + struct dentry *debugfs_dir; + struct hierarchy_stage *hstage[NR_STAGE_GROUPS]; +}; + +static inline bool stage_is_bio(enum stage_group stage) +{ + return stage >= 0 && stage < NR_BIO_STAGE_GROUPS; +} + +static inline bool stage_is_rq(enum stage_group stage) +{ + return stage >= NR_BIO_STAGE_GROUPS && stage < NR_RQ_STAGE_GROUPS; +} + +const char *hierarchy_stage_name(enum stage_group stage); +int blk_io_hierarchy_stats_alloc(struct request_queue *q); +void blk_io_hierarchy_stats_free(struct request_queue *q); + +/* APIs for stage registration */ +bool blk_mq_hierarchy_registered(struct request_queue *q, + enum stage_group stage); +void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage); +void blk_mq_unregister_hierarchy(struct request_queue *q, + enum stage_group stage); + +/* APIs for disk level debugfs */ +void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q); +void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q); +void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q); + +/* APIs for stage level debugfs */ +void blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage); +void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage); + +/* APIs for bio based stage io accounting */ +void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage); +void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage, + u64 time); +void bio_free_hierarchy_data(struct bio *bio); + +static inline void bio_hierarchy_end_io_acct(struct bio *bio, + enum stage_group stage) +{ + __bio_hierarchy_end_io_acct(bio, stage, blk_time_get_ns()); +} + +static inline void bio_list_hierarchy_end_io_acct(struct bio_list *list, + enum stage_group stage) +{ + u64 time = blk_time_get_ns(); + struct bio *bio; + + bio_list_for_each(bio, list) + __bio_hierarchy_end_io_acct(bio, stage, time); +} + +/* APIs for request based stage io accounting */ +void __rq_hierarchy_start_io_acct(struct request *rq, + struct hierarchy_stage *hstage); +void __rq_hierarchy_end_io_acct(struct request *rq, + struct hierarchy_stage *hstage); + +static inline void rq_hierarchy_start_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_start_io_acct( + rq, rq->q->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_hierarchy_end_io_acct(struct request *rq, + enum stage_group stage) +{ + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + __rq_hierarchy_end_io_acct( + rq, rq->q->io_hierarchy_stats->hstage[stage]); +} + +static inline void rq_list_hierarchy_start_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = rq->q->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_start_io_acct(rq, hstage); +} + +static inline void rq_list_hierarchy_end_io_acct(struct list_head *head, + enum stage_group stage) +{ + struct request *rq; + struct hierarchy_stage *hstage; + + if (list_empty(head)) + return; + + rq = list_first_entry(head, struct request, queuelist); + if (!blk_mq_hierarchy_registered(rq->q, stage)) + return; + + hstage = rq->q->io_hierarchy_stats->hstage[stage]; + list_for_each_entry(rq, head, queuelist) + __rq_hierarchy_end_io_acct(rq, hstage); +} + +#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio); +void __bio_hierarchy_end(struct bio *bio, u64 now); + +static inline void bio_hierarchy_end(struct bio *bio) +{ + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO)) + return; + + __bio_hierarchy_end(bio, blk_time_get_ns()); +} + +static inline void req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ + u64 now; + + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO)) + return; + + now = rq->io_end_time_ns; + if (!now) { + now = blk_time_get_ns(); + rq->io_end_time_ns = now; + } + + __bio_hierarchy_end(bio, now); +} +#endif + +#else /* CONFIG_BLK_IO_HIERARCHY_STATS */ + +static inline int +blk_io_hierarchy_stats_alloc(struct request_queue *q) +{ + return 0; +} + +static inline void +blk_io_hierarchy_stats_free(struct request_queue *q) +{ +} + +static inline bool +blk_mq_hierarchy_registered(struct request_queue *q, enum stage_group stage) +{ + return false; +} + +static inline void +blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +blk_mq_unregister_hierarchy(struct request_queue *q, enum stage_group stage) +{ +} + +static inline void +blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q) +{ +} + +static inline void +blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q) +{ +} + +static inline void +blk_mq_debugfs_register_hierarchy(struct request_queue *q, + enum stage_group stage) +{ +} + +static inline void +blk_mq_debugfs_unregister_hierarchy(struct request_queue *q, + enum stage_group stage) +{ +} + +static inline void +bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage) +{ +} + +static inline void +bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage) +{ +} + +static inline void +bio_list_hierarchy_end_io_acct(struct bio_list *list, enum stage_group stage) +{ +} + +static inline void +bio_free_hierarchy_data(struct bio *bio) +{ +} + +static inline void +rq_hierarchy_start_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_hierarchy_end_io_acct(struct request *rq, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_start_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +static inline void +rq_list_hierarchy_end_io_acct(struct list_head *head, enum stage_group stage) +{ +} + +#endif /* CONFIG_BLK_IO_HIERARCHY_STATS */ + +#if !defined(CONFIG_BLK_IO_HIERARCHY_STATS) || !defined(CONFIG_HIERARCHY_BIO) +static inline void +bio_hierarchy_start(struct bio *bio) +{ +} + +static inline void +bio_hierarchy_end(struct bio *bio) +{ +} + +static inline void +req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ +} +#endif + +#endif /* BLK_IO_HIERARCHY_STATS_H */ diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f0865b6ea1e1..a0909e56d669 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -23,6 +23,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" +#include "blk-io-hierarchy/stats.h" static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) { @@ -355,9 +356,8 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) return blk_mq_rq_state_name_array[rq_state]; } -int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +void debugfs_rq_show(struct seq_file *m, struct request *rq) { - const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const unsigned int op = rq->cmd_flags & REQ_OP_MASK; seq_printf(m, "%p {.op=", rq); @@ -374,6 +374,13 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); +} + +int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +{ + const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; + + debugfs_rq_show(m, rq); if (mq_ops->show_rq) mq_ops->show_rq(m, rq); seq_puts(m, "}\n"); @@ -811,8 +818,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {}, }; -static bool debugfs_create_files(struct dentry *parent, void *data, - const struct blk_mq_debugfs_attr *attr) +bool debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr) { if (IS_ERR_OR_NULL(parent)) return false; @@ -861,6 +868,7 @@ int blk_mq_debugfs_register(struct request_queue *q) goto err; } + blk_mq_debugfs_register_hierarchy_stats(q); return 0; err: diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index a9160be12be0..73a3796bd03c 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -31,6 +31,14 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q); int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); +bool debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr); +void debugfs_rq_show(struct seq_file *m, struct request *rq); + +static inline bool blk_mq_debugfs_enabled(struct request_queue *q) +{ + return !IS_ERR_OR_NULL(q->debugfs_dir); +} #else static inline int blk_mq_debugfs_register(struct request_queue *q) { diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 0fb33abac3f6..1c8befbe7b69 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -15,6 +15,7 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h" void blk_mq_sched_free_hctx_data(struct request_queue *q, void (*exit)(struct blk_mq_hw_ctx *)) @@ -250,6 +251,7 @@ int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); + rq_list_hierarchy_end_io_acct(&rq_list, STAGE_HCTX); if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { if (has_sched_dispatch) ret = blk_mq_do_dispatch_sched(hctx); @@ -389,10 +391,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); bool ret = false; - if (e && e->type->ops.mq.bio_merge) { - blk_mq_put_ctx(ctx); + if (e && e->type->ops.mq.bio_merge) return e->type->ops.mq.bio_merge(hctx, bio); - } if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && !list_empty_careful(&ctx->rq_list)) { @@ -402,7 +402,6 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) spin_unlock(&ctx->lock); } - blk_mq_put_ctx(ctx); return ret; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index bee92ab06a5e..f7b21d7f136e 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-io-hierarchy/stats.h" bool blk_mq_has_free_tags(struct blk_mq_tags *tags) { @@ -113,7 +114,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct sbq_wait_state *ws; DEFINE_WAIT(wait); unsigned int tag_offset; - bool drop_ctx; int tag; if (data->flags & BLK_MQ_REQ_RESERVED) { @@ -135,8 +135,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (data->flags & BLK_MQ_REQ_NOWAIT) return BLK_MQ_TAG_FAIL; + if (data->bio) + bio_hierarchy_start_io_acct(data->bio, STAGE_GETTAG); ws = bt_wait_ptr(bt, data->hctx); - drop_ctx = data->ctx == NULL; do { struct sbitmap_queue *bt_prev; @@ -162,9 +163,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break; - if (data->ctx) - blk_mq_put_ctx(data->ctx); - bt_prev = bt; io_schedule(); @@ -189,10 +187,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ws = bt_wait_ptr(bt, data->hctx); } while (1); - if (drop_ctx && data->ctx) - blk_mq_put_ctx(data->ctx); - finish_wait(&ws->wait, &wait); + if (data->bio) + bio_hierarchy_end_io_acct(data->bio, STAGE_GETTAG); found_tag: return tag + tag_offset; diff --git a/block/blk-mq.c b/block/blk-mq.c index aa4b3c608249..7f24ff0692d4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -36,6 +36,7 @@ #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h" static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); @@ -368,6 +369,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->part = NULL; rq->start_time_ns = ktime_get_ns(); rq->io_start_time_ns = 0; + blk_mq_get_alloc_task(rq, data->bio); + rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -400,13 +403,13 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct elevator_queue *e = q->elevator; struct request *rq; unsigned int tag; - bool put_ctx_on_error = false; + bool clear_ctx_on_error = false; blk_queue_enter_live(q); data->q = q; if (likely(!data->ctx)) { data->ctx = blk_mq_get_ctx(q); - put_ctx_on_error = true; + clear_ctx_on_error = true; } if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->ctx->cpu); @@ -430,10 +433,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, tag = blk_mq_get_tag(data); if (tag == BLK_MQ_TAG_FAIL) { - if (put_ctx_on_error) { - blk_mq_put_ctx(data->ctx); + if (clear_ctx_on_error) data->ctx = NULL; - } blk_queue_exit(q); return NULL; } @@ -470,8 +471,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (!rq) return ERR_PTR(-EWOULDBLOCK); - blk_mq_put_ctx(alloc_data.ctx); - rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; @@ -532,6 +531,8 @@ static void __blk_mq_free_request(struct request *rq) struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); const int sched_tag = rq->internal_tag; + blk_mq_put_alloc_task(rq); + if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -583,6 +584,8 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_stat_add(rq, now); } + if (blk_mq_request_started(rq)) + rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER); blk_account_io_done(rq, now); if (rq->end_io) { @@ -722,6 +725,7 @@ void blk_mq_start_request(struct request *rq) blk_mq_sched_started_request(rq); trace_block_rq_issue(q, rq); + rq_hierarchy_start_io_acct(rq, STAGE_RQ_DRIVER); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); @@ -762,6 +766,7 @@ static void __blk_mq_requeue_request(struct request *rq) rq->rq_flags &= ~RQF_TIMED_OUT; if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; + rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER); } } @@ -787,6 +792,7 @@ static void blk_mq_requeue_work(struct work_struct *work) spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); spin_unlock_irq(&q->requeue_lock); + rq_list_hierarchy_end_io_acct(&rq_list, STAGE_REQUEUE); list_for_each_entry_safe(rq, next, &rq_list, queuelist) { if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) @@ -826,6 +832,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, */ BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); + rq_hierarchy_start_io_acct(rq, STAGE_REQUEUE); spin_lock_irqsave(&q->requeue_lock, flags); if (at_head) { rq->rq_flags |= RQF_SOFTBARRIER; @@ -1317,6 +1324,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!list_empty(list)) { bool needs_restart; + rq_list_hierarchy_start_io_acct(list, STAGE_HCTX); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1726,6 +1734,7 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head, struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + rq_hierarchy_start_io_acct(rq, STAGE_HCTX); spin_lock(&hctx->lock); if (at_head) list_add(&rq->queuelist, &hctx->dispatch); @@ -1792,6 +1801,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) if (rq->mq_ctx != this_ctx) { if (this_ctx) { trace_block_unplug(this_q, depth, !from_schedule); + rq_list_hierarchy_end_io_acct(&ctx_list, + STAGE_PLUG); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); @@ -1812,6 +1823,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (this_ctx) { trace_block_unplug(this_q, depth, !from_schedule); + rq_list_hierarchy_end_io_acct(&ctx_list, STAGE_PLUG); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); } @@ -1975,7 +1987,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { .flags = 0 }; + struct blk_mq_alloc_data data = { + .flags = 0, + .bio = bio, + }; struct request *rq; unsigned int request_count = 0; struct blk_plug *plug; @@ -2019,7 +2034,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) plug = current->plug; if (unlikely(is_flush_fua)) { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); /* bypass scheduler for flush rq */ @@ -2028,7 +2042,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else if (plug && q->nr_hw_queues == 1) { struct request *last = NULL; - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); /* @@ -2051,6 +2064,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_plug(q); } + rq_hierarchy_start_io_acct(rq, STAGE_PLUG); list_add_tail(&rq->queuelist, &plug->mq_list); } else if (plug && !blk_queue_nomerges(q)) { blk_mq_bio_to_request(rq, bio); @@ -2066,23 +2080,21 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) same_queue_rq = NULL; if (same_queue_rq) list_del_init(&same_queue_rq->queuelist); + rq_hierarchy_start_io_acct(rq, STAGE_PLUG); list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); - if (same_queue_rq) { data.hctx = blk_mq_map_queue(q, same_queue_rq->mq_ctx->cpu); + rq_hierarchy_end_io_acct(same_queue_rq, STAGE_PLUG); blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_sched_insert_request(rq, false, true, true); } @@ -2324,6 +2336,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) if (list_empty(&tmp)) return 0; + rq_list_hierarchy_start_io_acct(&tmp, STAGE_HCTX); spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -2758,6 +2771,9 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx, *next; int i; + blk_mq_unregister_hierarchy(q, STAGE_BIO); + blk_io_hierarchy_stats_free(q); + queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); @@ -2895,11 +2911,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; + if (blk_io_hierarchy_stats_alloc(q)) + goto err_exit; + q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, blk_mq_poll_stats_bkt, BLK_MQ_POLL_STATS_BKTS, q); if (!q->poll_cb) - goto err_exit; + goto err_hierarchy_stats; if (blk_mq_alloc_ctxs(q)) goto err_exit; @@ -2972,6 +2991,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_hw_queues = 0; err_sys_init: blk_mq_sysfs_deinit(q); +err_hierarchy_stats: + blk_io_hierarchy_stats_free(q); err_exit: q->mq_ops = NULL; return ERR_PTR(-ENOMEM); diff --git a/block/blk-mq.h b/block/blk-mq.h index c6ec9aa12fb2..1bba4eb18332 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -125,12 +125,7 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { - return __blk_mq_get_ctx(q, get_cpu()); -} - -static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) -{ - put_cpu(); + return __blk_mq_get_ctx(q, raw_smp_processor_id()); } struct blk_mq_alloc_data { @@ -142,6 +137,7 @@ struct blk_mq_alloc_data { /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; + struct bio *bio; }; static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) @@ -234,4 +230,32 @@ static inline void blk_mq_free_requests(struct list_head *list) } } +static inline bool blk_mq_is_sbitmap_shared(unsigned int flags) +{ + return false; +} + +#ifdef CONFIG_BLK_BIO_ALLOC_TASK +static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio) +{ + rq->pid = bio ? get_pid(bio->pid) : get_pid(task_pid(current)); +} + +static inline void blk_mq_put_alloc_task(struct request *rq) +{ + if (rq->pid) { + put_pid(rq->pid); + rq->pid = NULL; + } +} +#else +static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio) +{ +} + +static inline void blk_mq_put_alloc_task(struct request *rq) +{ +} +#endif + #endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1c4d795bbdc4..719687a394ea 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -17,6 +17,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-wbt.h" +#include "blk-io-hierarchy/stats.h" struct queue_sysfs_entry { struct attribute attr; @@ -924,6 +925,19 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +static void blk_mq_register_default_hierarchy(struct request_queue *q) +{ + if (!q->mq_ops) + return; + + blk_mq_register_hierarchy(q, STAGE_GETTAG); + blk_mq_register_hierarchy(q, STAGE_PLUG); + blk_mq_register_hierarchy(q, STAGE_HCTX); + blk_mq_register_hierarchy(q, STAGE_REQUEUE); + blk_mq_register_hierarchy(q, STAGE_RQ_DRIVER); + blk_mq_register_hierarchy(q, STAGE_BIO); +} + /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -973,6 +987,8 @@ int blk_register_queue(struct gendisk *disk) has_elevator = true; } + blk_mq_register_default_hierarchy(q); + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 598191286557..446864c27c3b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -14,6 +14,7 @@ #include <linux/sched/signal.h> #include <linux/delay.h> #include "blk.h" +#include "blk-io-hierarchy/stats.h" /* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; @@ -1350,6 +1351,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(q->queue_lock); + bio_list_hierarchy_end_io_acct(&bio_list_on_stack, STAGE_THROTTLE); + if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while((bio = bio_list_pop(&bio_list_on_stack))) @@ -2333,6 +2336,20 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, tg->last_low_overflow_time[rw] = jiffies; + /* + * This is slow path now, bio_hierarchy_start_io_acct() might spend + * some time to allocate memory. However, it's safe because 'tg' is + * pinned by this bio, and io charge should still be accurate because + * slice is already started from tg_may_dispatch(). + */ + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); + + bio_hierarchy_start_io_acct(bio, STAGE_THROTTLE); + + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; @@ -2561,6 +2578,8 @@ void blk_throtl_exit(struct request_queue *q) del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); + blk_mq_unregister_hierarchy(q, STAGE_THROTTLE); + free_percpu(q->td->latency_buckets[READ]); free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); @@ -2593,6 +2612,8 @@ void blk_throtl_register_queue(struct request_queue *q) td->track_bio_latency = !queue_is_rq_based(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); + + blk_mq_register_hierarchy(q, STAGE_THROTTLE); } #ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 94b5eff0cd3a..87d7816af6e0 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -29,6 +29,7 @@ #include "blk-wbt.h" #include "blk-rq-qos.h" +#include "blk-io-hierarchy/stats.h" #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> @@ -532,11 +533,12 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. */ -static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, - unsigned long rw, spinlock_t *lock) +static void __wbt_wait(struct rq_wb *rwb, struct bio *bio, + enum wbt_flags wb_acct, spinlock_t *lock) __releases(lock) __acquires(lock) { + unsigned long rw = bio->bi_opf; struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); struct wbt_wait_data data = { .wq = { @@ -554,6 +556,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) return; + bio_hierarchy_start_io_acct(bio, STAGE_WBT); has_sleeper = !__prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); do { @@ -588,6 +591,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, } while (1); finish_wait(&rqw->wait, &data.wq); + bio_hierarchy_end_io_acct(bio, STAGE_WBT); } static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) @@ -652,7 +656,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) return; } - __wbt_wait(rwb, flags, bio->bi_opf, lock); + __wbt_wait(rwb, bio, flags, lock); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); @@ -770,6 +774,7 @@ static void wbt_exit(struct rq_qos *rqos) struct rq_wb *rwb = RQWB(rqos); struct request_queue *q = rqos->q; + blk_mq_unregister_hierarchy(q, STAGE_WBT); blk_stat_remove_callback(q, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); @@ -845,6 +850,7 @@ int wbt_init(struct request_queue *q) rwb->min_lat_nsec = wbt_default_latency_nsec(q); wbt_set_queue_depth(q, blk_queue_depth(q)); + blk_mq_register_hierarchy(q, STAGE_WBT); blk_mq_unfreeze_queue(q); wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); diff --git a/block/blk.h b/block/blk.h index 965e9c507654..162b42388610 100644 --- a/block/blk.h +++ b/block/blk.h @@ -175,6 +175,51 @@ static inline void blk_queue_enter_live(struct request_queue *q) percpu_ref_get(&q->q_usage_counter); } +#ifdef CONFIG_BLK_BIO_ALLOC_TIME +static inline u64 blk_time_get_ns(void); +static inline void blk_rq_init_bi_alloc_time(struct request *rq, + struct request *first_rq) +{ + rq->bi_alloc_time_ns = first_rq ? first_rq->bi_alloc_time_ns : + blk_time_get_ns(); +} + +/* + * Used in following cases to updated request bi_alloc_time_ns: + * + * 1) Allocate a new @rq for @bio; + * 2) @bio is merged to @rq, in this case @merged_rq should be NULL; + * 3) @merged_rq is merged to @rq, in this case @bio should be NULL; + */ +static inline void blk_rq_update_bi_alloc_time(struct request *rq, + struct bio *bio, + struct request *merged_rq) +{ + if (bio) { + if (rq->bi_alloc_time_ns > bio->bi_alloc_time_ns) + rq->bi_alloc_time_ns = bio->bi_alloc_time_ns; + return; + } + + if (!merged_rq) + return; + + if (rq->bi_alloc_time_ns > merged_rq->bi_alloc_time_ns) + rq->bi_alloc_time_ns = merged_rq->bi_alloc_time_ns; +} +#else /* CONFIG_BLK_BIO_ALLOC_TIME */ +static inline void blk_rq_init_bi_alloc_time(struct request *rq, + struct request *first_rq) +{ +} + +static inline void blk_rq_update_bi_alloc_time(struct request *rq, + struct bio *bio, + struct request *merged_rq) +{ +} +#endif + #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); @@ -479,4 +524,17 @@ static inline void blk_free_queue_dispatch_async(struct request_queue *q) } #endif +static inline u64 blk_time_get_ns(void) +{ + struct blk_plug *plug = current->plug; + + if (!plug || !in_task()) + return ktime_get_ns(); + + if (!plug->cur_ktime) + plug->cur_ktime = ktime_get_ns(); + + return plug->cur_ktime; +} + #endif /* BLK_INTERNAL_H */ diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 833e9eaae640..04ff97c076fb 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -30,6 +30,7 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-stat.h" +#include "blk-io-hierarchy/stats.h" /* Scheduling domains. */ enum { @@ -365,6 +366,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) blk_stat_add_callback(q, kqd->cb); + blk_mq_register_hierarchy(q, STAGE_KYBER); return 0; } @@ -374,6 +376,7 @@ static void kyber_exit_sched(struct elevator_queue *e) struct request_queue *q = kqd->q; int i; + blk_mq_unregister_hierarchy(kqd->q, STAGE_KYBER); blk_stat_remove_callback(q, kqd->cb); for (i = 0; i < KYBER_NUM_DOMAINS; i++) @@ -517,7 +520,6 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx_q, struct bio *bio) spin_lock(&kcq->lock); merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); spin_unlock(&kcq->lock); - blk_mq_put_ctx(ctx); return merged; } @@ -533,6 +535,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, struct kyber_hctx_data *khd = hctx->sched_data; struct request *rq, *next; + rq_list_hierarchy_start_io_acct(rq_list, STAGE_KYBER); list_for_each_entry_safe(rq, next, rq_list, queuelist) { unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; @@ -772,6 +775,9 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = NULL; out: spin_unlock(&khd->lock); + + if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_KYBER); return rq; } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 7ad820050675..aa51abb3eaa4 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -22,6 +22,7 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" +#include "blk-io-hierarchy/stats.h" /* * See Documentation/block/deadline-iosched.txt @@ -61,6 +62,8 @@ struct deadline_data { spinlock_t lock; spinlock_t zone_lock; struct list_head dispatch; + + struct request_queue *q; }; static inline struct rb_root * @@ -386,6 +389,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock); + if (rq) + rq_hierarchy_end_io_acct(rq, STAGE_DEADLINE); return rq; } @@ -396,6 +401,7 @@ static void dd_exit_queue(struct elevator_queue *e) BUG_ON(!list_empty(&dd->fifo_list[READ])); BUG_ON(!list_empty(&dd->fifo_list[WRITE])); + blk_mq_unregister_hierarchy(dd->q, STAGE_DEADLINE); kfree(dd); } @@ -427,11 +433,13 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e) dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; + dd->q = q; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); INIT_LIST_HEAD(&dd->dispatch); q->elevator = eq; + blk_mq_register_hierarchy(q, STAGE_DEADLINE); return 0; } @@ -469,8 +477,10 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) ret = blk_mq_sched_try_merge(q, bio, &free); spin_unlock(&dd->lock); - if (free) + if (free) { + rq_hierarchy_end_io_acct(free, STAGE_DEADLINE); blk_mq_free_request(free); + } return ret; } @@ -493,6 +503,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_req_zone_write_unlock(rq); if (blk_mq_sched_try_insert_merge(q, rq, &free)) { + rq_list_hierarchy_end_io_acct(&free, STAGE_DEADLINE); blk_mq_free_requests(&free); return; } @@ -527,6 +538,8 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; + rq_list_hierarchy_start_io_acct(list, STAGE_DEADLINE); + spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8075b9955bb3..c2867571bcc7 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -203,6 +203,9 @@ struct bio { struct bio_set *bi_pool; + u64 bi_alloc_time_ns; + struct bio_hierarchy_data *hdata; + struct pid *pid; KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) @@ -234,6 +237,13 @@ struct bio { * of this bio. */ #define BIO_QUEUE_ENTERED 11 /* can use blk_queue_enter_live() */ #define BIO_TRACKED 12 /* set if bio goes through the rq_qos path */ +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS +#define BIO_HAS_DATA 13 /* bio contain data. */ +#define BIO_HIERARCHY_ACCT 14 /* + * This bio has already been subjected to + * blk-io-hierarchy, don't do it again. + */ +#endif /* See BVEC_POOL_OFFSET below before adding new flags */ @@ -368,7 +378,36 @@ enum stat_group { STAT_WRITE, STAT_DISCARD, - NR_STAT_GROUPS + NR_STAT_GROUPS, + STAT_FLUSH = NR_STAT_GROUPS, + NEW_NR_STAT_GROUPS, +}; + +enum stage_group { +#ifdef CONFIG_BLK_DEV_THROTTLING + STAGE_THROTTLE, +#endif +#ifdef CONFIG_BLK_WBT + STAGE_WBT, +#endif + STAGE_GETTAG, + NR_BIO_STAGE_GROUPS, + STAGE_PLUG = NR_BIO_STAGE_GROUPS, +#if IS_ENABLED(CONFIG_MQ_IOSCHED_DEADLINE) + STAGE_DEADLINE, +#endif +#if IS_ENABLED(CONFIG_IOSCHED_BFQ) + STAGE_BFQ, +#endif +#if IS_ENABLED(CONFIG_MQ_IOSCHED_KYBER) + STAGE_KYBER, +#endif + STAGE_HCTX, + STAGE_REQUEUE, + STAGE_RQ_DRIVER, + NR_RQ_STAGE_GROUPS, + STAGE_BIO = NR_RQ_STAGE_GROUPS, + NR_STAGE_GROUPS, }; #define bio_op(bio) \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c848f4205729..713c42987851 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -277,6 +277,12 @@ struct request { #ifdef CONFIG_BLK_CGROUP struct request_list *rl; /* rl this rq is alloced from */ #endif + + enum stage_group stage; + unsigned long hierarchy_time; + u64 io_end_time_ns; + u64 bi_alloc_time_ns; + struct pid *pid; }; static inline bool blk_op_is_scsi(unsigned int op) @@ -703,6 +709,8 @@ struct request_queue { #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; + + struct blk_io_hierarchy_stats *io_hierarchy_stats; }; #define QUEUE_FLAG_QUEUED 0 /* uses generic tag queueing */ @@ -1372,6 +1380,7 @@ struct blk_plug { struct list_head list; /* requests */ struct list_head mq_list; /* blk-mq requests */ struct list_head cb_list; /* md requires an unplug callback */ + u64 cur_ktime; }; #define BLK_MAX_REQUEST_COUNT 16 #define BLK_PLUG_FLUSH_SIZE (128 * 1024) -- 2.39.2

2 1

[PATCH OLK-5.10] Bluetooth: qca: Fix BT enable failure again for QCA6390 after warm reboot
by liwei 05 Aug '24

05 Aug '24

From: Zijun Hu <quic_zijuhu(a)quicinc.com> mainline inclusion from mainline-v6.10-rc7 commit 88e72239ead9814b886db54fc4ee39ef3c2b8f26 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGRQD CVE: CVE-2024-42137 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?… -------------------------------- Commit 272970be3dab ("Bluetooth: hci_qca: Fix driver shutdown on closed serdev") will cause below regression issue: BT can't be enabled after below steps: cold boot -> enable BT -> disable BT -> warm reboot -> BT enable failure if property enable-gpios is not configured within DT|ACPI for QCA6390. The commit is to fix a use-after-free issue within qca_serdev_shutdown() by adding condition to avoid the serdev is flushed or wrote after closed but also introduces this regression issue regarding above steps since the VSC is not sent to reset controller during warm reboot. Fixed by sending the VSC to reset controller within qca_serdev_shutdown() once BT was ever enabled, and the use-after-free issue is also fixed by this change since the serdev is still opened before it is flushed or wrote. Verified by the reported machine Dell XPS 13 9310 laptop over below two kernel commits: commit e00fc2700a3f ("Bluetooth: btusb: Fix triggering coredump implementation for QCA") of bluetooth-next tree. commit b23d98d46d28 ("Bluetooth: btusb: Fix triggering coredump implementation for QCA") of linus mainline tree. Fixes: 272970be3dab ("Bluetooth: hci_qca: Fix driver shutdown on closed serdev") Cc: stable(a)vger.kernel.org Reported-by: Wren Turkal <wt(a)penguintechs.org> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218726 Signed-off-by: Zijun Hu <quic_zijuhu(a)quicinc.com> Tested-by: Wren Turkal <wt(a)penguintechs.org> Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz(a)intel.com> Signed-off-by: dengquan <dengquan9(a)huawei.com> --- drivers/bluetooth/hci_qca.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 6e0c0762fbab..24cfc552e6a8 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -2076,15 +2076,27 @@ static void qca_serdev_shutdown(struct device *dev) struct qca_serdev *qcadev = serdev_device_get_drvdata(serdev); struct hci_uart *hu = &qcadev->serdev_hu; struct hci_dev *hdev = hu->hdev; - struct qca_data *qca = hu->priv; const u8 ibs_wake_cmd[] = { 0xFD }; const u8 edl_reset_soc_cmd[] = { 0x01, 0x00, 0xFC, 0x01, 0x05 }; if (qcadev->btsoc_type == QCA_QCA6390) { - if (test_bit(QCA_BT_OFF, &qca->flags) || - !test_bit(HCI_RUNNING, &hdev->flags)) + /* The purpose of sending the VSC is to reset SOC into a initial + * state and the state will ensure next hdev->setup() success. + * if HCI_QUIRK_NON_PERSISTENT_SETUP is set, it means that + * hdev->setup() can do its job regardless of SoC state, so + * don't need to send the VSC. + * if HCI_SETUP is set, it means that hdev->setup() was never + * invoked and the SOC is already in the initial state, so + * don't also need to send the VSC. + */ + if (test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks) || + hci_dev_test_flag(hdev, HCI_SETUP)) return; + /* The serdev must be in open state when conrol logic arrives + * here, so also fix the use-after-free issue caused by that + * the serdev is flushed or wrote after it is closed. + */ serdev_device_write_flush(serdev); ret = serdev_device_write_buf(serdev, ibs_wake_cmd, sizeof(ibs_wake_cmd)); -- 2.25.1

2 1

[PATCH OLK-6.6 0/1] kvm : add coda feature
by l00500167 05 Aug '24

05 Aug '24

From: JunBin Li <lijunbin4(a)huawei.com> virtCCA feature JunBin Li (1): vfio modify include/uapi/linux/vfio.h | 3 +++ 1 file changed, 3 insertions(+) -- 2.25.1

2 2

[PATCH OLK-6.6 0/1] kvm : add coda feature
by l00500167 05 Aug '24

05 Aug '24

From: JunBin Li <lijunbin4(a)huawei.com> virtCCA feature JunBin Li (1): vfio modify drivers/vfio/pci/vfio_pci_rdwr.c | 61 ++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) -- 2.25.1

2 2

[PATCH OLK-5.10] drm/shmem-helper: Fix BUG_ON() on mmap(PROT_WRITE, MAP_PRIVATE)
by Pu Lehui 05 Aug '24

05 Aug '24

From: "Wachowski, Karol" <karol.wachowski(a)intel.com> mainline inclusion from mainline-v6.10-rc2 commit 39bc27bd688066a63e56f7f64ad34fae03fbe3b8 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IACS4Z CVE: CVE-2024-39497 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?… -------------------------------- Lack of check for copy-on-write (COW) mapping in drm_gem_shmem_mmap allows users to call mmap with PROT_WRITE and MAP_PRIVATE flag causing a kernel panic due to BUG_ON in vmf_insert_pfn_prot: BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); Return -EINVAL early if COW mapping is detected. This bug affects all drm drivers using default shmem helpers. It can be reproduced by this simple example: void *ptr = mmap(0, size, PROT_WRITE, MAP_PRIVATE, fd, mmap_offset); ptr[0] = 0; Fixes: 2194a63a818d ("drm: Add library for shmem backed GEM objects") Cc: Noralf Trønnes <noralf(a)tronnes.org> Cc: Eric Anholt <eric(a)anholt.net> Cc: Rob Herring <robh(a)kernel.org> Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com> Cc: Maxime Ripard <mripard(a)kernel.org> Cc: Thomas Zimmermann <tzimmermann(a)suse.de> Cc: David Airlie <airlied(a)gmail.com> Cc: Daniel Vetter <daniel(a)ffwll.ch> Cc: dri-devel(a)lists.freedesktop.org Cc: <stable(a)vger.kernel.org> # v5.2+ Signed-off-by: Wachowski, Karol <karol.wachowski(a)intel.com> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz(a)linux.intel.com> Signed-off-by: Daniel Vetter <daniel.vetter(a)ffwll.ch> Link: https://patchwork.freedesktop.org/patch/msgid/20240520100514.925681-1-jacek… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> Conflicts: drivers/gpu/drm/drm_gem_shmem_helper.c [The conflicts were due to not merge commit 21aa27ddc582] Signed-off-by: Pu Lehui <pulehui(a)huawei.com> --- drivers/gpu/drm/drm_gem_shmem_helper.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index e8f07305e279..0422c557755c 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -630,6 +630,9 @@ int drm_gem_shmem_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) return ret; } + if (is_cow_mapping(vma->vm_flags)) + return -EINVAL; + shmem = to_drm_gem_shmem_obj(obj); ret = drm_gem_shmem_get_pages(shmem); -- 2.34.1

2 1

[PATCH openEuler-22.03-LTS-SP1] drm/shmem-helper: Fix BUG_ON() on mmap(PROT_WRITE, MAP_PRIVATE)
by Pu Lehui 05 Aug '24

05 Aug '24

From: "Wachowski, Karol" <karol.wachowski(a)intel.com> mainline inclusion from mainline-v6.10-rc2 commit 39bc27bd688066a63e56f7f64ad34fae03fbe3b8 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IACS4Z CVE: CVE-2024-39497 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?… -------------------------------- Lack of check for copy-on-write (COW) mapping in drm_gem_shmem_mmap allows users to call mmap with PROT_WRITE and MAP_PRIVATE flag causing a kernel panic due to BUG_ON in vmf_insert_pfn_prot: BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); Return -EINVAL early if COW mapping is detected. This bug affects all drm drivers using default shmem helpers. It can be reproduced by this simple example: void *ptr = mmap(0, size, PROT_WRITE, MAP_PRIVATE, fd, mmap_offset); ptr[0] = 0; Fixes: 2194a63a818d ("drm: Add library for shmem backed GEM objects") Cc: Noralf Trønnes <noralf(a)tronnes.org> Cc: Eric Anholt <eric(a)anholt.net> Cc: Rob Herring <robh(a)kernel.org> Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com> Cc: Maxime Ripard <mripard(a)kernel.org> Cc: Thomas Zimmermann <tzimmermann(a)suse.de> Cc: David Airlie <airlied(a)gmail.com> Cc: Daniel Vetter <daniel(a)ffwll.ch> Cc: dri-devel(a)lists.freedesktop.org Cc: <stable(a)vger.kernel.org> # v5.2+ Signed-off-by: Wachowski, Karol <karol.wachowski(a)intel.com> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz(a)linux.intel.com> Signed-off-by: Daniel Vetter <daniel.vetter(a)ffwll.ch> Link: https://patchwork.freedesktop.org/patch/msgid/20240520100514.925681-1-jacek… Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> Conflicts: drivers/gpu/drm/drm_gem_shmem_helper.c [The conflicts were due to not merge commit 21aa27ddc582] Signed-off-by: Pu Lehui <pulehui(a)huawei.com> --- drivers/gpu/drm/drm_gem_shmem_helper.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index cfacce0418a4..46ebde9c7282 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -613,6 +613,9 @@ int drm_gem_shmem_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) return dma_buf_mmap(obj->dma_buf, vma, 0); } + if (is_cow_mapping(vma->vm_flags)) + return -EINVAL; + shmem = to_drm_gem_shmem_obj(obj); ret = drm_gem_shmem_get_pages(shmem); -- 2.34.1

2 1

[PATCH openEuler-1.0-LTS] ocfs2: fix DIO failure due to insufficient transaction credits
by heyujie 05 Aug '24

05 Aug '24

From: Jan Kara <jack(a)suse.cz> mainline inclusion from mainline-v6.10-rc6 commit be346c1a6eeb49d8fda827d2a9522124c2f72f36 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGENQ CVE: CVE-2024-42077 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?… -------------------------------- The code in ocfs2_dio_end_io_write() estimates number of necessary transaction credits using ocfs2_calc_extend_credits(). This however does not take into account that the IO could be arbitrarily large and can contain arbitrary number of extents. Extent tree manipulations do often extend the current transaction but not in all of the cases. For example if we have only single block extents in the tree, ocfs2_mark_extent_written() will end up calling ocfs2_replace_extent_rec() all the time and we will never extend the current transaction and eventually exhaust all the transaction credits if the IO contains many single block extents. Once that happens a WARN_ON(jbd2_handle_buffer_credits(handle) <= 0) is triggered in jbd2_journal_dirty_metadata() and subsequently OCFS2 aborts in response to this error. This was actually triggered by one of our customers on a heavily fragmented OCFS2 filesystem. To fix the issue make sure the transaction always has enough credits for one extent insert before each call of ocfs2_mark_extent_written(). Heming Zhao said: ------ PANIC: "Kernel panic - not syncing: OCFS2: (device dm-1): panic forced after error" PID: xxx TASK: xxxx CPU: 5 COMMAND: "SubmitThread-CA" #0 machine_kexec at ffffffff8c069932 #1 __crash_kexec at ffffffff8c1338fa #2 panic at ffffffff8c1d69b9 #3 ocfs2_handle_error at ffffffffc0c86c0c [ocfs2] #4 __ocfs2_abort at ffffffffc0c88387 [ocfs2] #5 ocfs2_journal_dirty at ffffffffc0c51e98 [ocfs2] #6 ocfs2_split_extent at ffffffffc0c27ea3 [ocfs2] #7 ocfs2_change_extent_flag at ffffffffc0c28053 [ocfs2] #8 ocfs2_mark_extent_written at ffffffffc0c28347 [ocfs2] #9 ocfs2_dio_end_io_write at ffffffffc0c2bef9 [ocfs2] Link: https://lkml.kernel.org/r/20240617095543.6971-1-jack@suse.cz Link: https://lkml.kernel.org/r/20240614145243.8837-1-jack@suse.cz Fixes: c15471f79506 ("ocfs2: fix sparse file & data ordering issue in direct io") Signed-off-by: Jan Kara <jack(a)suse.cz> Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com> Reviewed-by: Heming Zhao <heming.zhao(a)suse.com> Cc: Mark Fasheh <mark(a)fasheh.com> Cc: Joel Becker <jlbec(a)evilplan.org> Cc: Junxiao Bi <junxiao.bi(a)oracle.com> Cc: Changwei Ge <gechangwei(a)live.cn> Cc: Gang He <ghe(a)suse.com> Cc: Jun Piao <piaojun(a)huawei.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Signed-off-by: He Yujie <coka.heyujie(a)huawei.com> --- fs/ocfs2/aops.c | 5 +++++ fs/ocfs2/journal.c | 17 +++++++++++++++++ fs/ocfs2/journal.h | 2 ++ fs/ocfs2/ocfs2_trace.h | 2 ++ 4 files changed, 26 insertions(+) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 543efa3e5655..d0c5e0bcdb1f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2382,6 +2382,11 @@ static int ocfs2_dio_end_io_write(struct inode *inode, } list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { + ret = ocfs2_assure_trans_credits(handle, credits); + if (ret < 0) { + mlog_errno(ret); + break; + } ret = ocfs2_mark_extent_written(inode, &et, handle, ue->ue_cpos, 1, ue->ue_phys, diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 08ca85613418..1f2ce86136a8 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -463,6 +463,23 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) return status; } +/* + * Make sure handle has at least 'nblocks' credits available. If it does not + * have that many credits available, we will try to extend the handle to have + * enough credits. If that fails, we will restart transaction to have enough + * credits. Similar notes regarding data consistency and locking implications + * as for ocfs2_extend_trans() apply here. + */ +int ocfs2_assure_trans_credits(handle_t *handle, int nblocks) +{ + int old_nblks = jbd2_handle_buffer_credits(handle); + + trace_ocfs2_assure_trans_credits(old_nblks); + if (old_nblks >= nblocks) + return 0; + return ocfs2_extend_trans(handle, nblocks - old_nblks); +} + /* * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA. * If that fails, restart the transaction & regain write access for the diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 497a4171ef61..654054e558cc 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -258,6 +258,8 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle); int ocfs2_extend_trans(handle_t *handle, int nblocks); +int ocfs2_assure_trans_credits(handle_t *handle, + int nblocks); int ocfs2_allocate_extend_trans(handle_t *handle, int thresh); diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 2ee76a90ba8f..70410ec8cebb 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2576,6 +2576,8 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); +DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits); + DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans); -- 2.34.1

2 1

[PATCH OLK-5.10] ocfs2: fix DIO failure due to insufficient transaction credits
by heyujie 05 Aug '24

05 Aug '24

From: Jan Kara <jack(a)suse.cz> mainline inclusion from mainline-v6.10-rc6 commit be346c1a6eeb49d8fda827d2a9522124c2f72f36 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGENQ CVE: CVE-2024-42077 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?… -------------------------------- The code in ocfs2_dio_end_io_write() estimates number of necessary transaction credits using ocfs2_calc_extend_credits(). This however does not take into account that the IO could be arbitrarily large and can contain arbitrary number of extents. Extent tree manipulations do often extend the current transaction but not in all of the cases. For example if we have only single block extents in the tree, ocfs2_mark_extent_written() will end up calling ocfs2_replace_extent_rec() all the time and we will never extend the current transaction and eventually exhaust all the transaction credits if the IO contains many single block extents. Once that happens a WARN_ON(jbd2_handle_buffer_credits(handle) <= 0) is triggered in jbd2_journal_dirty_metadata() and subsequently OCFS2 aborts in response to this error. This was actually triggered by one of our customers on a heavily fragmented OCFS2 filesystem. To fix the issue make sure the transaction always has enough credits for one extent insert before each call of ocfs2_mark_extent_written(). Heming Zhao said: ------ PANIC: "Kernel panic - not syncing: OCFS2: (device dm-1): panic forced after error" PID: xxx TASK: xxxx CPU: 5 COMMAND: "SubmitThread-CA" #0 machine_kexec at ffffffff8c069932 #1 __crash_kexec at ffffffff8c1338fa #2 panic at ffffffff8c1d69b9 #3 ocfs2_handle_error at ffffffffc0c86c0c [ocfs2] #4 __ocfs2_abort at ffffffffc0c88387 [ocfs2] #5 ocfs2_journal_dirty at ffffffffc0c51e98 [ocfs2] #6 ocfs2_split_extent at ffffffffc0c27ea3 [ocfs2] #7 ocfs2_change_extent_flag at ffffffffc0c28053 [ocfs2] #8 ocfs2_mark_extent_written at ffffffffc0c28347 [ocfs2] #9 ocfs2_dio_end_io_write at ffffffffc0c2bef9 [ocfs2] Link: https://lkml.kernel.org/r/20240617095543.6971-1-jack@suse.cz Link: https://lkml.kernel.org/r/20240614145243.8837-1-jack@suse.cz Fixes: c15471f79506 ("ocfs2: fix sparse file & data ordering issue in direct io") Signed-off-by: Jan Kara <jack(a)suse.cz> Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com> Reviewed-by: Heming Zhao <heming.zhao(a)suse.com> Cc: Mark Fasheh <mark(a)fasheh.com> Cc: Joel Becker <jlbec(a)evilplan.org> Cc: Junxiao Bi <junxiao.bi(a)oracle.com> Cc: Changwei Ge <gechangwei(a)live.cn> Cc: Gang He <ghe(a)suse.com> Cc: Jun Piao <piaojun(a)huawei.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Signed-off-by: He Yujie <coka.heyujie(a)huawei.com> --- fs/ocfs2/aops.c | 5 +++++ fs/ocfs2/journal.c | 17 +++++++++++++++++ fs/ocfs2/journal.h | 2 ++ fs/ocfs2/ocfs2_trace.h | 2 ++ 4 files changed, 26 insertions(+) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index aae893cf9e26..5089f4d09dce 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2373,6 +2373,11 @@ static int ocfs2_dio_end_io_write(struct inode *inode, } list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { + ret = ocfs2_assure_trans_credits(handle, credits); + if (ret < 0) { + mlog_errno(ret); + break; + } ret = ocfs2_mark_extent_written(inode, &et, handle, ue->ue_cpos, 1, ue->ue_phys, diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 0534800a472a..dfa6ff2756fb 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -449,6 +449,23 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) return status; } +/* + * Make sure handle has at least 'nblocks' credits available. If it does not + * have that many credits available, we will try to extend the handle to have + * enough credits. If that fails, we will restart transaction to have enough + * credits. Similar notes regarding data consistency and locking implications + * as for ocfs2_extend_trans() apply here. + */ +int ocfs2_assure_trans_credits(handle_t *handle, int nblocks) +{ + int old_nblks = jbd2_handle_buffer_credits(handle); + + trace_ocfs2_assure_trans_credits(old_nblks); + if (old_nblks >= nblocks) + return 0; + return ocfs2_extend_trans(handle, nblocks - old_nblks); +} + /* * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA. * If that fails, restart the transaction & regain write access for the diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index eb7a21bac71e..bc5d77cb3c50 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -244,6 +244,8 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle); int ocfs2_extend_trans(handle_t *handle, int nblocks); +int ocfs2_assure_trans_credits(handle_t *handle, + int nblocks); int ocfs2_allocate_extend_trans(handle_t *handle, int thresh); diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index dc4bce1649c1..7a9cfd61145a 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2578,6 +2578,8 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); +DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits); + DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans); -- 2.34.1

2 1

[PATCH openEuler-22.03-LTS-SP1] ocfs2: fix DIO failure due to insufficient transaction credits
by heyujie 05 Aug '24

05 Aug '24

From: Jan Kara <jack(a)suse.cz> mainline inclusion from mainline-v6.10-rc6 commit be346c1a6eeb49d8fda827d2a9522124c2f72f36 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGENQ CVE: CVE-2024-42077 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?… -------------------------------- The code in ocfs2_dio_end_io_write() estimates number of necessary transaction credits using ocfs2_calc_extend_credits(). This however does not take into account that the IO could be arbitrarily large and can contain arbitrary number of extents. Extent tree manipulations do often extend the current transaction but not in all of the cases. For example if we have only single block extents in the tree, ocfs2_mark_extent_written() will end up calling ocfs2_replace_extent_rec() all the time and we will never extend the current transaction and eventually exhaust all the transaction credits if the IO contains many single block extents. Once that happens a WARN_ON(jbd2_handle_buffer_credits(handle) <= 0) is triggered in jbd2_journal_dirty_metadata() and subsequently OCFS2 aborts in response to this error. This was actually triggered by one of our customers on a heavily fragmented OCFS2 filesystem. To fix the issue make sure the transaction always has enough credits for one extent insert before each call of ocfs2_mark_extent_written(). Heming Zhao said: ------ PANIC: "Kernel panic - not syncing: OCFS2: (device dm-1): panic forced after error" PID: xxx TASK: xxxx CPU: 5 COMMAND: "SubmitThread-CA" #0 machine_kexec at ffffffff8c069932 #1 __crash_kexec at ffffffff8c1338fa #2 panic at ffffffff8c1d69b9 #3 ocfs2_handle_error at ffffffffc0c86c0c [ocfs2] #4 __ocfs2_abort at ffffffffc0c88387 [ocfs2] #5 ocfs2_journal_dirty at ffffffffc0c51e98 [ocfs2] #6 ocfs2_split_extent at ffffffffc0c27ea3 [ocfs2] #7 ocfs2_change_extent_flag at ffffffffc0c28053 [ocfs2] #8 ocfs2_mark_extent_written at ffffffffc0c28347 [ocfs2] #9 ocfs2_dio_end_io_write at ffffffffc0c2bef9 [ocfs2] Link: https://lkml.kernel.org/r/20240617095543.6971-1-jack@suse.cz Link: https://lkml.kernel.org/r/20240614145243.8837-1-jack@suse.cz Fixes: c15471f79506 ("ocfs2: fix sparse file & data ordering issue in direct io") Signed-off-by: Jan Kara <jack(a)suse.cz> Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com> Reviewed-by: Heming Zhao <heming.zhao(a)suse.com> Cc: Mark Fasheh <mark(a)fasheh.com> Cc: Joel Becker <jlbec(a)evilplan.org> Cc: Junxiao Bi <junxiao.bi(a)oracle.com> Cc: Changwei Ge <gechangwei(a)live.cn> Cc: Gang He <ghe(a)suse.com> Cc: Jun Piao <piaojun(a)huawei.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Signed-off-by: He Yujie <coka.heyujie(a)huawei.com> --- fs/ocfs2/aops.c | 5 +++++ fs/ocfs2/journal.c | 17 +++++++++++++++++ fs/ocfs2/journal.h | 2 ++ fs/ocfs2/ocfs2_trace.h | 2 ++ 4 files changed, 26 insertions(+) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 6b06de78f2af..b8667311c292 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2359,6 +2359,11 @@ static int ocfs2_dio_end_io_write(struct inode *inode, } list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { + ret = ocfs2_assure_trans_credits(handle, credits); + if (ret < 0) { + mlog_errno(ret); + break; + } ret = ocfs2_mark_extent_written(inode, &et, handle, ue->ue_cpos, 1, ue->ue_phys, diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index db52e843002a..ef188543c89f 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -449,6 +449,23 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) return status; } +/* + * Make sure handle has at least 'nblocks' credits available. If it does not + * have that many credits available, we will try to extend the handle to have + * enough credits. If that fails, we will restart transaction to have enough + * credits. Similar notes regarding data consistency and locking implications + * as for ocfs2_extend_trans() apply here. + */ +int ocfs2_assure_trans_credits(handle_t *handle, int nblocks) +{ + int old_nblks = jbd2_handle_buffer_credits(handle); + + trace_ocfs2_assure_trans_credits(old_nblks); + if (old_nblks >= nblocks) + return 0; + return ocfs2_extend_trans(handle, nblocks - old_nblks); +} + /* * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA. * If that fails, restart the transaction & regain write access for the diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index bfe611ed1b1d..77bf7842894f 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -243,6 +243,8 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle); int ocfs2_extend_trans(handle_t *handle, int nblocks); +int ocfs2_assure_trans_credits(handle_t *handle, + int nblocks); int ocfs2_allocate_extend_trans(handle_t *handle, int thresh); diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index dc4bce1649c1..7a9cfd61145a 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2578,6 +2578,8 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); +DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits); + DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans); -- 2.34.1

2 1