hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP CVE: NA
--------------------------------
Like blk-throttle, following new debugfs entries will be created as well:
/sys/kernel/debug/block/sda/blk_io_hierarchy/ |-- bio | |-- io_dump | |-- stats | `-- threshold
User can use them to analyze how IO behaves for bio lifetime.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/bio.c | 1 + block/blk-core.c | 10 +- block/blk-io-hierarchy/Kconfig | 11 ++ block/blk-io-hierarchy/debugfs.c | 3 + block/blk-io-hierarchy/iodump.c | 241 ++++++++++++++++++++++++++++++- block/blk-io-hierarchy/stats.c | 47 +++++- block/blk-io-hierarchy/stats.h | 52 +++++++ block/blk-mq.c | 4 + block/blk-sysfs.c | 1 + include/linux/blk_types.h | 13 ++ 10 files changed, 380 insertions(+), 3 deletions(-)
diff --git a/block/bio.c b/block/bio.c index 274af65487b9..ff18f6839063 100644 --- a/block/bio.c +++ b/block/bio.c @@ -252,6 +252,7 @@ void bio_uninit(struct bio *bio) bio->pid = NULL; } #endif + bio_hierarchy_end(bio); bio_free_hierarchy_data(bio); } EXPORT_SYMBOL(bio_uninit); diff --git a/block/blk-core.c b/block/blk-core.c index a72ceb688f59..0c74101424dc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -538,8 +538,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio_advance(bio, nbytes);
/* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) { + req_bio_hierarchy_end(rq, bio); bio_endio(bio); + } }
void blk_dump_rq_flags(struct request *rq, char *msg) @@ -2627,6 +2629,12 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node);
+ /* + * On the one hand REQ_PREFLUSH | REQ_FUA can be cleared above, on the + * other hand it doesn't make sense to count invalid bio. Split bio will + * be accounted separately. + */ + bio_hierarchy_start(bio); if (!blkcg_bio_issue_check(q, bio)) return false;
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig index 2c2055a4502e..01019f6aa425 100644 --- a/block/blk-io-hierarchy/Kconfig +++ b/block/blk-io-hierarchy/Kconfig @@ -13,6 +13,17 @@ menuconfig BLK_IO_HIERARCHY_STATS
if BLK_IO_HIERARCHY_STATS
+config HIERARCHY_BIO + bool "Support to record stats for bio lifetime" + default n + select BLK_BIO_ALLOC_TIME + help + Enabling this lets blk hierarchy stats to record additional information + for bio. Such information can be helpful to debug performance and + problems like io hang. + + If unsure, say N. + config HIERARCHY_IO_DUMP bool "Support to dump io that is throttled" default n diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c index 7a386133da94..29c17e116773 100644 --- a/block/blk-io-hierarchy/debugfs.c +++ b/block/blk-io-hierarchy/debugfs.c @@ -49,6 +49,9 @@ static const char *stage_name[NR_STAGE_GROUPS] = { #ifdef CONFIG_HIERARCHY_RQ_DRIVER [STAGE_RQ_DRIVER] = "rq_driver", #endif +#ifdef CONFIG_HIERARCHY_BIO + [STAGE_BIO] = "bio", +#endif };
const char *hierarchy_stage_name(enum stage_group stage) diff --git a/block/blk-io-hierarchy/iodump.c b/block/blk-io-hierarchy/iodump.c index d03b12008310..9cc56c5a0dfe 100644 --- a/block/blk-io-hierarchy/iodump.c +++ b/block/blk-io-hierarchy/iodump.c @@ -42,6 +42,22 @@ struct rq_dump_data { bool enter_queue; };
+#ifdef CONFIG_HIERARCHY_BIO +struct pos_data { + enum stage_group stage; + unsigned int count; +}; + +struct bio_stage_dump_data { + union { + loff_t pos; + struct pos_data pdata; + }; + struct rq_dump_data rq_ddata; + u64 stat_time; +}; +#endif + int blk_io_hierarchy_iodump_init(struct request_queue *q, struct hierarchy_stage *hstage) { @@ -73,6 +89,23 @@ int blk_io_hierarchy_iodump_init(struct request_queue *q, return 0; }
+#ifdef CONFIG_HIERARCHY_BIO + BUILD_BUG_ON(sizeof(struct pos_data) != sizeof(loff_t)); + + if (hstage->stage == STAGE_BIO) { + struct bio_stage_dump_data *bstage_ddata = + kzalloc(sizeof(*bstage_ddata), GFP_KERNEL); + + if (!bstage_ddata) + return -ENOMEM; + + bstage_ddata->rq_ddata.q = q; + bstage_ddata->rq_ddata.stage = hstage->stage; + hstage->dump_data = bstage_ddata; + return 0; + } +#endif + return -EINVAL; }
@@ -373,7 +406,8 @@ static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata) * fast path to avoid refcount cas operations for the request that * is from other shared request_queue or other stages. */ - if (rq->q != q || READ_ONCE(rq_wrapper->stage) != rq_ddata->stage) + if (rq->q != q || (rq_ddata->stage != STAGE_BIO && + READ_ONCE(rq_wrapper->stage) != rq_ddata->stage)) return NULL;
if (!refcount_inc_not_zero(&rq->ref)) @@ -385,6 +419,9 @@ static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata) return NULL; }
+ if (rq_ddata->stage == STAGE_BIO) + return rq; + /* * Barrier is paired with the smp_store_release() in * rq_hierarchy_start_io_acct(), so that if stage is read, uninitialized @@ -473,6 +510,206 @@ static const struct blk_mq_debugfs_attr hierarchy_rq_dump_attr[] = { {}, };
+#ifdef CONFIG_HIERARCHY_BIO +static struct bio_dump_data *get_bio_stage_ddata(struct request_queue *q, + enum stage_group stage) +{ + struct blk_io_hierarchy_stats *stats = + queue_to_wrapper(q)->io_hierarchy_stats; + struct hierarchy_stage *hstage = READ_ONCE(stats->hstage[stage]); + + if (!hstage) + return NULL; + + return hstage->dump_data; +} + +static void bio_stage_start_next_stage(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + pdata->stage++; + if (!stage_is_bio(pdata->stage)) + pdata->stage = STAGE_BIO; + pdata->count = 0; + + *pos = bstage_ddata->pos; +} + +static void bio_stage_start_next_io(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) + pdata->count++; + else + pdata->count = bstage_ddata->rq_ddata.tag; + + *pos = bstage_ddata->pos; +} + +static void __bio_stage_hierarchy_stop(struct bio_stage_dump_data *bstage_ddata) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + spin_unlock_irq(&bio_ddata->lock); + } + + if (rq_ddata->enter_queue) { + percpu_ref_put(&rq_ddata->q->q_usage_counter); + rq_ddata->enter_queue = false; + } +} + +void *__bio_stage_hierarchy_start(struct bio_stage_dump_data *bstage_ddata, + loff_t *pos) +{ + struct pos_data *pdata = &bstage_ddata->pdata; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + +retry: + if (stage_is_bio(pdata->stage)) { + struct list_head *list; + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + + if (!bio_ddata) { + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + spin_lock_irq(&bio_ddata->lock); + list = seq_list_start(&bio_ddata->head, pdata->count); + if (list) + return list; + + spin_unlock_irq(&bio_ddata->lock); + bio_stage_start_next_stage(bstage_ddata, pos); + goto retry; + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_start(rq_ddata, pdata->count)) + return bstage_ddata; + + return NULL; +} + +static void *bio_stage_hierarchy_start(struct seq_file *m, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + mutex_lock(&dump_mutex); + bstage_ddata->pos = *pos; + bstage_ddata->stat_time = blk_time_get_ns(); + + return __bio_stage_hierarchy_start(bstage_ddata, pos); +} + +static void *bio_stage_hierarchy_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + + if (stage_is_bio(pdata->stage)) { + struct bio_dump_data *bio_ddata = + get_bio_stage_ddata(rq_ddata->q, pdata->stage); + struct list_head *list = ((struct list_head *)v)->next; + + if (list != &bio_ddata->head) { + bio_stage_start_next_io(bstage_ddata, pos); + return list; + } + + spin_unlock_irq(&bio_ddata->lock); + + bio_stage_start_next_stage(bstage_ddata, pos); + return __bio_stage_hierarchy_start(bstage_ddata, pos); + } + + if (pdata->stage == STAGE_BIO && + __rq_hierarchy_next(rq_ddata)) { + bio_stage_start_next_io(bstage_ddata, pos); + return bstage_ddata; + } + + (*pos)++; + return NULL; +} + +static void bio_stage_hierarchy_stop(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + + __bio_stage_hierarchy_stop(bstage_ddata); + mutex_unlock(&dump_mutex); +} + +static int bio_stage_hierarchy_show(struct seq_file *m, void *v) +{ + struct hierarchy_stage *hstage = m->private; + struct bio_stage_dump_data *bstage_ddata = hstage->dump_data; + struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata; + struct pos_data *pdata = &bstage_ddata->pdata; + u64 duration; + + if (stage_is_bio(pdata->stage)) { + struct bio_hierarchy_data *data = list_entry( + v, struct bio_hierarchy_data, hierarchy_list); + + duration = get_duration(bstage_ddata->stat_time, + data->bio->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + __hierarchy_show_bio(m, data, pdata->stage, duration); + } else if (pdata->stage == STAGE_BIO) { + struct request *rq = hierarchy_find_and_get_rq(rq_ddata); + + if (rq) { + duration = get_duration(bstage_ddata->stat_time, + request_to_wrapper(rq)->bi_alloc_time_ns); + if (hstage->threshold <= ns_to_ms(duration)) + hierarchy_show_rq(m, rq, duration); + blk_mq_put_rq_ref(rq); + } + } + + return 0; +} + +static const struct seq_operations bio_stage_hierarchy_ops = { + .start = bio_stage_hierarchy_start, + .next = bio_stage_hierarchy_next, + .stop = bio_stage_hierarchy_stop, + .show = bio_stage_hierarchy_show, +}; + +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + { + "io_dump", + 0400, + .seq_ops = &bio_stage_hierarchy_ops, + }, + {}, +}; + +#else /* CONFIG_HIERARCHY_BIO */ +static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = { + {}, +}; + +#endif + void io_hierarchy_register_iodump(struct hierarchy_stage *hstage) { const struct blk_mq_debugfs_attr *attr; @@ -481,6 +718,8 @@ void io_hierarchy_register_iodump(struct hierarchy_stage *hstage) attr = hierarchy_bio_dump_attr; else if (stage_is_rq(hstage->stage)) attr = hierarchy_rq_dump_attr; + else if (hstage->stage == STAGE_BIO) + attr = bio_stage_dump_attr; else attr = NULL;
diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c index 4dd1d2eb43d2..b9e79b435149 100644 --- a/block/blk-io-hierarchy/stats.c +++ b/block/blk-io-hierarchy/stats.c @@ -238,7 +238,8 @@ static enum stat_group bio_hierarchy_op(struct bio *bio) if (op_is_discard(bio->bi_opf)) return STAT_DISCARD;
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + if (op_is_flush(bio->bi_opf) && + !(bio_sectors(bio) || (bio->bi_opf & REQ_HAS_DATA))) return STAT_FLUSH;
if (op_is_write(bio->bi_opf)) @@ -372,6 +373,50 @@ void __rq_hierarchy_end_io_acct(struct request *rq, } EXPORT_SYMBOL_GPL(__rq_hierarchy_end_io_acct);
+#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio) +{ + struct request_queue_wrapper *q_wrapper; + struct gendisk *disk = bio->bi_disk; + struct hierarchy_stage *hstage; + + if (bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(disk->queue, STAGE_BIO)) + return; + + bio_set_flag(bio, BIO_HIERARCHY_ACCT); + if (bio_has_data(bio)) + bio->bi_opf |= REQ_HAS_DATA; + q_wrapper = queue_to_wrapper(disk->queue); + hstage = q_wrapper->io_hierarchy_stats->hstage[STAGE_BIO]; + io_hierarchy_inc(hstage->hstats_data, dispatched, + bio_hierarchy_op(bio)); +} + +void __bio_hierarchy_end(struct bio *bio, u64 now) +{ + struct request_queue_wrapper *q_wrapper; + struct gendisk *disk = bio->bi_disk; + struct hierarchy_stage *hstage; + enum stat_group op; + u64 duration; + + op = bio_hierarchy_op(bio); + duration = now - bio->bi_alloc_time_ns; + q_wrapper = queue_to_wrapper(disk->queue); + hstage = q_wrapper->io_hierarchy_stats->hstage[STAGE_BIO]; + + io_hierarchy_inc(hstage->hstats_data, completed, op); + io_hierarchy_add(hstage->hstats_data, nsecs, op, duration); + hierarchy_account_slow_io_ns(hstage, op, duration); + + bio_clear_flag(bio, BIO_HIERARCHY_ACCT); + bio->bi_opf &= ~REQ_HAS_DATA; +} +#endif + static int __init hierarchy_stats_init(void) { hdata_pool = mempool_create_kmalloc_pool(PRE_ALLOC_BIO_CNT, diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h index f5ed92c5f7c5..7864af79b398 100644 --- a/block/blk-io-hierarchy/stats.h +++ b/block/blk-io-hierarchy/stats.h @@ -202,6 +202,41 @@ static inline bool blk_rq_hierarchy_is_flush_done(struct request *rq) return request_to_wrapper(rq)->flush_done; }
+#ifdef CONFIG_HIERARCHY_BIO +void bio_hierarchy_start(struct bio *bio); +void __bio_hierarchy_end(struct bio *bio, u64 now); + +static inline void bio_hierarchy_end(struct bio *bio) +{ + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO)) + return; + + __bio_hierarchy_end(bio, blk_time_get_ns()); +} + +static inline void req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ + u64 now; + + if (!bio_flagged(bio, BIO_HIERARCHY_ACCT)) + return; + + if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO)) + return; + + now = request_to_wrapper(rq)->io_end_time_ns; + if (!now) { + now = blk_time_get_ns(); + request_to_wrapper(rq)->io_end_time_ns = now; + } + + __bio_hierarchy_end(bio, now); +} +#endif + #else /* CONFIG_BLK_IO_HIERARCHY_STATS */
static inline int @@ -316,4 +351,21 @@ blk_rq_hierarchy_stats_init(struct request *rq)
#endif /* CONFIG_BLK_IO_HIERARCHY_STATS */
+#if !defined(CONFIG_BLK_IO_HIERARCHY_STATS) || !defined(CONFIG_HIERARCHY_BIO) +static inline void +bio_hierarchy_start(struct bio *bio) +{ +} + +static inline void +bio_hierarchy_end(struct bio *bio) +{ +} + +static inline void +req_bio_hierarchy_end(struct request *rq, struct bio *bio) +{ +} +#endif + #endif /* BLK_IO_HIERARCHY_STATS_H */ diff --git a/block/blk-mq.c b/block/blk-mq.c index 80df133486ad..d7d3589c1e5a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2015,6 +2015,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_queue_split(q, &bio);
+ /* account for split bio. */ + bio_hierarchy_start(bio); + if (!bio_integrity_prep(bio)) return BLK_QC_T_NONE;
@@ -2781,6 +2784,7 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx, *next; int i;
+ blk_mq_unregister_hierarchy(q, STAGE_BIO); blk_io_hierarchy_stats_free(q);
queue_for_each_hw_ctx(q, hctx, i) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 93af23610f98..719687a394ea 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -935,6 +935,7 @@ static void blk_mq_register_default_hierarchy(struct request_queue *q) blk_mq_register_hierarchy(q, STAGE_HCTX); blk_mq_register_hierarchy(q, STAGE_REQUEUE); blk_mq_register_hierarchy(q, STAGE_RQ_DRIVER); + blk_mq_register_hierarchy(q, STAGE_BIO); }
/** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 04cc9b6bd524..f77227229f58 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -232,6 +232,12 @@ struct bio { /* * bio flags */ +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS +#define BIO_HIERARCHY_ACCT 0 /* + * This bio has already been subjected to + * blk-io-hierarchy, don't do it again. + */ +#endif #define BIO_SEG_VALID 1 /* bi_phys_segments valid */ #define BIO_CLONED 2 /* doesn't own data */ #define BIO_BOUNCED 3 /* bio is a bounce bio */ @@ -346,6 +352,9 @@ enum req_flag_bits { /* for driver use */ __REQ_DRV, __REQ_SWAP, /* swapping request. */ +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS + _REQ_HAS_DATA, /* io contain data. */ +#endif __REQ_NR_BITS, /* stops here */ };
@@ -368,6 +377,9 @@ enum req_flag_bits {
#define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) +#ifdef CONFIG_BLK_IO_HIERARCHY_STATS +#define REQ_HAS_DATA (1UL << _REQ_HAS_DATA) +#endif
#define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) @@ -408,6 +420,7 @@ enum stage_group { STAGE_REQUEUE, STAGE_RQ_DRIVER, NR_RQ_STAGE_GROUPS, + STAGE_BIO = NR_RQ_STAGE_GROUPS, NR_STAGE_GROUPS, };