From: Zhang Wensheng zhangwensheng5@huawei.com
hulk inclusion category: bugfix bugzilla: 39265, https://gitee.com/openeuler/kernel/issues/I4WC06 CVE: NA
-----------------------------------------------
When the inflight IOs are slow and no new IOs are issued, we expect iostat could manifest the IO hang problem. However after commit 9c6dea45e6f7 ("block: delete part_round_stats and switch to less precise counting"), io_tick and time_in_queue will not be updated until the end of IO, and the avgqu-sz and %util columns of iostat will be zero.
To fix it, we could fallback to the implementation before commit 9c6dea45e6f7, but it may cause performance regression on NVMe device or bio-based device (due to overhead of inflight calculation), so add a switch to control whether or not to use precise iostat accounting. It can be enabled by adding "precise_iostat=1" in kernel boot cmdline. When precise accouting is enabled, io_tick and time_in_queue will be updated when accessing /proc/diskstats and /sys/block/sdX/sdXN/stat.
Fixes: 9c6dea45e6f7 ("block: delete part_round_stats and switch to less precise counting") Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/bio.c | 8 ++++++-- block/blk-core.c | 30 +++++++++++++++++++++++++++--- block/blk-merge.c | 2 ++ block/genhd.c | 7 +++++++ block/partition-generic.c | 8 ++++++++ include/linux/blkdev.h | 1 + 6 files changed, 51 insertions(+), 5 deletions(-)
diff --git a/block/bio.c b/block/bio.c index d94243411ef30..b50d3b59c79b4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1706,9 +1706,13 @@ void generic_end_io_acct(struct request_queue *q, int req_op, const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock();
- update_io_ticks(cpu, part, now); + if (precise_iostat) { + part_round_stats(q, cpu, part); + } else { + update_io_ticks(cpu, part, now); + part_stat_add(cpu, part, time_in_queue, duration); + } part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_add(cpu, part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op));
part_stat_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 41d0b09e9a673..df733e8caa6a1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -56,6 +56,20 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
DEFINE_IDA(blk_queue_ida);
+bool precise_iostat; +static int __init precise_iostat_setup(char *str) +{ + bool precise; + + if (!strtobool(str, &precise)) { + precise_iostat = precise; + pr_info("precise iostat %d\n", precise_iostat); + } + + return 1; +} +__setup("precise_iostat=", precise_iostat_setup); + /* * For the allocated request tables */ @@ -1700,8 +1714,13 @@ static void part_round_stats_single(struct request_queue *q, int cpu, struct hd_struct *part, unsigned long now, unsigned int inflight) { - if (inflight) + if (inflight) { + if (precise_iostat) { + __part_stat_add(cpu, part, time_in_queue, + inflight * (now - part->stamp)); + } __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); + } part->stamp = now; }
@@ -2771,10 +2790,15 @@ void blk_account_io_done(struct request *req, u64 now) cpu = part_stat_lock(); part = req->part;
- update_io_ticks(cpu, part, jiffies); + if (!precise_iostat) { + update_io_ticks(cpu, part, jiffies); + part_stat_add(cpu, part, time_in_queue, + nsecs_to_jiffies64(now - req->start_time_ns)); + } else { + part_round_stats(req->q, cpu, part); + } part_stat_inc(cpu, part, ios[sgrp]); part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); - part_stat_add(cpu, part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); part_dec_in_flight(req->q, part, rq_data_dir(req));
hd_struct_put(part); diff --git a/block/blk-merge.c b/block/blk-merge.c index 4c17c1031e34f..d2fabe1fdf326 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -669,6 +669,8 @@ static void blk_account_io_merge(struct request *req) cpu = part_stat_lock(); part = req->part;
+ if (precise_iostat) + part_round_stats(req->q, cpu, part); part_dec_in_flight(req->q, part, rq_data_dir(req));
hd_struct_put(part); diff --git a/block/genhd.c b/block/genhd.c index 183612cbbd6b7..e7b97fdb41731 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1352,6 +1352,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct hd_struct *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight[2]; + int cpu;
/* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1363,6 +1364,12 @@ static int diskstats_show(struct seq_file *seqf, void *v)
disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { + if (precise_iostat) { + cpu = part_stat_lock(); + part_round_stats(gp->queue, cpu, hd); + part_stat_unlock(); + } + part_in_flight(gp->queue, hd, inflight); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " diff --git a/block/partition-generic.c b/block/partition-generic.c index 739c0cc5fd222..c4ac7a8c77dc5 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -18,6 +18,7 @@ #include <linux/ctype.h> #include <linux/genhd.h> #include <linux/blktrace_api.h> +#include <linux/blkdev.h>
#include "partitions/check.h"
@@ -121,6 +122,13 @@ ssize_t part_stat_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q = part_to_disk(p)->queue; unsigned int inflight[2]; + int cpu; + + if (precise_iostat) { + cpu = part_stat_lock(); + part_round_stats(q, cpu, p); + part_stat_unlock(); + }
part_in_flight(q, p, inflight); return sprintf(buf, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 28ea02865ecc1..a86659e78d987 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -28,6 +28,7 @@ #include <linux/scatterlist.h> #include <linux/blkzoned.h>
+extern bool precise_iostat; struct module; struct scsi_ioctl_command;
From: Zhang Wensheng zhangwensheng5@huawei.com
hulk inclusion category: bugfix bugzilla: 39265, https://gitee.com/openeuler/kernel/issues/I4WC06 CVE: NA
-----------------------------------------------
When 'precise_iostat' is set, io_tick and time_in_queue will be updated if io is inflight. However, mq will not account io as inflight until getting driver tag, while sq will account it after getting sched tag.
On the other hand, if io scheduler is none, blk_mq_get_tag() will get driver tag directly, and such io will be accounted as inflight. The consequences is that 'io_tick' will be miscalculated in part_round_stats().
Thus revert to sq's implementation if 'precise_iostat' is set.
Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/genhd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c index e7b97fdb41731..fc24384e2c85f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -24,6 +24,7 @@
#include "blk.h"
+extern bool precise_iostat; static DEFINE_MUTEX(block_class_lock); struct kobject *block_depr;
@@ -47,7 +48,7 @@ static void disk_release_events(struct gendisk *disk);
void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (!precise_iostat && q->mq_ops) return;
atomic_inc(&part->in_flight[rw]); @@ -57,7 +58,7 @@ void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (!precise_iostat && q->mq_ops) return;
atomic_dec(&part->in_flight[rw]); @@ -68,7 +69,7 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) void part_in_flight(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - if (q->mq_ops) { + if (!precise_iostat && q->mq_ops) { blk_mq_in_flight(q, part, inflight); return; } @@ -85,7 +86,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part, void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - if (q->mq_ops) { + if (!precise_iostat && q->mq_ops) { blk_mq_in_flight_rw(q, part, inflight); return; }
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 186143, https://gitee.com/openeuler/kernel/issues/I4WC06 CVE: NA
-----------------------------------------------
If precise_iostat is enabled, inflight will be recorded and cleared by atomic operations. However, for dm multipath, inflight will be recorded by dm_requeue_original_request(), and if some error happened, multipath_release_clone() won't clear inflight, which will cause inflight to leak. Furthermore, %util will always be 100 in iostat.
Fix the problem by calling __blk_mq_end_request() in multipath_release_clone() instead.
Signed-off-by: Yu Kuai yukuai3@huawei.com Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-core.c | 22 ++++++++++++++++++++-- block/blk.h | 9 +++++++++ drivers/md/dm-rq.c | 2 +- include/linux/blkdev.h | 2 ++ 4 files changed, 32 insertions(+), 3 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index df733e8caa6a1..a5d80ab911707 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2669,8 +2669,10 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @q: the queue to submit the request * @rq: the request being queued + * @precise: true if io account with start and done will be balanced */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) +blk_status_t __blk_insert_cloned_request(struct request_queue *q, + struct request *rq, bool precise) { unsigned long flags; int where = ELEVATOR_INSERT_BACK; @@ -2693,7 +2695,16 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - return blk_mq_request_issue_directly(rq); + ret = blk_mq_request_issue_directly(rq); + if (ret && precise) { + u64 now = 0; + + if (blk_mq_need_time_stamp(rq)) + now = ktime_get_ns(); + + blk_account_io_done(rq, now); + } + return ret; }
spin_lock_irqsave(q->queue_lock, flags); @@ -2718,6 +2729,13 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
return BLK_STS_OK; } +EXPORT_SYMBOL_GPL(__blk_insert_cloned_request); + +blk_status_t blk_insert_cloned_request(struct request_queue *q, + struct request *rq) +{ + return __blk_insert_cloned_request(q, rq, false); +} EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
/** diff --git a/block/blk.h b/block/blk.h index e496e26630f71..dde2141a32dde 100644 --- a/block/blk.h +++ b/block/blk.h @@ -405,6 +405,15 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) return current->io_context; }
+/* + * Only need start/end time stamping if we have stats enabled, or using + * an IO scheduler. + */ +static inline bool blk_mq_need_time_stamp(struct request *rq) +{ + return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; +} + /* * Internal throttling interface */ diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 80683f2b723d5..3bd805f7ce85b 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -410,7 +410,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ clone->rq_flags |= RQF_IO_STAT;
clone->start_time_ns = ktime_get_ns(); - r = blk_insert_cloned_request(clone->q, clone); + r = __blk_insert_cloned_request(clone->q, clone, true); if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE) /* must complete clone in terms of original request */ dm_complete_request(rq, r); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a86659e78d987..1deaf36eb2371 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1012,6 +1012,8 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, extern void blk_rq_unprep_clone(struct request *rq); extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); +extern blk_status_t __blk_insert_cloned_request(struct request_queue *q, + struct request *rq, bool precise); extern int blk_rq_append_bio(struct request *rq, struct bio **bio); extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_queue_split(struct request_queue *, struct bio **);