From: Zhang Wensheng zhangwensheng5@huawei.com
hulk inclusion category: bugfix bugzilla: 39265, https://gitee.com/openeuler/kernel/issues/I4WC06 CVE: NA
-----------------------------------------------
When the inflight IOs are slow and no new IOs are issued, we expect iostat could manifest the IO hang problem. However after commit 9c6dea45e6f7 ("block: delete part_round_stats and switch to less precise counting"), io_tick and time_in_queue will not be updated until the end of IO, and the avgqu-sz and %util columns of iostat will be zero.
To fix it, we could fallback to the implementation before commit 9c6dea45e6f7, but it may cause performance regression on NVMe device or bio-based device (due to overhead of inflight calculation), so add a switch to control whether or not to use precise iostat accounting. It can be enabled by adding "precise_iostat=1" in kernel boot cmdline. When precise accouting is enabled, io_tick and time_in_queue will be updated when accessing /proc/diskstats and /sys/block/sdX/sdXN/stat.
Fixes: 9c6dea45e6f7 ("block: delete part_round_stats and switch to less precise counting") Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/bio.c | 8 ++++++-- block/blk-core.c | 30 +++++++++++++++++++++++++++--- block/blk-merge.c | 2 ++ block/genhd.c | 7 +++++++ block/partition-generic.c | 8 ++++++++ include/linux/blkdev.h | 1 + 6 files changed, 51 insertions(+), 5 deletions(-)
diff --git a/block/bio.c b/block/bio.c index d94243411ef30..b50d3b59c79b4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1706,9 +1706,13 @@ void generic_end_io_acct(struct request_queue *q, int req_op, const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock();
- update_io_ticks(cpu, part, now); + if (precise_iostat) { + part_round_stats(q, cpu, part); + } else { + update_io_ticks(cpu, part, now); + part_stat_add(cpu, part, time_in_queue, duration); + } part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_add(cpu, part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op));
part_stat_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 41d0b09e9a673..df733e8caa6a1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -56,6 +56,20 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
DEFINE_IDA(blk_queue_ida);
+bool precise_iostat; +static int __init precise_iostat_setup(char *str) +{ + bool precise; + + if (!strtobool(str, &precise)) { + precise_iostat = precise; + pr_info("precise iostat %d\n", precise_iostat); + } + + return 1; +} +__setup("precise_iostat=", precise_iostat_setup); + /* * For the allocated request tables */ @@ -1700,8 +1714,13 @@ static void part_round_stats_single(struct request_queue *q, int cpu, struct hd_struct *part, unsigned long now, unsigned int inflight) { - if (inflight) + if (inflight) { + if (precise_iostat) { + __part_stat_add(cpu, part, time_in_queue, + inflight * (now - part->stamp)); + } __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); + } part->stamp = now; }
@@ -2771,10 +2790,15 @@ void blk_account_io_done(struct request *req, u64 now) cpu = part_stat_lock(); part = req->part;
- update_io_ticks(cpu, part, jiffies); + if (!precise_iostat) { + update_io_ticks(cpu, part, jiffies); + part_stat_add(cpu, part, time_in_queue, + nsecs_to_jiffies64(now - req->start_time_ns)); + } else { + part_round_stats(req->q, cpu, part); + } part_stat_inc(cpu, part, ios[sgrp]); part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); - part_stat_add(cpu, part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); part_dec_in_flight(req->q, part, rq_data_dir(req));
hd_struct_put(part); diff --git a/block/blk-merge.c b/block/blk-merge.c index 4c17c1031e34f..d2fabe1fdf326 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -669,6 +669,8 @@ static void blk_account_io_merge(struct request *req) cpu = part_stat_lock(); part = req->part;
+ if (precise_iostat) + part_round_stats(req->q, cpu, part); part_dec_in_flight(req->q, part, rq_data_dir(req));
hd_struct_put(part); diff --git a/block/genhd.c b/block/genhd.c index 183612cbbd6b7..e7b97fdb41731 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1352,6 +1352,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct hd_struct *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight[2]; + int cpu;
/* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1363,6 +1364,12 @@ static int diskstats_show(struct seq_file *seqf, void *v)
disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { + if (precise_iostat) { + cpu = part_stat_lock(); + part_round_stats(gp->queue, cpu, hd); + part_stat_unlock(); + } + part_in_flight(gp->queue, hd, inflight); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " diff --git a/block/partition-generic.c b/block/partition-generic.c index 739c0cc5fd222..c4ac7a8c77dc5 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -18,6 +18,7 @@ #include <linux/ctype.h> #include <linux/genhd.h> #include <linux/blktrace_api.h> +#include <linux/blkdev.h>
#include "partitions/check.h"
@@ -121,6 +122,13 @@ ssize_t part_stat_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q = part_to_disk(p)->queue; unsigned int inflight[2]; + int cpu; + + if (precise_iostat) { + cpu = part_stat_lock(); + part_round_stats(q, cpu, p); + part_stat_unlock(); + }
part_in_flight(q, p, inflight); return sprintf(buf, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 28ea02865ecc1..a86659e78d987 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -28,6 +28,7 @@ #include <linux/scatterlist.h> #include <linux/blkzoned.h>
+extern bool precise_iostat; struct module; struct scsi_ioctl_command;