 
            mainline inclusion from mainline-v6.12-rc1 commit 29390bb5661d49d10424ad8e915230de1f7074c9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Currently, blk-throttle handle all IO fifo, hence if data IO is throttled and then meta IO is dispatched, the meta IO will have to wait for the data IO, causing priority inversion problems. This patch support to handle metadata first and then pay debt while throttling data. Test script: use cgroup v1 to throttle root cgroup, then create new dir and file while write back is throttled test() { mkdir /mnt/test/xxx touch /mnt/test/xxx/1 sync /mnt/test/xxx sync /mnt/test/xxx } mkfs.ext4 -F /dev/nvme0n1 -E lazy_itable_init=0,lazy_journal_init=0 mount /dev/nvme0n1 /mnt/test echo "259:0 $((1024*1024))" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device dd if=/dev/zero of=/mnt/test/foo1 bs=16M count=1 conv=fdatasync status=none & sleep 4 time test echo "259:0 0" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device sleep 1 umount /dev/nvme0n1 Test result: time cost for creating new dir and file before this patch: 14s after this patch: 0.1s Signed-off-by: Yu Kuai <yukuai3@huawei.com> Acked-by: Tejun Heo <tj@kernel.org> Link: https://lore.kernel.org/r/20240903135149.271857-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe <axboe@kernel.dk> Conflicts: block/blk-throttle.c [commit bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") is not backported.] Signed-off-by: Yu Kuai <yukuai3@huawei.com> --- block/blk-throttle.c | 63 +++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f9aeaa2ce9e7..5e299ea26d85 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2140,6 +2140,22 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif +static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) +{ + /* throtl is FIFO - if bios are already queued, should queue */ + if (tg->service_queue.nr_queued[rw]) + return false; + + return tg_may_dispatch(tg, bio, NULL); +} + +static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) +{ + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) + tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); + tg->carryover_ios[rw]--; +} + bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; @@ -2167,12 +2183,33 @@ bool __blk_throtl_bio(struct bio *bio) tg->last_low_overflow_time[rw] = jiffies; throtl_downgrade_check(tg); throtl_upgrade_check(tg); - /* throtl is FIFO - if bios are already queued, should queue */ - if (sq->nr_queued[rw]) - break; - /* if above limits, break to queue */ - if (!tg_may_dispatch(tg, bio, NULL)) { + if (tg_within_limit(tg, bio, rw)) { + /* within limits, let's charge and dispatch directly */ + throtl_charge_bio(tg, bio); + + /* + * We need to trim slice even when bios are not being + * queued otherwise it might happen that a bio is not + * queued for a long time and slice keeps on extending + * and trim is not called for a long time. Now if limits + * are reduced suddenly we take into account all the IO + * dispatched so far at new low rate and * newly queued + * IO gets a really long dispatch time. + * + * So keep on trimming slice even if bio is not queued. + */ + throtl_trim_slice(tg, rw); + } else if (bio_issue_as_root_blkg(bio)) { + /* + * IOs which may cause priority inversions are + * dispatched directly, even if they're over limit. + * Debts are handled by carryover_bytes/ios while + * calculating wait time. + */ + tg_dispatch_in_debt(tg, bio, rw); + } else { + /* if above limits, break to queue */ tg->last_low_overflow_time[rw] = jiffies; if (throtl_can_upgrade(td, tg)) { throtl_upgrade_state(td); @@ -2181,22 +2218,6 @@ bool __blk_throtl_bio(struct bio *bio) break; } - /* within limits, let's charge and dispatch directly */ - throtl_charge_bio(tg, bio); - - /* - * We need to trim slice even when bios are not being queued - * otherwise it might happen that a bio is not queued for - * a long time and slice keeps on extending and trim is not - * called for a long time. Now if limits are reduced suddenly - * we take into account all the IO dispatched so far at new - * low rate and * newly queued IO gets a really long dispatch - * time. - * - * So keep on trimming slice even if bio is not queued. - */ - throtl_trim_slice(tg, rw); - /* * @bio passed through this layer without being throttled. * Climb up the ladder. If we're already at the top, it -- 2.39.2