Jens Axboe (1): block: move blk-throtl fast path inline
Jinke Han (1): blk-throttle: Fix io statistics for cgroup v1
Kemeng Shi (2): blk-throttle: Fix that bps of child could exceed bps limited in parent blk-throttle: correct calculation of wait time in tg_may_dispatch
Laibin Qiu (1): blk-throttle: Set BIO_THROTTLED when bio has been throttled
Ming Lei (4): block: throttle split bio in case of iops limit block: don't try to throttle split bio if iops limit isn't set block: revert 4f1e9630afe6 ("blk-throtl: optimize IOPS throttle for large IO scenarios") block: avoid use-after-free on throttle data
Yu Kuai (10): Revert "blk-throttle: fix io hung due to configuration updates" Revert "blk-throttle: Set BIO_THROTTLED when bio has been throttled" blk-throttle: fix that io throttle can only work for single bio blk-throttle: fix io hung due to configuration updates blk-throttle: remove THROTL_TG_HAS_IOPS_LIMIT blk-throttle: improve bypassing bios checkings blk-throttle: print signed value 'carryover_bytes/ios' for user blk-throttle: fix wrong comparation while 'carryover_ios/bytes' is negative blk-throttle: consider 'carryover_ios/bytes' in throtl_trim_slice() blk-throttle: support prioritized processing of metadata
block/bio.c | 2 - block/blk-cgroup.c | 7 +- block/blk-core.c | 1 + block/blk-merge.c | 6 +- block/blk-sysfs.c | 1 + block/blk-throttle.c | 519 +++++++++++++------------------------- block/blk-throttle.h | 214 ++++++++++++++++ block/blk.h | 16 -- include/linux/bio.h | 2 +- include/linux/blk_types.h | 2 +- 10 files changed, 396 insertions(+), 374 deletions(-) create mode 100644 block/blk-throttle.h
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
--------------------------------
This reverts commit 510d96b0f5a1edbd9ed1e9efbfde87930a3588b3.
Prepare to backport mainline solution in the next patch instead.
Fixes: 510d96b0f5a1 ("blk-throttle: fix io hung due to configuration updates") Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 79 ++++++++------------------------------------ 1 file changed, 13 insertions(+), 66 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 5475aa477bc5..4e5cfc1c79e0 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1432,57 +1432,7 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; }
-static u64 throtl_update_bytes_disp(u64 dispatched, u64 new_limit, - u64 old_limit) -{ - if (new_limit == old_limit) - return dispatched; - - if (!dispatched) - return 0; - - /* - * In the case that multiply will overflow, just return 0. It will only - * let bios to be dispatched earlier. - */ - if (div64_u64(U64_MAX, dispatched) < new_limit) - return 0; - - dispatched *= new_limit; - return div64_u64(dispatched, old_limit); -} - -static u32 throtl_update_io_disp(u32 dispatched, u32 new_limit, u32 old_limit) -{ - if (new_limit == old_limit) - return dispatched; - - if (!dispatched) - return 0; - /* - * In the case that multiply will overflow, just return 0. It will only - * let bios to be dispatched earlier. - */ - if (UINT_MAX / dispatched < new_limit) - return 0; - - dispatched *= new_limit; - return dispatched / old_limit; -} - -static void throtl_update_slice(struct throtl_grp *tg, u64 *old_limits) -{ - tg->bytes_disp[READ] = throtl_update_bytes_disp(tg->bytes_disp[READ], - tg_bps_limit(tg, READ), old_limits[0]); - tg->bytes_disp[WRITE] = throtl_update_bytes_disp(tg->bytes_disp[WRITE], - tg_bps_limit(tg, WRITE), old_limits[1]); - tg->io_disp[READ] = throtl_update_io_disp(tg->io_disp[READ], - tg_iops_limit(tg, READ), (u32)old_limits[2]); - tg->io_disp[WRITE] = throtl_update_io_disp(tg->io_disp[WRITE], - tg_iops_limit(tg, WRITE), (u32)old_limits[3]); -} - -static void tg_conf_updated(struct throtl_grp *tg, u64 *old_limits, bool global) +static void tg_conf_updated(struct throtl_grp *tg, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; @@ -1523,7 +1473,16 @@ static void tg_conf_updated(struct throtl_grp *tg, u64 *old_limits, bool global) } rcu_read_unlock();
- throtl_update_slice(tg, old_limits); + /* + * We're already holding queue_lock and know @tg is valid. Let's + * apply the new config directly. + * + * Restart the slices for both READ and WRITES. It might happen + * that a group's limit are dropped suddenly and we don't want to + * account recently dispatched IO with new low rate. + */ + throtl_start_new_slice(tg, READ); + throtl_start_new_slice(tg, WRITE);
if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -1556,14 +1515,6 @@ static inline int throtl_restart_syscall_when_busy(int errno) return ret; }
-static void tg_get_limits(struct throtl_grp *tg, u64 *limits) -{ - limits[0] = tg_bps_limit(tg, READ); - limits[1] = tg_bps_limit(tg, WRITE); - limits[2] = tg_iops_limit(tg, READ); - limits[3] = tg_iops_limit(tg, WRITE); -} - static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { @@ -1572,7 +1523,6 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, struct throtl_grp *tg; int ret; u64 v; - u64 old_limits[4];
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); if (ret) @@ -1589,14 +1539,13 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, v = U64_MAX;
tg = blkg_to_tg(ctx.blkg); - tg_get_limits(tg, old_limits);
if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; else *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
- tg_conf_updated(tg, old_limits, false); + tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_finish(&ctx); @@ -1767,7 +1716,6 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; - u64 old_limits[4]; unsigned long idle_time; unsigned long latency_time; int ret; @@ -1786,7 +1734,6 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, v[1] = tg->bps_conf[WRITE][index]; v[2] = tg->iops_conf[READ][index]; v[3] = tg->iops_conf[WRITE][index]; - tg_get_limits(tg, old_limits);
idle_time = tg->idletime_threshold_conf; latency_time = tg->latency_target_conf; @@ -1873,7 +1820,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->td->limit_index = LIMIT_LOW; } else tg->td->limit_index = LIMIT_MAX; - tg_conf_updated(tg, old_limits, index == LIMIT_LOW && + tg_conf_updated(tg, index == LIMIT_LOW && tg->td->limit_valid[LIMIT_LOW]); ret = 0; out_finish:
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
--------------------------------
This reverts commit 2c7b55097150add1295e9329e00a6dffeaf84636.
Fixes: 2c7b55097150 ("blk-throttle: Set BIO_THROTTLED when bio has been throttled") Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 4e5cfc1c79e0..8a22fd8449bf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2279,16 +2279,13 @@ bool blk_throtl_bio(struct bio *bio) struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; - bool locked = true; struct throtl_data *td = tg->td;
rcu_read_lock();
/* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED)) { - locked = false; + if (bio_flagged(bio, BIO_THROTTLED)) goto out; - }
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, @@ -2296,10 +2293,8 @@ bool blk_throtl_bio(struct bio *bio) blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); }
- if (!tg->has_rules[rw]) { - locked = false; + if (!tg->has_rules[rw]) goto out; - }
spin_lock_irq(&q->queue_lock);
@@ -2354,7 +2349,7 @@ bool blk_throtl_bio(struct bio *bio) sq = sq->parent_sq; tg = sq_to_tg(sq); if (!tg) - goto out; + goto out_unlock; }
/* out-of-limit, queue to @tg */ @@ -2382,6 +2377,8 @@ bool blk_throtl_bio(struct bio *bio) throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); }
+out_unlock: + spin_unlock_irq(&q->queue_lock); out: bio_set_flag(bio, BIO_THROTTLED);
@@ -2389,9 +2386,6 @@ bool blk_throtl_bio(struct bio *bio) if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; #endif - if (locked) - spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); return throttled; }
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-v5.16-rc1 commit a7b36ee6ba299ffa5c3b36af187b4d0fb32a557c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Even if no policies are defined, we spend ~2% of the total IO time checking. Move the fast path inline.
Acked-by: Tejun Heo tj@kernel.org Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: block/blk-throttle.c block/blk-cgroup.c block/blk-core.c block/blk-merge.c [Contexts conflicts, and hide blk-throttle.h in blk-merge.c to prevent kabi broken.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-cgroup.c | 1 + block/blk-core.c | 1 + block/blk-merge.c | 4 + block/blk-sysfs.c | 1 + block/blk-throttle.c | 161 +------------------------------------- block/blk-throttle.h | 182 +++++++++++++++++++++++++++++++++++++++++++ block/blk.h | 16 ---- 7 files changed, 192 insertions(+), 174 deletions(-) create mode 100644 block/blk-throttle.h
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1f2c93e9daa1..0872b392360d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -31,6 +31,7 @@ #include <linux/tracehook.h> #include <linux/psi.h> #include "blk.h" +#include "blk-throttle.h"
#define MAX_KEY_LEN 100
diff --git a/block/blk-core.c b/block/blk-core.c index bf3bfc3ed339..2f8829bc7e50 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -51,6 +51,7 @@ #include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-rq-qos.h" +#include "blk-throttle.h"
struct dentry *blk_debugfs_root;
diff --git a/block/blk-merge.c b/block/blk-merge.c index 3b2004308e93..f5c770f57946 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -13,6 +13,10 @@ #include "blk.h" #include "blk-rq-qos.h"
+#ifndef __GENKSYMS__ +#include "blk-throttle.h" +#endif + /* * rq_straddles_atomic_write_boundary - check for boundary violation * @rq: request to check diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 078aace75204..bfca6fb243d3 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -17,6 +17,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-wbt.h" +#include "blk-throttle.h"
struct queue_sysfs_entry { struct attribute attr; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8a22fd8449bf..dc2e2db6013a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -14,6 +14,7 @@ #include <linux/delay.h> #include "blk.h" #include "blk-cgroup-rwstat.h" +#include "blk-throttle.h"
/* Max dispatch from a group in 1 round */ #define THROTL_GRP_QUANTUM 8 @@ -38,8 +39,6 @@ */ #define LATENCY_FILTERED_HD (1000L) /* 1ms */
-static struct blkcg_policy blkcg_policy_throtl; - /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue;
@@ -56,55 +55,6 @@ static int __init setup_global_limit(char *str)
__setup("blkcg_global_limit=", setup_global_limit);
-/* - * To implement hierarchical throttling, throtl_grps form a tree and bios - * are dispatched upwards level by level until they reach the top and get - * issued. When dispatching bios from the children and local group at each - * level, if the bios are dispatched into a single bio_list, there's a risk - * of a local or child group which can queue many bios at once filling up - * the list starving others. - * - * To avoid such starvation, dispatched bios are queued separately - * according to where they came from. When they are again dispatched to - * the parent, they're popped in round-robin order so that no single source - * hogs the dispatch window. - * - * throtl_qnode is used to keep the queued bios separated by their sources. - * Bios are queued to throtl_qnode which in turn is queued to - * throtl_service_queue and then dispatched in round-robin order. - * - * It's also used to track the reference counts on blkg's. A qnode always - * belongs to a throtl_grp and gets queued on itself or the parent, so - * incrementing the reference of the associated throtl_grp when a qnode is - * queued and decrementing when dequeued is enough to keep the whole blkg - * tree pinned while bios are in flight. - */ -struct throtl_qnode { - struct list_head node; /* service_queue->queued[] */ - struct bio_list bios; /* queued bios */ - struct throtl_grp *tg; /* tg this qnode belongs to */ -}; - -struct throtl_service_queue { - struct throtl_service_queue *parent_sq; /* the parent service_queue */ - - /* - * Bios queued directly to this service_queue or dispatched from - * children throtl_grp's. - */ - struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ - unsigned int nr_queued[2]; /* number of queued bios */ - - /* - * RB tree of active children throtl_grp's, which are sorted by - * their ->disptime. - */ - struct rb_root_cached pending_tree; /* RB tree of active tgs */ - unsigned int nr_pending; /* # queued in the tree */ - unsigned long first_pending_disptime; /* disptime of the first tg */ - struct timer_list pending_timer; /* fires on first_pending_disptime */ -}; - enum tg_state_flags { THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ @@ -112,93 +62,6 @@ enum tg_state_flags {
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
-enum { - LIMIT_LOW, - LIMIT_MAX, - LIMIT_CNT, -}; - -struct throtl_grp { - /* must be the first member */ - struct blkg_policy_data pd; - - /* active throtl group service_queue member */ - struct rb_node rb_node; - - /* throtl_data this group belongs to */ - struct throtl_data *td; - - /* this group's service queue */ - struct throtl_service_queue service_queue; - - /* - * qnode_on_self is used when bios are directly queued to this - * throtl_grp so that local bios compete fairly with bios - * dispatched from children. qnode_on_parent is used when bios are - * dispatched from this throtl_grp into its parent and will compete - * with the sibling qnode_on_parents and the parent's - * qnode_on_self. - */ - struct throtl_qnode qnode_on_self[2]; - struct throtl_qnode qnode_on_parent[2]; - - /* - * Dispatch time in jiffies. This is the estimated time when group - * will unthrottle and is ready to dispatch more bio. It is used as - * key to sort active groups in service tree. - */ - unsigned long disptime; - - unsigned int flags; - - /* are there any throtl rules between this group and td? */ - bool has_rules[2]; - - /* internally used bytes per second rate limits */ - uint64_t bps[2][LIMIT_CNT]; - /* user configured bps limits */ - uint64_t bps_conf[2][LIMIT_CNT]; - - /* internally used IOPS limits */ - unsigned int iops[2][LIMIT_CNT]; - /* user configured IOPS limits */ - unsigned int iops_conf[2][LIMIT_CNT]; - - /* Number of bytes dispatched in current slice */ - uint64_t bytes_disp[2]; - /* Number of bio's dispatched in current slice */ - unsigned int io_disp[2]; - - unsigned long last_low_overflow_time[2]; - - uint64_t last_bytes_disp[2]; - unsigned int last_io_disp[2]; - - unsigned long last_check_time; - - unsigned long latency_target; /* us */ - unsigned long latency_target_conf; /* us */ - /* When did we start a new slice */ - unsigned long slice_start[2]; - unsigned long slice_end[2]; - - unsigned long last_finish_time; /* ns / 1024 */ - unsigned long checked_last_finish_time; /* ns / 1024 */ - unsigned long avg_idletime; /* ns / 1024 */ - unsigned long idletime_threshold; /* us */ - unsigned long idletime_threshold_conf; /* us */ - - unsigned int bio_cnt; /* total bios */ - unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ - unsigned long bio_cnt_reset_time; - - atomic_t io_split_cnt[2]; - atomic_t last_io_split_cnt[2]; - - struct blkg_rwstat stat_bytes; - struct blkg_rwstat stat_ios; -}; - /* We measure latency for request size from <= 4k to >= 1M */ #define LATENCY_BUCKET_SIZE 9
@@ -245,16 +108,6 @@ struct throtl_data
static void throtl_pending_timer_fn(struct timer_list *t);
-static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct throtl_grp, pd) : NULL; -} - -static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) -{ - return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); -} - static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) { return pd_to_blkg(&tg->pd); @@ -1856,7 +1709,7 @@ static void throtl_shutdown_wq(struct request_queue *q) cancel_work_sync(&td->dispatch_work); }
-static struct blkcg_policy blkcg_policy_throtl = { +struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files,
@@ -2270,7 +2123,7 @@ void blk_throtl_charge_bio_split(struct bio *bio) } while (parent); }
-bool blk_throtl_bio(struct bio *bio) +bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; struct blkcg_gq *blkg = bio->bi_blkg; @@ -2283,19 +2136,12 @@ bool blk_throtl_bio(struct bio *bio)
rcu_read_lock();
- /* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED)) - goto out; - if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, bio->bi_iter.bi_size); blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); }
- if (!tg->has_rules[rw]) - goto out; - spin_lock_irq(&q->queue_lock);
throtl_update_latency_buckets(td); @@ -2379,7 +2225,6 @@ bool blk_throtl_bio(struct bio *bio)
out_unlock: spin_unlock_irq(&q->queue_lock); -out: bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/block/blk-throttle.h b/block/blk-throttle.h new file mode 100644 index 000000000000..175f03abd9e4 --- /dev/null +++ b/block/blk-throttle.h @@ -0,0 +1,182 @@ +#ifndef BLK_THROTTLE_H +#define BLK_THROTTLE_H + +#include "blk-cgroup-rwstat.h" + +/* + * To implement hierarchical throttling, throtl_grps form a tree and bios + * are dispatched upwards level by level until they reach the top and get + * issued. When dispatching bios from the children and local group at each + * level, if the bios are dispatched into a single bio_list, there's a risk + * of a local or child group which can queue many bios at once filling up + * the list starving others. + * + * To avoid such starvation, dispatched bios are queued separately + * according to where they came from. When they are again dispatched to + * the parent, they're popped in round-robin order so that no single source + * hogs the dispatch window. + * + * throtl_qnode is used to keep the queued bios separated by their sources. + * Bios are queued to throtl_qnode which in turn is queued to + * throtl_service_queue and then dispatched in round-robin order. + * + * It's also used to track the reference counts on blkg's. A qnode always + * belongs to a throtl_grp and gets queued on itself or the parent, so + * incrementing the reference of the associated throtl_grp when a qnode is + * queued and decrementing when dequeued is enough to keep the whole blkg + * tree pinned while bios are in flight. + */ +struct throtl_qnode { + struct list_head node; /* service_queue->queued[] */ + struct bio_list bios; /* queued bios */ + struct throtl_grp *tg; /* tg this qnode belongs to */ +}; + +struct throtl_service_queue { + struct throtl_service_queue *parent_sq; /* the parent service_queue */ + + /* + * Bios queued directly to this service_queue or dispatched from + * children throtl_grp's. + */ + struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ + unsigned int nr_queued[2]; /* number of queued bios */ + + /* + * RB tree of active children throtl_grp's, which are sorted by + * their ->disptime. + */ + struct rb_root_cached pending_tree; /* RB tree of active tgs */ + unsigned int nr_pending; /* # queued in the tree */ + unsigned long first_pending_disptime; /* disptime of the first tg */ + struct timer_list pending_timer; /* fires on first_pending_disptime */ +}; + +enum { + LIMIT_LOW, + LIMIT_MAX, + LIMIT_CNT, +}; + +struct throtl_grp { + /* must be the first member */ + struct blkg_policy_data pd; + + /* active throtl group service_queue member */ + struct rb_node rb_node; + + /* throtl_data this group belongs to */ + struct throtl_data *td; + + /* this group's service queue */ + struct throtl_service_queue service_queue; + + /* + * qnode_on_self is used when bios are directly queued to this + * throtl_grp so that local bios compete fairly with bios + * dispatched from children. qnode_on_parent is used when bios are + * dispatched from this throtl_grp into its parent and will compete + * with the sibling qnode_on_parents and the parent's + * qnode_on_self. + */ + struct throtl_qnode qnode_on_self[2]; + struct throtl_qnode qnode_on_parent[2]; + + /* + * Dispatch time in jiffies. This is the estimated time when group + * will unthrottle and is ready to dispatch more bio. It is used as + * key to sort active groups in service tree. + */ + unsigned long disptime; + + unsigned int flags; + + /* are there any throtl rules between this group and td? */ + bool has_rules[2]; + + /* internally used bytes per second rate limits */ + uint64_t bps[2][LIMIT_CNT]; + /* user configured bps limits */ + uint64_t bps_conf[2][LIMIT_CNT]; + + /* internally used IOPS limits */ + unsigned int iops[2][LIMIT_CNT]; + /* user configured IOPS limits */ + unsigned int iops_conf[2][LIMIT_CNT]; + + /* Number of bytes dispatched in current slice */ + uint64_t bytes_disp[2]; + /* Number of bio's dispatched in current slice */ + unsigned int io_disp[2]; + + unsigned long last_low_overflow_time[2]; + + uint64_t last_bytes_disp[2]; + unsigned int last_io_disp[2]; + + unsigned long last_check_time; + + unsigned long latency_target; /* us */ + unsigned long latency_target_conf; /* us */ + /* When did we start a new slice */ + unsigned long slice_start[2]; + unsigned long slice_end[2]; + + unsigned long last_finish_time; /* ns / 1024 */ + unsigned long checked_last_finish_time; /* ns / 1024 */ + unsigned long avg_idletime; /* ns / 1024 */ + unsigned long idletime_threshold; /* us */ + unsigned long idletime_threshold_conf; /* us */ + + unsigned int bio_cnt; /* total bios */ + unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ + unsigned long bio_cnt_reset_time; + + atomic_t io_split_cnt[2]; + atomic_t last_io_split_cnt[2]; + + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; +}; + +extern struct blkcg_policy blkcg_policy_throtl; + +static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct throtl_grp, pd) : NULL; +} + +static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) +{ + return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); +} + +/* + * Internal throttling interface + */ +#ifndef CONFIG_BLK_DEV_THROTTLING +static inline int blk_throtl_init(struct request_queue *q) { return 0; } +static inline void blk_throtl_exit(struct request_queue *q) { } +static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline void blk_throtl_charge_bio_split(struct bio *bio) { } +static inline bool blk_throtl_bio(struct bio *bio) { return false; } +#else /* CONFIG_BLK_DEV_THROTTLING */ +int blk_throtl_init(struct request_queue *q); +void blk_throtl_exit(struct request_queue *q); +void blk_throtl_register_queue(struct request_queue *q); +void blk_throtl_charge_bio_split(struct bio *bio); +bool __blk_throtl_bio(struct bio *bio); +static inline bool blk_throtl_bio(struct bio *bio) +{ + struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); + + if (bio_flagged(bio, BIO_THROTTLED)) + return false; + if (!tg->has_rules[bio_data_dir(bio)]) + return false; + + return __blk_throtl_bio(bio); +} +#endif /* CONFIG_BLK_DEV_THROTTLING */ + +#endif diff --git a/block/blk.h b/block/blk.h index c86d27d80ba0..b3f0981148c9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -301,22 +301,6 @@ void ioc_clear_queue(struct request_queue *q);
int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
-/* - * Internal throttling interface - */ -#ifdef CONFIG_BLK_DEV_THROTTLING -extern int blk_throtl_init(struct request_queue *q); -extern void blk_throtl_exit(struct request_queue *q); -extern void blk_throtl_register_queue(struct request_queue *q); -extern void blk_throtl_charge_bio_split(struct bio *bio); -bool blk_throtl_bio(struct bio *bio); -#else /* CONFIG_BLK_DEV_THROTTLING */ -static inline int blk_throtl_init(struct request_queue *q) { return 0; } -static inline void blk_throtl_exit(struct request_queue *q) { } -static inline void blk_throtl_register_queue(struct request_queue *q) { } -static inline void blk_throtl_charge_bio_split(struct bio *bio) { } -static inline bool blk_throtl_bio(struct bio *bio) { return false; } -#endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.18-rc1 commit 9f5ede3c01f9951b0ae7d68b28762ad51d9bacc8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Commit 111be8839817 ("block-throttle: avoid double charge") marks bio as BIO_THROTTLED unconditionally if __blk_throtl_bio() is called on this bio, then this bio won't be called into __blk_throtl_bio() any more. This way is to avoid double charge in case of bio splitting. It is reasonable for read/write throughput limit, but not reasonable for IOPS limit because block layer provides io accounting against split bio.
Chunguang Xu has already observed this issue and fixed it in commit 4f1e9630afe6 ("blk-throtl: optimize IOPS throttle for large IO scenarios"). However, that patch only covers bio splitting in __blk_queue_split(), and we have other kind of bio splitting, such as bio_split() & submit_bio_noacct() and other ways.
This patch tries to fix the issue in one generic way by always charging the bio for iops limit in blk_throtl_bio(). This way is reasonable: re-submission & fast-cloned bio is charged if it is submitted to same disk/queue, and BIO_THROTTLED will be cleared if bio->bi_bdev is changed.
This new approach can get much more smooth/stable iops limit compared with commit 4f1e9630afe6 ("blk-throtl: optimize IOPS throttle for large IO scenarios") since that commit can't throttle current split bios actually.
Also this way won't cause new double bio iops charge in blk_throtl_dispatch_work_fn() in which blk_throtl_bio() won't be called any more.
Reported-by: Ning Li lining2020x@163.com Acked-by: Tejun Heo tj@kernel.org Cc: Chunguang Xu brookxu@tencent.com Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20220216044514.2903784-7-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: block/blk-merge.c [Context conflicts] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-merge.c | 2 -- block/blk-throttle.c | 10 +++++++--- block/blk-throttle.h | 2 -- 3 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/block/blk-merge.c b/block/blk-merge.c index f5c770f57946..88b0ffe5439f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -399,8 +399,6 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) trace_block_split(q, split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); *bio = split; - - blk_throtl_charge_bio_split(*bio); } }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index dc2e2db6013a..007acfb90b4d 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -837,7 +837,8 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio);
- if (bps_limit == U64_MAX) { + /* no need to throttle if this bio's bytes have been accounted */ + if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) { if (wait) *wait = 0; return true; @@ -945,9 +946,12 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */ - tg->bytes_disp[rw] += bio_size; + if (!bio_flagged(bio, BIO_THROTTLED)) { + tg->bytes_disp[rw] += bio_size; + tg->last_bytes_disp[rw] += bio_size; + } + tg->io_disp[rw]++; - tg->last_bytes_disp[rw] += bio_size; tg->last_io_disp[rw]++;
/* diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 175f03abd9e4..cb43f4417d6e 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -170,8 +170,6 @@ static inline bool blk_throtl_bio(struct bio *bio) { struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
- if (bio_flagged(bio, BIO_THROTTLED)) - return false; if (!tg->has_rules[bio_data_dir(bio)]) return false;
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.18-rc1 commit 5a93b6027eb4ef5db60a4bc5bdbeba5fb9f29384 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We need to throttle split bio in case of IOPS limit even though the split bio has been marked as BIO_THROTTLED since block layer accounts split bio actually.
If only throughput throttle is setup, no need to throttle any more if BIO_THROTTLED is set since we have accounted & considered the whole bio bytes already.
Add one flag of THROTL_TG_HAS_IOPS_LIMIT for serving this purpose.
Acked-by: Tejun Heo tj@kernel.org Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20220216044514.2903784-8-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: block/blk-throttle.c [Context conflicts] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 21 ++++++++++++++------- block/blk-throttle.h | 11 +++++++++++ 2 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 007acfb90b4d..98f3cc66674c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -55,11 +55,6 @@ static int __init setup_global_limit(char *str)
__setup("blkcg_global_limit=", setup_global_limit);
-enum tg_state_flags { - THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ - THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ -}; - #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
/* We measure latency for request size from <= 4k to >= 1M */ @@ -440,12 +435,24 @@ static void tg_update_has_rules(struct throtl_grp *tg) struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); struct throtl_data *td = tg->td; int rw; + int has_iops_limit = 0; + + for (rw = READ; rw <= WRITE; rw++) { + unsigned int iops_limit = tg_iops_limit(tg, rw);
- for (rw = READ; rw <= WRITE; rw++) tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || (td->limit_valid[td->limit_index] && (tg_bps_limit(tg, rw) != U64_MAX || - tg_iops_limit(tg, rw) != UINT_MAX)); + iops_limit != UINT_MAX)); + + if (iops_limit != UINT_MAX) + has_iops_limit = 1; + } + + if (has_iops_limit) + tg->flags |= THROTL_TG_HAS_IOPS_LIMIT; + else + tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT; }
static void throtl_pd_online(struct blkg_policy_data *pd) diff --git a/block/blk-throttle.h b/block/blk-throttle.h index cb43f4417d6e..c996a15f290e 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -52,6 +52,12 @@ struct throtl_service_queue { struct timer_list pending_timer; /* fires on first_pending_disptime */ };
+enum tg_state_flags { + THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ + THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ + THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */ +}; + enum { LIMIT_LOW, LIMIT_MAX, @@ -170,6 +176,11 @@ static inline bool blk_throtl_bio(struct bio *bio) { struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
+ /* no need to throttle bps any more if the bio has been throttled */ + if (bio_flagged(bio, BIO_THROTTLED) && + !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT)) + return false; + if (!tg->has_rules[bio_data_dir(bio)]) return false;
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.18-rc1 commit 34841e6fb125aa3f0e33e4eaac9f5eb86b2bb34b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Revert commit 4f1e9630afe6 ("blk-throtl: optimize IOPS throttle for large IO scenarios") since we have another easier way to address this issue and get better iops throttling result.
Acked-by: Tejun Heo tj@kernel.org Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20220216044514.2903784-9-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: block/blk-throttle.c [commit 681cd46fff8c ("blk-throttle: factor out code to calculate ios/bytes_allowed") is backported first.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 28 ---------------------------- block/blk-throttle.h | 5 ----- 2 files changed, 33 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 98f3cc66674c..fec47ad3b04e 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -655,8 +655,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0;
- atomic_set(&tg->io_split_cnt[rw], 0); - /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used @@ -680,8 +678,6 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
- atomic_set(&tg->io_split_cnt[rw], 0); - throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -926,9 +922,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); }
- if (iops_limit != UINT_MAX) - tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0); - if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) && tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) @@ -1989,14 +1982,12 @@ static void throtl_downgrade_check(struct throtl_grp *tg) }
if (tg->iops[READ][LIMIT_LOW]) { - tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0); iops = tg->last_io_disp[READ] * HZ / elapsed_time; if (iops >= tg->iops[READ][LIMIT_LOW]) tg->last_low_overflow_time[READ] = now; }
if (tg->iops[WRITE][LIMIT_LOW]) { - tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0); iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; if (iops >= tg->iops[WRITE][LIMIT_LOW]) tg->last_low_overflow_time[WRITE] = now; @@ -2115,25 +2106,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif
-void blk_throtl_charge_bio_split(struct bio *bio) -{ - struct blkcg_gq *blkg = bio->bi_blkg; - struct throtl_grp *parent = blkg_to_tg(blkg); - struct throtl_service_queue *parent_sq; - bool rw = bio_data_dir(bio); - - do { - if (!parent->has_rules[rw]) - break; - - atomic_inc(&parent->io_split_cnt[rw]); - atomic_inc(&parent->last_io_split_cnt[rw]); - - parent_sq = parent->service_queue.parent_sq; - parent = sq_to_tg(parent_sq); - } while (parent); -} - bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index c996a15f290e..b23a9f3abb82 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -138,9 +138,6 @@ struct throtl_grp { unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time;
- atomic_t io_split_cnt[2]; - atomic_t last_io_split_cnt[2]; - struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; @@ -164,13 +161,11 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } -static inline void blk_throtl_charge_bio_split(struct bio *bio) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } #else /* CONFIG_BLK_DEV_THROTTLING */ int blk_throtl_init(struct request_queue *q); void blk_throtl_exit(struct request_queue *q); void blk_throtl_register_queue(struct request_queue *q); -void blk_throtl_charge_bio_split(struct bio *bio); bool __blk_throtl_bio(struct bio *bio); static inline bool blk_throtl_bio(struct bio *bio) {
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.18-rc1 commit ee37eddbfa9e0401f13a01691cf4bbbacd2d16c9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
In throtl_pending_timer_fn(), request queue is retrieved from throttle data. And tg's pending timer is deleted synchronously when releasing the associated blkg, at that time, throttle data may have been freed since commit 1059699f87eb ("block: move blkcg initialization/destroy into disk allocation/release handler") moves freeing q->td to disk_release() from blk_release_queue(). So use-after-free on q->td in throtl_pending_timer_fn can be triggered.
Fixes the issue by:
- do nothing in case that disk is released, when there isn't any bio to dispatch
- retrieve request queue from blkg instead of throttle data for non top-level pending timer.
Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20220318130144.1066064-2-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index fec47ad3b04e..4f73b10734ab 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1163,12 +1163,22 @@ static void throtl_pending_timer_fn(struct timer_list *t) struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); - struct request_queue *q = td->queue; struct throtl_service_queue *parent_sq; + struct request_queue *q; bool dispatched; int ret;
+ /* throtl_data may be gone, so figure out request queue by blkg */ + if (tg) + q = tg->pd.blkg->q; + else + q = td->queue; + spin_lock_irq(&q->queue_lock); + + if (!q->root_blkg) + goto out_unlock; + if (throtl_can_upgrade(td, NULL)) throtl_upgrade_state(td);
From: Laibin Qiu qiulaibin@huawei.com
mainline inclusion from mainline-v5.19-rc1 commit 5a011f889b4832aa80c2a872a5aade5c48d2756f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
1.In current process, all bio will set the BIO_THROTTLED flag after __blk_throtl_bio().
2.If bio needs to be throttled, it will start the timer and stop submit bio directly. Bio will submit in blk_throtl_dispatch_work_fn() when the timer expires.But in the current process, if bio is throttled. The BIO_THROTTLED will be set to bio after timer start. If the bio has been completed, it may cause use-after-free blow.
BUG: KASAN: use-after-free in blk_throtl_bio+0x12f0/0x2c70 Read of size 2 at addr ffff88801b8902d4 by task fio/26380
dump_stack+0x9b/0xce print_address_description.constprop.6+0x3e/0x60 kasan_report.cold.9+0x22/0x3a blk_throtl_bio+0x12f0/0x2c70 submit_bio_checks+0x701/0x1550 submit_bio_noacct+0x83/0xc80 submit_bio+0xa7/0x330 mpage_readahead+0x380/0x500 read_pages+0x1c1/0xbf0 page_cache_ra_unbounded+0x471/0x6f0 do_page_cache_ra+0xda/0x110 ondemand_readahead+0x442/0xae0 page_cache_async_ra+0x210/0x300 generic_file_buffered_read+0x4d9/0x2130 generic_file_read_iter+0x315/0x490 blkdev_read_iter+0x113/0x1b0 aio_read+0x2ad/0x450 io_submit_one+0xc8e/0x1d60 __se_sys_io_submit+0x125/0x350 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Allocated by task 26380: kasan_save_stack+0x19/0x40 __kasan_kmalloc.constprop.2+0xc1/0xd0 kmem_cache_alloc+0x146/0x440 mempool_alloc+0x125/0x2f0 bio_alloc_bioset+0x353/0x590 mpage_alloc+0x3b/0x240 do_mpage_readpage+0xddf/0x1ef0 mpage_readahead+0x264/0x500 read_pages+0x1c1/0xbf0 page_cache_ra_unbounded+0x471/0x6f0 do_page_cache_ra+0xda/0x110 ondemand_readahead+0x442/0xae0 page_cache_async_ra+0x210/0x300 generic_file_buffered_read+0x4d9/0x2130 generic_file_read_iter+0x315/0x490 blkdev_read_iter+0x113/0x1b0 aio_read+0x2ad/0x450 io_submit_one+0xc8e/0x1d60 __se_sys_io_submit+0x125/0x350 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Freed by task 0: kasan_save_stack+0x19/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x1b/0x30 __kasan_slab_free+0x111/0x160 kmem_cache_free+0x94/0x460 mempool_free+0xd6/0x320 bio_free+0xe0/0x130 bio_put+0xab/0xe0 bio_endio+0x3a6/0x5d0 blk_update_request+0x590/0x1370 scsi_end_request+0x7d/0x400 scsi_io_completion+0x1aa/0xe50 scsi_softirq_done+0x11b/0x240 blk_mq_complete_request+0xd4/0x120 scsi_mq_done+0xf0/0x200 virtscsi_vq_done+0xbc/0x150 vring_interrupt+0x179/0x390 __handle_irq_event_percpu+0xf7/0x490 handle_irq_event_percpu+0x7b/0x160 handle_irq_event+0xcc/0x170 handle_edge_irq+0x215/0xb20 common_interrupt+0x60/0x120 asm_common_interrupt+0x1e/0x40
Fix this by move BIO_THROTTLED set into the queue_lock.
Signed-off-by: Laibin Qiu qiulaibin@huawei.com Reviewed-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20220301123919.2381579-1-qiulaibin@huawei.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 4f73b10734ab..e7bdf36b8716 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2217,13 +2217,14 @@ bool __blk_throtl_bio(struct bio *bio) }
out_unlock: - spin_unlock_irq(&q->queue_lock); bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; #endif + spin_unlock_irq(&q->queue_lock); + rcu_read_unlock(); return throttled; }
mainline inclusion from mainline-v6.1-rc1 commit 320fb0f91e55ba248d4bad106b408e59099cfa89 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Test scripts: cd /sys/fs/cgroup/blkio/ echo "8:0 1024" > blkio.throttle.write_bps_device echo $$ > cgroup.procs dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct & dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct &
Test result: 10240 bytes (10 kB, 10 KiB) copied, 10.0134 s, 1.0 kB/s 10240 bytes (10 kB, 10 KiB) copied, 10.0135 s, 1.0 kB/s
The problem is that the second bio is finished after 10s instead of 20s.
Root cause: 1) second bio will be flagged:
__blk_throtl_bio while (true) { ... if (sq->nr_queued[rw]) -> some bio is throttled already break }; bio_set_flag(bio, BIO_THROTTLED); -> flag the bio
2) flagged bio will be dispatched without waiting:
throtl_dispatch_tg tg_may_dispatch tg_with_in_bps_limit if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) *wait = 0; -> wait time is zero return true;
commit 9f5ede3c01f9 ("block: throttle split bio in case of iops limit") support to count split bios for iops limit, thus it adds flagged bio checking in tg_with_in_bps_limit() so that split bios will only count once for bps limit, however, it introduce a new problem that io throttle won't work if multiple bios are throttled.
In order to fix the problem, handle iops/bps limit in different ways:
1) for iops limit, there is no flag to record if the bio is throttled, and iops is always applied. 2) for bps limit, original bio will be flagged with BIO_BPS_THROTTLED, and io throttle will ignore bio with the flag.
Noted this patch also remove the code to set flag in __bio_clone(), it's introduced in commit 111be8839817 ("block-throttle: avoid double charge"), and author thinks split bio can be resubmited and throttled again, which is wrong because split bio will continue to dispatch from caller.
Fixes: 9f5ede3c01f9 ("block: throttle split bio in case of iops limit") Cc: stable@vger.kernel.org Signed-off-by: Yu Kuai yukuai3@huawei.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20220829022240.3348319-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: block/bio.c include/linux/bio.h [Context conflicts] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/bio.c | 2 -- block/blk-throttle.c | 20 ++++++-------------- block/blk-throttle.h | 2 +- include/linux/bio.h | 2 +- include/linux/blk_types.h | 2 +- 5 files changed, 9 insertions(+), 19 deletions(-)
diff --git a/block/bio.c b/block/bio.c index 43bf0920aa32..e2eaa426ac38 100644 --- a/block/bio.c +++ b/block/bio.c @@ -683,8 +683,6 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_disk = bio_src->bi_disk; bio->bi_partno = bio_src->bi_partno; bio_set_flag(bio, BIO_CLONED); - if (bio_flagged(bio_src, BIO_THROTTLED)) - bio_set_flag(bio, BIO_THROTTLED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index e7bdf36b8716..d4a12db068ce 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -841,7 +841,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, unsigned int bio_size = throtl_bio_data_size(bio);
/* no need to throttle if this bio's bytes have been accounted */ - if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) { + if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { if (wait) *wait = 0; return true; @@ -946,22 +946,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */ - if (!bio_flagged(bio, BIO_THROTTLED)) { + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { tg->bytes_disp[rw] += bio_size; tg->last_bytes_disp[rw] += bio_size; }
tg->io_disp[rw]++; tg->last_io_disp[rw]++; - - /* - * BIO_THROTTLED is used to prevent the same bio to be throttled - * more than once as a throttled bio will go through blk-throtl the - * second time when it eventually gets issued. Set it when a bio - * is being charged to a tg. - */ - if (!bio_flagged(bio, BIO_THROTTLED)) - bio_set_flag(bio, BIO_THROTTLED); }
/** @@ -1051,6 +1042,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) sq->nr_queued[rw]--;
throtl_charge_bio(tg, bio); + bio_set_flag(bio, BIO_BPS_THROTTLED);
/* * If our parent is another tg, we just need to transfer @bio to @@ -2187,8 +2179,10 @@ bool __blk_throtl_bio(struct bio *bio) qn = &tg->qnode_on_parent[rw]; sq = sq->parent_sq; tg = sq_to_tg(sq); - if (!tg) + if (!tg) { + bio_set_flag(bio, BIO_BPS_THROTTLED); goto out_unlock; + } }
/* out-of-limit, queue to @tg */ @@ -2217,8 +2211,6 @@ bool __blk_throtl_bio(struct bio *bio) }
out_unlock: - bio_set_flag(bio, BIO_THROTTLED); - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index b23a9f3abb82..438c50b3e071 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -172,7 +172,7 @@ static inline bool blk_throtl_bio(struct bio *bio) struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
/* no need to throttle bps any more if the bio has been throttled */ - if (bio_flagged(bio, BIO_THROTTLED) && + if (bio_flagged(bio, BIO_BPS_THROTTLED) && !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT)) return false;
diff --git a/include/linux/bio.h b/include/linux/bio.h index e895c84be0bc..680bf8f9d6e8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -474,7 +474,7 @@ extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_set_dev(bio, bdev) \ do { \ if ((bio)->bi_disk != (bdev)->bd_disk) \ - bio_clear_flag(bio, BIO_THROTTLED);\ + bio_clear_flag(bio, BIO_BPS_THROTTLED);\ (bio)->bi_disk = (bdev)->bd_disk; \ (bio)->bi_partno = (bdev)->bd_partno; \ bio_associate_blkg(bio); \ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b49d97547009..3298ac8ad281 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -300,7 +300,7 @@ enum { BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ - BIO_THROTTLED, /* This bio has already been subjected to + BIO_BPS_THROTTLED, /* This bio has already been subjected to * throttling rules. Don't do it again. */ BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion * of this bio. */
mainline inclusion from mainline-v6.1-rc1 commit a880ae93e5b5bb5d8d5500077a391e3f5ec7715c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If new configuration is submitted while a bio is throttled, then new waiting time is recalculated regardless that the bio might already wait for some time:
tg_conf_updated throtl_start_new_slice tg_update_disptime throtl_schedule_next_dispatch
Then io hung can be triggered by always submmiting new configuration before the throttled bio is dispatched.
Fix the problem by respecting the time that throttled bio already waited. In order to do that, add new fields to record how many bytes/io are waited, and use it to calculate wait time for throttled bio under new configuration.
Some simple test: 1) cd /sys/fs/cgroup/blkio/ echo $$ > cgroup.procs echo "8:0 2048" > blkio.throttle.write_bps_device { sleep 2 echo "8:0 1024" > blkio.throttle.write_bps_device } & dd if=/dev/zero of=/dev/sda bs=8k count=1 oflag=direct
2) cd /sys/fs/cgroup/blkio/ echo $$ > cgroup.procs echo "8:0 1024" > blkio.throttle.write_bps_device { sleep 4 echo "8:0 2048" > blkio.throttle.write_bps_device } & dd if=/dev/zero of=/dev/sda bs=8k count=1 oflag=direct
test results: io finish time before this patch with this patch 1) 10s 6s 2) 8s 6s
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Michal Koutný mkoutny@suse.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20220829022240.3348319-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: block/blk-throttle.c [commit 2dd710d476f2 ("blk-throttle: check for overflow in calculate_bytes_allowed") is backported first.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 59 +++++++++++++++++++++++++++++++++++++++----- block/blk-throttle.h | 9 +++++++ 2 files changed, 62 insertions(+), 6 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d4a12db068ce..3a5ae7998375 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -654,6 +654,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; + tg->carryover_bytes[rw] = 0; + tg->carryover_ios[rw] = 0;
/* * Previous slice has expired. We must have trimmed it after last @@ -671,12 +673,17 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->slice_end[rw], jiffies); }
-static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) +static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, + bool clear_carryover) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + if (clear_carryover) { + tg->carryover_bytes[rw] = 0; + tg->carryover_ios[rw] = 0; + }
throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -800,6 +807,41 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) jiffies); }
+static void __tg_update_carryover(struct throtl_grp *tg, bool rw) +{ + unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; + u64 bps_limit = tg_bps_limit(tg, rw); + u32 iops_limit = tg_iops_limit(tg, rw); + + /* + * If config is updated while bios are still throttled, calculate and + * accumulate how many bytes/ios are waited across changes. And + * carryover_bytes/ios will be used to calculate new wait time under new + * configuration. + */ + if (bps_limit != U64_MAX) + tg->carryover_bytes[rw] += + calculate_bytes_allowed(bps_limit, jiffy_elapsed) - + tg->bytes_disp[rw]; + if (iops_limit != UINT_MAX) + tg->carryover_ios[rw] += + calculate_io_allowed(iops_limit, jiffy_elapsed) - + tg->io_disp[rw]; +} + +static void tg_update_carryover(struct throtl_grp *tg) +{ + if (tg->service_queue.nr_queued[READ]) + __tg_update_carryover(tg, READ); + if (tg->service_queue.nr_queued[WRITE]) + __tg_update_carryover(tg, WRITE); + + /* see comments in struct throtl_grp for meaning of these fields. */ + throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__, + tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], + tg->carryover_ios[READ], tg->carryover_ios[WRITE]); +} + static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, u32 iops_limit, unsigned long *wait) { @@ -817,7 +859,8 @@ static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
/* Round up to the next throttle slice, wait time must be nonzero */ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); - io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + + tg->carryover_ios[rw]; if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; @@ -854,7 +897,8 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + + tg->carryover_bytes[rw]; if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { if (wait) *wait = 0; @@ -914,7 +958,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, * slice and it should be extended instead. */ if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) - throtl_start_new_slice(tg, rw); + throtl_start_new_slice(tg, rw, true); else { if (time_before(tg->slice_end[rw], jiffies + tg->td->throtl_slice)) @@ -1340,8 +1384,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ - throtl_start_new_slice(tg, READ); - throtl_start_new_slice(tg, WRITE); + throtl_start_new_slice(tg, READ, false); + throtl_start_new_slice(tg, WRITE, false);
if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -1398,6 +1442,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, v = U64_MAX;
tg = blkg_to_tg(ctx.blkg); + tg_update_carryover(tg);
if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; @@ -1589,6 +1634,8 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, goto out_finish;
tg = blkg_to_tg(ctx.blkg); + tg_update_carryover(tg); + v[0] = tg->bps_conf[READ][index]; v[1] = tg->bps_conf[WRITE][index]; v[2] = tg->iops_conf[READ][index]; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 438c50b3e071..f930dbf3ef3b 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -120,6 +120,15 @@ struct throtl_grp { uint64_t last_bytes_disp[2]; unsigned int last_io_disp[2];
+ /* + * The following two fields are updated when new configuration is + * submitted while some bios are still throttled, they record how many + * bytes/ios are waited already in previous configuration, and they will + * be used to calculate wait time under new configuration. + */ + uint64_t carryover_bytes[2]; + unsigned int carryover_ios[2]; + unsigned long last_check_time;
unsigned long latency_target; /* us */
mainline inclusion from mainline-v6.1-rc1 commit 85496749904016f36b69332f73a1cf3ecfee828f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Currently, "tg->has_rules" and "tg->flags & THROTL_TG_HAS_IOPS_LIMIT" both try to bypass bios that don't need to be throttled, however, they are a little redundant and both not perfect:
1) "tg->has_rules" only distinguish read and write, but not iops and bps limit. 2) "tg->flags & THROTL_TG_HAS_IOPS_LIMIT" only check if iops limit exist, read and write is not distinguished, and bps limit is not checked.
tg->has_rules will extended to distinguish bps and iops in the following patch. There is no need to keep the flag.
Signed-off-by: Yu Kuai yukuai3@huawei.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20220921095309.1481289-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: block/blk-throttle.h [commit ("8f9e7b65f833 block: cancel all throttled bios in del_gendisk()") is not backported.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 16 ++-------------- block/blk-throttle.h | 6 ------ 2 files changed, 2 insertions(+), 20 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 3a5ae7998375..973359642481 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -435,24 +435,12 @@ static void tg_update_has_rules(struct throtl_grp *tg) struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); struct throtl_data *td = tg->td; int rw; - int has_iops_limit = 0; - - for (rw = READ; rw <= WRITE; rw++) { - unsigned int iops_limit = tg_iops_limit(tg, rw);
+ for (rw = READ; rw <= WRITE; rw++) tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || (td->limit_valid[td->limit_index] && (tg_bps_limit(tg, rw) != U64_MAX || - iops_limit != UINT_MAX)); - - if (iops_limit != UINT_MAX) - has_iops_limit = 1; - } - - if (has_iops_limit) - tg->flags |= THROTL_TG_HAS_IOPS_LIMIT; - else - tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT; + tg_iops_limit(tg, rw) != UINT_MAX)); }
static void throtl_pd_online(struct blkg_policy_data *pd) diff --git a/block/blk-throttle.h b/block/blk-throttle.h index f930dbf3ef3b..0ffdc5cb7222 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -55,7 +55,6 @@ struct throtl_service_queue { enum tg_state_flags { THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ - THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */ };
enum { @@ -180,11 +179,6 @@ static inline bool blk_throtl_bio(struct bio *bio) { struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
- /* no need to throttle bps any more if the bio has been throttled */ - if (bio_flagged(bio, BIO_BPS_THROTTLED) && - !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT)) - return false; - if (!tg->has_rules[bio_data_dir(bio)]) return false;
mainline inclusion from mainline-v6.1-rc1 commit 81c7a63abc7c0be572b4f853e913ce93a34f6e1b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
"tg->has_rules" is extended to "tg->has_rules_iops/bps", thus bios that don't need to be throttled can be checked accurately.
With this patch, bio will be throttled if:
1) Bio is read/write, and corresponding read/write iops limit exist. 2) If corresponding doesn't exist, corresponding bps limit exist and bio is not throttled before.
Signed-off-by: Yu Kuai yukuai3@huawei.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20220921095309.1481289-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: block/blk-throttle.h [commit ("8f9e7b65f833 block: cancel all throttled bios in del_gendisk()") is not backported.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 13 +++++++++---- block/blk-throttle.h | 22 +++++++++++++++++++--- 2 files changed, 28 insertions(+), 7 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 973359642481..5e7580dde671 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -436,11 +436,16 @@ static void tg_update_has_rules(struct throtl_grp *tg) struct throtl_data *td = tg->td; int rw;
- for (rw = READ; rw <= WRITE; rw++) - tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || + for (rw = READ; rw <= WRITE; rw++) { + tg->has_rules_iops[rw] = + (parent_tg && parent_tg->has_rules_iops[rw]) || (td->limit_valid[td->limit_index] && - (tg_bps_limit(tg, rw) != U64_MAX || - tg_iops_limit(tg, rw) != UINT_MAX)); + tg_iops_limit(tg, rw) != UINT_MAX); + tg->has_rules_bps[rw] = + (parent_tg && parent_tg->has_rules_bps[rw]) || + (td->limit_valid[td->limit_index] && + (tg_bps_limit(tg, rw) != U64_MAX)); + } }
static void throtl_pd_online(struct blkg_policy_data *pd) diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 0ffdc5cb7222..45fcf7aa0eb5 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -97,7 +97,8 @@ struct throtl_grp { unsigned int flags;
/* are there any throtl rules between this group and td? */ - bool has_rules[2]; + bool has_rules_bps[2]; + bool has_rules_iops[2];
/* internally used bytes per second rate limits */ uint64_t bps[2][LIMIT_CNT]; @@ -175,11 +176,26 @@ int blk_throtl_init(struct request_queue *q); void blk_throtl_exit(struct request_queue *q); void blk_throtl_register_queue(struct request_queue *q); bool __blk_throtl_bio(struct bio *bio); -static inline bool blk_throtl_bio(struct bio *bio) + +static inline bool blk_should_throtl(struct bio *bio) { struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); + int rw = bio_data_dir(bio); + + /* iops limit is always counted */ + if (tg->has_rules_iops[rw]) + return true; + + if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED)) + return true; + + return false; +} + +static inline bool blk_throtl_bio(struct bio *bio) +{
- if (!tg->has_rules[bio_data_dir(bio)]) + if (!blk_should_throtl(bio)) return false;
return __blk_throtl_bio(bio);
From: Kemeng Shi shikemeng@huawei.com
mainline inclusion from mainline-v6.2-rc1 commit 84aca0a7e039c8735abc0f89f3f48e9006c0dfc7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Consider situation as following (on the default hierarchy): HDD | root (bps limit: 4k) | child (bps limit :8k) | fio bs=8k Rate of fio is supposed to be 4k, but result is 8k. Reason is as following: Size of single IO from fio is larger than bytes allowed in one throtl_slice in child, so IOs are always queued in child group first. When queued IOs in child are dispatched to parent group, BIO_BPS_THROTTLED is set and these IOs will not be limited by tg_within_bps_limit anymore. Fix this by only set BIO_BPS_THROTTLED when the bio traversed the entire tree.
There patch has no influence on situation which is not on the default hierarchy as each group is a single root group without parent.
Acked-by: Tejun Heo tj@kernel.org Signed-off-by: Kemeng Shi shikemeng@huawei.com Link: https://lore.kernel.org/r/20221205115709.251489-3-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 5e7580dde671..ba2f579a7934 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1079,7 +1079,6 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) sq->nr_queued[rw]--;
throtl_charge_bio(tg, bio); - bio_set_flag(bio, BIO_BPS_THROTTLED);
/* * If our parent is another tg, we just need to transfer @bio to @@ -1092,6 +1091,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); start_parent_slice_with_credit(tg, parent_tg, rw); } else { + bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], &parent_sq->queued[rw]); BUG_ON(tg->td->nr_queued[rw] <= 0);
From: Kemeng Shi shikemeng@huawei.com
mainline inclusion from mainline-v6.2-rc1 commit 183daeb11de871b073515d14ec1e3bc0da79e038 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
In C language, When executing "if (expression1 && expression2)" and expression1 return false, the expression2 may not be executed. For "tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) && tg_within_iops_limit(tg, bio, iops_limit, &iops_wait))", if bps is limited, tg_within_bps_limit will return false and tg_within_iops_limit will not be called. So even bps and iops are both limited, iops_wait will not be calculated and is always zero. So wait time of iops is always ignored.
Fix this by always calling tg_within_bps_limit and tg_within_iops_limit to get wait time for both bps and iops.
Observed that: 1. Wait time in tg_within_iops_limit/tg_within_bps_limit need always be stored as wait argument is always passed. 2. wait time is stored to zero if iops/bps is limited otherwise non-zero is stored. Simpfy tg_within_iops_limit/tg_within_bps_limit by removing wait argument and return wait time directly. Caller tg_may_dispatch checks if wait time is zero to find if iops/bps is limited.
Acked-by: Tejun Heo tj@kernel.org Signed-off-by: Kemeng Shi shikemeng@huawei.com Link: https://lore.kernel.org/r/20221205115709.251489-5-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index ba2f579a7934..d04a72b30faf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -835,17 +835,15 @@ static void tg_update_carryover(struct throtl_grp *tg) tg->carryover_ios[READ], tg->carryover_ios[WRITE]); }
-static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, - u32 iops_limit, unsigned long *wait) +static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, + u32 iops_limit) { bool rw = bio_data_dir(bio); unsigned int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
if (iops_limit == UINT_MAX) { - if (wait) - *wait = 0; - return true; + return 0; }
jiffy_elapsed = jiffies - tg->slice_start[rw]; @@ -855,21 +853,16 @@ static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + tg->carryover_ios[rw]; if (tg->io_disp[rw] + 1 <= io_allowed) { - if (wait) - *wait = 0; - return true; + return 0; }
/* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; - - if (wait) - *wait = jiffy_wait; - return false; + return jiffy_wait; }
-static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, - u64 bps_limit, unsigned long *wait) +static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, + u64 bps_limit) { bool rw = bio_data_dir(bio); u64 bytes_allowed, extra_bytes; @@ -878,9 +871,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* no need to throttle if this bio's bytes have been accounted */ if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { - if (wait) - *wait = 0; - return true; + return 0; }
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; @@ -893,9 +884,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + tg->carryover_bytes[rw]; if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { - if (wait) - *wait = 0; - return true; + return 0; }
/* Calc approx time to dispatch */ @@ -910,9 +899,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, * up we did. Add that time also. */ jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); - if (wait) - *wait = jiffy_wait; - return false; + return jiffy_wait; }
/* @@ -959,8 +946,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); }
- if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) && - tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) { + bps_wait = tg_within_bps_limit(tg, bio, bps_limit); + iops_wait = tg_within_iops_limit(tg, bio, iops_limit); + if (bps_wait + iops_wait == 0) { if (wait) *wait = 0; return true;
From: Jinke Han hanjinke.666@bytedance.com
mainline inclusion from mainline-v6.5-rc1 commit ad7c3b41e86b59943a903d23c7b037d820e6270c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
After commit f382fb0bcef4 ("block: remove legacy IO schedulers"), blkio.throttle.io_serviced and blkio.throttle.io_service_bytes become the only stable io stats interface of cgroup v1, and these statistics are done in the blk-throttle code. But the current code only counts the bios that are actually throttled. When the user does not add the throttle limit, the io stats for cgroup v1 has nothing. I fix it according to the statistical method of v2, and made it count all ios accurately.
Fixes: a7b36ee6ba29 ("block: move blk-throtl fast path inline") Tested-by: Andrea Righi andrea.righi@canonical.com Signed-off-by: Jinke Han hanjinke.666@bytedance.com Acked-by: Muchun Song songmuchun@bytedance.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20230507170631.89607-1-hanjinke.666@bytedance.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: block/blk-cgroup.c [commit 0416f3be58c6 ("blk-cgroup: don't update io stat for root cgroup") is not backported.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-cgroup.c | 6 ++++-- block/blk-throttle.c | 6 ------ block/blk-throttle.h | 9 +++++++++ 3 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0872b392360d..0805864543be 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1953,6 +1953,9 @@ void blk_cgroup_bio_start(struct bio *bio) struct blkg_iostat_set *bis; unsigned long flags;
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) + return; + cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); flags = u64_stats_update_begin_irqsave(&bis->sync); @@ -1968,8 +1971,7 @@ void blk_cgroup_bio_start(struct bio *bio) bis->cur.ios[rwd]++;
u64_stats_update_end_irqrestore(&bis->sync, flags); - if (cgroup_subsys_on_dfl(io_cgrp_subsys)) - cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); + cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); put_cpu(); }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d04a72b30faf..79ffce4bc213 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2149,12 +2149,6 @@ bool __blk_throtl_bio(struct bio *bio)
rcu_read_lock();
- if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { - blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); - blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); - } - spin_lock_irq(&q->queue_lock);
throtl_update_latency_buckets(td); diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 45fcf7aa0eb5..cb7f7d0e6f0d 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -182,6 +182,15 @@ static inline bool blk_should_throtl(struct bio *bio) struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); int rw = bio_data_dir(bio);
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); + } + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); + } + /* iops limit is always counted */ if (tg->has_rules_iops[rw]) return true;
mainline inclusion from mainline-v6.6-rc1 commit ef100397fac3e2e403d5d510e66f36e242654073 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
'carryover_bytes/ios' can be negative, indicate that some bio is dispatched in advance within slice while configuration is updated. Print a huge value is not user-friendly.
Signed-off-by: Yu Kuai yukuai3@huawei.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20230816012708.1193747-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk --- block/blk-throttle.c | 2 +- block/blk-throttle.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 79ffce4bc213..fe3ee1a90aaf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -830,7 +830,7 @@ static void tg_update_carryover(struct throtl_grp *tg) __tg_update_carryover(tg, WRITE);
/* see comments in struct throtl_grp for meaning of these fields. */ - throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__, + throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], tg->carryover_ios[READ], tg->carryover_ios[WRITE]); } diff --git a/block/blk-throttle.h b/block/blk-throttle.h index cb7f7d0e6f0d..b65343bbb108 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -126,8 +126,8 @@ struct throtl_grp { * bytes/ios are waited already in previous configuration, and they will * be used to calculate wait time under new configuration. */ - uint64_t carryover_bytes[2]; - unsigned int carryover_ios[2]; + long long carryover_bytes[2]; + int carryover_ios[2];
unsigned long last_check_time;
mainline inclusion from mainline-v6.6-rc1 commit bb8d5587bdc3ab211e1eae2eeb966f7a7d1f9c0b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
carryover_ios/bytes[] can be negative in the case that ios are dispatched in the slice in advance, and then configuration is updated. For example:
1) set iops limit to 1000, and slice start is 0, slice end is 100ms; 2) current time is 0, and 100 ios are dispatched, those ios will not be throttled, hence io_disp is 100; 3) still at current time 0, update iops limit to 100, then carryover_ios is (0 - 100) = -100; 4) then, dispatch a new io at time 0, the expected result is that this io will wait for 1s. The calculation in tg_within_iops_limit:
io_disp = 0; io_allowed = calculate_io_allowed + carryover_ios = 10 + (-100) = -90; io won't be throttled if (io_disp + 1 < io_allowed) passed.
Before this patch, in step 4) (io_disp + 1 < io_allowed) is passed, because -90 for unsigned value is very huge, and such io won't be throttled.
Fix this problem by checking if 'io/bytes_allowed' is negative first.
Signed-off-by: Yu Kuai yukuai3@huawei.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20230816012708.1193747-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk --- block/blk-throttle.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index fe3ee1a90aaf..91a22d49f5c3 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -839,7 +839,7 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio u32 iops_limit) { bool rw = bio_data_dir(bio); - unsigned int io_allowed; + int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
if (iops_limit == UINT_MAX) { @@ -852,9 +852,8 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + tg->carryover_ios[rw]; - if (tg->io_disp[rw] + 1 <= io_allowed) { + if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; - }
/* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; @@ -865,7 +864,8 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, u64 bps_limit) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes; + long long bytes_allowed; + u64 extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio);
@@ -883,9 +883,8 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + tg->carryover_bytes[rw]; - if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { + if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) return 0; - }
/* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
mainline inclusion from mainline-v6.6-rc1 commit eead0056648cef49d7b15c07ae612fa217083165 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Currently, 'carryover_ios/bytes' is not handled in throtl_trim_slice(), for consequence, 'carryover_ios/bytes' will be used to throttle bio multiple times, for example:
1) set iops limit to 100, and slice start is 0, slice end is 100ms; 2) current time is 0, and 10 ios are dispatched, those io won't be throttled and io_disp is 10; 3) still at current time 0, update iops limit to 1000, carryover_ios is updated to (0 - 10) = -10; 4) in this slice(0 - 100ms), io_allowed = 100 + (-10) = 90, which means only 90 ios can be dispatched without waiting; 5) assume that io is throttled in slice(0 - 100ms), and throtl_trim_slice() update silce to (100ms - 200ms). In this case, 'carryover_ios/bytes' is not cleared and still only 90 ios can be dispatched between 100ms - 200ms.
Fix this problem by updating 'carryover_ios/bytes' in throtl_trim_slice().
Fixes: a880ae93e5b5 ("blk-throttle: fix io hung due to configuration updates") Reported-by: zhuxiaohui zhuxiaohui.400@bytedance.com Link: https://lore.kernel.org/all/20230812072116.42321-1-zhuxiaohui.400@bytedance.... Signed-off-by: Yu Kuai yukuai3@huawei.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20230816012708.1193747-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk --- block/blk-throttle.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 91a22d49f5c3..f9aeaa2ce9e7 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -747,8 +747,9 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { - unsigned long time_elapsed, io_trim; - u64 bytes_trim; + unsigned long time_elapsed; + long long bytes_trim; + int io_trim;
BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
@@ -776,17 +777,21 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) return;
bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), - time_elapsed); - io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed); - if (!bytes_trim && !io_trim) + time_elapsed) + + tg->carryover_bytes[rw]; + io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + + tg->carryover_ios[rw]; + if (bytes_trim <= 0 && io_trim <= 0) return;
- if (tg->bytes_disp[rw] >= bytes_trim) + tg->carryover_bytes[rw] = 0; + if ((long long)tg->bytes_disp[rw] >= bytes_trim) tg->bytes_disp[rw] -= bytes_trim; else tg->bytes_disp[rw] = 0;
- if (tg->io_disp[rw] >= io_trim) + tg->carryover_ios[rw] = 0; + if ((int)tg->io_disp[rw] >= io_trim) tg->io_disp[rw] -= io_trim; else tg->io_disp[rw] = 0; @@ -794,7 +799,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) tg->slice_start[rw] += time_elapsed;
throtl_log(&tg->service_queue, - "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", + "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies);
mainline inclusion from mainline-v6.12-rc1 commit 29390bb5661d49d10424ad8e915230de1f7074c9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAUKH4 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Currently, blk-throttle handle all IO fifo, hence if data IO is throttled and then meta IO is dispatched, the meta IO will have to wait for the data IO, causing priority inversion problems.
This patch support to handle metadata first and then pay debt while throttling data.
Test script: use cgroup v1 to throttle root cgroup, then create new dir and file while write back is throttled
test() { mkdir /mnt/test/xxx touch /mnt/test/xxx/1 sync /mnt/test/xxx sync /mnt/test/xxx }
mkfs.ext4 -F /dev/nvme0n1 -E lazy_itable_init=0,lazy_journal_init=0 mount /dev/nvme0n1 /mnt/test
echo "259:0 $((1024*1024))" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device dd if=/dev/zero of=/mnt/test/foo1 bs=16M count=1 conv=fdatasync status=none & sleep 4
time test echo "259:0 0" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device
sleep 1 umount /dev/nvme0n1
Test result: time cost for creating new dir and file before this patch: 14s after this patch: 0.1s
Signed-off-by: Yu Kuai yukuai3@huawei.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20240903135149.271857-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: block/blk-throttle.c [commit bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") is not backported.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-throttle.c | 63 +++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 21 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f9aeaa2ce9e7..5e299ea26d85 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2140,6 +2140,22 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif
+static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) +{ + /* throtl is FIFO - if bios are already queued, should queue */ + if (tg->service_queue.nr_queued[rw]) + return false; + + return tg_may_dispatch(tg, bio, NULL); +} + +static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) +{ + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) + tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); + tg->carryover_ios[rw]--; +} + bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; @@ -2167,12 +2183,33 @@ bool __blk_throtl_bio(struct bio *bio) tg->last_low_overflow_time[rw] = jiffies; throtl_downgrade_check(tg); throtl_upgrade_check(tg); - /* throtl is FIFO - if bios are already queued, should queue */ - if (sq->nr_queued[rw]) - break;
- /* if above limits, break to queue */ - if (!tg_may_dispatch(tg, bio, NULL)) { + if (tg_within_limit(tg, bio, rw)) { + /* within limits, let's charge and dispatch directly */ + throtl_charge_bio(tg, bio); + + /* + * We need to trim slice even when bios are not being + * queued otherwise it might happen that a bio is not + * queued for a long time and slice keeps on extending + * and trim is not called for a long time. Now if limits + * are reduced suddenly we take into account all the IO + * dispatched so far at new low rate and * newly queued + * IO gets a really long dispatch time. + * + * So keep on trimming slice even if bio is not queued. + */ + throtl_trim_slice(tg, rw); + } else if (bio_issue_as_root_blkg(bio)) { + /* + * IOs which may cause priority inversions are + * dispatched directly, even if they're over limit. + * Debts are handled by carryover_bytes/ios while + * calculating wait time. + */ + tg_dispatch_in_debt(tg, bio, rw); + } else { + /* if above limits, break to queue */ tg->last_low_overflow_time[rw] = jiffies; if (throtl_can_upgrade(td, tg)) { throtl_upgrade_state(td); @@ -2181,22 +2218,6 @@ bool __blk_throtl_bio(struct bio *bio) break; }
- /* within limits, let's charge and dispatch directly */ - throtl_charge_bio(tg, bio); - - /* - * We need to trim slice even when bios are not being queued - * otherwise it might happen that a bio is not queued for - * a long time and slice keeps on extending and trim is not - * called for a long time. Now if limits are reduced suddenly - * we take into account all the IO dispatched so far at new - * low rate and * newly queued IO gets a really long dispatch - * time. - * - * So keep on trimming slice even if bio is not queued. - */ - throtl_trim_slice(tg, rw); - /* * @bio passed through this layer without being throttled. * Climb up the ladder. If we're already at the top, it
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/12086 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/5...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/12086 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/5...