From: Tejun Heo tj@kernel.org
mainline inclusion from mainline-5.4-rc1 commit 36a524814ff3e5d5385f42d30152fe8c5e1fd2c1 category: feature bugzilla: 38688 CVE: NA
---------------------------
Currently, when a bio needs to be force-charged and there isn't enough budget, vtime is simply pushed into the future. This means that the cost of the whole bio is scaled using the current hweight and then charged immediately. Until the global vtime advances beyond this future vtime, the cgroup won't be allowed to issue normal IOs.
This is incorrect and can lead to, for example, exploding vrate or extended stalls if vrate range is constrained. Consider the following scenario.
1. A cgroup with a very low hweight runs out of budget.
2. A storm of swap-out happens on it. All of them are scaled according to the current low hweight and charged to vtime pushing it to a far future.
3. All other cgroups go idle and now the above cgroup has access to the whole device. However, because vtime is already wound using the past low hweight, what its current hweight is doesn't matter until global vtime catches up to the local vtime.
4. As a result, either vrate gets ramped up extremely or the IOs stall while the underlying device is idle.
This is because the hweight the overage is calculated at is different from the hweight that it's being paid at.
Fix it by remembering the overage in absoulte vtime and continuously paying with the actual budget according to the current hweight at each period.
Note that non-forced bios which wait already remembers the cost in absolute vtime. This brings forced-bio accounting in line.
Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-iocost.c | 62 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 7 deletions(-)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 431a639c04592..88056825adcf5 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -469,6 +469,7 @@ struct ioc_gq { */ atomic64_t vtime; atomic64_t done_vtime; + atomic64_t abs_vdebt; u64 last_vtime;
/* @@ -653,13 +654,21 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
/* * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical - * weight, the more expensive each IO. + * weight, the more expensive each IO. Must round up. */ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) { return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse); }
+/* + * The inverse of abs_cost_to_cost(). Must round up. + */ +static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) +{ + return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE); +} + static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) { bio->bi_iocost_cost = cost; @@ -1132,16 +1141,36 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) struct iocg_wake_ctx ctx = { .iocg = iocg }; u64 margin_ns = (u64)(ioc->period_us * WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; - u64 vshortage, expires, oexpires; + u64 abs_vdebt, vdebt, vshortage, expires, oexpires; + s64 vbudget; + u32 hw_inuse;
lockdep_assert_held(&iocg->waitq.lock);
+ current_hweight(iocg, NULL, &hw_inuse); + vbudget = now->vnow - atomic64_read(&iocg->vtime); + + /* pay off debt */ + abs_vdebt = atomic64_read(&iocg->abs_vdebt); + vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse); + if (vdebt && vbudget > 0) { + u64 delta = min_t(u64, vbudget, vdebt); + u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), + abs_vdebt); + + atomic64_add(delta, &iocg->vtime); + atomic64_add(delta, &iocg->done_vtime); + atomic64_sub(abs_delta, &iocg->abs_vdebt); + if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0)) + atomic64_set(&iocg->abs_vdebt, 0); + } + /* * Wake up the ones which are due and see how much vtime we'll need * for the next one. */ - current_hweight(iocg, NULL, &ctx.hw_inuse); - ctx.vbudget = now->vnow - atomic64_read(&iocg->vtime); + ctx.hw_inuse = hw_inuse; + ctx.vbudget = vbudget - vdebt; __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); if (!waitqueue_active(&iocg->waitq)) return; @@ -1187,6 +1216,11 @@ static void iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) u64 vmargin = ioc->margin_us * now->vrate; u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; u64 expires, oexpires; + u32 hw_inuse; + + /* debt-adjust vtime */ + current_hweight(iocg, NULL, &hw_inuse); + vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
/* clear or maintain depending on the overage */ if (time_before_eq64(vtime, now->vnow)) { @@ -1332,12 +1366,14 @@ static void ioc_timer_fn(struct timer_list *timer) * should have woken up in the last period and expire idle iocgs. */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { - if (!waitqueue_active(&iocg->waitq) && !iocg_is_idle(iocg)) + if (!waitqueue_active(&iocg->waitq) && + !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg)) continue;
spin_lock(&iocg->waitq.lock);
- if (waitqueue_active(&iocg->waitq)) { + if (waitqueue_active(&iocg->waitq) || + atomic64_read(&iocg->abs_vdebt)) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, &now); iocg_kick_delay(iocg, &now, 0); @@ -1674,13 +1710,24 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio, * in a while which is fine. */ if (!waitqueue_active(&iocg->waitq) && + !atomic64_read(&iocg->abs_vdebt) && time_before_eq64(vtime + cost, now.vnow)) { iocg_commit_bio(iocg, bio, cost); return; }
+ /* + * We're over budget. If @bio has to be issued regardless, + * remember the abs_cost instead of advancing vtime. + * iocg_kick_waitq() will pay off the debt before waking more IOs. + * This way, the debt is continuously paid off each period with the + * actual budget available to the cgroup. If we just wound vtime, + * we would incorrectly use the current hw_inuse for the entire + * amount which, for example, can lead to the cgroup staying + * blocked for a long time even with substantially raised hw_inuse. + */ if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { - iocg_commit_bio(iocg, bio, cost); + atomic64_add(abs_cost, &iocg->abs_vdebt); iocg_kick_delay(iocg, &now, cost); return; } @@ -1929,6 +1976,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) iocg->ioc = ioc; atomic64_set(&iocg->vtime, now.vnow); atomic64_set(&iocg->done_vtime, now.vnow); + atomic64_set(&iocg->abs_vdebt, 0); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); iocg->hweight_active = HWEIGHT_WHOLE;