[PATCH OLK-6.6 0/5] blk-ioinf: introduce inflight-based IO QoS controller

Baokun Li (5): blk-ioinf: introduce inflight-based IO QoS controller blk-ioinf: add inflight/lat interface blk-ioinf: dynamically adjust inflight limit to balance workloads blk-ioinf: support percentile latency QoS for oneline workloads blk-ioinf: support per-online-cgroup weights for priority control block/Kconfig | 10 + block/Makefile | 1 + block/blk-ioinf.c | 1370 ++++++++++++++++++++++++++++++++++++++++ block/blk-mq-debugfs.c | 2 + block/blk-rq-qos.h | 1 + include/linux/blk-mq.h | 3 + 6 files changed, 1387 insertions(+) create mode 100644 block/blk-ioinf.c -- 2.46.1

This patch introduces a new inflight-based IO QoS controller for cgroups. The controller is designed to guarantee absolute priority of online (latency-sensitive) workloads over offline (best-effort) workloads. If a cgroup's weight is set to 0, it is marked as offline; otherwise it is marked as online. Online cgroups always enjoy priority budgets and can preempt offline allocations at any time. This ensures that LS workloads meet their latency requirements, while BE workloads can opportunistically utilize idle bandwidth. Currently, the inflight limit is a fixed value. In subsequent patches, we will implement dynamic adjustment of inflight limits and online weights. Co-developed-by: Yu Kuai <yukuai3@huawei.com> Signed-off-by: Baokun Li <libaokun1@huawei.com> --- block/Kconfig | 10 + block/Makefile | 1 + block/blk-ioinf.c | 739 +++++++++++++++++++++++++++++++++++++++++ block/blk-mq-debugfs.c | 2 + block/blk-rq-qos.h | 1 + include/linux/blk-mq.h | 3 + 6 files changed, 756 insertions(+) create mode 100644 block/blk-ioinf.c diff --git a/block/Kconfig b/block/Kconfig index 7018fdcaa459..1d338261b751 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -204,6 +204,16 @@ config BLK_CGROUP_LEGACY_IOCOST If unsure, say N. +config BLK_CGROUP_IOINFLIGHT + bool "Enable support for inflight based cgroup IO controller" + depends on BLK_CGROUP + select BLK_RQ_ALLOC_TIME + help + Enabling this option enables the .inf.qos interface for inflight + based proportional IO control. The IO controller distributes IO + capacity between different groups based on their share of the + overall weight distribution. + config BLK_CGROUP_IOPRIO bool "Cgroup I/O controller for assigning an I/O priority class" depends on BLK_CGROUP diff --git a/block/Makefile b/block/Makefile index bfba1d2afc0e..eeea4092a5c0 100644 --- a/block/Makefile +++ b/block/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o +obj-$(CONFIG_BLK_CGROUP_IOINFLIGHT) += blk-ioinf.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o diff --git a/block/blk-ioinf.c b/block/blk-ioinf.c new file mode 100644 index 000000000000..623cbfca2823 --- /dev/null +++ b/block/blk-ioinf.c @@ -0,0 +1,739 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IO inflight relative controller + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/time64.h> +#include <linux/parser.h> +#include <linux/blk-cgroup.h> + +#include "blk-cgroup.h" +#include "blk-rq-qos.h" +#include "blk-mq.h" + +/* default weight for each cgroup */ +#define IOINF_DFL_WEIGHT 0 +#define IOINF_MIN_INFLIGHT 30 +#define IOINFG_MIN_INFLIGHT 1 +/* default wake-up time in jiffies for backgroup job, see ioinf_timer_fn() */ +#define IOINF_TIMER_PERID 500 + +/* io.inf.qos controls */ +enum { + INF_ENABLE, + INF_INFLIGHT, + + NR_QOS_CTRL_PARAMS, +}; + +/* qos control params */ +struct ioinf_params { + bool enabled; + u32 inflight; +}; + +struct ioinf_rq_wait { + struct rq_wait rqw; + u32 hinflight; + u32 max_inflight; + u32 last_max; + u32 exhausted; + u32 issued; +}; + +/* the global conrtol structure */ +struct ioinf { + struct rq_qos rqos; + + struct ioinf_params params; + u32 inflight; + + /* default time for ioinf_timer_fn */ + unsigned long inf_timer_perid; + struct timer_list inf_timer; + + /* global lock */ + spinlock_t lock; + + /* for offline cgroups */ + struct ioinf_rq_wait offline; + /* for online cgroups */ + struct ioinf_rq_wait online; +}; + +/* per disk-cgroup pair structure */ +struct ioinf_gq { + struct blkg_policy_data pd; + struct ioinf *inf; + + /* configured by user */ + u32 user_weight; +}; + +/* per cgroup structure, used to record default weight for all disks */ +struct ioinf_cgrp { + struct blkcg_policy_data cpd; + + /* if default user weight is 0, means it's offline */ + u32 dfl_user_weight; +}; + +static struct blkcg_policy blkcg_policy_ioinf; + +static struct ioinf *rqos_to_inf(struct rq_qos *rqos) +{ + return container_of(rqos, struct ioinf, rqos); +} + +static struct ioinf *q_to_inf(struct request_queue *q) +{ + return rqos_to_inf(rq_qos_id(q, RQ_QOS_INFLIGHT)); +} + +static struct ioinf_gq *pd_to_infg(struct blkg_policy_data *pd) +{ + if (!pd) + return NULL; + + return container_of(pd, struct ioinf_gq, pd); +} + +static struct ioinf_gq *blkg_to_infg(struct blkcg_gq *blkg) +{ + return pd_to_infg(blkg_to_pd(blkg, &blkcg_policy_ioinf)); +} + +static struct blkcg_gq *infg_to_blkg(struct ioinf_gq *infg) +{ + return pd_to_blkg(&infg->pd); +} + +static struct ioinf_cgrp *blkcg_to_infcg(struct blkcg *blkcg) +{ + struct blkcg_policy_data *cpd = + blkcg_to_cpd(blkcg, &blkcg_policy_ioinf); + + return container_of(cpd, struct ioinf_cgrp, cpd); +} + +static struct blkcg_gq *ioinf_bio_blkg(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + if (!blkg || !blkg->online) + return NULL; + + if (blkg->blkcg->css.cgroup->level == 0) + return NULL; + + return blkg; +} + +static struct ioinf_gq *ioinf_bio_infg(struct bio *bio) +{ + struct ioinf_gq *infg; + struct blkcg_gq *blkg = ioinf_bio_blkg(bio); + + if (!blkg) + return NULL; + + infg = blkg_to_infg(blkg); + if (!infg) + return NULL; + + return infg; +} + +static struct ioinf_rq_wait *rqw_to_ioinf_rqw(struct rq_wait *rqw) +{ + return container_of(rqw, struct ioinf_rq_wait, rqw); +} + +static u32 infg_user_weight(struct ioinf_gq *infg) +{ + struct ioinf_cgrp *infcg; + struct blkcg_gq *blkg; + + if (infg->user_weight) + return infg->user_weight; + + /* if user doesn't set per disk weight, use the cgroup default weight */ + blkg = infg_to_blkg(infg); + infcg = blkcg_to_infcg(blkg->blkcg); + + return infcg->dfl_user_weight; +} + +static bool infg_offline(struct ioinf_gq *infg) +{ + return infg_user_weight(infg) == 0; +} + +static unsigned int atomic_inc_below_return(atomic_t *v, unsigned int below) +{ + unsigned int cur = atomic_read(v); + + for (;;) { + unsigned int old; + + if (cur >= below) + return below + 1; + + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return cur + 1; +} + +static void ioinf_set_hinflight(struct ioinf_rq_wait *ioinf_rqw, u32 new) +{ + u32 old = ioinf_rqw->hinflight; + + ioinf_rqw->hinflight = new; + ioinf_rqw->last_max = max(ioinf_rqw->last_max >> 1, + ioinf_rqw->max_inflight); + ioinf_rqw->max_inflight = new >> 1; + + if (new > old && wq_has_sleeper(&ioinf_rqw->rqw.wait)) + wake_up_all(&ioinf_rqw->rqw.wait); +} + +void ioinf_done(struct ioinf_rq_wait *ioinf_rqw) +{ + int inflight = atomic_dec_return(&ioinf_rqw->rqw.inflight); + + BUG_ON(inflight < 0); + + if (inflight < ioinf_rqw->hinflight && + wq_has_sleeper(&ioinf_rqw->rqw.wait)) + wake_up_all(&ioinf_rqw->rqw.wait); +} + +static bool ioinf_inflight_cb(struct rq_wait *rqw, void *private_data) +{ + struct ioinf_rq_wait *ioinf_rqw = rqw_to_ioinf_rqw(rqw); + struct ioinf *inf = private_data; + u32 inflight; + u32 limit; + +retry: + limit = ioinf_rqw->hinflight; + inflight = atomic_inc_below_return(&rqw->inflight, limit); + if (inflight > ioinf_rqw->max_inflight) + ioinf_rqw->max_inflight = inflight; + if (inflight <= limit) { + ioinf_rqw->issued++; + return true; + } + + if (ioinf_rqw == &inf->offline) { + ioinf_rqw->exhausted++; + return false; + } + + if (inf->offline.hinflight > IOINFG_MIN_INFLIGHT) { + spin_lock_irq(&inf->lock); + /* Reclaim half of the inflight budget from offline groups. */ + ioinf_set_hinflight(&inf->offline, + inf->offline.hinflight >> 1); + ioinf_set_hinflight(&inf->online, + inf->inflight - inf->offline.hinflight); + spin_unlock_irq(&inf->lock); + } + + if (ioinf_rqw->hinflight > limit) + goto retry; + + ioinf_rqw->exhausted++; + return false; +} + +static void ioinf_cleanup_cb(struct rq_wait *rqw, void *private_data) +{ + struct ioinf_rq_wait *ioinf_rqw = rqw_to_ioinf_rqw(rqw); + + ioinf_done(ioinf_rqw); +} + +static void ioinf_throttle(struct ioinf *inf, struct ioinf_rq_wait *ioinf_rqw) +{ + rq_qos_wait(&ioinf_rqw->rqw, inf, ioinf_inflight_cb, + ioinf_cleanup_cb, NULL); + + /* + * In case no online cgroup is active, daemon will adjust all the + * budget to offline cgroup. + */ + timer_reduce(&inf->inf_timer, jiffies + inf->inf_timer_perid); +} + +static void ioinf_rqos_throttle(struct rq_qos *rqos, struct bio *bio) +{ + struct ioinf *inf = rqos_to_inf(rqos); + struct ioinf_gq *infg = ioinf_bio_infg(bio); + + if (!inf->params.enabled || !infg) + return; + + if (infg_offline(infg)) + ioinf_throttle(inf, &inf->offline); + else + ioinf_throttle(inf, &inf->online); +} + +static void ioinf_rqos_track(struct rq_qos *rqos, struct request *rq, + struct bio *bio) +{ + struct blkcg_gq *blkg = ioinf_bio_blkg(bio); + + if (!blkg) + return; + + rq->blkg = blkg; +} + +static void ioinf_rqos_done(struct rq_qos *rqos, struct request *rq) +{ + struct blkcg_gq *blkg = rq->blkg; + struct ioinf_gq *infg; + struct ioinf *inf; + + if (!blkg) + return; + + infg = blkg_to_infg(blkg); + inf = infg->inf; + if (infg_offline(infg)) + ioinf_done(&inf->offline); + else + ioinf_done(&inf->online); + + rq->blkg = NULL; +} + +static void ioinf_rqos_exit(struct rq_qos *rqos) +{ + struct ioinf *inf = rqos_to_inf(rqos); + + blkcg_deactivate_policy(rqos->disk, &blkcg_policy_ioinf); + + timer_shutdown_sync(&inf->inf_timer); + kfree(inf); +} + +static int ioinf_stat_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct ioinf *inf = rqos_to_inf(rqos); + + spin_lock_irq(&inf->lock); + + seq_printf(m, "inflight %u->%u\n", inf->params.inflight, inf->inflight); + + seq_printf(m, "online inflight %u/%d\n", + atomic_read(&inf->online.rqw.inflight), + inf->online.hinflight); + seq_printf(m, "offline inflight %u/%d\n", + atomic_read(&inf->offline.rqw.inflight), + inf->offline.hinflight); + + spin_unlock_irq(&inf->lock); + + return 0; +} + +static const struct blk_mq_debugfs_attr ioinf_debugfs_attrs[] = { + {"stat", 0400, ioinf_stat_show}, + {}, +}; + +static struct rq_qos_ops ioinf_rqos_ops = { + .throttle = ioinf_rqos_throttle, + .done = ioinf_rqos_done, + .track = ioinf_rqos_track, + .exit = ioinf_rqos_exit, + +#ifdef CONFIG_BLK_DEBUG_FS + .debugfs_attrs = ioinf_debugfs_attrs, +#endif +}; + +u32 ioinf_calc_budget(struct ioinf_rq_wait *ioinf_rqw) +{ + u32 new_budget; + u64 exhausted = ioinf_rqw->exhausted; + u64 issued = ioinf_rqw->issued; + + new_budget = max(ioinf_rqw->last_max, ioinf_rqw->max_inflight); + /* How much budget is needed to avoid 'exhausted'? */ + if (exhausted && issued) + new_budget += exhausted * new_budget / issued; + + return new_budget; +} + +static +void ioinf_update_inflight(struct ioinf *inf, u32 new_online, u32 new_offline) +{ + inf->inflight = inf->params.inflight; + if (inf->inflight < IOINF_MIN_INFLIGHT) + inf->inflight = IOINF_MIN_INFLIGHT; + + if (new_online >= inf->inflight) + new_offline = min(new_offline, IOINFG_MIN_INFLIGHT); + else if (new_online + new_offline > inf->inflight) + new_offline = inf->inflight - new_online; + new_online = inf->inflight - new_offline; + + ioinf_set_hinflight(&inf->offline, new_offline); + inf->offline.exhausted = 0; + inf->offline.issued = 0; + + ioinf_set_hinflight(&inf->online, new_online); + inf->online.exhausted = 0; + inf->online.issued = 0; +} + +static void ioinf_timer_fn(struct timer_list *timer) +{ + struct ioinf *inf = container_of(timer, struct ioinf, inf_timer); + struct ioinf_rq_wait *online = &inf->online; + struct ioinf_rq_wait *offline = &inf->offline; + unsigned long flags; + u32 online_budget, offline_budget; + + spin_lock_irqsave(&inf->lock, flags); + + online_budget = ioinf_calc_budget(online); + offline_budget = ioinf_calc_budget(offline); + ioinf_update_inflight(inf, online_budget, offline_budget); + + spin_unlock_irqrestore(&inf->lock, flags); + mod_timer(&inf->inf_timer, jiffies + inf->inf_timer_perid); +} + +static u32 ioinf_default_inflight(struct gendisk *disk) +{ + return max(disk->queue->nr_requests, IOINF_MIN_INFLIGHT); +} + +static int blk_ioinf_init(struct gendisk *disk) +{ + struct ioinf *inf; + int ret; + + inf = kzalloc(sizeof(*inf), GFP_KERNEL); + if (!inf) + return -ENOMEM; + + spin_lock_init(&inf->lock); + inf->params.inflight = ioinf_default_inflight(disk); + inf->inflight = inf->params.inflight; + inf->inf_timer_perid = IOINF_TIMER_PERID; + inf->offline.hinflight = IOINFG_MIN_INFLIGHT; + rq_wait_init(&inf->offline.rqw); + inf->online.hinflight = inf->inflight - IOINFG_MIN_INFLIGHT; + rq_wait_init(&inf->online.rqw); + timer_setup(&inf->inf_timer, ioinf_timer_fn, 0); + + ret = rq_qos_add(&inf->rqos, disk, RQ_QOS_INFLIGHT, &ioinf_rqos_ops); + if (ret) + goto err_free_inf; + + ret = blkcg_activate_policy(disk, &blkcg_policy_ioinf); + if (ret) + goto err_del_qos; + return 0; + +err_del_qos: + rq_qos_del(&inf->rqos); +err_free_inf: + timer_shutdown_sync(&inf->inf_timer); + kfree(inf); + return ret; +} + +static u64 ioinf_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioinf_gq *infg = pd_to_infg(pd); + + if (dname && infg->user_weight) + seq_printf(sf, "%s %u\n", dname, infg->user_weight); + + return 0; +} + +static int ioinf_weight_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct ioinf_cgrp *infcg = blkcg_to_infcg(blkcg); + + seq_printf(sf, "default %u\n", infcg->dfl_user_weight); + blkcg_print_blkgs(sf, blkcg, ioinf_weight_prfill, &blkcg_policy_ioinf, + seq_cft(sf)->private, false); + + return 0; +} + +static ssize_t ioinf_weight_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct ioinf_cgrp *infcg = blkcg_to_infcg(blkcg); + struct blkg_conf_ctx ctx; + struct ioinf_gq *infg; + int ret; + u32 v; + + if (!strchr(buf, ':')) { + if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v)) + return -EINVAL; + + infcg->dfl_user_weight = v; + + return nbytes; + } + + blkg_conf_init(&ctx, buf); + ret = blkg_conf_prep(blkcg, &blkcg_policy_ioinf, &ctx); + if (ret) + return ret; + + infg = blkg_to_infg(ctx.blkg); + if (!strncmp(ctx.body, "default", 7)) { + v = IOINF_DFL_WEIGHT; + } else if (!sscanf(ctx.body, "%u", &v) || + v < 0 || v > CGROUP_WEIGHT_MAX) { + blkg_conf_exit(&ctx); + return -EINVAL; + } + + infg->user_weight = v; + blkg_conf_exit(&ctx); + return nbytes; +} + +static u64 ioinf_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioinf *inf = q_to_inf(pd->blkg->q); + struct ioinf_params params; + + if (!dname) + return 0; + + params = inf->params; + seq_printf(sf, "%s enable=%d inflight=%u", dname, + params.enabled, params.inflight); + + seq_putc(sf, '\n'); + return 0; +} + +static int ioinf_qos_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + + blkcg_print_blkgs(sf, blkcg, ioinf_qos_prfill, + &blkcg_policy_ioinf, seq_cft(sf)->private, false); + return 0; +} + +static const match_table_t qos_ctrl_tokens = { + { INF_ENABLE, "enable=%u" }, + { INF_INFLIGHT, "inflight=%u" }, + { NR_QOS_CTRL_PARAMS, NULL }, +}; + +static ssize_t ioinf_qos_write(struct kernfs_open_file *of, char *input, + size_t nbytes, loff_t off) +{ + struct blkg_conf_ctx ctx; + struct gendisk *disk; + struct ioinf *inf; + struct ioinf_params params = {0}; + char *body, *p; + int ret; + + blkg_conf_init(&ctx, input); + + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto err; + + body = ctx.body; + disk = ctx.bdev->bd_disk; + if (!queue_is_mq(disk->queue)) { + ret = -EOPNOTSUPP; + goto err; + } + + inf = q_to_inf(disk->queue); + if (inf) + params = inf->params; + + while ((p = strsep(&body, " \t\n"))) { + substring_t args[MAX_OPT_ARGS]; + s64 v; + + if (!*p) + continue; + + switch (match_token(p, qos_ctrl_tokens, args)) { + case INF_ENABLE: + if (match_u64(&args[0], &v)) + goto einval; + params.enabled = !!v; + continue; + case INF_INFLIGHT: + if (match_u64(&args[0], &v) || v == 0) + goto einval; + params.inflight = v; + continue; + default: + goto einval; + } + } + + if (!inf && params.enabled) { + ret = blk_ioinf_init(disk); + if (ret) + goto err; + inf = q_to_inf(disk->queue); + if (!params.inflight) + params.inflight = inf->params.inflight; + } else if (inf && !params.enabled) { + timer_shutdown_sync(&inf->inf_timer); + blkcg_deactivate_policy(inf->rqos.disk, &blkcg_policy_ioinf); + rq_qos_del(&inf->rqos); + kfree(inf); + inf = NULL; + } + + if (inf) { + inf->params = params; + if (inf->inflight != params.inflight) { + spin_lock_irq(&inf->lock); + ioinf_update_inflight(inf, inf->online.hinflight, + inf->offline.hinflight); + spin_unlock_irq(&inf->lock); + } + } + + blkg_conf_exit(&ctx); + return nbytes; + +einval: + ret = -EINVAL; +err: + blkg_conf_exit(&ctx); + return ret; +} + +static struct cftype ioinf_files[] = { + { + .name = "inf.weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = ioinf_weight_show, + .write = ioinf_weight_write, + }, + { + .name = "inf.qos", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = ioinf_qos_show, + .write = ioinf_qos_write, + }, + {} +}; + +static struct cftype ioinf_legacy_files[] = { + { + .name = "inf.weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = ioinf_weight_show, + .write = ioinf_weight_write, + }, + { + .name = "inf.qos", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = ioinf_qos_show, + .write = ioinf_qos_write, + }, + {} +}; + +static struct blkcg_policy_data *ioinf_cpd_alloc(gfp_t gfp) +{ + struct ioinf_cgrp *infcg = kzalloc(sizeof(*infcg), gfp); + + if (!infcg) + return NULL; + + infcg->dfl_user_weight = IOINF_DFL_WEIGHT; + return &infcg->cpd; +} + +static void ioinf_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(container_of(cpd, struct ioinf_cgrp, cpd)); +} + +static struct blkg_policy_data *ioinf_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) +{ + struct ioinf_gq *infg = kzalloc_node(sizeof(*infg), gfp, disk->node_id); + + if (!infg) + return NULL; + + return &infg->pd; +} + +static void ioinf_pd_init(struct blkg_policy_data *pd) +{ + struct ioinf_gq *infg = pd_to_infg(pd); + struct blkcg_gq *blkg = pd_to_blkg(pd); + + infg->inf = q_to_inf(blkg->q); +} + +static void ioinf_pd_free(struct blkg_policy_data *pd) +{ + struct ioinf_gq *infg = pd_to_infg(pd); + + kfree(infg); +} + +static struct blkcg_policy blkcg_policy_ioinf = { + .dfl_cftypes = ioinf_files, + .legacy_cftypes = ioinf_legacy_files, + + .cpd_alloc_fn = ioinf_cpd_alloc, + .cpd_free_fn = ioinf_cpd_free, + + .pd_alloc_fn = ioinf_pd_alloc, + .pd_init_fn = ioinf_pd_init, + .pd_free_fn = ioinf_pd_free, +}; + +static int __init ioinf_init(void) +{ + return blkcg_policy_register(&blkcg_policy_ioinf); +} + +static void __exit ioinf_exit(void) +{ + blkcg_policy_unregister(&blkcg_policy_ioinf); +} + +module_init(ioinf_init); +module_exit(ioinf_exit); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index efe99cfae51d..b5af47bf99d4 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -798,6 +798,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) return "latency"; case RQ_QOS_COST: return "cost"; + case RQ_QOS_INFLIGHT: + return "inflight"; } return "unknown"; } diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 93d1ba692973..d504a302ca0f 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -17,6 +17,7 @@ enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, + RQ_QOS_INFLIGHT, }; struct rq_wait { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4c4416fd2df7..81a733e1bef9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -115,6 +115,9 @@ struct request { #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; +#endif +#ifdef CONFIG_BLK_CGROUP_IOINFLIGHT + struct blkcg_gq *blkg; #endif /* * rq sectors used for blk stats. It has the same value -- 2.46.1

Add /sys/kernel/debug/block/$DEV/rqos/inflight/lat interface to display the dynamic latency of online cgroups. Signed-off-by: Baokun Li <libaokun1@huawei.com> --- block/blk-ioinf.c | 105 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-) diff --git a/block/blk-ioinf.c b/block/blk-ioinf.c index 623cbfca2823..6627a003312c 100644 --- a/block/blk-ioinf.c +++ b/block/blk-ioinf.c @@ -35,6 +35,16 @@ struct ioinf_params { u32 inflight; }; +struct ioinf_io_stat { + u64 nr; + u64 lat; +}; + +struct ioinf_lat_stat { + struct ioinf_io_stat read; + struct ioinf_io_stat write; +}; + struct ioinf_rq_wait { struct rq_wait rqw; u32 hinflight; @@ -62,6 +72,10 @@ struct ioinf { struct ioinf_rq_wait offline; /* for online cgroups */ struct ioinf_rq_wait online; + + struct ioinf_lat_stat last_stat; + struct ioinf_lat_stat delta_stat; + struct ioinf_lat_stat __percpu *stat; }; /* per disk-cgroup pair structure */ @@ -298,6 +312,27 @@ static void ioinf_rqos_track(struct rq_qos *rqos, struct request *rq, rq->blkg = blkg; } +static void ioinf_record_lat(struct ioinf *inf, struct request *rq) +{ + u64 lat; + + lat = rq->io_end_time_ns ? rq->io_end_time_ns : blk_time_get_ns(); + lat -= rq->alloc_time_ns; + + switch(req_op(rq)) { + case REQ_OP_READ: + this_cpu_inc(inf->stat->read.nr); + this_cpu_add(inf->stat->read.lat, lat); + break; + case REQ_OP_WRITE: + this_cpu_inc(inf->stat->write.nr); + this_cpu_add(inf->stat->write.lat, lat); + break; + default: + break; + } +} + static void ioinf_rqos_done(struct rq_qos *rqos, struct request *rq) { struct blkcg_gq *blkg = rq->blkg; @@ -309,10 +344,12 @@ static void ioinf_rqos_done(struct rq_qos *rqos, struct request *rq) infg = blkg_to_infg(blkg); inf = infg->inf; - if (infg_offline(infg)) + if (infg_offline(infg)) { ioinf_done(&inf->offline); - else + } else { ioinf_done(&inf->online); + ioinf_record_lat(inf, rq); + } rq->blkg = NULL; } @@ -324,6 +361,7 @@ static void ioinf_rqos_exit(struct rq_qos *rqos) blkcg_deactivate_policy(rqos->disk, &blkcg_policy_ioinf); timer_shutdown_sync(&inf->inf_timer); + free_percpu(inf->stat); kfree(inf); } @@ -348,8 +386,21 @@ static int ioinf_stat_show(void *data, struct seq_file *m) return 0; } +static int ioinf_lat_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct ioinf *inf = rqos_to_inf(rqos); + struct ioinf_lat_stat *stat = &inf->delta_stat; + + seq_printf(m, "online average latency: (%llu-%llu) (%llu-%llu)\n", + stat->read.nr, stat->read.lat, stat->write.nr, stat->write.lat); + + return 0; +} + static const struct blk_mq_debugfs_attr ioinf_debugfs_attrs[] = { {"stat", 0400, ioinf_stat_show}, + {"lat", 0400, ioinf_lat_show}, {}, }; @@ -378,6 +429,46 @@ u32 ioinf_calc_budget(struct ioinf_rq_wait *ioinf_rqw) return new_budget; } +static void ioinf_sample_cpu_lat(struct ioinf_lat_stat *cur, int cpu, + struct ioinf_lat_stat __percpu *stat) +{ + struct ioinf_lat_stat *pstat = per_cpu_ptr(stat, cpu); + + cur->read.nr += pstat->read.nr; + cur->read.lat += pstat->read.lat; + cur->write.nr += pstat->write.nr; + cur->write.lat += pstat->write.lat; +} + +static struct ioinf_lat_stat ioinf_calc_stat(struct ioinf_lat_stat *cur, + struct ioinf_lat_stat *last) +{ + struct ioinf_lat_stat delta = {0}; + + delta.read.nr = cur->read.nr - last->read.nr; + delta.read.lat = cur->read.lat - last->read.lat; + if (delta.read.nr > 0) + delta.read.lat = delta.read.lat / delta.read.nr; + + delta.write.nr = cur->write.nr - last->write.nr; + delta.write.lat = cur->write.lat - last->write.lat; + if (delta.write.nr > 0) + delta.write.lat = delta.write.lat / delta.write.nr; + + return delta; +} + +static void ioinf_sample_lat(struct ioinf *inf) +{ + struct ioinf_lat_stat cur = {0}; + int cpu; + + for_each_possible_cpu(cpu) + ioinf_sample_cpu_lat(&cur, cpu, inf->stat); + inf->delta_stat = ioinf_calc_stat(&cur, &inf->last_stat); + inf->last_stat = cur; +} + static void ioinf_update_inflight(struct ioinf *inf, u32 new_online, u32 new_offline) { @@ -408,6 +499,7 @@ static void ioinf_timer_fn(struct timer_list *timer) unsigned long flags; u32 online_budget, offline_budget; + ioinf_sample_lat(inf); spin_lock_irqsave(&inf->lock, flags); online_budget = ioinf_calc_budget(online); @@ -432,6 +524,12 @@ static int blk_ioinf_init(struct gendisk *disk) if (!inf) return -ENOMEM; + inf->stat = alloc_percpu(struct ioinf_lat_stat); + if (!inf->stat) { + kfree(inf); + return -ENOMEM; + } + spin_lock_init(&inf->lock); inf->params.inflight = ioinf_default_inflight(disk); inf->inflight = inf->params.inflight; @@ -455,6 +553,7 @@ static int blk_ioinf_init(struct gendisk *disk) rq_qos_del(&inf->rqos); err_free_inf: timer_shutdown_sync(&inf->inf_timer); + free_percpu(inf->stat); kfree(inf); return ret; } @@ -611,7 +710,9 @@ static ssize_t ioinf_qos_write(struct kernfs_open_file *of, char *input, inf = q_to_inf(disk->queue); if (!params.inflight) params.inflight = inf->params.inflight; + blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); } else if (inf && !params.enabled) { + blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); timer_shutdown_sync(&inf->inf_timer); blkcg_deactivate_policy(inf->rqos.disk, &blkcg_policy_ioinf); rq_qos_del(&inf->rqos); -- 2.46.1

With a fixed inflight limit, offline workloads are still throttled even when no online workloads are running. Moreover, excessive idle inflight capacity can allow sudden offline bursts to impact online workloads, causing latency fluctuations. Introduce dynamic inflight limit adjustment, with a range from 0.1% to 100× of the configured value. The adjustment rules are: 1) When online workloads are throttled, reclaim offline budget and try to increase the total inflight limit. 2) When only offline workloads are active, increase inflight as needed without restriction. 3) If neither of the above conditions has been met since the last increase, attempt to reduce inflight to avoid excessive idle budget. Signed-off-by: Baokun Li <libaokun1@huawei.com> --- block/blk-ioinf.c | 96 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 8 deletions(-) diff --git a/block/blk-ioinf.c b/block/blk-ioinf.c index 6627a003312c..9c771a6f276a 100644 --- a/block/blk-ioinf.c +++ b/block/blk-ioinf.c @@ -16,11 +16,19 @@ /* default weight for each cgroup */ #define IOINF_DFL_WEIGHT 0 -#define IOINF_MIN_INFLIGHT 30 +#define IOINF_MIN_INFLIGHT 3 #define IOINFG_MIN_INFLIGHT 1 /* default wake-up time in jiffies for backgroup job, see ioinf_timer_fn() */ #define IOINF_TIMER_PERID 500 +/* scale inflight from 1/1000 to 100 */ +enum { + MIN_SCALE = 1, /* one thousandth. */ + DFL_SCALE = 100, /* one tenth. */ + SCALE_GRAN = 1000, /* The control granularity is 1/1000. */ + MAX_SCALE = 100000, /* A hundredfold. */ +}; + /* io.inf.qos controls */ enum { INF_ENABLE, @@ -60,6 +68,8 @@ struct ioinf { struct ioinf_params params; u32 inflight; + u32 scale; + u32 old_scale; /* default time for ioinf_timer_fn */ unsigned long inf_timer_perid; @@ -265,6 +275,9 @@ static bool ioinf_inflight_cb(struct rq_wait *rqw, void *private_data) goto retry; ioinf_rqw->exhausted++; + /* wake up ioinf_timer_fn() immediately to adjust scale */ + if (inf->scale < MAX_SCALE) + timer_reduce(&inf->inf_timer, jiffies + 1); return false; } @@ -372,7 +385,9 @@ static int ioinf_stat_show(void *data, struct seq_file *m) spin_lock_irq(&inf->lock); - seq_printf(m, "inflight %u->%u\n", inf->params.inflight, inf->inflight); + seq_printf(m, "scale %u/%u inflight %u->%u\n", + inf->scale, SCALE_GRAN, + inf->params.inflight, inf->inflight); seq_printf(m, "online inflight %u/%d\n", atomic_read(&inf->online.rqw.inflight), @@ -415,6 +430,49 @@ static struct rq_qos_ops ioinf_rqos_ops = { #endif }; +static void __inflight_scale_up(struct ioinf *inf, u32 aim, bool force) +{ + u32 new_scale; + + inf->old_scale = inf->scale; + if (aim < inf->inflight || inf->scale >= MAX_SCALE) + return; + + new_scale = DIV_ROUND_UP(aim * SCALE_GRAN, inf->params.inflight); + if (new_scale <= inf->old_scale) { + if (!force) + return; + new_scale = inf->scale + 1; + } + + inf->scale = new_scale; +} + +static void inflight_scale_up(struct ioinf *inf, u32 aim) +{ + __inflight_scale_up(inf, aim, false); +} + +static void inflight_force_scale_up(struct ioinf *inf, u32 aim) +{ + __inflight_scale_up(inf, aim, true); +} + +static void inflight_scale_down(struct ioinf *inf, u32 aim) +{ + u32 new_scale; + + inf->old_scale = inf->scale; + if (inf->inflight <= IOINF_MIN_INFLIGHT || inf->old_scale >= MAX_SCALE) + return; + + new_scale = DIV_ROUND_UP(aim * SCALE_GRAN, inf->params.inflight); + if (new_scale >= inf->old_scale) + return; + + inf->scale = new_scale; +} + u32 ioinf_calc_budget(struct ioinf_rq_wait *ioinf_rqw) { u32 new_budget; @@ -472,9 +530,12 @@ static void ioinf_sample_lat(struct ioinf *inf) static void ioinf_update_inflight(struct ioinf *inf, u32 new_online, u32 new_offline) { - inf->inflight = inf->params.inflight; - if (inf->inflight < IOINF_MIN_INFLIGHT) + inf->scale = clamp(inf->scale, MIN_SCALE, MAX_SCALE); + inf->inflight = inf->params.inflight * inf->scale / SCALE_GRAN; + if (inf->inflight < IOINF_MIN_INFLIGHT) { inf->inflight = IOINF_MIN_INFLIGHT; + inf->scale = inf->inflight * SCALE_GRAN / inf->params.inflight; + } if (new_online >= inf->inflight) new_offline = min(new_offline, IOINFG_MIN_INFLIGHT); @@ -504,15 +565,32 @@ static void ioinf_timer_fn(struct timer_list *timer) online_budget = ioinf_calc_budget(online); offline_budget = ioinf_calc_budget(offline); + + if (online->exhausted) { + offline_budget = min(offline_budget, IOINFG_MIN_INFLIGHT); + inflight_force_scale_up(inf, online_budget + offline_budget); + } else if (!online_budget) { + inflight_scale_up(inf, offline_budget); + } else if (inf->old_scale < inf->scale) { + inflight_scale_down(inf, online_budget + offline->hinflight); + } + ioinf_update_inflight(inf, online_budget, offline_budget); spin_unlock_irqrestore(&inf->lock, flags); mod_timer(&inf->inf_timer, jiffies + inf->inf_timer_perid); } -static u32 ioinf_default_inflight(struct gendisk *disk) +static u32 ioinf_default_inflight(struct ioinf *inf) { - return max(disk->queue->nr_requests, IOINF_MIN_INFLIGHT); + u32 inflight = inf->params.inflight * DFL_SCALE / SCALE_GRAN; + + if (inflight < IOINF_MIN_INFLIGHT) + inflight = IOINF_MIN_INFLIGHT; + inf->scale = DIV_ROUND_UP(inflight * SCALE_GRAN, inf->params.inflight); + inf->old_scale = inf->scale; + + return inf->params.inflight * inf->scale / SCALE_GRAN; } static int blk_ioinf_init(struct gendisk *disk) @@ -531,8 +609,8 @@ static int blk_ioinf_init(struct gendisk *disk) } spin_lock_init(&inf->lock); - inf->params.inflight = ioinf_default_inflight(disk); - inf->inflight = inf->params.inflight; + inf->params.inflight = disk->queue->nr_requests; + inf->inflight = ioinf_default_inflight(inf); inf->inf_timer_perid = IOINF_TIMER_PERID; inf->offline.hinflight = IOINFG_MIN_INFLIGHT; rq_wait_init(&inf->offline.rqw); @@ -724,6 +802,8 @@ static ssize_t ioinf_qos_write(struct kernfs_open_file *of, char *input, inf->params = params; if (inf->inflight != params.inflight) { spin_lock_irq(&inf->lock); + inf->scale = SCALE_GRAN; + inf->old_scale = SCALE_GRAN; ioinf_update_inflight(inf, inf->online.hinflight, inf->offline.hinflight); spin_unlock_irq(&inf->lock); -- 2.46.1

Online (latency-sensitive) workloads often require strict latency guarantees to maintain service responsiveness. To support this, introduce a qos_enable switch that allows users to specify percentile-based latency targets (e.g., 95% of write IOs < 1ms). Read and write latencies are monitored separately. If either latency target is violated, the inflight limit will be reduced to throttle IO pressure until the latency requirements are met. Signed-off-by: Baokun Li <libaokun1@huawei.com> --- block/blk-ioinf.c | 141 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 127 insertions(+), 14 deletions(-) diff --git a/block/blk-ioinf.c b/block/blk-ioinf.c index 9c771a6f276a..db039e3c13fa 100644 --- a/block/blk-ioinf.c +++ b/block/blk-ioinf.c @@ -20,10 +20,13 @@ #define IOINFG_MIN_INFLIGHT 1 /* default wake-up time in jiffies for backgroup job, see ioinf_timer_fn() */ #define IOINF_TIMER_PERID 500 +/* minimal number of samples for congestion control */ +#define IOINF_MIN_SAMPLES 100 /* scale inflight from 1/1000 to 100 */ enum { MIN_SCALE = 1, /* one thousandth. */ + SCALE_THRESH = 3, /* Regulate scale threshold. */ DFL_SCALE = 100, /* one tenth. */ SCALE_GRAN = 1000, /* The control granularity is 1/1000. */ MAX_SCALE = 100000, /* A hundredfold. */ @@ -34,18 +37,30 @@ enum { INF_ENABLE, INF_INFLIGHT, + QOS_ENABLE, + QOS_RLAT, + QOS_WLAT, + QOS_RPCT, + QOS_WPCT, + NR_QOS_CTRL_PARAMS, }; /* qos control params */ struct ioinf_params { bool enabled; + bool qos_enabled; u32 inflight; + u64 rlat; + u64 wlat; + u32 rpct; + u32 wpct; }; struct ioinf_io_stat { u64 nr; u64 lat; + u64 met; }; struct ioinf_lat_stat { @@ -70,6 +85,7 @@ struct ioinf { u32 inflight; u32 scale; u32 old_scale; + u32 max_scale; /* default time for ioinf_timer_fn */ unsigned long inf_timer_perid; @@ -84,6 +100,7 @@ struct ioinf { struct ioinf_rq_wait online; struct ioinf_lat_stat last_stat; + struct ioinf_lat_stat cur_stat; struct ioinf_lat_stat delta_stat; struct ioinf_lat_stat __percpu *stat; }; @@ -276,7 +293,7 @@ static bool ioinf_inflight_cb(struct rq_wait *rqw, void *private_data) ioinf_rqw->exhausted++; /* wake up ioinf_timer_fn() immediately to adjust scale */ - if (inf->scale < MAX_SCALE) + if (inf->scale < inf->max_scale) timer_reduce(&inf->inf_timer, jiffies + 1); return false; } @@ -336,10 +353,14 @@ static void ioinf_record_lat(struct ioinf *inf, struct request *rq) case REQ_OP_READ: this_cpu_inc(inf->stat->read.nr); this_cpu_add(inf->stat->read.lat, lat); + if (inf->params.qos_enabled && lat <= inf->params.rlat) + this_cpu_inc(inf->stat->read.met); break; case REQ_OP_WRITE: this_cpu_inc(inf->stat->write.nr); this_cpu_add(inf->stat->write.lat, lat); + if (inf->params.qos_enabled && lat <= inf->params.wlat) + this_cpu_inc(inf->stat->write.met); break; default: break; @@ -407,8 +428,9 @@ static int ioinf_lat_show(void *data, struct seq_file *m) struct ioinf *inf = rqos_to_inf(rqos); struct ioinf_lat_stat *stat = &inf->delta_stat; - seq_printf(m, "online average latency: (%llu-%llu) (%llu-%llu)\n", - stat->read.nr, stat->read.lat, stat->write.nr, stat->write.lat); + seq_printf(m, "online average latency: (%llu/%llu-%llu) (%llu/%llu-%llu)\n", + stat->read.met, stat->read.nr, stat->read.lat, + stat->write.met, stat->write.nr, stat->write.lat); return 0; } @@ -458,7 +480,7 @@ static void inflight_force_scale_up(struct ioinf *inf, u32 aim) __inflight_scale_up(inf, aim, true); } -static void inflight_scale_down(struct ioinf *inf, u32 aim) +static void __inflight_scale_down(struct ioinf *inf, u32 aim, bool force) { u32 new_scale; @@ -467,12 +489,25 @@ static void inflight_scale_down(struct ioinf *inf, u32 aim) return; new_scale = DIV_ROUND_UP(aim * SCALE_GRAN, inf->params.inflight); - if (new_scale >= inf->old_scale) - return; + if (new_scale >= inf->old_scale) { + if (!force) + return; + new_scale = inf->scale - 1; + } inf->scale = new_scale; } +static void inflight_scale_down(struct ioinf *inf, u32 aim) +{ + __inflight_scale_down(inf, aim, false); +} + +static void inflight_force_scale_down(struct ioinf *inf, u32 aim) +{ + __inflight_scale_down(inf, aim, true); +} + u32 ioinf_calc_budget(struct ioinf_rq_wait *ioinf_rqw) { u32 new_budget; @@ -494,8 +529,10 @@ static void ioinf_sample_cpu_lat(struct ioinf_lat_stat *cur, int cpu, cur->read.nr += pstat->read.nr; cur->read.lat += pstat->read.lat; + cur->read.met += pstat->read.met; cur->write.nr += pstat->write.nr; cur->write.lat += pstat->write.lat; + cur->write.met += pstat->write.met; } static struct ioinf_lat_stat ioinf_calc_stat(struct ioinf_lat_stat *cur, @@ -504,11 +541,13 @@ static struct ioinf_lat_stat ioinf_calc_stat(struct ioinf_lat_stat *cur, struct ioinf_lat_stat delta = {0}; delta.read.nr = cur->read.nr - last->read.nr; + delta.read.met = cur->read.met - last->read.met; delta.read.lat = cur->read.lat - last->read.lat; if (delta.read.nr > 0) delta.read.lat = delta.read.lat / delta.read.nr; delta.write.nr = cur->write.nr - last->write.nr; + delta.write.met = cur->write.met - last->write.met; delta.write.lat = cur->write.lat - last->write.lat; if (delta.write.nr > 0) delta.write.lat = delta.write.lat / delta.write.nr; @@ -518,13 +557,37 @@ static struct ioinf_lat_stat ioinf_calc_stat(struct ioinf_lat_stat *cur, static void ioinf_sample_lat(struct ioinf *inf) { - struct ioinf_lat_stat cur = {0}; int cpu; for_each_possible_cpu(cpu) - ioinf_sample_cpu_lat(&cur, cpu, inf->stat); - inf->delta_stat = ioinf_calc_stat(&cur, &inf->last_stat); - inf->last_stat = cur; + ioinf_sample_cpu_lat(&inf->cur_stat, cpu, inf->stat); + inf->delta_stat = ioinf_calc_stat(&inf->cur_stat, &inf->last_stat); +} + +static int ioinf_online_busy(struct ioinf *inf) +{ + struct ioinf_lat_stat *stat; + int met_percent, unmet_percent = 0; + + if (!inf->params.qos_enabled) { + inf->last_stat = inf->cur_stat; + return unmet_percent; + } + + stat = &inf->delta_stat; + if (stat->read.nr >= IOINF_MIN_SAMPLES) { + inf->last_stat.read = inf->cur_stat.read; + met_percent = stat->read.met * 100 / stat->read.nr; + unmet_percent = inf->params.rpct - met_percent; + } + if (stat->write.nr >= IOINF_MIN_SAMPLES) { + inf->last_stat.write = inf->cur_stat.write; + met_percent = stat->write.met * 100 / stat->write.nr; + if (unmet_percent < inf->params.wpct - met_percent) + unmet_percent = inf->params.wpct - met_percent; + } + + return unmet_percent; } static @@ -559,16 +622,30 @@ static void ioinf_timer_fn(struct timer_list *timer) struct ioinf_rq_wait *offline = &inf->offline; unsigned long flags; u32 online_budget, offline_budget; + int unmet_percent; - ioinf_sample_lat(inf); spin_lock_irqsave(&inf->lock, flags); + ioinf_sample_lat(inf); + unmet_percent = ioinf_online_busy(inf); online_budget = ioinf_calc_budget(online); offline_budget = ioinf_calc_budget(offline); - if (online->exhausted) { + if (unmet_percent < -SCALE_THRESH && inf->max_scale < MAX_SCALE) + inf->max_scale++; + + if (unmet_percent > 0) { + inf->max_scale = clamp(inf->scale - 1, MIN_SCALE, MAX_SCALE); + offline_budget = min(offline_budget, IOINFG_MIN_INFLIGHT); + online_budget = online->hinflight; + online_budget -= online_budget * unmet_percent / 100; + online_budget = max(online_budget, IOINFG_MIN_INFLIGHT); + inflight_force_scale_down(inf, online_budget + offline_budget); + } else if (inf->scale < inf->max_scale && online->exhausted) { offline_budget = min(offline_budget, IOINFG_MIN_INFLIGHT); inflight_force_scale_up(inf, online_budget + offline_budget); + if (inf->scale > inf->max_scale) + inf->scale = (inf->old_scale + inf->max_scale + 1) / 2; } else if (!online_budget) { inflight_scale_up(inf, offline_budget); } else if (inf->old_scale < inf->scale) { @@ -611,6 +688,7 @@ static int blk_ioinf_init(struct gendisk *disk) spin_lock_init(&inf->lock); inf->params.inflight = disk->queue->nr_requests; inf->inflight = ioinf_default_inflight(inf); + inf->max_scale = MAX_SCALE; inf->inf_timer_perid = IOINF_TIMER_PERID; inf->offline.hinflight = IOINFG_MIN_INFLIGHT; rq_wait_init(&inf->offline.rqw); @@ -709,8 +787,12 @@ static u64 ioinf_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, return 0; params = inf->params; - seq_printf(sf, "%s enable=%d inflight=%u", dname, - params.enabled, params.inflight); + seq_printf(sf, "%s enable=%d inflight=%u qos_enable=%d", dname, + params.enabled, params.inflight, params.qos_enabled); + + if (inf->params.qos_enabled) + seq_printf(sf, " rlat=%llu rpct=%u wlat=%llu wpct=%u", + params.rlat, params.rpct, params.wlat, params.wpct); seq_putc(sf, '\n'); return 0; @@ -728,6 +810,11 @@ static int ioinf_qos_show(struct seq_file *sf, void *v) static const match_table_t qos_ctrl_tokens = { { INF_ENABLE, "enable=%u" }, { INF_INFLIGHT, "inflight=%u" }, + { QOS_ENABLE, "qos_enable=%u" }, + { QOS_RLAT, "rlat=%u" }, + { QOS_WLAT, "wlat=%u" }, + { QOS_RPCT, "rpct=%u" }, + { QOS_WPCT, "wpct=%u" }, { NR_QOS_CTRL_PARAMS, NULL }, }; @@ -776,6 +863,31 @@ static ssize_t ioinf_qos_write(struct kernfs_open_file *of, char *input, goto einval; params.inflight = v; continue; + case QOS_ENABLE: + if (match_u64(&args[0], &v)) + goto einval; + params.qos_enabled = !!v; + continue; + case QOS_RLAT: + if (match_u64(&args[0], &v) || v == 0) + goto einval; + params.rlat = v; + continue; + case QOS_WLAT: + if (match_u64(&args[0], &v) || v == 0) + goto einval; + params.wlat = v; + continue; + case QOS_RPCT: + if (match_u64(&args[0], &v) || v > 100) + goto einval; + params.rpct = v; + continue; + case QOS_WPCT: + if (match_u64(&args[0], &v) || v > 100) + goto einval; + params.wpct = v; + continue; default: goto einval; } @@ -808,6 +920,7 @@ static ssize_t ioinf_qos_write(struct kernfs_open_file *of, char *input, inf->offline.hinflight); spin_unlock_irq(&inf->lock); } + inf->max_scale = MAX_SCALE; } blkg_conf_exit(&ctx); -- 2.46.1

Support assigning different weights to online cgroups to provide priority differentiation and finer-grained control. Signed-off-by: Baokun Li <libaokun1@huawei.com> --- block/blk-ioinf.c | 369 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 353 insertions(+), 16 deletions(-) diff --git a/block/blk-ioinf.c b/block/blk-ioinf.c index db039e3c13fa..2577bee4893e 100644 --- a/block/blk-ioinf.c +++ b/block/blk-ioinf.c @@ -9,6 +9,7 @@ #include <linux/time64.h> #include <linux/parser.h> #include <linux/blk-cgroup.h> +#include <linux/list_sort.h> #include "blk-cgroup.h" #include "blk-rq-qos.h" @@ -23,6 +24,10 @@ /* minimal number of samples for congestion control */ #define IOINF_MIN_SAMPLES 100 +bool online_weight = false; +module_param(online_weight, bool, 0); +MODULE_PARM_DESC(online_weight, "True if you want online weight, false if not"); + /* scale inflight from 1/1000 to 100 */ enum { MIN_SCALE = 1, /* one thousandth. */ @@ -93,6 +98,10 @@ struct ioinf { /* global lock */ spinlock_t lock; + /* list of active infgs */ + struct list_head active_infgs; + /* The total user weight of all active online cgroups */ + u32 total_weights; /* for offline cgroups */ struct ioinf_rq_wait offline; @@ -112,6 +121,27 @@ struct ioinf_gq { /* configured by user */ u32 user_weight; + /* original weight ratio */ + u32 weight; + /* normalized weight ratio */ + u32 hweight; + /* normalized inflight */ + u32 hinflight; + /* normalized goal inflight */ + u32 goal; + /* accumulated precision loss or inflight spikes. */ + int deficit; + + /* head of the list is inf->active_infgs */ + struct list_head active; + /* for each cgroup, IO beyond budget will wait here */ + struct ioinf_rq_wait cg_rqw; + + /* to calculate avgqu size */ + struct ioinf_lat_stat last_stat; + struct ioinf_lat_stat cur_stat; + struct ioinf_lat_stat delta_stat; + struct ioinf_lat_stat __percpu *stat; }; /* per cgroup structure, used to record default weight for all disks */ @@ -239,7 +269,7 @@ static void ioinf_set_hinflight(struct ioinf_rq_wait *ioinf_rqw, u32 new) ioinf_rqw->hinflight = new; ioinf_rqw->last_max = max(ioinf_rqw->last_max >> 1, ioinf_rqw->max_inflight); - ioinf_rqw->max_inflight = new >> 1; + ioinf_rqw->max_inflight = IOINFG_MIN_INFLIGHT; if (new > old && wq_has_sleeper(&ioinf_rqw->rqw.wait)) wake_up_all(&ioinf_rqw->rqw.wait); @@ -256,6 +286,65 @@ void ioinf_done(struct ioinf_rq_wait *ioinf_rqw) wake_up_all(&ioinf_rqw->rqw.wait); } +/* Sort the active list by deficit, in descending order. */ +static int infgs_deficit_compare(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct ioinf_gq *infg1; + struct ioinf_gq *infg2; + + infg1 = container_of(a, struct ioinf_gq, active); + infg2 = container_of(b, struct ioinf_gq, active); + if (infg1->deficit < infg2->deficit) + return 1; + else if (infg1->deficit > infg2->deficit) + return -1; + return 0; +} + +static void infgs_propagate_weights(struct ioinf *inf) +{ + struct ioinf_gq *infg; + struct ioinf_rq_wait *online; + int total, left; + int hinflight; + + if (!online_weight || list_empty(&inf->active_infgs)) + return; + + online = &inf->online; + total = online->hinflight; + left = online->hinflight; + list_for_each_entry(infg, &inf->active_infgs, active) { + infg->weight = infg_user_weight(infg) * SCALE_GRAN / + inf->total_weights; + hinflight = total * infg->weight / SCALE_GRAN; + + /* Record the precision loss. */ + infg->deficit += total * infg->weight - hinflight * SCALE_GRAN; + /* Distribute some of the overused budget to others. */ + while (hinflight > IOINFG_MIN_INFLIGHT && + infg->deficit < -SCALE_GRAN) { + hinflight--; + infg->deficit += SCALE_GRAN; + } + + infg->hinflight = hinflight; + left -= hinflight; + } + + list_sort(NULL, &inf->active_infgs, infgs_deficit_compare); + list_for_each_entry(infg, &inf->active_infgs, active) { + if (left > 0 && infg->deficit > 0) { + left--; + infg->deficit -= SCALE_GRAN; + infg->hinflight++; + } + infg->hweight = infg->hinflight * SCALE_GRAN / online->hinflight; + ioinf_set_hinflight(&infg->cg_rqw, infg->hinflight); + } +} + static bool ioinf_inflight_cb(struct rq_wait *rqw, void *private_data) { struct ioinf_rq_wait *ioinf_rqw = rqw_to_ioinf_rqw(rqw); @@ -268,6 +357,7 @@ static bool ioinf_inflight_cb(struct rq_wait *rqw, void *private_data) inflight = atomic_inc_below_return(&rqw->inflight, limit); if (inflight > ioinf_rqw->max_inflight) ioinf_rqw->max_inflight = inflight; + if (inflight <= limit) { ioinf_rqw->issued++; return true; @@ -285,13 +375,23 @@ static bool ioinf_inflight_cb(struct rq_wait *rqw, void *private_data) inf->offline.hinflight >> 1); ioinf_set_hinflight(&inf->online, inf->inflight - inf->offline.hinflight); + + /* Distribute the reclaimed inflight. */ + infgs_propagate_weights(inf); spin_unlock_irq(&inf->lock); } if (ioinf_rqw->hinflight > limit) goto retry; + /* + * Once a cgroup successfully acquires an inflight, subsequent online + * inflight acquisitions are guaranteed to succeed. Therefore, we count + * online.exhausted here. + */ ioinf_rqw->exhausted++; + if (ioinf_rqw != &inf->online) + inf->online.exhausted++; /* wake up ioinf_timer_fn() immediately to adjust scale */ if (inf->scale < inf->max_scale) timer_reduce(&inf->inf_timer, jiffies + 1); @@ -305,6 +405,32 @@ static void ioinf_cleanup_cb(struct rq_wait *rqw, void *private_data) ioinf_done(ioinf_rqw); } +static void ioinf_activate_infg(struct ioinf_gq *infg) +{ + struct ioinf *inf = infg->inf; + + spin_lock_irq(&inf->lock); + if (list_empty(&infg->active)) { + list_add(&infg->active, &inf->active_infgs); + inf->total_weights += infg_user_weight(infg); + infgs_propagate_weights(inf); + } + spin_unlock_irq(&inf->lock); +} + +static void ioinf_deactivate_infg(struct ioinf_gq *infg) +{ + struct ioinf *inf = infg->inf; + + spin_lock_irq(&inf->lock); + if (!list_empty(&infg->active)) { + list_del_init(&infg->active); + inf->total_weights -= infg_user_weight(infg); + infgs_propagate_weights(inf); + } + spin_unlock_irq(&inf->lock); +} + static void ioinf_throttle(struct ioinf *inf, struct ioinf_rq_wait *ioinf_rqw) { rq_qos_wait(&ioinf_rqw->rqw, inf, ioinf_inflight_cb, @@ -325,10 +451,17 @@ static void ioinf_rqos_throttle(struct rq_qos *rqos, struct bio *bio) if (!inf->params.enabled || !infg) return; - if (infg_offline(infg)) + if (infg_offline(infg)) { ioinf_throttle(inf, &inf->offline); - else - ioinf_throttle(inf, &inf->online); + return; + } + + if (online_weight) { + if (list_empty_careful(&infg->active)) + ioinf_activate_infg(infg); + ioinf_throttle(inf, &infg->cg_rqw); + } + ioinf_throttle(inf, &inf->online); } static void ioinf_rqos_track(struct rq_qos *rqos, struct request *rq, @@ -342,9 +475,10 @@ static void ioinf_rqos_track(struct rq_qos *rqos, struct request *rq, rq->blkg = blkg; } -static void ioinf_record_lat(struct ioinf *inf, struct request *rq) +static void ioinf_record_lat(struct ioinf_gq *infg, struct request *rq) { u64 lat; + struct ioinf *inf = infg->inf; lat = rq->io_end_time_ns ? rq->io_end_time_ns : blk_time_get_ns(); lat -= rq->alloc_time_ns; @@ -353,14 +487,22 @@ static void ioinf_record_lat(struct ioinf *inf, struct request *rq) case REQ_OP_READ: this_cpu_inc(inf->stat->read.nr); this_cpu_add(inf->stat->read.lat, lat); - if (inf->params.qos_enabled && lat <= inf->params.rlat) + this_cpu_inc(infg->stat->read.nr); + this_cpu_add(infg->stat->read.lat, lat); + if (inf->params.qos_enabled && lat <= inf->params.rlat) { this_cpu_inc(inf->stat->read.met); + this_cpu_inc(infg->stat->read.met); + } break; case REQ_OP_WRITE: this_cpu_inc(inf->stat->write.nr); this_cpu_add(inf->stat->write.lat, lat); - if (inf->params.qos_enabled && lat <= inf->params.wlat) + this_cpu_inc(infg->stat->write.nr); + this_cpu_add(infg->stat->write.lat, lat); + if (inf->params.qos_enabled && lat <= inf->params.wlat) { this_cpu_inc(inf->stat->write.met); + this_cpu_inc(infg->stat->write.met); + } break; default: break; @@ -376,16 +518,34 @@ static void ioinf_rqos_done(struct rq_qos *rqos, struct request *rq) if (!blkg) return; + rq->blkg = NULL; infg = blkg_to_infg(blkg); inf = infg->inf; + if (infg_offline(infg)) { ioinf_done(&inf->offline); - } else { - ioinf_done(&inf->online); - ioinf_record_lat(inf, rq); + return; } - rq->blkg = NULL; + ioinf_done(&inf->online); + if (online_weight) + ioinf_done(&infg->cg_rqw); + ioinf_record_lat(infg, rq); +} + +static void ioinf_rqos_cleanup(struct rq_qos *rqos, struct bio *bio) +{ + struct ioinf_gq *infg; + + if (!online_weight) + return; + + infg = ioinf_bio_infg(bio); + if (!infg || infg->inf->params.enabled || + list_empty_careful(&infg->active)) + return; + + ioinf_done(&infg->cg_rqw); } static void ioinf_rqos_exit(struct rq_qos *rqos) @@ -399,6 +559,26 @@ static void ioinf_rqos_exit(struct rq_qos *rqos) kfree(inf); } +static void infgs_stat_show(struct ioinf *inf, struct seq_file *m) +{ + struct ioinf_gq *infg; + char path[32]; + struct ioinf_rq_wait *cg_rqw; + + list_for_each_entry(infg, &inf->active_infgs, active) { + blkg_path(infg_to_blkg(infg), path, sizeof(path)); + seq_printf(m, "%s: weight (%u->(%u->%u)/%d %d)", path, + infg_user_weight(infg), infg->weight, infg->hweight, + SCALE_GRAN, infg->deficit); + + cg_rqw = &infg->cg_rqw; + seq_printf(m, " inflight %d/(%u->%u) %u->%u\n", + atomic_read(&cg_rqw->rqw.inflight), + infg->hinflight, cg_rqw->hinflight, + cg_rqw->last_max, cg_rqw->max_inflight); + } +} + static int ioinf_stat_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; @@ -417,6 +597,7 @@ static int ioinf_stat_show(void *data, struct seq_file *m) atomic_read(&inf->offline.rqw.inflight), inf->offline.hinflight); + infgs_stat_show(inf, m); spin_unlock_irq(&inf->lock); return 0; @@ -426,12 +607,25 @@ static int ioinf_lat_show(void *data, struct seq_file *m) { struct rq_qos *rqos = data; struct ioinf *inf = rqos_to_inf(rqos); - struct ioinf_lat_stat *stat = &inf->delta_stat; + struct ioinf_lat_stat *stat; + struct ioinf_gq *infg; + char path[32]; + spin_lock_irq(&inf->lock); + stat = &inf->delta_stat; seq_printf(m, "online average latency: (%llu/%llu-%llu) (%llu/%llu-%llu)\n", stat->read.met, stat->read.nr, stat->read.lat, stat->write.met, stat->write.nr, stat->write.lat); + list_for_each_entry(infg, &inf->active_infgs, active) { + stat = &infg->delta_stat; + blkg_path(infg_to_blkg(infg), path, sizeof(path)); + seq_printf(m, "%s average latency: (%llu/%llu-%llu) (%llu/%llu-%llu)\n", + path, stat->read.met, stat->read.nr, stat->read.lat, + stat->write.met, stat->write.nr, stat->write.lat); + } + spin_unlock_irq(&inf->lock); + return 0; } @@ -445,6 +639,7 @@ static struct rq_qos_ops ioinf_rqos_ops = { .throttle = ioinf_rqos_throttle, .done = ioinf_rqos_done, .track = ioinf_rqos_track, + .cleanup = ioinf_rqos_cleanup, .exit = ioinf_rqos_exit, #ifdef CONFIG_BLK_DEBUG_FS @@ -522,6 +717,70 @@ u32 ioinf_calc_budget(struct ioinf_rq_wait *ioinf_rqw) return new_budget; } +static u32 adjust_budget_by_primary(struct ioinf *inf, struct ioinf_gq *infg) +{ + u32 online_budget = ioinf_calc_budget(&inf->online); + + if (!infg->cg_rqw.exhausted) { + u32 inflight; + + infg->weight = infg_user_weight(infg) * SCALE_GRAN / + inf->total_weights; + inflight = DIV_ROUND_UP(infg->goal * SCALE_GRAN, infg->weight); + inf->max_scale = DIV_ROUND_UP(inflight * SCALE_GRAN, + inf->params.inflight); + inf->max_scale = clamp(inf->max_scale, MIN_SCALE, MAX_SCALE); + inflight = inf->params.inflight * inf->max_scale / SCALE_GRAN; + online_budget = min(online_budget, inflight); + } + if (infg->cg_rqw.exhausted || inf->old_scale < inf->scale) { + online_budget = inf->online.hinflight * infg->goal / + infg->hinflight; + inf->max_scale = inf->max_scale * infg->goal / infg->hinflight; + inf->max_scale = clamp(inf->max_scale, MIN_SCALE, MAX_SCALE); + } + + return online_budget; +} + +u32 ioinf_calc_online_budget(struct ioinf *inf, u32 *exhausted) +{ + struct ioinf_gq *infg, *tmp, *primary = NULL; + u32 max_weight = 0; + + if (!online_weight || list_empty(&inf->active_infgs)) + return ioinf_calc_budget(&inf->online); + + list_for_each_entry_safe(infg, tmp, &inf->active_infgs, active) { + int max_inflight = infg->cg_rqw.max_inflight; + + infg->goal = ioinf_calc_budget(&infg->cg_rqw); + if (!infg->goal && !wq_has_sleeper(&infg->cg_rqw.rqw.wait)) { + list_del_init(&infg->active); + inf->total_weights -= infg_user_weight(infg); + infg->deficit = 0; + continue; + } + + /* Some high-priority I/Os may exceed the budget. */ + if (max_inflight > infg->hinflight + 1) { + int deficit = (int)infg->hinflight + 1 - max_inflight; + infg->deficit += deficit * SCALE_GRAN; + } + + if (infg->weight < max_weight) + continue; + + if (infg->weight > max_weight || infg->goal > primary->goal) { + primary = infg; + max_weight = infg->weight; + *exhausted = primary->cg_rqw.exhausted; + } + } + + return adjust_budget_by_primary(inf, primary); +} + static void ioinf_sample_cpu_lat(struct ioinf_lat_stat *cur, int cpu, struct ioinf_lat_stat __percpu *stat) { @@ -558,30 +817,50 @@ static struct ioinf_lat_stat ioinf_calc_stat(struct ioinf_lat_stat *cur, static void ioinf_sample_lat(struct ioinf *inf) { int cpu; + struct ioinf_gq *infg; - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { ioinf_sample_cpu_lat(&inf->cur_stat, cpu, inf->stat); + list_for_each_entry(infg, &inf->active_infgs, active) { + ioinf_sample_cpu_lat(&infg->cur_stat, cpu, infg->stat); + } + } + inf->delta_stat = ioinf_calc_stat(&inf->cur_stat, &inf->last_stat); + list_for_each_entry(infg, &inf->active_infgs, active) { + infg->delta_stat = ioinf_calc_stat(&infg->cur_stat, + &infg->last_stat); + } } static int ioinf_online_busy(struct ioinf *inf) { struct ioinf_lat_stat *stat; int met_percent, unmet_percent = 0; + struct ioinf_gq *infg; if (!inf->params.qos_enabled) { inf->last_stat = inf->cur_stat; + list_for_each_entry(infg, &inf->active_infgs, active) { + infg->last_stat = infg->cur_stat; + } return unmet_percent; } stat = &inf->delta_stat; if (stat->read.nr >= IOINF_MIN_SAMPLES) { inf->last_stat.read = inf->cur_stat.read; + list_for_each_entry(infg, &inf->active_infgs, active) { + infg->last_stat.read = infg->cur_stat.read; + } met_percent = stat->read.met * 100 / stat->read.nr; unmet_percent = inf->params.rpct - met_percent; } if (stat->write.nr >= IOINF_MIN_SAMPLES) { inf->last_stat.write = inf->cur_stat.write; + list_for_each_entry(infg, &inf->active_infgs, active) { + infg->last_stat.write = infg->cur_stat.write; + } met_percent = stat->write.met * 100 / stat->write.nr; if (unmet_percent < inf->params.wpct - met_percent) unmet_percent = inf->params.wpct - met_percent; @@ -590,6 +869,20 @@ static int ioinf_online_busy(struct ioinf *inf) return unmet_percent; } +static void infgs_update_inflight(struct ioinf *inf) +{ + struct ioinf_gq *infg; + + if (!online_weight || list_empty(&inf->active_infgs)) + return; + + infgs_propagate_weights(inf); + list_for_each_entry(infg, &inf->active_infgs, active) { + infg->cg_rqw.exhausted = 0; + infg->cg_rqw.issued = 0; + } +} + static void ioinf_update_inflight(struct ioinf *inf, u32 new_online, u32 new_offline) { @@ -611,6 +904,7 @@ void ioinf_update_inflight(struct ioinf *inf, u32 new_online, u32 new_offline) inf->offline.issued = 0; ioinf_set_hinflight(&inf->online, new_online); + infgs_update_inflight(inf); inf->online.exhausted = 0; inf->online.issued = 0; } @@ -623,13 +917,14 @@ static void ioinf_timer_fn(struct timer_list *timer) unsigned long flags; u32 online_budget, offline_budget; int unmet_percent; + u32 exhausted = online->exhausted; spin_lock_irqsave(&inf->lock, flags); ioinf_sample_lat(inf); unmet_percent = ioinf_online_busy(inf); - online_budget = ioinf_calc_budget(online); offline_budget = ioinf_calc_budget(offline); + online_budget = ioinf_calc_online_budget(inf, &exhausted); if (unmet_percent < -SCALE_THRESH && inf->max_scale < MAX_SCALE) inf->max_scale++; @@ -641,7 +936,7 @@ static void ioinf_timer_fn(struct timer_list *timer) online_budget -= online_budget * unmet_percent / 100; online_budget = max(online_budget, IOINFG_MIN_INFLIGHT); inflight_force_scale_down(inf, online_budget + offline_budget); - } else if (inf->scale < inf->max_scale && online->exhausted) { + } else if (inf->scale < inf->max_scale && exhausted) { offline_budget = min(offline_budget, IOINFG_MIN_INFLIGHT); inflight_force_scale_up(inf, online_budget + offline_budget); if (inf->scale > inf->max_scale) @@ -690,10 +985,13 @@ static int blk_ioinf_init(struct gendisk *disk) inf->inflight = ioinf_default_inflight(inf); inf->max_scale = MAX_SCALE; inf->inf_timer_perid = IOINF_TIMER_PERID; + inf->offline.hinflight = IOINFG_MIN_INFLIGHT; rq_wait_init(&inf->offline.rqw); inf->online.hinflight = inf->inflight - IOINFG_MIN_INFLIGHT; rq_wait_init(&inf->online.rqw); + + INIT_LIST_HEAD(&inf->active_infgs); timer_setup(&inf->inf_timer, ioinf_timer_fn, 0); ret = rq_qos_add(&inf->rqos, disk, RQ_QOS_INFLIGHT, &ioinf_rqos_ops); @@ -738,6 +1036,23 @@ static int ioinf_weight_show(struct seq_file *sf, void *v) return 0; } +static void infg_update_weight(struct ioinf_gq *infg, u32 new) +{ + u32 old; + struct ioinf *inf = infg->inf; + + spin_lock_irq(&inf->lock); + old = infg_user_weight(infg); + infg->user_weight = new; + if (new != old && !list_empty(&infg->active)) { + if (new == 0) + list_del_init(&infg->active); + inf->total_weights = inf->total_weights - old + new; + infgs_propagate_weights(inf); + } + spin_unlock_irq(&inf->lock); +} + static ssize_t ioinf_weight_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -771,7 +1086,8 @@ static ssize_t ioinf_weight_write(struct kernfs_open_file *of, char *buf, return -EINVAL; } - infg->user_weight = v; + infg_update_weight(infg, v); + blkg_conf_exit(&ctx); return nbytes; } @@ -918,6 +1234,7 @@ static ssize_t ioinf_qos_write(struct kernfs_open_file *of, char *input, inf->old_scale = SCALE_GRAN; ioinf_update_inflight(inf, inf->online.hinflight, inf->offline.hinflight); + infgs_propagate_weights(inf); spin_unlock_irq(&inf->lock); } inf->max_scale = MAX_SCALE; @@ -989,6 +1306,12 @@ static struct blkg_policy_data *ioinf_pd_alloc(struct gendisk *disk, if (!infg) return NULL; + infg->stat = alloc_percpu_gfp(struct ioinf_lat_stat, GFP_ATOMIC); + if (!infg->stat) { + kfree(infg); + return NULL; + } + return &infg->pd; } @@ -997,13 +1320,26 @@ static void ioinf_pd_init(struct blkg_policy_data *pd) struct ioinf_gq *infg = pd_to_infg(pd); struct blkcg_gq *blkg = pd_to_blkg(pd); + INIT_LIST_HEAD(&infg->active); infg->inf = q_to_inf(blkg->q); + rq_wait_init(&infg->cg_rqw.rqw); + infg->cg_rqw.last_max = IOINFG_MIN_INFLIGHT; + infg->cg_rqw.max_inflight = IOINFG_MIN_INFLIGHT; +} + +static void ioinf_pd_offline(struct blkg_policy_data *pd) +{ + struct ioinf_gq *infg = pd_to_infg(pd); + + if (!list_empty_careful(&infg->active)) + ioinf_deactivate_infg(infg); } static void ioinf_pd_free(struct blkg_policy_data *pd) { struct ioinf_gq *infg = pd_to_infg(pd); + free_percpu(infg->stat); kfree(infg); } @@ -1016,6 +1352,7 @@ static struct blkcg_policy blkcg_policy_ioinf = { .pd_alloc_fn = ioinf_pd_alloc, .pd_init_fn = ioinf_pd_init, + .pd_offline_fn = ioinf_pd_offline, .pd_free_fn = ioinf_pd_free, }; -- 2.46.1

反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/18101 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/FCC... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/18101 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/FCC...
participants (2)
-
Baokun Li
-
patchwork bot