[PATCH OLK-6.6 1/4] blk-ioinf: introduce inflight-based IO QoS controller

28 Oct 2025

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/ID0C1D

--------------------------------

This patch introduces a new inflight-based IO QoS controller for cgroups.
The controller is designed to guarantee absolute priority of online
(latency-sensitive) workloads over offline (best-effort) workloads.

If a cgroup's weight greater than 0, it is marked as online and we track
its IO latency. If the weight is less than 0, the cgroup is marked as
offline; we do not track its IO latency, and online cgroups may preempt
the inflight of offline cgroups. This ensures that LS workloads meet their
latency requirements, while BE workloads can opportunistically utilize idle
bandwidth.

A weight of 0 means the weight is unset. By default, such cgroups are
also marked as offline. This default can be changed by setting a flag.
In that case, the QoS controller skips these cgroups' IO: no control is
applied and no latency is tracked.

Currently, the inflight limit is a fixed value. In subsequent patches, we
will implement dynamic adjustment of inflight limits and online weights.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
---
 block/Kconfig          |  10 +
 block/Makefile         |   1 +
 block/blk-ioinf.c      | 925 +++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.c |   2 +
 block/blk-rq-qos.h     |   1 +
 include/linux/blk-mq.h |   3 +
 6 files changed, 942 insertions(+)
 create mode 100644 block/blk-ioinf.c

diff --git a/block/Kconfig b/block/Kconfig
index 7018fdcaa459..1d338261b751 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -204,6 +204,16 @@ config BLK_CGROUP_LEGACY_IOCOST
 
 	If unsure, say N.
 
+config BLK_CGROUP_IOINFLIGHT
+	bool "Enable support for inflight based cgroup IO controller"
+	depends on BLK_CGROUP
+	select BLK_RQ_ALLOC_TIME
+	help
+	Enabling this option enables the .inf.qos interface for inflight
+	based proportional IO control. The IO controller distributes IO
+	capacity between different groups based on their share of the
+	overall weight distribution.
+
 config BLK_CGROUP_IOPRIO
 	bool "Cgroup I/O controller for assigning an I/O priority class"
 	depends on BLK_CGROUP
diff --git a/block/Makefile b/block/Makefile
index 400731b162c0..358599938757 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_BLK_CGROUP_IOPRIO)	+= blk-ioprio.o
 obj-$(CONFIG_BLK_CGROUP_IOLATENCY)	+= blk-iolatency.o
 obj-$(CONFIG_BLK_CGROUP_IOCOST)	+= blk-iocost.o
+obj-$(CONFIG_BLK_CGROUP_IOINFLIGHT)	+= blk-ioinf.o
 obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
 obj-$(CONFIG_MQ_IOSCHED_KYBER)	+= kyber-iosched.o
 bfq-y				:= bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/blk-ioinf.c b/block/blk-ioinf.c
new file mode 100644
index 000000000000..26c3efb60e02
--- /dev/null
+++ b/block/blk-ioinf.c
@@ -0,0 +1,925 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IO inflight relative controller
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/time64.h>
+#include <linux/parser.h>
+#include <linux/blk-cgroup.h>
+
+#include "blk-cgroup.h"
+#include "blk-rq-qos.h"
+#include "blk-mq.h"
+
+/* default weight for each cgroup */
+#define IOINF_DFL_WEIGHT	0
+#define IOINF_MIN_INFLIGHT	30
+#define IOINFG_MIN_INFLIGHT	1
+/* default wake-up time in jiffies for backgroup job, see ioinf_timer_fn() */
+#define IOINF_TIMER_PERID	(HZ / 2)
+
+/* io.inf.qos controls */
+enum {
+	INF_ENABLE,
+	INF_INFLIGHT,
+	INF_FLAGS,
+
+	NR_QOS_CTRL_PARAMS,
+};
+
+/* qos control params */
+struct ioinf_params {
+	bool enabled;
+	u32 inflight;
+	unsigned long flags;
+};
+
+struct ioinf_rq_wait {
+	wait_queue_head_t	*wait;
+	u32			wq_nr;
+	atomic_t		next_wq;
+	atomic_t		sleepers;
+
+	atomic_t		inflight;
+	u32			hinflight;
+	u32			max_inflight;
+	u32			last_max;
+	u32			exhausted;
+	u32			issued;
+};
+
+/* the global conrtol structure */
+struct ioinf {
+	struct rq_qos		rqos;
+
+	struct ioinf_params	params;
+	u32			inflight;
+
+	/* default time for ioinf_timer_fn */
+	unsigned long		inf_timer_perid;
+	struct timer_list	inf_timer;
+
+	/* global lock */
+	spinlock_t		lock;
+
+	/* for offline cgroups */
+	struct ioinf_rq_wait	offline;
+	/* for online cgroups */
+	struct ioinf_rq_wait	online;
+
+	/* timer for ioinf_wakeup_timer_fn */
+	struct hrtimer		wakeup_timer;
+	bool			waking;
+};
+
+/* per disk-cgroup pair structure */
+struct ioinf_gq {
+	struct blkg_policy_data	pd;
+	struct ioinf		*inf;
+
+	/* weight < 0: offline; weight > 0: online; weight == 0: unset */
+	int			user_weight;
+	int			dfl_user_weight;
+};
+
+/* per cgroup structure, used to record default weight for all disks */
+struct ioinf_cgrp {
+	struct blkcg_policy_data	cpd;
+
+	/* weight < 0: offline; weight > 0: online; weight == 0: unset */
+	int				dfl_user_weight;
+};
+
+/* io-inflight flags */
+enum {
+	/*
+	 * Cgroups with unset weight are not throttled and latency is not
+	 * recorded. Without this flag, such cgroups are treated as offline.
+	 */
+	DEFAULT_NOLIMIT,
+
+	NR_INF_FLAGS
+};
+
+static inline int inf_test_flag(struct ioinf *inf, int bit)
+{
+	return test_bit(bit, &inf->params.flags);
+}
+
+static int infg_user_weight(struct ioinf_gq *infg)
+{
+	if (infg->user_weight)
+		return infg->user_weight;
+
+	/* if user doesn't set per disk weight, use the cgroup default weight */
+	if (infg->dfl_user_weight)
+		return infg->dfl_user_weight;
+
+	/* No limit for Cgroups with unset weight */
+	if (inf_test_flag(infg->inf, DEFAULT_NOLIMIT))
+		return 0;
+
+	/* Cgroups with unset weight are treated as offline. */
+	return -1;
+}
+
+static bool infg_offline(struct ioinf_gq *infg)
+{
+	return infg_user_weight(infg) < 0;
+}
+
+static bool infg_nolimit(struct ioinf_gq *infg)
+{
+	return infg_user_weight(infg) == 0;
+}
+
+static struct ioinf *rqos_to_inf(struct rq_qos *rqos)
+{
+	return container_of(rqos, struct ioinf, rqos);
+}
+
+static struct ioinf *q_to_inf(struct request_queue *q)
+{
+	return rqos_to_inf(rq_qos_id(q, RQ_QOS_INFLIGHT));
+}
+
+static struct ioinf_gq *pd_to_infg(struct blkg_policy_data *pd)
+{
+	if (!pd)
+		return NULL;
+
+	return container_of(pd, struct ioinf_gq, pd);
+}
+
+static struct blkcg_policy blkcg_policy_ioinf;
+
+static struct ioinf_gq *blkg_to_infg(struct blkcg_gq *blkg)
+{
+	return pd_to_infg(blkg_to_pd(blkg, &blkcg_policy_ioinf));
+}
+
+static struct ioinf_cgrp *blkcg_to_infcg(struct blkcg *blkcg)
+{
+	struct blkcg_policy_data *cpd =
+		blkcg_to_cpd(blkcg, &blkcg_policy_ioinf);
+
+	return container_of(cpd, struct ioinf_cgrp, cpd);
+}
+
+static struct blkcg_gq *ioinf_bio_blkg(struct bio *bio)
+{
+	struct blkcg_gq *blkg = bio->bi_blkg;
+
+	if (!blkg || !blkg->online)
+		return NULL;
+
+	if (blkg->blkcg->css.cgroup->level == 0)
+		return NULL;
+
+	return blkg;
+}
+
+static struct ioinf_gq *ioinf_bio_infg(struct bio *bio)
+{
+	struct ioinf_gq *infg;
+	struct blkcg_gq *blkg = ioinf_bio_blkg(bio);
+
+	if (!blkg)
+		return NULL;
+
+	infg = blkg_to_infg(blkg);
+	if (!infg)
+		return NULL;
+
+	return infg;
+}
+
+static void ioinf_set_hinflight(struct ioinf_rq_wait *rqw, u32 new)
+{
+	rqw->hinflight = new;
+	rqw->last_max = max(rqw->last_max >> 1, rqw->max_inflight);
+	rqw->max_inflight = IOINFG_MIN_INFLIGHT;
+}
+
+static inline void ioinf_rqw_wake_up_all(struct ioinf_rq_wait *rqw)
+{
+	if (!atomic_read(&rqw->sleepers))
+		return;
+
+	for (int i = 0; i < rqw->wq_nr; i++)
+		wake_up_all(&rqw->wait[i]);
+}
+
+static void ioinf_wake_up_all(struct ioinf *inf)
+{
+	ioinf_rqw_wake_up_all(&inf->online);
+	ioinf_rqw_wake_up_all(&inf->offline);
+}
+
+static enum hrtimer_restart ioinf_wakeup_timer_fn(struct hrtimer *timer)
+{
+	struct ioinf *inf = container_of(timer, struct ioinf, wakeup_timer);
+
+	inf->waking = false;
+	ioinf_wake_up_all(inf);
+
+	return HRTIMER_NORESTART;
+}
+
+void ioinf_done(struct ioinf *inf, struct ioinf_rq_wait *rqw)
+{
+	int inflight;
+
+	if (!inf->params.enabled)
+		return;
+
+	inflight = atomic_dec_if_positive(&rqw->inflight);
+	if (inflight && inflight >= (int)rqw->hinflight)
+		return;
+
+	if (atomic_read(&rqw->sleepers) && !inf->waking) {
+		inf->waking = true;
+		hrtimer_start(&inf->wakeup_timer, 0, HRTIMER_MODE_REL);
+	}
+}
+
+static bool ioinf_inflight_cb(struct ioinf_rq_wait *rqw, void *private_data,
+			      bool do_wakeup)
+{
+	struct ioinf *inf = private_data;
+	u32 inflight;
+	u32 limit;
+	u32 sleepers;
+
+	if (!inf->params.enabled)
+		return true;
+retry:
+	limit = rqw->hinflight;
+	sleepers = do_wakeup ? 0 : atomic_read(&rqw->sleepers);
+	inflight = atomic_read(&rqw->inflight);
+
+	if (inflight + sleepers < limit) {
+		inflight = atomic_inc_return(&rqw->inflight);
+
+		if (inflight > rqw->max_inflight)
+			rqw->max_inflight = inflight;
+		rqw->issued++;
+		return true;
+	}
+
+	rqw->max_inflight = max(rqw->max_inflight, inflight + 1);
+	if (rqw == &inf->offline)
+		goto exhausted;
+
+	if (inf->offline.hinflight > IOINFG_MIN_INFLIGHT) {
+		/* Reclaim half of the inflight budget from offline groups. */
+		inf->offline.hinflight = inf->offline.hinflight >> 1;
+		inf->online.hinflight = inf->inflight - inf->offline.hinflight;
+	}
+
+	if (rqw->hinflight > limit)
+		goto retry;
+
+exhausted:
+	rqw->exhausted++;
+	return false;
+}
+
+static void ioinf_cleanup_cb(struct ioinf_rq_wait *rqw, void *private_data)
+{
+	ioinf_done(private_data, rqw);
+}
+
+struct rq_qos_wait_data {
+	struct wait_queue_entry wq;
+	struct task_struct *task;
+	struct ioinf_rq_wait *rqw;
+	void *private_data;
+	bool got_token;
+};
+
+static int ioinf_wake_fn(struct wait_queue_entry *curr,
+			 unsigned int mode, int wake_flags, void *key)
+{
+	struct rq_qos_wait_data *data = container_of(curr,
+					struct rq_qos_wait_data, wq);
+
+	/*
+	 * If we fail to get a budget, return -1 to interrupt
+	 * the wake up loop in __wake_up_common.
+	 */
+	if (!ioinf_inflight_cb(data->rqw, data->private_data, true))
+		return -1;
+
+	data->got_token = true;
+	wake_up_process(data->task);
+	list_del_init_careful(&curr->entry);
+	return 1;
+}
+
+static void ioinf_throttle(struct ioinf *inf, struct ioinf_rq_wait *rqw)
+{
+	bool has_sleeper;
+	u32 wq_idx;
+	struct rq_qos_wait_data data = {
+		.wq = {
+			.func	= ioinf_wake_fn,
+			.entry	= LIST_HEAD_INIT(data.wq.entry),
+		},
+		.task = current,
+		.rqw = rqw,
+		.private_data = inf,
+	};
+
+	if (!timer_pending(&inf->inf_timer))
+		timer_reduce(&inf->inf_timer, jiffies + inf->inf_timer_perid);
+
+	if (ioinf_inflight_cb(rqw, inf, false))
+		return;
+
+	wq_idx = atomic_fetch_inc(&rqw->next_wq) % rqw->wq_nr;
+	has_sleeper = !prepare_to_wait_exclusive(&rqw->wait[wq_idx], &data.wq,
+						 TASK_UNINTERRUPTIBLE);
+	atomic_inc(&rqw->sleepers);
+	do {
+		/* The memory barrier in set_task_state saves us here. */
+		if (data.got_token)
+			break;
+		if (!has_sleeper && ioinf_inflight_cb(rqw, inf, true)) {
+			finish_wait(&rqw->wait[wq_idx], &data.wq);
+
+			/*
+			 * We raced with rq_qos_wake_function() getting a token,
+			 * which means we now have two. Put our local token
+			 * and wake anyone else potentially waiting for one.
+			 */
+			if (data.got_token)
+				ioinf_cleanup_cb(rqw, inf);
+			break;
+		}
+		io_schedule();
+		has_sleeper = true;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	} while (1);
+
+	finish_wait(&rqw->wait[wq_idx], &data.wq);
+	atomic_dec(&rqw->sleepers);
+}
+
+static void ioinf_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
+{
+	struct ioinf *inf = rqos_to_inf(rqos);
+	struct ioinf_gq *infg = ioinf_bio_infg(bio);
+
+	if (!inf->params.enabled || !infg || infg_nolimit(infg))
+		return;
+
+	if (infg_offline(infg)) {
+		ioinf_throttle(inf, &inf->offline);
+		return;
+	}
+
+	ioinf_throttle(inf, &inf->online);
+}
+
+static void ioinf_rqos_track(struct rq_qos *rqos, struct request *rq,
+			     struct bio *bio)
+{
+	struct blkcg_gq *blkg = ioinf_bio_blkg(bio);
+
+	if (!blkg)
+		return;
+
+	rq->blkg = blkg;
+}
+
+static void ioinf_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
+{
+	struct blkcg_gq *blkg = ioinf_bio_blkg(bio);
+	struct ioinf_gq *infg;
+	struct ioinf *inf;
+
+	if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED))
+		return;
+
+	infg = blkg_to_infg(blkg);
+	if (!infg)
+		return;
+
+	inf = infg->inf;
+	if (!inf->params.enabled || infg_nolimit(infg))
+		return;
+
+	if (infg_offline(infg))
+		ioinf_done(inf, &inf->offline);
+	else
+		ioinf_done(inf, &inf->online);
+}
+
+static void ioinf_rqos_done(struct rq_qos *rqos, struct request *rq)
+{
+	struct blkcg_gq *blkg = rq->blkg;
+
+	if (!blkg)
+		return;
+
+	rq->blkg = NULL;
+}
+
+static void ioinf_rqos_exit(struct rq_qos *rqos)
+{
+	struct ioinf *inf = rqos_to_inf(rqos);
+
+	blkcg_deactivate_policy(rqos->disk, &blkcg_policy_ioinf);
+
+	hrtimer_cancel(&inf->wakeup_timer);
+	timer_shutdown_sync(&inf->inf_timer);
+	ioinf_wake_up_all(inf);
+	kfree(inf->online.wait);
+	kfree(inf->offline.wait);
+	kfree(inf);
+}
+
+static struct rq_qos_ops ioinf_rqos_ops = {
+	.throttle	= ioinf_rqos_throttle,
+	.done_bio	= ioinf_rqos_done_bio,
+	.done		= ioinf_rqos_done,
+	.track		= ioinf_rqos_track,
+	.exit		= ioinf_rqos_exit,
+};
+
+u32 ioinf_calc_budget(struct ioinf_rq_wait *rqw)
+{
+	u32 new_budget;
+	u64 exhausted = rqw->exhausted;
+	u64 issued = rqw->issued;
+
+	new_budget = max(rqw->last_max, rqw->max_inflight);
+	/* How much budget is needed to avoid 'exhausted'? */
+	if (exhausted && issued)
+		new_budget += exhausted * new_budget / issued;
+
+	return new_budget;
+}
+
+static
+void ioinf_update_inflight(struct ioinf *inf, u32 new_online, u32 new_offline)
+{
+
+	if (inf->inflight < IOINF_MIN_INFLIGHT)
+		inf->inflight = IOINF_MIN_INFLIGHT;
+
+	if (new_online < inf->inflight)
+		new_offline = inf->inflight - new_online;
+	else
+		new_offline = min(new_offline, IOINFG_MIN_INFLIGHT);
+	new_online = inf->inflight - new_offline;
+
+	ioinf_set_hinflight(&inf->offline, new_offline);
+	inf->offline.exhausted = 0;
+	inf->offline.issued = 0;
+
+	ioinf_set_hinflight(&inf->online, new_online);
+	inf->online.exhausted = 0;
+	inf->online.issued = 0;
+
+	ioinf_wake_up_all(inf);
+}
+
+static void ioinf_timer_fn(struct timer_list *timer)
+{
+	struct ioinf *inf = container_of(timer, struct ioinf, inf_timer);
+	struct ioinf_rq_wait *online = &inf->online;
+	struct ioinf_rq_wait *offline = &inf->offline;
+	unsigned long flags;
+	u32 online_budget, offline_budget;
+
+	spin_lock_irqsave(&inf->lock, flags);
+
+	online_budget = ioinf_calc_budget(online);
+	offline_budget = ioinf_calc_budget(offline);
+	ioinf_update_inflight(inf, online_budget, offline_budget);
+
+	spin_unlock_irqrestore(&inf->lock, flags);
+	mod_timer(&inf->inf_timer, jiffies + inf->inf_timer_perid);
+}
+
+static u32 ioinf_default_inflight(struct ioinf *inf)
+{
+	u32 inflight = inf->params.inflight;
+
+	if (inflight < IOINF_MIN_INFLIGHT)
+		inflight = IOINF_MIN_INFLIGHT;
+
+	return inflight;
+}
+
+static inline int ioinf_rqw_init(struct ioinf_rq_wait *rqw)
+{
+	int i;
+
+	rqw->wait = kcalloc(rqw->wq_nr, sizeof(wait_queue_head_t), GFP_KERNEL);
+	if (!rqw->wait)
+		return -ENOMEM;
+
+	for (i = 0; i < rqw->wq_nr; i++)
+		init_waitqueue_head(&rqw->wait[i]);
+
+	return 0;
+}
+
+static int blk_ioinf_init(struct gendisk *disk)
+{
+	struct ioinf *inf;
+	int ret = -ENOMEM;
+
+	inf = kzalloc(sizeof(*inf), GFP_KERNEL);
+	if (!inf)
+		return ret;
+
+	inf->offline.wq_nr = num_possible_cpus() / 2;
+	ret = ioinf_rqw_init(&inf->offline);
+	if (ret)
+		goto free_inf;
+
+	inf->online.wq_nr = 1;
+	ret = ioinf_rqw_init(&inf->online);
+	if (ret)
+		goto free_wq;
+
+	spin_lock_init(&inf->lock);
+	inf->params.inflight = disk->queue->nr_requests;
+	inf->inflight = ioinf_default_inflight(inf);
+	inf->inf_timer_perid = IOINF_TIMER_PERID;
+
+	inf->offline.hinflight = inf->inflight - IOINFG_MIN_INFLIGHT;
+	inf->online.hinflight = IOINFG_MIN_INFLIGHT;
+
+	timer_setup(&inf->inf_timer, ioinf_timer_fn, 0);
+	hrtimer_init(&inf->wakeup_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	inf->wakeup_timer.function = ioinf_wakeup_timer_fn;
+	inf->waking = false;
+
+	ret = rq_qos_add(&inf->rqos, disk, RQ_QOS_INFLIGHT, &ioinf_rqos_ops);
+	if (ret)
+		goto err_cancel_timer;
+
+	ret = blkcg_activate_policy(disk, &blkcg_policy_ioinf);
+	if (ret)
+		goto err_del_qos;
+	return 0;
+
+err_del_qos:
+	rq_qos_del(&inf->rqos);
+err_cancel_timer:
+	hrtimer_cancel(&inf->wakeup_timer);
+	timer_shutdown_sync(&inf->inf_timer);
+	kfree(inf->online.wait);
+free_wq:
+	kfree(inf->offline.wait);
+free_inf:
+	kfree(inf);
+	return ret;
+}
+
+static u64 ioinf_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
+{
+	const char *dname = blkg_dev_name(pd->blkg);
+	struct ioinf_gq *infg = pd_to_infg(pd);
+
+	if (dname && infg->user_weight)
+		seq_printf(sf, "%s %d\n", dname, infg->user_weight);
+
+	return 0;
+}
+
+static int ioinf_weight_show(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct ioinf_cgrp *infcg = blkcg_to_infcg(blkcg);
+
+	seq_printf(sf, "default %d\n", infcg->dfl_user_weight);
+	blkcg_print_blkgs(sf, blkcg, ioinf_weight_prfill, &blkcg_policy_ioinf,
+			  seq_cft(sf)->private, false);
+
+	return 0;
+}
+
+static void ioinf_default_weight_update(struct blkcg *blkcg, int v)
+{
+	struct ioinf_cgrp *infcg = blkcg_to_infcg(blkcg);
+	struct blkcg_gq *blkg;
+	struct hlist_node *tmp;
+	struct ioinf_gq *infg;
+
+	if (v == infcg->dfl_user_weight)
+		return;
+
+	infcg->dfl_user_weight = v;
+	spin_lock_irq(&blkcg->lock);
+	hlist_for_each_entry_safe(blkg, tmp, &blkcg->blkg_list, blkcg_node) {
+		infg = blkg_to_infg(blkg);
+		if (infg && infg->dfl_user_weight != v) {
+			spin_unlock_irq(&blkcg->lock);
+			blk_mq_freeze_queue(infg->inf->rqos.disk->queue);
+			blk_mq_quiesce_queue(infg->inf->rqos.disk->queue);
+			infg->dfl_user_weight = v;
+			blk_mq_unquiesce_queue(infg->inf->rqos.disk->queue);
+			blk_mq_unfreeze_queue(infg->inf->rqos.disk->queue);
+			spin_lock_irq(&blkcg->lock);
+		}
+	}
+	spin_unlock_irq(&blkcg->lock);
+}
+
+static int ioing_weight_write(struct blkcg *blkcg, char *buf)
+{
+	struct blkg_conf_ctx ctx;
+	struct ioinf_gq *infg;
+	int ret;
+	int v;
+
+	blkg_conf_init(&ctx, buf);
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_ioinf, &ctx);
+	if (ret) {
+		blkg_conf_exit(&ctx);
+		return ret;
+	}
+
+	infg = blkg_to_infg(ctx.blkg);
+	if (!strncmp(ctx.body, "default", 7)) {
+		v = IOINF_DFL_WEIGHT;
+	} else if (kstrtoint(ctx.body, 0, &v) || abs(v) > CGROUP_WEIGHT_MAX) {
+		blkg_conf_exit(&ctx);
+		return -EINVAL;
+	}
+
+	spin_unlock_irq(&bdev_get_queue(ctx.bdev)->queue_lock);
+	blk_mq_freeze_queue(infg->inf->rqos.disk->queue);
+	blk_mq_quiesce_queue(infg->inf->rqos.disk->queue);
+	infg->user_weight = v;
+	blk_mq_unquiesce_queue(infg->inf->rqos.disk->queue);
+	blk_mq_unfreeze_queue(infg->inf->rqos.disk->queue);
+	spin_lock_irq(&bdev_get_queue(ctx.bdev)->queue_lock);
+
+	blkg_conf_exit(&ctx);
+	return 0;
+}
+
+static ssize_t ioinf_weight_write(struct kernfs_open_file *of, char *buf,
+				  size_t nbytes, loff_t off)
+{
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
+	int ret;
+
+	if (!strchr(buf, ':')) {
+		int v;
+
+		if (sscanf(buf, "default %d", &v) != 1 && kstrtoint(buf, 0, &v))
+			return -EINVAL;
+
+		if (abs(v) > CGROUP_WEIGHT_MAX)
+			return -EINVAL;
+
+		ioinf_default_weight_update(blkcg, v);
+		return nbytes;
+	}
+
+	ret = ioing_weight_write(blkcg, buf);
+	return ret ? ret : nbytes;
+}
+
+static u64 ioinf_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
+			    int off)
+{
+	const char *dname = blkg_dev_name(pd->blkg);
+	struct ioinf *inf = q_to_inf(pd->blkg->q);
+	struct ioinf_params params;
+
+	if (!dname)
+		return 0;
+
+	params = inf->params;
+	seq_printf(sf, "%s enable=%d inflight=%u flags=%lu", dname,
+		   params.enabled, params.inflight, params.flags);
+
+	seq_putc(sf, '\n');
+	return 0;
+}
+
+static int ioinf_qos_show(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+
+	blkcg_print_blkgs(sf, blkcg, ioinf_qos_prfill,
+			  &blkcg_policy_ioinf, seq_cft(sf)->private, false);
+	return 0;
+}
+
+static const match_table_t qos_ctrl_tokens = {
+	{ INF_ENABLE,		"enable=%u"	},
+	{ INF_INFLIGHT,		"inflight=%u"	},
+	{ INF_FLAGS,		"flags=%u"	},
+	{ NR_QOS_CTRL_PARAMS,	NULL		},
+};
+
+static ssize_t ioinf_qos_write(struct kernfs_open_file *of, char *input,
+			       size_t nbytes, loff_t off)
+{
+	struct blkg_conf_ctx ctx;
+	struct gendisk *disk;
+	struct ioinf *inf;
+	struct ioinf_params params = {0};
+	char *body, *p;
+	int ret;
+
+	blkg_conf_init(&ctx, input);
+
+	ret = blkg_conf_open_bdev(&ctx);
+	if (ret)
+		goto err;
+
+	body = ctx.body;
+	disk = ctx.bdev->bd_disk;
+	if (!queue_is_mq(disk->queue)) {
+		ret = -EOPNOTSUPP;
+		goto err;
+	}
+
+	inf = q_to_inf(disk->queue);
+	if (!inf) {
+		ret = blk_ioinf_init(disk);
+		if (ret)
+			goto err;
+		inf = q_to_inf(disk->queue);
+	}
+	params = inf->params;
+
+	while ((p = strsep(&body, " \t\n"))) {
+		substring_t args[MAX_OPT_ARGS];
+		u64 v;
+
+		if (!*p)
+			continue;
+
+		switch (match_token(p, qos_ctrl_tokens, args)) {
+		case INF_ENABLE:
+			if (match_u64(&args[0], &v))
+				goto einval;
+			params.enabled = !!v;
+			continue;
+		case INF_INFLIGHT:
+			if (match_u64(&args[0], &v) || v == 0)
+				goto einval;
+			params.inflight = v;
+			continue;
+		case INF_FLAGS:
+			if (match_u64(&args[0], &v) || v >= 1 << NR_INF_FLAGS)
+				goto einval;
+			params.flags = v;
+			continue;
+		default:
+			goto einval;
+		}
+	}
+
+	if (!params.enabled && !inf->params.enabled)
+		goto out;
+
+	blk_mq_freeze_queue(disk->queue);
+	blk_mq_quiesce_queue(disk->queue);
+
+	if (params.enabled && !inf->params.enabled) {
+		blk_stat_enable_accounting(disk->queue);
+		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
+	} else if (inf->params.enabled && !params.enabled) {
+		blk_stat_disable_accounting(disk->queue);
+		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
+	}
+
+	spin_lock_irq(&inf->lock);
+	inf->params = params;
+	if (inf->inflight != params.inflight)
+		ioinf_update_inflight(inf, inf->online.hinflight,
+				      inf->offline.hinflight);
+	spin_unlock_irq(&inf->lock);
+
+	blk_mq_unquiesce_queue(disk->queue);
+	blk_mq_unfreeze_queue(disk->queue);
+out:
+	blkg_conf_exit(&ctx);
+	return nbytes;
+
+einval:
+	ret = -EINVAL;
+err:
+	blkg_conf_exit(&ctx);
+	return ret;
+}
+
+static struct cftype ioinf_files[] = {
+	{
+		.name = "inf.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = ioinf_weight_show,
+		.write = ioinf_weight_write,
+	},
+	{
+		.name = "inf.qos",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = ioinf_qos_show,
+		.write = ioinf_qos_write,
+	},
+	{}
+};
+
+static struct cftype ioinf_legacy_files[] = {
+	{
+		.name = "inf.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = ioinf_weight_show,
+		.write = ioinf_weight_write,
+	},
+	{
+		.name = "inf.qos",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = ioinf_qos_show,
+		.write = ioinf_qos_write,
+	},
+	{}
+};
+
+static struct blkcg_policy_data *ioinf_cpd_alloc(gfp_t gfp)
+{
+	struct ioinf_cgrp *infcg = kzalloc(sizeof(*infcg), gfp);
+
+	if (!infcg)
+		return NULL;
+
+	return &infcg->cpd;
+}
+
+static void ioinf_cpd_free(struct blkcg_policy_data *cpd)
+{
+	kfree(container_of(cpd, struct ioinf_cgrp, cpd));
+}
+
+static struct blkg_policy_data *ioinf_pd_alloc(struct gendisk *disk,
+					       struct blkcg *blkcg, gfp_t gfp)
+{
+	struct ioinf_gq *infg = kzalloc_node(sizeof(*infg), gfp, disk->node_id);
+
+	if (!infg)
+		return NULL;
+
+	return &infg->pd;
+}
+
+static void ioinf_pd_init(struct blkg_policy_data *pd)
+{
+	struct ioinf_gq *infg = pd_to_infg(pd);
+	struct blkcg_gq *blkg = pd_to_blkg(pd);
+	struct ioinf_cgrp *infcg = blkcg_to_infcg(blkg->blkcg);
+
+	infg->inf = q_to_inf(blkg->q);
+	infg->dfl_user_weight = infcg->dfl_user_weight;
+}
+
+static void ioinf_pd_free(struct blkg_policy_data *pd)
+{
+	struct ioinf_gq *infg = pd_to_infg(pd);
+
+	kfree(infg);
+}
+
+static struct blkcg_policy blkcg_policy_ioinf = {
+	.dfl_cftypes	= ioinf_files,
+	.legacy_cftypes = ioinf_legacy_files,
+
+	.cpd_alloc_fn	= ioinf_cpd_alloc,
+	.cpd_free_fn	= ioinf_cpd_free,
+
+	.pd_alloc_fn	= ioinf_pd_alloc,
+	.pd_init_fn	= ioinf_pd_init,
+	.pd_free_fn	= ioinf_pd_free,
+};
+
+static int __init ioinf_init(void)
+{
+	return blkcg_policy_register(&blkcg_policy_ioinf);
+}
+
+static void __exit ioinf_exit(void)
+{
+	blkcg_policy_unregister(&blkcg_policy_ioinf);
+}
+
+MODULE_AUTHOR("Baokun Li, Yu Kuai and others");
+MODULE_DESCRIPTION("Block IO infligt I/O controller");
+MODULE_LICENSE("GPL");
+module_init(ioinf_init);
+module_exit(ioinf_exit);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index efe99cfae51d..b5af47bf99d4 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -798,6 +798,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
 		return "latency";
 	case RQ_QOS_COST:
 		return "cost";
+	case RQ_QOS_INFLIGHT:
+		return "inflight";
 	}
 	return "unknown";
 }
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 93d1ba692973..d504a302ca0f 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -17,6 +17,7 @@ enum rq_qos_id {
 	RQ_QOS_WBT,
 	RQ_QOS_LATENCY,
 	RQ_QOS_COST,
+	RQ_QOS_INFLIGHT,
 };
 
 struct rq_wait {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4c4416fd2df7..81a733e1bef9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -115,6 +115,9 @@ struct request {
 
 #ifdef CONFIG_BLK_WBT
 	unsigned short wbt_flags;
+#endif
+#ifdef CONFIG_BLK_CGROUP_IOINFLIGHT
+	struct blkcg_gq *blkg;
 #endif
 	/*
 	 * rq sectors used for blk stats. It has the same value
-- 
2.46.1