[PATCH v2 openEuler-25.03 13/16] xsched: Add support for CFS quota for cgroups

30 Sep 2025

From: Alekseev Dmitry <alekseev.dmitry@huawei.com>

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/IC5EHB

-----------------------------------------

Add support for CFS quota for cgroups.

Signed-off-by: Alekseev Dmitry <alekseev.dmitry@huawei.com>
Signed-off-by: Hui Tang <tanghui20@.huawei.com>
Signed-off-by: Liu Kai <liukai284@huawei.com>
Signed-off-by: Xia Fukun <xiafukun@huawei.com>
---
 include/linux/xsched.h           | 11 +++-
 include/uapi/linux/xcu_vstream.h |  1 +
 kernel/xsched/Makefile           |  2 +-
 kernel/xsched/cfs.c              |  1 +
 kernel/xsched/cfs_quota.c        | 95 ++++++++++++++++++++++++++++++++
 kernel/xsched/cgroup.c           | 62 ++++++++++++++++++++-
 kernel/xsched/core.c             | 23 ++++----
 7 files changed, 181 insertions(+), 14 deletions(-)
 create mode 100644 kernel/xsched/cfs_quota.c

diff --git a/include/linux/xsched.h b/include/linux/xsched.h
index e59e4fe5e4b4..5ffaffc5afdb 100644
--- a/include/linux/xsched.h
+++ b/include/linux/xsched.h
@@ -38,7 +38,8 @@
 #define RUNTIME_INF ((u64)~0ULL)
 #define XSCHED_TIME_INF RUNTIME_INF
 #define XSCHED_CFS_ENTITY_WEIGHT_DFLT 1
-#define XSCHED_CFS_MIN_TIMESLICE (10*NSEC_PER_MSEC)
+#define XSCHED_CFS_MIN_TIMESLICE (10 * NSEC_PER_MSEC)
+#define XSCHED_CFS_QUOTA_PERIOD_MS (100 * NSEC_PER_MSEC)
 #define XSCHED_CFG_SHARE_DFLT 1024
 
 #define __GET_VS_TASK_TYPE(t) ((t)&0xFF)
@@ -590,6 +591,7 @@ static inline void xsched_init_vsm(struct vstream_metadata *vsm,
 				struct vstream_info *vs, vstream_args_t *arg)
 {
 	vsm->sq_id = arg->sq_id;
+	vsm->exec_time = arg->vk_args.exec_time;
 	vsm->sqe_num = arg->vk_args.sqe_num;
 	vsm->timeout = arg->vk_args.timeout;
 	memcpy(vsm->sqe, arg->vk_args.sqe, XCU_SQE_SIZE_MAX);
@@ -615,6 +617,13 @@ int xsched_group_inherit(struct task_struct *tsk, struct xsched_entity *xse);
 void xcu_cg_init_common(struct xsched_group *xcg);
 void xcu_grp_shares_update(struct xsched_group *xg);
 void xsched_group_xse_detach(struct xsched_entity *xse);
+
+void xsched_quota_init(void);
+void xsched_quota_timeout_init(struct xsched_group *xg);
+void xsched_quota_timeout_update(struct xsched_group *xg);
+void xsched_quota_account(struct xsched_group *xg, s64 exec_time);
+bool xsched_quota_exceed(struct xsched_group *xg);
+void xsched_quota_refill(struct work_struct *work);
 void enqueue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu);
 void dequeue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu);
 #endif /* __LINUX_XSCHED_H__ */
diff --git a/include/uapi/linux/xcu_vstream.h b/include/uapi/linux/xcu_vstream.h
index 32c71dce5ad1..46d5a32db68e 100644
--- a/include/uapi/linux/xcu_vstream.h
+++ b/include/uapi/linux/xcu_vstream.h
@@ -28,6 +28,7 @@ typedef struct vstream_free_args { } vstream_free_args_t;
 
 typedef struct vstream_kick_args {
 	__u32 sqe_num;
+	__u32 exec_time;
 	__s32 timeout;
 	__s8 sqe[XCU_SQE_SIZE_MAX];
 } vstream_kick_args_t;
diff --git a/kernel/xsched/Makefile b/kernel/xsched/Makefile
index c4c06b6038ff..8ab32b086b3d 100644
--- a/kernel/xsched/Makefile
+++ b/kernel/xsched/Makefile
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-y += vstream.o
-obj-$(CONFIG_XCU_SCHEDULER) += core.o rt.o cfs.o
+obj-$(CONFIG_XCU_SCHEDULER) += core.o rt.o cfs.o cfs_quota.o
 obj-$(CONFIG_CGROUP_XCU) += cgroup.o
diff --git a/kernel/xsched/cfs.c b/kernel/xsched/cfs.c
index 1313c7e73a11..94189d8088ac 100644
--- a/kernel/xsched/cfs.c
+++ b/kernel/xsched/cfs.c
@@ -209,6 +209,7 @@ static void put_prev_ctx_fair(struct xsched_entity *xse)
 {
 	struct xsched_entity_cfs *prev = &xse->cfs;
 
+	xsched_quota_account(xse->parent_grp, (s64)xse->last_exec_runtime);
 	xs_update(prev, xse->last_exec_runtime);
 }
 
diff --git a/kernel/xsched/cfs_quota.c b/kernel/xsched/cfs_quota.c
new file mode 100644
index 000000000000..a62f07ad3cdc
--- /dev/null
+++ b/kernel/xsched/cfs_quota.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Bandwidth provisioning for XPU device
+ *
+ * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd
+ *
+ * Author: Konstantin Meskhidze <konstantin.meskhidze@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/timer.h>
+#include <linux/xsched.h>
+
+static struct workqueue_struct *quota_workqueue;
+
+void xsched_quota_refill(struct work_struct *work)
+{
+	uint32_t id;
+	struct xsched_cu *xcu;
+	struct xsched_group *xg;
+
+	xg = container_of(work, struct xsched_group, refill_work);
+
+	spin_lock(&xg->lock);
+	xg->runtime = max((xg->runtime - xg->quota), (s64)0);
+	hrtimer_start(&xg->quota_timeout, ns_to_ktime(xg->period), HRTIMER_MODE_REL_SOFT);
+	spin_unlock(&xg->lock);
+
+	for_each_active_xcu(xcu, id) {
+		xcu = xsched_cu_mgr[id];
+		mutex_lock(&xcu->xcu_lock);
+		if (!READ_ONCE(xg->perxcu_priv[id].xse.on_rq)) {
+			enqueue_ctx(&xg->perxcu_priv[id].xse, xcu);
+			wake_up_interruptible(&xcu->wq_xcu_idle);
+		}
+		mutex_unlock(&xcu->xcu_lock);
+	}
+}
+
+static enum hrtimer_restart quota_timer_cb(struct hrtimer *hrtimer)
+{
+	struct xsched_group *xg;
+
+	xg = container_of(hrtimer, struct xsched_group, quota_timeout);
+	queue_work(quota_workqueue, &xg->refill_work);
+
+	return HRTIMER_NORESTART;
+}
+
+void xsched_quota_account(struct xsched_group *xg, s64 exec_time)
+{
+	spin_lock(&xg->lock);
+	xg->runtime += exec_time;
+	spin_unlock(&xg->lock);
+}
+
+bool xsched_quota_exceed(struct xsched_group *xg)
+{
+	bool ret;
+
+	spin_lock(&xg->lock);
+	ret = (xg->quota > 0) ? (xg->runtime >= xg->quota) : false;
+	spin_unlock(&xg->lock);
+
+	return ret;
+}
+
+void xsched_quota_init(void)
+{
+	quota_workqueue = create_singlethread_workqueue("xsched_quota_workqueue");
+}
+
+void xsched_quota_timeout_init(struct xsched_group *xg)
+{
+	hrtimer_init(&xg->quota_timeout, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+	xg->quota_timeout.function = quota_timer_cb;
+}
+
+void xsched_quota_timeout_update(struct xsched_group *xg)
+{
+	struct hrtimer *t = &xg->quota_timeout;
+
+	hrtimer_cancel(t);
+
+	if (xg->quota > 0 && xg->period > 0)
+		hrtimer_start(t, ns_to_ktime(xg->period), HRTIMER_MODE_REL_SOFT);
+}
diff --git a/kernel/xsched/cgroup.c b/kernel/xsched/cgroup.c
index 8ae17069e031..aa675a013927 100644
--- a/kernel/xsched/cgroup.c
+++ b/kernel/xsched/cgroup.c
@@ -47,6 +47,8 @@ void xcu_cg_init_common(struct xsched_group *xcg)
 	spin_lock_init(&xcg->lock);
 	INIT_LIST_HEAD(&xcg->members);
 	INIT_LIST_HEAD(&xcg->children_groups);
+	xsched_quota_timeout_init(xcg);
+	INIT_WORK(&xcg->refill_work, xsched_quota_refill);
 }
 
 static void xcu_cfs_root_cg_init(void)
@@ -62,6 +64,10 @@ static void xcu_cfs_root_cg_init(void)
 	}
 
 	root_xcg->sched_type = XSCHED_TYPE_DFLT;
+	root_xcg->period = XSCHED_CFS_QUOTA_PERIOD_MS;
+	root_xcg->quota = XSCHED_TIME_INF;
+	root_xcg->runtime = 0;
+	xsched_quota_init();
 }
 
 /**
@@ -115,6 +121,9 @@ static int xcu_cfs_cg_init(struct xsched_group *xcg,
 
 	xcg->shares_cfg = XSCHED_CFG_SHARE_DFLT;
 	xcu_grp_shares_update(parent_xg);
+	xcg->period = XSCHED_CFS_QUOTA_PERIOD_MS;
+	xcg->quota = XSCHED_TIME_INF;
+	xcg->runtime = 0;
 
 	return 0;
 
@@ -223,6 +232,8 @@ static void xcu_css_free(struct cgroup_subsys_state *css)
 			break;
 		}
 	}
+	hrtimer_cancel(&xcg->quota_timeout);
+	cancel_work_sync(&xcg->refill_work);
 	list_del(&xcg->group_node);
 	mutex_unlock(&xcg_mutex);
 
@@ -460,6 +471,12 @@ static s64 xcu_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 
 	spin_lock(&xcucg->lock);
 	switch (cft->private) {
+	case XCU_FILE_PERIOD_MS:
+		ret = xcucg->period / NSEC_PER_MSEC;
+		break;
+	case XCU_FILE_QUOTA_MS:
+		ret = (xcucg->quota > 0) ? xcucg->quota / NSEC_PER_MSEC : xcucg->quota;
+		break;
 	case XCU_FILE_SHARES:
 		ret = xcucg->shares_cfg;
 		break;
@@ -530,11 +547,37 @@ static int xcu_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 {
 	int ret = 0;
 	struct xsched_group *xcucg = xcu_cg_from_css(css);
+	s64 quota_ns;
+
+	if (xcucg->sched_type != XSCHED_TYPE_CFS)
+		return -EINVAL;
 
 	spin_lock(&xcucg->lock);
 	switch (cft->private) {
+	case XCU_FILE_PERIOD_MS:
+		if (val < 1 || val > (S64_MAX / NSEC_PER_MSEC)) {
+			ret = -EINVAL;
+			break;
+		}
+		xcucg->period = val * NSEC_PER_MSEC;
+		xsched_quota_timeout_update(xcucg);
+		break;
+	case XCU_FILE_QUOTA_MS:
+		if (val < -1 || val > (S64_MAX / NSEC_PER_MSEC)) {
+			ret = -EINVAL;
+			break;
+		}
+		/* Runtime should be updated when modifying quota_ms configuration */
+		quota_ns = (val > 0) ? val * NSEC_PER_MSEC : val;
+		if (xcucg->quota > 0 && quota_ns > 0)
+			xcucg->runtime = max((xcucg->runtime - quota_ns), (s64)0);
+		else
+			xcucg->runtime = 0;
+		xcucg->quota = quota_ns;
+		xsched_quota_timeout_update(xcucg);
+		break;
 	case XCU_FILE_SHARES:
-		if (val <= 0) {
+		if (val <= 0 || val > U64_MAX) {
 			ret = -EINVAL;
 			break;
 		}
@@ -577,11 +620,28 @@ static int xcu_stat(struct seq_file *sf, void *v)
 	seq_printf(sf, "exec_runtime:	%llu\n", exec_runtime);
 	seq_printf(sf, "shares cfg:	%llu/%llu x%u\n", xcucg->shares_cfg,
 		   xcucg->parent->children_shares_sum, xcucg->weight);
+	seq_printf(sf, "quota:	%lld\n", xcucg->quota);
+	seq_printf(sf, "used:	%lld\n", xcucg->runtime);
+	seq_printf(sf, "period:	%lld\n", xcucg->period);
 
 	return 0;
 }
 
 static struct cftype xcu_cg_files[] = {
+	{
+		.name = "period_ms",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = xcu_read_s64,
+		.write_s64 = xcu_write_s64,
+		.private = XCU_FILE_PERIOD_MS,
+	},
+	{
+		.name = "quota_ms",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = xcu_read_s64,
+		.write_s64 = xcu_write_s64,
+		.private = XCU_FILE_QUOTA_MS,
+	},
 	{
 		.name = "shares",
 		.flags = CFTYPE_NOT_ON_ROOT,
diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c
index 64f2cbafb8cd..78808f6ae561 100644
--- a/kernel/xsched/core.c
+++ b/kernel/xsched/core.c
@@ -46,7 +46,6 @@ static void put_prev_ctx(struct xsched_entity *xse)
 	struct xsched_cu *xcu = xse->xcu;
 
 	lockdep_assert_held(&xcu->xcu_lock);
-
 	xse->class->put_prev_ctx(xse);
 	xse->last_exec_runtime = 0;
 	atomic_set(&xse->submitted_one_kick, 0);
@@ -505,16 +504,18 @@ static int xsched_schedule(void *input_xcu)
 			continue;
 
 		curr_xse = xcu->xrq.curr_xse;
-		if (curr_xse) { /* if not deleted yet */
-			put_prev_ctx(curr_xse);
-			if (!atomic_read(&curr_xse->kicks_pending_ctx_cnt)) {
-				dequeue_ctx(curr_xse, xcu);
-				XSCHED_DEBUG(
-					"%s: Dequeue xse %d due to zero kicks on xcu %u\n",
-					__func__, curr_xse->tgid, xcu->id);
-				curr_xse = xcu->xrq.curr_xse = NULL;
-			}
-		}
+		if (!curr_xse)
+			continue;
+
+		/* if not deleted yet */
+		put_prev_ctx(curr_xse);
+		if (!atomic_read(&curr_xse->kicks_pending_ctx_cnt))
+			dequeue_ctx(curr_xse, xcu);
+
+		if (xsched_quota_exceed(curr_xse->parent_grp))
+			dequeue_ctx(&curr_xse->parent_grp->perxcu_priv[xcu->id].xse, xcu);
+
+		xcu->xrq.curr_xse = NULL;
 	}
 
 	return err;
-- 
2.34.1