[PATCH OLK-6.6 5/8] xsched: Add xsched RT class

17 Nov 2025

From: Konstantin Meskhidze <konstantin.meskhidze@huawei.com>

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/IC5EHB

-----------------------------------------

Add rt class callbacks implementation:
- dequeue_ctx
- enqueue_ctx
- pick_next_ctx
- put_prev_ctx
- submit_prepare_ctx
- select_work
- check_preempt

Add xsched_rt.c in /kernel/xsched Makefile.
Add RT class callbacks support in core.c.

Signed-off-by: Konstantin Meskhidze <konstantin.meskhidze@huawei.com>
Signed-off-by: Hui Tang <tanghui20@.huawei.com>
Signed-off-by: Liu Kai <liukai284@huawei.com>
Signed-off-by: Xia Fukun <xiafukun@huawei.com>
Signed-off-by: Zicheng Qu <quzicheng@huawei.com>
---
 drivers/xcu/xcu_group.c |   2 +-
 include/linux/xsched.h  |  52 +++++++-
 kernel/xsched/Kconfig   |  15 +++
 kernel/xsched/Makefile  |   1 +
 kernel/xsched/core.c    |  30 ++++-
 kernel/xsched/rt.c      | 281 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 376 insertions(+), 5 deletions(-)
 create mode 100644 kernel/xsched/rt.c

diff --git a/drivers/xcu/xcu_group.c b/drivers/xcu/xcu_group.c
index 54d389534508..2a349de62256 100644
--- a/drivers/xcu/xcu_group.c
+++ b/drivers/xcu/xcu_group.c
@@ -20,7 +20,7 @@
 #include <linux/xcu_group.h>
 #include <linux/xsched.h>
 
-static int num_active_xcu;
+int num_active_xcu;
 static DEFINE_SPINLOCK(xcu_mgr_lock);
 struct xsched_cu *xsched_cu_mgr[XSCHED_NR_CUS];
 static DECLARE_RWSEM(xcu_group_rwsem);
diff --git a/include/linux/xsched.h b/include/linux/xsched.h
index d52461e63d8a..f62bbc55c354 100644
--- a/include/linux/xsched.h
+++ b/include/linux/xsched.h
@@ -38,19 +38,51 @@
 
 #define MAX_VSTREAM_NUM 512
 
+/*
+ * A default kick slice for RT class XSEs.
+ */
+#define XSCHED_RT_KICK_SLICE 2
+
+extern struct xsched_cu *xsched_cu_mgr[XSCHED_NR_CUS];
+
 enum xcu_sched_type {
-	XSCHED_TYPE_NUM
+	XSCHED_TYPE_RT = 0,
+	XSCHED_TYPE_NUM,
+	XSCHED_TYPE_DFLT = XSCHED_TYPE_RT
 };
 
+enum xse_prio {
+	XSE_PRIO_HIGH = 0,
+	XSE_PRIO_LOW = 4,
+	NR_XSE_PRIO,
+	XSE_PRIO_DFLT = XSE_PRIO_LOW
+};
+
+extern struct xsched_class rt_xsched_class;
+
 #define xsched_first_class \
 	list_first_entry(&(xsched_class_list), struct xsched_class, node)
 
 #define for_each_xsched_class(class)                                           \
 	list_for_each_entry((class), &(xsched_class_list), node)
 
+#define for_each_xse_prio(prio)                                                \
+	for (prio = XSE_PRIO_HIGH; prio < NR_XSE_PRIO; prio++)
 #define for_each_vstream_in_ctx(vs, ctx)                                       \
 	list_for_each_entry((vs), &((ctx)->vstream_list), ctx_node)
 
+
+/* Manages xsched RT-like class linked list based runqueue.
+ *
+ * Now RT-like class runqueue structs is identical
+ * but will most likely grow different in the
+ * future as the Xsched evolves.
+ */
+struct xsched_rq_rt {
+	struct list_head rq[NR_XSE_PRIO];
+	unsigned int nr_running;
+};
+
 /* Base XSched runqueue object structure that contains both mutual and
  * individual parameters for different scheduling classes.
  */
@@ -60,6 +92,8 @@ struct xsched_rq {
 
 	int state;
 	int nr_running;
+	/* RT class run queue.*/
+	struct xsched_rq_rt rt;
 };
 
 enum xsched_cu_status {
@@ -102,6 +136,18 @@ struct xsched_cu {
 	wait_queue_head_t wq_xcu_idle;
 };
 
+extern int num_active_xcu;
+#define for_each_active_xcu(xcu, id)                                           \
+	for ((id) = 0, xcu = xsched_cu_mgr[(id)];                                  \
+	     (id) < num_active_xcu && (xcu = xsched_cu_mgr[(id)]); (id)++)
+
+struct xsched_entity_rt {
+	struct list_head list_node;
+	enum xse_prio prio;
+
+	ktime_t timeslice;
+};
+
 struct xsched_entity {
 	uint32_t task_type;
 
@@ -128,6 +174,9 @@ struct xsched_entity {
 	/* Xsched class for this xse. */
 	const struct xsched_class *class;
 
+	/* RT class entity. */
+	struct xsched_entity_rt rt;
+
 	/* Pointer to context object. */
 	struct xsched_context *ctx;
 
@@ -279,6 +328,7 @@ int xsched_init_entity(struct xsched_context *ctx, struct vstream_info *vs);
 int ctx_bind_to_xcu(vstream_info_t *vstream_info, struct xsched_context *ctx);
 int xsched_vsm_add_tail(struct vstream_info *vs, vstream_args_t *arg);
 struct vstream_metadata *xsched_vsm_fetch_first(struct vstream_info *vs);
+int xsched_rt_prio_set(pid_t tgid, unsigned int prio);
 void enqueue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu);
 void dequeue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu);
 int delete_ctx(struct xsched_context *ctx);
diff --git a/kernel/xsched/Kconfig b/kernel/xsched/Kconfig
index 8d12b8db5f6d..77883b6a3cc3 100644
--- a/kernel/xsched/Kconfig
+++ b/kernel/xsched/Kconfig
@@ -4,6 +4,7 @@ config XCU_SCHEDULER
     bool "Enable XSched functionality"
     default n
     select XCU_VSTREAM
+    select XCU_SCHED_RT
     help
       This option enables the XSched scheduler, a custom scheduling mechanism
       designed for heterogeneous compute units (e.g., XPUs). It provides:
@@ -34,3 +35,17 @@ config XSCHED_NR_CUS
       This option defines the maximum number of Compute Units (CUs) that can be
       managed by the XSched scheduler, consider changing this value proportionally
       to the number of available XCU cores.
+
+config XCU_SCHED_RT
+    bool "XCU RT scheduling class"
+    default y
+    depends on XCU_SCHEDULER
+    help
+      Enable support for the RT scheduling class in the XCU scheduler.
+
+      This option allows XCU to schedule tasks using real-time priorities
+      (XSCHED_TYPE_RT). When enabled, tasks in RT cgroups can be assigned
+      deterministic priorities and will be scheduled ahead of CFS tasks.
+
+      Unless you are using RT workloads that rely on strict priority-based
+      scheduling within XCU, it is recommended to keep the default setting.
diff --git a/kernel/xsched/Makefile b/kernel/xsched/Makefile
index 62e58e4151b0..e98dcea3b2bc 100644
--- a/kernel/xsched/Makefile
+++ b/kernel/xsched/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-y += vstream.o
 obj-$(CONFIG_XCU_SCHEDULER) += core.o
+obj-$(CONFIG_XCU_SCHED_RT) += rt.o
diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c
index 701a81297fc4..bdad82041ada 100644
--- a/kernel/xsched/core.c
+++ b/kernel/xsched/core.c
@@ -188,9 +188,16 @@ int delete_ctx(struct xsched_context *ctx)
 
 int xsched_xse_set_class(struct xsched_entity *xse)
 {
-	struct xsched_class *sched = xsched_first_class;
+	switch (xse->task_type) {
+	case XSCHED_TYPE_RT:
+		xse->class = &rt_xsched_class;
+		XSCHED_DEBUG("Context is in RT class %s\n", __func__);
+		break;
+	default:
+		XSCHED_ERR("Xse has incorrect class @ %s\n", __func__);
+		return -EINVAL;
+	}
 
-	xse->class = sched;
 	return 0;
 }
 
@@ -354,7 +361,8 @@ int xsched_schedule(void *input_xcu)
 
 	while (!kthread_should_stop()) {
 		mutex_unlock(&xcu->xcu_lock);
-		wait_event_interruptible(xcu->wq_xcu_idle, 1);
+		wait_event_interruptible(xcu->wq_xcu_idle,
+			xcu->xrq.rt.nr_running);
 
 		mutex_lock(&xcu->xcu_lock);
 		if (kthread_should_stop()) {
@@ -481,3 +489,19 @@ int xsched_init_entity(struct xsched_context *ctx, struct vstream_info *vs)
 	return err;
 }
 
+static void xsched_register_sched_class(struct xsched_class *sched)
+{
+	list_add_tail(&sched->node, &xsched_class_list);
+}
+
+__init int xsched_sched_init(void)
+{
+	INIT_LIST_HEAD(&xsched_class_list);
+#ifdef CONFIG_XCU_SCHED_RT
+	xsched_register_sched_class(&rt_xsched_class);
+#endif
+
+	return 0;
+}
+late_initcall(xsched_sched_init);
+
diff --git a/kernel/xsched/rt.c b/kernel/xsched/rt.c
new file mode 100644
index 000000000000..41b60e341679
--- /dev/null
+++ b/kernel/xsched/rt.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Real-Time Scheduling Class for XPU device
+ *
+ * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <uapi/linux/sched/types.h>
+#include <linux/hash.h>
+#include <linux/hashtable.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/xsched.h>
+#include <linux/vstream.h>
+
+#define XSCHED_RT_TIMESLICE	(10 * NSEC_PER_MSEC)
+
+#define TGID_HASH_BITS 8
+
+/* Mapping between tgid and context */
+struct tgid_prio {
+	pid_t tgid;
+	int32_t prio;
+	struct hlist_node hnode;
+};
+
+static DEFINE_HASHTABLE(tgid_prio_map, TGID_HASH_BITS);
+static DEFINE_SPINLOCK(tgid_prio_lock);
+
+static int tgid_prio_insert(pid_t tgid, int32_t prio)
+{
+	struct tgid_prio *new_map;
+	unsigned int hash_key;
+
+	if (prio >= NR_XSE_PRIO)
+		return -EINVAL;
+
+	new_map = kzalloc(sizeof(struct tgid_prio), GFP_KERNEL);
+	if (!new_map) {
+		XSCHED_ERR("Fail to alloc mapping (tgid=%d) @ %s\n",
+			tgid, __func__);
+		return -ENOMEM;
+	}
+
+	new_map->tgid = tgid;
+	new_map->prio = prio;
+
+	hash_key = hash_32(tgid, TGID_HASH_BITS);
+
+	spin_lock(&tgid_prio_lock);
+	hash_add_rcu(tgid_prio_map, &new_map->hnode, hash_key);
+	spin_unlock(&tgid_prio_lock);
+
+	return 0;
+}
+
+static struct tgid_prio *tgid_prio_find(pid_t tgid)
+{
+	struct tgid_prio *map = NULL;
+	unsigned int hash_key = hash_32(tgid, TGID_HASH_BITS);
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(tgid_prio_map, map, hnode, hash_key) {
+		if (map->tgid == tgid)
+			break;
+	}
+	rcu_read_unlock();
+	return map;
+}
+
+static void tgid_prio_delete(pid_t tgid)
+{
+	struct tgid_prio *map;
+	unsigned int hash_key = hash_32(tgid, TGID_HASH_BITS);
+
+	spin_lock(&tgid_prio_lock);
+	hash_for_each_possible(tgid_prio_map, map, hnode, hash_key) {
+		if (map->tgid == tgid) {
+			hash_del_rcu(&map->hnode);
+			spin_unlock(&tgid_prio_lock);
+			kfree(map);
+			return;
+		}
+	}
+	spin_unlock(&tgid_prio_lock);
+}
+
+static inline void
+xse_rt_add(struct xsched_entity *xse, struct xsched_cu *xcu)
+{
+	list_add_tail(&xse->rt.list_node, &xcu->xrq.rt.rq[xse->rt.prio]);
+}
+
+static inline void xse_rt_del(struct xsched_entity *xse)
+{
+	list_del_init(&xse->rt.list_node);
+}
+
+static inline void xse_rt_move_tail(struct xsched_entity *xse)
+{
+	struct xsched_cu *xcu = xse->xcu;
+
+	list_move_tail(&xse->rt.list_node, &xcu->xrq.rt.rq[xse->rt.prio]);
+}
+
+/* Increase RT runqueue total and per prio nr_running stat. */
+static inline void xrq_inc_nr_running(struct xsched_entity *xse,
+				      struct xsched_cu *xcu)
+{
+	xcu->xrq.rt.nr_running++;
+}
+
+/* Decrease RT runqueue total and per prio nr_running stat
+ * and raise a bug if nr_running decrease beyond zero.
+ */
+static inline void xrq_dec_nr_running(struct xsched_entity *xse)
+{
+	struct xsched_cu *xcu = xse->xcu;
+
+	xcu->xrq.rt.nr_running--;
+}
+
+static void dequeue_ctx_rt(struct xsched_entity *xse)
+{
+	xse_rt_del(xse);
+	xrq_dec_nr_running(xse);
+}
+
+static void enqueue_ctx_rt(struct xsched_entity *xse, struct xsched_cu *xcu)
+{
+	xse_rt_add(xse, xcu);
+	xrq_inc_nr_running(xse, xcu);
+}
+
+static inline struct xsched_entity *xrq_next_xse(struct xsched_cu *xcu,
+						 int prio)
+{
+	return list_first_entry(&xcu->xrq.rt.rq[prio], struct xsched_entity,
+				rt.list_node);
+}
+
+/* Return the next priority for pick_next_ctx taking into
+ * account if there are pending kicks on certain priority.
+ */
+static inline uint32_t get_next_prio_rt(struct xsched_rq *xrq)
+{
+	unsigned int curr_prio;
+
+	for_each_xse_prio(curr_prio) {
+		if (!list_empty(&xrq->rt.rq[curr_prio]))
+			return curr_prio;
+	}
+	return NR_XSE_PRIO;
+}
+
+static struct xsched_entity *pick_next_ctx_rt(struct xsched_cu *xcu)
+{
+	struct xsched_entity *result;
+	int next_prio;
+
+	next_prio = get_next_prio_rt(&xcu->xrq);
+	if (next_prio >= NR_XSE_PRIO) {
+		XSCHED_DEBUG("No pending kicks in RT class @ %s\n", __func__);
+		return NULL;
+	}
+
+	result = xrq_next_xse(xcu, next_prio);
+	if (!result)
+		XSCHED_ERR("Next XSE not found @ %s\n", __func__);
+	else
+		XSCHED_DEBUG("Next XSE %u at prio %u @ %s\n", result->tgid, next_prio, __func__);
+
+	return result;
+}
+
+static void put_prev_ctx_rt(struct xsched_entity *xse)
+{
+	xse->rt.timeslice -= xse->last_exec_runtime;
+	XSCHED_DEBUG(
+		"Update XSE=%d timeslice=%lld, XSE submitted=%lld in RT class @ %s\n",
+		xse->tgid, xse->rt.timeslice,
+		xse->last_exec_runtime, __func__);
+
+	if (xse->rt.timeslice <= 0) {
+		xse->rt.timeslice = XSCHED_RT_TIMESLICE;
+		XSCHED_DEBUG("Refill XSE=%d kick_slice=%lld in RT class @ %s\n",
+			    xse->tgid, xse->rt.timeslice, __func__);
+		xse_rt_move_tail(xse);
+	}
+}
+
+static bool check_preempt_ctx_rt(struct xsched_entity *xse)
+{
+	return true;
+}
+
+void rq_init_rt(struct xsched_cu *xcu)
+{
+	int prio = 0;
+
+	xcu->xrq.rt.nr_running = 0;
+
+	for_each_xse_prio(prio) {
+		INIT_LIST_HEAD(&xcu->xrq.rt.rq[prio]);
+	}
+}
+
+void xse_init_rt(struct xsched_entity *xse)
+{
+	struct tgid_prio *map = tgid_prio_find(xse->tgid);
+
+	xse->rt.prio = (map) ? map->prio : XSE_PRIO_DFLT;
+	XSCHED_DEBUG("Xse init: set priority=%d.\n", xse->rt.prio);
+	xse->rt.timeslice = XSCHED_RT_TIMESLICE;
+	INIT_LIST_HEAD(&xse->rt.list_node);
+}
+
+void xse_deinit_rt(struct xsched_entity *xse)
+{
+	struct tgid_prio *map = tgid_prio_find(xse->tgid);
+
+	if (map) {
+		tgid_prio_delete(xse->tgid);
+		XSCHED_DEBUG("Map deleted: tgid=%d\n", xse->tgid);
+	}
+}
+
+struct xsched_class rt_xsched_class = {
+	.class_id = XSCHED_TYPE_RT,
+	.kick_slice = XSCHED_RT_KICK_SLICE,
+	.rq_init = rq_init_rt,
+	.xse_init = xse_init_rt,
+	.xse_deinit = xse_deinit_rt,
+	.dequeue_ctx = dequeue_ctx_rt,
+	.enqueue_ctx = enqueue_ctx_rt,
+	.pick_next_ctx = pick_next_ctx_rt,
+	.put_prev_ctx = put_prev_ctx_rt,
+	.check_preempt = check_preempt_ctx_rt
+};
+
+int xsched_rt_prio_set(pid_t tgid, unsigned int prio)
+{
+	unsigned int id;
+	struct xsched_cu *xcu;
+	struct xsched_context *ctx;
+	struct xsched_entity *xse;
+
+	tgid_prio_delete(tgid);
+	tgid_prio_insert(tgid, prio);
+
+	for_each_active_xcu(xcu, id) {
+		mutex_lock(&xcu->ctx_list_lock);
+		mutex_lock(&xcu->xcu_lock);
+
+		ctx = ctx_find_by_tgid_and_xcu(tgid, xcu);
+		if (ctx) {
+			xse = &ctx->xse;
+			xse->rt.prio = clamp_t(unsigned int, prio, XSE_PRIO_HIGH, XSE_PRIO_LOW);
+			if (xse->on_rq) {
+				xse_rt_del(xse);
+				xse_rt_add(xse, xcu);
+			}
+		}
+
+		mutex_unlock(&xcu->xcu_lock);
+		mutex_unlock(&xcu->ctx_list_lock);
+	}
+
+	return 0;
+}
+
-- 
2.34.1