[PATCH OLK-6.6 0/8] xsched: XCU Partition

Zicheng Qu

17 Nov 2025 17 Nov '25

2:43 p.m.

Alekseev Dmitry (2): xsched: Add XCU control group implementation and its backend in xsched CFS xsched: Add support for CFS quota for cgroups Konstantin Meskhidze (6): xsched: Add base vstream support xcu: Add base NPU driver support xsched: Introduce vstream management xsched: Add basic scheduler core support xsched: Add xsched RT class xsched: Add xsched CFS class arch/arm/include/uapi/asm/unistd.h | 1 + arch/arm64/configs/openeuler_defconfig | 1 + arch/powerpc/include/uapi/asm/unistd.h | 1 + arch/x86/configs/openeuler_defconfig | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 2 +- arch/x86/include/uapi/asm/unistd.h | 1 + drivers/Makefile | 1 + drivers/xcu/Makefile | 2 + drivers/xcu/xcu_group.c | 380 ++++++++++++ include/linux/cgroup_subsys.h | 4 + include/linux/syscalls.h | 2 + include/linux/vstream.h | 91 +++ include/linux/xcu_group.h | 96 +++ include/linux/xsched.h | 514 ++++++++++++++++ include/uapi/asm-generic/unistd.h | 6 +- include/uapi/linux/xcu_vstream.h | 77 +++ init/Kconfig | 1 + kernel/Makefile | 1 + kernel/cgroup/cgroup.c | 2 +- kernel/xsched/Kconfig | 85 +++ kernel/xsched/Makefile | 7 + kernel/xsched/cfs.c | 237 ++++++++ kernel/xsched/cfs_quota.c | 98 ++++ kernel/xsched/cgroup.c | 775 +++++++++++++++++++++++++ kernel/xsched/core.c | 526 +++++++++++++++++ kernel/xsched/rt.c | 281 +++++++++ kernel/xsched/vstream.c | 674 +++++++++++++++++++++ 27 files changed, 3863 insertions(+), 4 deletions(-) create mode 100644 drivers/xcu/Makefile create mode 100644 drivers/xcu/xcu_group.c create mode 100644 include/linux/vstream.h create mode 100644 include/linux/xcu_group.h create mode 100644 include/linux/xsched.h create mode 100644 include/uapi/linux/xcu_vstream.h create mode 100644 kernel/xsched/Kconfig create mode 100644 kernel/xsched/Makefile create mode 100644 kernel/xsched/cfs.c create mode 100644 kernel/xsched/cfs_quota.c create mode 100644 kernel/xsched/cgroup.c create mode 100644 kernel/xsched/core.c create mode 100644 kernel/xsched/rt.c create mode 100644 kernel/xsched/vstream.c -- 2.34.1

Show replies by date

Zicheng Qu

17 Nov 17 Nov

2:43 p.m.

New subject: [PATCH OLK-6.6 1/8] xsched: Add base vstream support

From: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC5EHB -------------------------------- Add sys_vstream_manage() syscall. Add the basic function framework. Add basic header files. Add new Kconfig.xsched with XCU_SCHEDULER and XCU_VSTREAM configurations. Create new dir kernel/xsched with vstream.c file with base xsched syscalls stubs. Add Makefile in kernel/xsched. Update main kernel Makefile to include kernel/xsched in build. Signed-off-by: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> Signed-off-by: Hui Tang <tanghui20@.huawei.com> Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Xia Fukun <xiafukun@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- arch/arm/include/uapi/asm/unistd.h | 1 + arch/arm64/configs/openeuler_defconfig | 1 + arch/powerpc/include/uapi/asm/unistd.h | 1 + arch/x86/configs/openeuler_defconfig | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 2 +- arch/x86/include/uapi/asm/unistd.h | 1 + include/linux/syscalls.h | 2 + include/linux/vstream.h | 13 +++++ include/linux/xsched.h | 35 ++++++++++++ include/uapi/asm-generic/unistd.h | 6 +- include/uapi/linux/xcu_vstream.h | 73 +++++++++++++++++++++++++ init/Kconfig | 1 + kernel/Makefile | 1 + kernel/xsched/Kconfig | 27 +++++++++ kernel/xsched/Makefile | 2 + kernel/xsched/vstream.c | 76 ++++++++++++++++++++++++++ 16 files changed, 240 insertions(+), 3 deletions(-) create mode 100644 include/linux/vstream.h create mode 100644 include/linux/xsched.h create mode 100644 include/uapi/linux/xcu_vstream.h create mode 100644 kernel/xsched/Kconfig create mode 100644 kernel/xsched/Makefile create mode 100644 kernel/xsched/vstream.c diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h index a1149911464c..df413d769767 100644 --- a/arch/arm/include/uapi/asm/unistd.h +++ b/arch/arm/include/uapi/asm/unistd.h @@ -14,6 +14,7 @@ #ifndef _UAPI__ASM_ARM_UNISTD_H #define _UAPI__ASM_ARM_UNISTD_H +#define __IGNORE_kabi_reserved454 #define __NR_OABI_SYSCALL_BASE 0x900000 #define __NR_SYSCALL_MASK 0x0fffff diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index f3ba33f1f521..84c7ea73d335 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -97,6 +97,7 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_DYNAMIC is not set +CONFIG_XCU_SCHEDULER=n # # CPU/Task time and stats accounting diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h index 5f84e3dc98d0..c9993b5dc704 100644 --- a/arch/powerpc/include/uapi/asm/unistd.h +++ b/arch/powerpc/include/uapi/asm/unistd.h @@ -9,6 +9,7 @@ */ #ifndef _UAPI_ASM_POWERPC_UNISTD_H_ #define _UAPI_ASM_POWERPC_UNISTD_H_ +#define __IGNORE_kabi_reserved454 #ifndef __powerpc64__ #include <asm/unistd_32.h> diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 6b9fa19c873a..5c015e32b808 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -117,6 +117,7 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_DYNAMIC is not set +CONFIG_XCU_SCHEDULER=n # # CPU/Task time and stats accounting diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f88268a37ec2..162517343cb1 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -375,7 +375,7 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 453 64 map_shadow_stack sys_map_shadow_stack -454 common kabi_reserved454 sys_ni_syscall +454 common vstream_manage sys_vstream_manage 455 common kabi_reserved455 sys_ni_syscall 456 common kabi_reserved456 sys_ni_syscall 457 common kabi_reserved457 sys_ni_syscall diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h index be5e2e747f50..5d81c4bb9803 100644 --- a/arch/x86/include/uapi/asm/unistd.h +++ b/arch/x86/include/uapi/asm/unistd.h @@ -11,6 +11,7 @@ * thing regardless. */ #define __X32_SYSCALL_BIT 0x40000000 +#define __IGNORE_kabi_reserved454 #ifndef __KERNEL__ # ifdef __i386__ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 36c592e43d65..119aabc72a2d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -74,6 +74,7 @@ struct landlock_ruleset_attr; enum landlock_rule_type; struct cachestat_range; struct cachestat; +struct vstream_args; #include <linux/types.h> #include <linux/aio_abi.h> @@ -948,6 +949,7 @@ asmlinkage long sys_cachestat(unsigned int fd, struct cachestat __user *cstat, unsigned int flags); asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags); +asmlinkage long sys_vstream_manage(struct vstream_args __user *arg, int cmd); /* * Architecture-specific system calls */ diff --git a/include/linux/vstream.h b/include/linux/vstream.h new file mode 100644 index 000000000000..891734e8ce04 --- /dev/null +++ b/include/linux/vstream.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VSTREAM_H +#define _LINUX_VSTREAM_H + +#include <uapi/linux/xcu_vstream.h> + +typedef int vstream_manage_t(struct vstream_args *arg); + +int vstream_alloc(struct vstream_args *arg); +int vstream_free(struct vstream_args *arg); +int vstream_kick(struct vstream_args *arg); + +#endif /* _LINUX_VSTREAM_H */ diff --git a/include/linux/xsched.h b/include/linux/xsched.h new file mode 100644 index 000000000000..dc840136a35f --- /dev/null +++ b/include/linux/xsched.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_XSCHED_H__ +#define __LINUX_XSCHED_H__ + +#ifndef pr_fmt +#define pr_fmt(fmt) fmt +#endif + +#define XSCHED_INFO_PREFIX "XSched [INFO]: " +#define XSCHED_INFO(fmt, ...) \ + pr_info(pr_fmt(XSCHED_INFO_PREFIX fmt), ##__VA_ARGS__) + +#define XSCHED_ERR_PREFIX "XSched [ERROR]: " +#define XSCHED_ERR(fmt, ...) \ + pr_err(pr_fmt(XSCHED_ERR_PREFIX fmt), ##__VA_ARGS__) + +#define XSCHED_WARN_PREFIX "XSched [WARNING]: " +#define XSCHED_WARN(fmt, ...) \ + pr_warn(pr_fmt(XSCHED_WARN_PREFIX fmt), ##__VA_ARGS__) + +/* + * Debug specific prints for XSched + */ + +#define XSCHED_DEBUG_PREFIX "XSched [DEBUG]: " +#define XSCHED_DEBUG(fmt, ...) \ + pr_debug(pr_fmt(XSCHED_DEBUG_PREFIX fmt), ##__VA_ARGS__) + +#define XSCHED_CALL_STUB() \ + XSCHED_DEBUG(" -----* %s @ %s called *-----\n", __func__, __FILE__) + +#define XSCHED_EXIT_STUB() \ + XSCHED_DEBUG(" -----* %s @ %s exited *-----\n", __func__, __FILE__) + +#endif /* !__LINUX_XSCHED_H__ */ diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index bf2b30463784..ea50d1a3471c 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -826,8 +826,10 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2) #define __NR_map_shadow_stack 453 __SYSCALL(__NR_map_shadow_stack, sys_map_shadow_stack) -#define __NR_kabi_reserved454 454 -__SYSCALL(__NR_kabi_reserved454, sys_ni_syscall) +#define __IGNORE_kabi_reserved454 +#define __NR_vstream_manage 454 +__SYSCALL(__NR_vstream_manage, sys_vstream_manage) + #define __NR_kabi_reserved455 455 __SYSCALL(__NR_kabi_reserved455, sys_ni_syscall) #define __NR_kabi_reserved456 456 diff --git a/include/uapi/linux/xcu_vstream.h b/include/uapi/linux/xcu_vstream.h new file mode 100644 index 000000000000..4d65789c37c7 --- /dev/null +++ b/include/uapi/linux/xcu_vstream.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_XCU_VSTREAM_H +#define _UAPI_XCU_VSTREAM_H + +#include <linux/types.h> + +#define PAYLOAD_SIZE_MAX 512 +#define XCU_SQE_SIZE_MAX 64 + +#define KABI_RESERVE_BYTES(idx, n) \ + __u8 __kabi_reserved_##idx[n] + +/* + * VSTREAM_ALLOC: alloc a vstream, buffer for tasks + * VSTREAM_FREE: free a vstream + * VSTREAM_KICK: there are tasks to be executed in the vstream + */ +typedef enum VSTREAM_COMMAND { + VSTREAM_ALLOC = 0, + VSTREAM_FREE, + VSTREAM_KICK, + MAX_COMMAND +} vstream_command_t; + +typedef struct vstream_alloc_args { + __s32 type; + __u32 user_stream_id; + + KABI_RESERVE_BYTES(0, 8); + KABI_RESERVE_BYTES(1, 8); + KABI_RESERVE_BYTES(2, 8); +} vstream_alloc_args_t; + +typedef struct vstream_free_args { + KABI_RESERVE_BYTES(0, 8); + KABI_RESERVE_BYTES(1, 8); + KABI_RESERVE_BYTES(2, 8); +} vstream_free_args_t; + +typedef struct vstream_kick_args { + __u32 sqe_num; + __s32 timeout; + __s8 sqe[XCU_SQE_SIZE_MAX]; + + KABI_RESERVE_BYTES(0, 8); + KABI_RESERVE_BYTES(1, 8); + KABI_RESERVE_BYTES(2, 8); +} vstream_kick_args_t; + +typedef struct vstream_args { + __u32 channel_id; + __u32 fd; + __u32 dev_id; + __u32 task_type; + __u32 sq_id; + __u32 cq_id; + + /* Device related structures. */ + union { + vstream_alloc_args_t va_args; + vstream_free_args_t vf_args; + vstream_kick_args_t vk_args; + }; + + __u32 payload_size; + __s8 payload[PAYLOAD_SIZE_MAX]; + + KABI_RESERVE_BYTES(0, 8); + KABI_RESERVE_BYTES(1, 8); + KABI_RESERVE_BYTES(2, 8); +} vstream_args_t; + +#endif /* _UAPI_LINUX_SCHED_H */ diff --git a/init/Kconfig b/init/Kconfig index 2720083aaa17..b3c4487fa631 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -488,6 +488,7 @@ source "kernel/time/Kconfig" source "kernel/bpf/Kconfig" source "kernel/bpf-rvi/Kconfig" source "kernel/Kconfig.preempt" +source "kernel/xsched/Kconfig" menu "CPU/Task time and stats accounting" diff --git a/kernel/Makefile b/kernel/Makefile index da4c2d1838dc..fe3559ee90d9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -50,6 +50,7 @@ obj-y += rcu/ obj-y += livepatch/ obj-y += dma/ obj-y += entry/ +obj-y += xsched/ obj-$(CONFIG_MODULES) += module/ obj-$(CONFIG_KCMP) += kcmp.o diff --git a/kernel/xsched/Kconfig b/kernel/xsched/Kconfig new file mode 100644 index 000000000000..c2d587f6d57a --- /dev/null +++ b/kernel/xsched/Kconfig @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: GPL-2.0 + +config XCU_SCHEDULER + bool "Enable XSched functionality" + default n + select XCU_VSTREAM + help + This option enables the XSched scheduler, a custom scheduling mechanism + designed for heterogeneous compute units (e.g., XPUs). It provides: + - Priority-based task scheduling with latency-sensitive optimizations. + - Integration with cgroups (via CGROUP_XCU) for resource isolation. + + Enable this only if your system requires advanced scheduling for XPU workloads. + If unsure, say N. + +config XCU_VSTREAM + bool "Enable vstream SQ/CQ buffers maintaining for XPU" + default n + depends on XCU_SCHEDULER + help + This option enables virtual stream (vstream) support for XPUs, managing + submission queues (SQ) and completion queues (CQ) in kernel space. Key features: + - Zero-copy buffer management between user and kernel space. + - Batch processing of XPU commands to reduce MMIO overhead. + + Requires XCU_SCHEDULER to be enabled. May increase kernel memory usage. + Recommended for high-throughput XPU workloads. If unsure, say N. diff --git a/kernel/xsched/Makefile b/kernel/xsched/Makefile new file mode 100644 index 000000000000..e972cd93b607 --- /dev/null +++ b/kernel/xsched/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += vstream.o diff --git a/kernel/xsched/vstream.c b/kernel/xsched/vstream.c new file mode 100644 index 000000000000..a20c9594b21e --- /dev/null +++ b/kernel/xsched/vstream.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Vstream manage for XPU device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include <linux/syscalls.h> +#include <linux/vstream.h> +#include <linux/xsched.h> + +#ifdef CONFIG_XCU_VSTREAM +int vstream_alloc(struct vstream_args *arg) +{ + return 0; +} + +int vstream_free(struct vstream_args *arg) +{ + return 0; +} + +int vstream_kick(struct vstream_args *arg) +{ + return 0; +} + +/* + * vstream_manage_cmd table + */ +static vstream_manage_t(*vstream_command_table[MAX_COMMAND + 1]) = { + vstream_alloc, // VSTREAM_ALLOC + vstream_free, // VSTREAM_FREE + vstream_kick, // VSTREAM_KICK + NULL // MAX_COMMAND +}; + +SYSCALL_DEFINE2(vstream_manage, struct vstream_args __user *, arg, int, cmd) +{ + int res = 0; + struct vstream_args vstream_arg; + + if (cmd < 0 || cmd >= MAX_COMMAND) { + XSCHED_ERR("Invalid cmd value: %d, valid range is 0 to %d\n", cmd, MAX_COMMAND - 1); + return -EINVAL; + } + + if (copy_from_user(&vstream_arg, arg, sizeof(struct vstream_args))) { + XSCHED_ERR("copy_from_user failed\n"); + return -EFAULT; + } + + res = vstream_command_table[cmd](&vstream_arg); + if (copy_to_user(arg, &vstream_arg, sizeof(struct vstream_args))) { + XSCHED_ERR("copy_to_user failed\n"); + return -EFAULT; + } + + XSCHED_DEBUG("vstream_manage: cmd %d\n", cmd); + return res; +} +#else +SYSCALL_DEFINE2(vstream_manage, struct vstream_args __user *, arg, int, cmd) +{ + return 0; +} +#endif \ No newline at end of file -- 2.34.1

Zicheng Qu

2:43 p.m.

New subject: [PATCH OLK-6.6 2/8] xcu: Add base NPU driver support

From: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC5EHB ----------------------------------------- Add base xcu_group structure, xcu_type enum, xcu_operation struct Add build support in Makefiles. Signed-off-by: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> Signed-off-by: Hui Tang <tanghui20@.huawei.com> Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Xia Fukun <xiafukun@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- drivers/Makefile | 1 + drivers/xcu/Makefile | 2 + drivers/xcu/xcu_group.c | 362 ++++++++++++++++++++++++++++++++++++++ include/linux/xcu_group.h | 89 ++++++++++ include/linux/xsched.h | 28 +++ kernel/xsched/Kconfig | 9 + kernel/xsched/Makefile | 2 + kernel/xsched/core.c | 58 ++++++ kernel/xsched/vstream.c | 2 +- 9 files changed, 552 insertions(+), 1 deletion(-) create mode 100644 drivers/xcu/Makefile create mode 100644 drivers/xcu/xcu_group.c create mode 100644 include/linux/xcu_group.h create mode 100644 kernel/xsched/core.c diff --git a/drivers/Makefile b/drivers/Makefile index f8e58f0ca2d1..57826d4b5cd7 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -197,6 +197,7 @@ obj-$(CONFIG_GNSS) += gnss/ obj-$(CONFIG_INTERCONNECT) += interconnect/ obj-$(CONFIG_COUNTER) += counter/ obj-$(CONFIG_MOST) += most/ +obj-$(CONFIG_XCU_SCHEDULER) += xcu/ obj-$(CONFIG_PECI) += peci/ obj-$(CONFIG_HTE) += hte/ obj-$(CONFIG_DRM_ACCEL) += accel/ diff --git a/drivers/xcu/Makefile b/drivers/xcu/Makefile new file mode 100644 index 000000000000..575115b148ec --- /dev/null +++ b/drivers/xcu/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_XCU_SCHEDULER) += xcu_group.o diff --git a/drivers/xcu/xcu_group.c b/drivers/xcu/xcu_group.c new file mode 100644 index 000000000000..3215f37e4ece --- /dev/null +++ b/drivers/xcu/xcu_group.c @@ -0,0 +1,362 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Code for NPU driver support + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include <linux/kthread.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/xcu_group.h> +#include <linux/xsched.h> + +static int num_active_xcu; +static DEFINE_SPINLOCK(xcu_mgr_lock); +struct xsched_cu *xsched_cu_mgr[XSCHED_NR_CUS]; +static DECLARE_RWSEM(xcu_group_rwsem); +struct xcu_group *xcu_group_init(int id) +{ + struct xcu_group *node = kzalloc(sizeof(*node), GFP_KERNEL); + + if (!node) + return NULL; + + node->id = id; + node->type = XCU_TYPE_XPU; + idr_init(&node->next_layer); + return node; +} +EXPORT_SYMBOL(xcu_group_init); + +int __xcu_group_attach(struct xcu_group *new_group, + struct xcu_group *previous_group) +{ + int id = new_group->id; + + if (id == -1) + id = idr_alloc(&previous_group->next_layer, new_group, 0, + INT_MAX, GFP_KERNEL); + else + id = idr_alloc(&previous_group->next_layer, new_group, id, + id + 1, GFP_KERNEL); + + if (id < 0) { + XSCHED_ERR("Fail to attach xcu_group: id conflict @ %s\n", + __func__); + return -EEXIST; + } + new_group->id = id; + new_group->previous_layer = previous_group; + + return 0; +} + +int xcu_group_attach(struct xcu_group *new_group, + struct xcu_group *previous_group) +{ + int ret; + + down_write(&xcu_group_rwsem); + ret = __xcu_group_attach(new_group, previous_group); + up_write(&xcu_group_rwsem); + + return ret; +} +EXPORT_SYMBOL(xcu_group_attach); + +static inline void __xcu_group_detach(struct xcu_group *group) +{ + if (!group || !group->previous_layer) + return; + + idr_remove(&group->previous_layer->next_layer, group->id); + group->previous_layer = NULL; +} + +void xcu_group_detach(struct xcu_group *group) +{ + down_write(&xcu_group_rwsem); + __xcu_group_detach(group); + up_write(&xcu_group_rwsem); +} +EXPORT_SYMBOL(xcu_group_detach); + +void xcu_group_free(struct xcu_group *group) +{ + idr_destroy(&group->next_layer); + if (group != xcu_group_root) + kfree(group); +} +EXPORT_SYMBOL(xcu_group_free); + +static struct xcu_group *__xcu_group_find_nolock(struct xcu_group *group, int id) +{ + return idr_find(&group->next_layer, id); +} + +struct xcu_group *xcu_group_find(struct xcu_group *group, int id) +{ + struct xcu_group *result; + + down_read(&xcu_group_rwsem); + result = __xcu_group_find_nolock(group, id); + up_read(&xcu_group_rwsem); + + return result; +} +EXPORT_SYMBOL(xcu_group_find); + +/* This function runs "run" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object + */ +int xcu_run(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->run) { + XSCHED_ERR("No function [run] called.\n"); + return -EINVAL; + } + + return params->group->opt->run(params); +} + +/* This function runs "wait" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object + */ +int xcu_wait(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->wait) { + XSCHED_ERR("No function [wait] called.\n"); + return -EINVAL; + } + + return params->group->opt->wait(params); +} + +/* This function runs "complete" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + */ +int xcu_complete(struct xcu_op_handler_params *params) +{ + return 0; +} + +/* This function runs "finish" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + * + * This handler provides an interface to implement deallocation + * and freeing memory for SQ and CQ buffers. + */ +int xcu_finish(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->finish) { + XSCHED_ERR("No function [finish] called.\n"); + return -EINVAL; + } + + return params->group->opt->finish(params); +} + +/* This function runs a "alloc" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + * + * This handler provides an interface to implement allocation + * and registering memory for SQ and CQ buffers. + */ +int xcu_alloc(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->alloc) { + XSCHED_ERR("No function [alloc] called.\n"); + return -EINVAL; + } + + return params->group->opt->alloc(params); +} + +/* This function runs a "logic_alloc" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + * + * This handler provides an interface to implement allocation + * and registering memory of logic CQ buffer. + */ +int xcu_logic_alloc(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->logic_alloc) { + XSCHED_ERR("No function [logic_alloc] called.\n"); + return -EINVAL; + } + + return params->group->opt->logic_alloc(params); +} + +/* This function runs a "logic_free" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + * + * This handler provides an interface to implement deallocation + * and unregistering memory of a logic CQ buffer. + */ +int xcu_logic_free(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->logic_free) { + XSCHED_ERR("No function [logic_free] called.\n"); + return -EINVAL; + } + + return params->group->opt->logic_free(params); +} + +static struct xcu_group __xcu_group_root = { + .id = 0, + .type = XCU_TYPE_ROOT, + .next_layer = IDR_INIT(next_layer), +}; + +struct xcu_group *xcu_group_root = &__xcu_group_root; +EXPORT_SYMBOL(xcu_group_root); + +static int nr_active_cu_inc(void) +{ + int cur_num = -1; + + spin_lock(&xcu_mgr_lock); + if (num_active_xcu >= XSCHED_NR_CUS) + goto out_unlock; + + cur_num = num_active_xcu; + num_active_xcu++; + +out_unlock: + spin_unlock(&xcu_mgr_lock); + return cur_num; +} + +static int nr_active_cu_dec(void) +{ + int cur_num = -1; + + spin_lock(&xcu_mgr_lock); + if (num_active_xcu <= 0) + goto out_unlock; + + cur_num = num_active_xcu; + num_active_xcu--; + +out_unlock: + spin_unlock(&xcu_mgr_lock); + return cur_num; +} + +/* + * Initialize and register xcu in xcu_manager array. + */ +int xsched_xcu_register(struct xcu_group *group, uint32_t phys_id) +{ + int xcu_cur_num, ret = 0; + struct xsched_cu *xcu; + + if (phys_id >= XSCHED_NR_CUS) { + XSCHED_ERR("phys_id [%u] is out of valid range [0, %d).\n", + phys_id, XSCHED_NR_CUS); + return -EINVAL; + } + + if (!group) { + XSCHED_ERR("group cannot be NULL.\n"); + return -EINVAL; + } + + xcu_cur_num = nr_active_cu_inc(); + if (xcu_cur_num < 0) { + XSCHED_ERR("Number of present XCU's exceeds %d: %d.\n", + XSCHED_NR_CUS, num_active_xcu); + return -ENOSPC; + } + + xcu = kzalloc(sizeof(struct xsched_cu), GFP_KERNEL); + if (!xcu) { + if (nr_active_cu_dec() < 0) { + XSCHED_ERR("num_active_xcu [%d] must be > 0.\n", + num_active_xcu); + } + XSCHED_ERR("Fail to alloc xcu for phys_id [%u].\n", phys_id); + return -ENOMEM; + } + + group->xcu = xcu; + xsched_cu_mgr[phys_id] = xcu; + + /* Init xcu's internals. */ + ret = xsched_xcu_init(xcu, group, phys_id); + if (ret != 0) { + group->xcu = NULL; + xsched_cu_mgr[phys_id] = NULL; + kfree(xcu); + XSCHED_ERR("Fail to init xcu[%u].", xcu->id); + return ret; + } + + return 0; +} +EXPORT_SYMBOL(xsched_xcu_register); + +int xsched_xcu_unregister(struct xcu_group *group, uint32_t phys_id) +{ + struct xsched_cu *xcu; + + if (phys_id >= XSCHED_NR_CUS) { + XSCHED_ERR("phys_id [%u] is out of valid range [0, %d).\n", + phys_id, XSCHED_NR_CUS); + return -EINVAL; + } + + if (!group || !group->xcu || group->xcu != xsched_cu_mgr[phys_id]) { + XSCHED_ERR("group is invalid or cannot mapping to phys_id [%u].\n", phys_id); + return -EINVAL; + } + + xcu = group->xcu; + if (!xcu) { + XSCHED_ERR("xcu for phys_id [%u] not found.\n", phys_id); + return -EINVAL; + } + + if (nr_active_cu_dec() < 0) { + XSCHED_ERR("No active XCU\n"); + return -EPERM; + }; + + if (xcu->worker) { + mutex_lock(&xcu->xcu_lock); + wake_up_interruptible(&xcu->wq_xcu_idle); + mutex_unlock(&xcu->xcu_lock); + + kthread_stop(xcu->worker); + xcu->worker = NULL; + } else { + XSCHED_ERR("The worker for xcu [%u] must not be NULL.\n", xcu->id); + } + + group->xcu = NULL; + xsched_cu_mgr[phys_id] = NULL; + kfree(xcu); + + return 0; +} +EXPORT_SYMBOL(xsched_xcu_unregister); + diff --git a/include/linux/xcu_group.h b/include/linux/xcu_group.h new file mode 100644 index 000000000000..b24641b98e6a --- /dev/null +++ b/include/linux/xcu_group.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __XSCHED_XCU_GROUP_H__ +#define __XSCHED_XCU_GROUP_H__ + +#include <linux/idr.h> +#include <uapi/linux/xcu_vstream.h> + +#ifndef CONFIG_XSCHED_NR_CUS +#define CONFIG_XSCHED_NR_CUS 128 +#endif /* !CONFIG_XSCHED_NR_CUS */ +#define XSCHED_NR_CUS CONFIG_XSCHED_NR_CUS + +extern struct xcu_group *xcu_group_root; + +enum xcu_type { + XCU_TYPE_ROOT, + XCU_TYPE_XPU, +}; + +struct xcu_op_handler_params { + int fd; + struct xcu_group *group; + void *payload; + union { + struct { + void *param_1; + void *param_2; + void *param_3; + void *param_4; + void *param_5; + void *param_6; + void *param_7; + void *param_8; + }; + }; +}; + +typedef int (*xcu_op_handler_fn_t)(struct xcu_op_handler_params *params); + +struct xcu_operation { + xcu_op_handler_fn_t run; + xcu_op_handler_fn_t finish; + xcu_op_handler_fn_t wait; + xcu_op_handler_fn_t complete; + xcu_op_handler_fn_t alloc; + xcu_op_handler_fn_t logic_alloc; + xcu_op_handler_fn_t logic_free; +}; + +struct xcu_group { + /* sq id. */ + uint32_t id; + + /* Type of XCU group. */ + enum xcu_type type; + + /* IDR for the next layer of XCU group tree. */ + struct idr next_layer; + + /* Pointer to the previous XCU group in the XCU group tree. */ + struct xcu_group *previous_layer; + + /* Pointer to operation fn pointers object describing + * this XCU group's callbacks. + */ + struct xcu_operation *opt; + + /* Pointer to the XCU related to this XCU group. */ + struct xsched_cu *xcu; +}; + +int xcu_group_attach(struct xcu_group *new_group, + struct xcu_group *previous_group); +void xcu_group_detach(struct xcu_group *group); +struct xcu_group *xcu_group_find(struct xcu_group *group, int id); +struct xcu_group *xcu_group_init(int id); +void xcu_group_free(struct xcu_group *group); + +extern int xcu_run(struct xcu_op_handler_params *params); +extern int xcu_wait(struct xcu_op_handler_params *params); +extern int xcu_complete(struct xcu_op_handler_params *params); +extern int xcu_finish(struct xcu_op_handler_params *params); +extern int xcu_alloc(struct xcu_op_handler_params *params); +extern int xcu_logic_alloc(struct xcu_op_handler_params *params); +extern int xcu_logic_free(struct xcu_op_handler_params *params); + +int xsched_xcu_register(struct xcu_group *group, uint32_t phys_id); +int xsched_xcu_unregister(struct xcu_group *group, uint32_t phys_id); +#endif /* __XSCHED_XCU_GROUP_H__ */ diff --git a/include/linux/xsched.h b/include/linux/xsched.h index dc840136a35f..ed3314ec6572 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -32,4 +32,32 @@ #define XSCHED_EXIT_STUB() \ XSCHED_DEBUG(" -----* %s @ %s exited *-----\n", __func__, __FILE__) +enum xsched_cu_status { + /* Worker not initialized. */ + XSCHED_XCU_NONE, + + /* Worker is sleeping in idle state. */ + XSCHED_XCU_WAIT_IDLE, + + /* Worker is sleeping in running state. */ + XSCHED_XCU_WAIT_RUNNING, + + /* Worker is active but not processing anything. */ + XSCHED_XCU_ACTIVE, + + NR_XSCHED_XCU_STATUS, +}; + +/* This is the abstraction object of the xcu computing unit. */ +struct xsched_cu { + uint32_t id; + uint32_t state; + struct task_struct *worker; + struct xcu_group *group; + struct mutex xcu_lock; + wait_queue_head_t wq_xcu_idle; +}; + +int xsched_xcu_init(struct xsched_cu *xcu, struct xcu_group *group, int xcu_id); +int xsched_schedule(void *input_xcu); #endif /* !__LINUX_XSCHED_H__ */ diff --git a/kernel/xsched/Kconfig b/kernel/xsched/Kconfig index c2d587f6d57a..8d12b8db5f6d 100644 --- a/kernel/xsched/Kconfig +++ b/kernel/xsched/Kconfig @@ -25,3 +25,12 @@ config XCU_VSTREAM Requires XCU_SCHEDULER to be enabled. May increase kernel memory usage. Recommended for high-throughput XPU workloads. If unsure, say N. + +config XSCHED_NR_CUS + int "Number of CUs (a.k.a. XCUs) available to XSched mechanism" + default 128 + depends on XCU_SCHEDULER + help + This option defines the maximum number of Compute Units (CUs) that can be + managed by the XSched scheduler, consider changing this value proportionally + to the number of available XCU cores. diff --git a/kernel/xsched/Makefile b/kernel/xsched/Makefile index e972cd93b607..031b09b9fb4d 100644 --- a/kernel/xsched/Makefile +++ b/kernel/xsched/Makefile @@ -1,2 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-y += vstream.o +xsched_enabled := $(CONFIG_XCU_SCHEDULER) +obj-$(xsched_enabled) += core.o diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c new file mode 100644 index 000000000000..d85379d914f5 --- /dev/null +++ b/kernel/xsched/core.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Core kernel scheduler code for XPU device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * Author: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/slab.h> +#include <linux/spinlock_types.h> +#include <linux/types.h> +#include <linux/xsched.h> + +int xsched_schedule(void *input_xcu) +{ + return 0; +} + +/* Initializes all xsched XCU objects. + * Should only be called from xsched_xcu_register function. + */ +int xsched_xcu_init(struct xsched_cu *xcu, struct xcu_group *group, int xcu_id) +{ + int err; + + xcu->id = xcu_id; + xcu->state = XSCHED_XCU_NONE; + xcu->group = group; + + mutex_init(&xcu->xcu_lock); + + /* This worker should set XCU to XSCHED_XCU_WAIT_IDLE. + * If after initialization XCU still has XSCHED_XCU_NONE + * status then we can assume that there was a problem + * with XCU kthread job. + */ + xcu->worker = kthread_run(xsched_schedule, xcu, "xcu_%u", xcu->id); + + if (IS_ERR(xcu->worker)) { + err = PTR_ERR(xcu->worker); + xcu->worker = NULL; + XSCHED_DEBUG("Fail to run the worker to schedule for xcu[%u].", xcu->id); + return err; + } + return 0; +} diff --git a/kernel/xsched/vstream.c b/kernel/xsched/vstream.c index a20c9594b21e..ada4a4d3946e 100644 --- a/kernel/xsched/vstream.c +++ b/kernel/xsched/vstream.c @@ -18,7 +18,7 @@ #include <linux/vstream.h> #include <linux/xsched.h> -#ifdef CONFIG_XCU_VSTREAM +#if defined(CONFIG_XCU_SCHEDULER) && defined(CONFIG_XCU_VSTREAM) int vstream_alloc(struct vstream_args *arg) { return 0; -- 2.34.1

Zicheng Qu

2:43 p.m.

New subject: [PATCH OLK-6.6 3/8] xsched: Introduce vstream management

From: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC5EHB ----------------------------------------- Add vstream related data structures: - vstream_info. Add vstream related context and entity data structures: - xsched_entity - xsched_context Add xsched_init() implementation. Add vstream_alloc/free implementation. Signed-off-by: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com> Signed-off-by: Hui Tang <tanghui20@.huawei.com> Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Xia Fukun <xiafukun@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- include/linux/vstream.h | 46 ++++ include/linux/xsched.h | 78 ++++++ kernel/xsched/core.c | 32 +++ kernel/xsched/vstream.c | 543 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 697 insertions(+), 2 deletions(-) diff --git a/include/linux/vstream.h b/include/linux/vstream.h index 891734e8ce04..ffab65889036 100644 --- a/include/linux/vstream.h +++ b/include/linux/vstream.h @@ -6,6 +6,52 @@ typedef int vstream_manage_t(struct vstream_args *arg); +typedef struct vstream_info { + uint32_t user_stream_id; + uint32_t id; + uint32_t vcq_id; + uint32_t logic_vcq_id; + uint32_t dev_id; + uint32_t channel_id; + uint32_t fd; + uint32_t task_type; + int tgid; + int sqcq_type; + + void *drv_ctx; + + int inode_fd; + + /* Pointer to corresponding context. */ + struct xsched_context *ctx; + + /* List node in context's vstream list. */ + struct list_head ctx_node; + + /* Pointer to an CU object on which this + * vstream is currently being processed. + * NULL if vstream is not being processed. + */ + struct xsched_cu *xcu; + + /* List node in an CU list of vstreams that + * are currently being processed by this specific CU. + */ + struct list_head xcu_node; + + /* Private vstream data. */ + void *data; + + spinlock_t stream_lock; + + uint32_t kicks_count; + + /* List of metadata a.k.a. all recorded unprocesed + * kicks for this exact vstream. + */ + struct list_head metadata_list; +} vstream_info_t; + int vstream_alloc(struct vstream_args *arg); int vstream_free(struct vstream_args *arg); int vstream_kick(struct vstream_args *arg); diff --git a/include/linux/xsched.h b/include/linux/xsched.h index ed3314ec6572..d0753639a9f2 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -2,6 +2,10 @@ #ifndef __LINUX_XSCHED_H__ #define __LINUX_XSCHED_H__ +#include <linux/kref.h> +#include <linux/vstream.h> +#include <linux/xcu_group.h> + #ifndef pr_fmt #define pr_fmt(fmt) fmt #endif @@ -32,6 +36,8 @@ #define XSCHED_EXIT_STUB() \ XSCHED_DEBUG(" -----* %s @ %s exited *-----\n", __func__, __FILE__) +#define MAX_VSTREAM_NUM 512 + enum xsched_cu_status { /* Worker not initialized. */ XSCHED_XCU_NONE, @@ -53,11 +59,83 @@ struct xsched_cu { uint32_t id; uint32_t state; struct task_struct *worker; + struct list_head ctx_list; + struct mutex ctx_list_lock; + vstream_info_t *vs_array[MAX_VSTREAM_NUM]; + struct mutex vs_array_lock; struct xcu_group *group; struct mutex xcu_lock; wait_queue_head_t wq_xcu_idle; }; +struct xsched_entity { + uint32_t task_type; + + bool on_rq; + + pid_t owner_pid; + pid_t tgid; + + /* File descriptor coming from an associated context + * used for identifying a given xsched entity in + * info and error prints. + */ + uint32_t fd; + + /* Xsched class for this xse. */ + const struct xsched_class *class; + + /* Pointer to context object. */ + struct xsched_context *ctx; + + /* Pointer to an XCU object that represents an XCU + * on which this xse is to be processed or is being + * processed currently. + */ + struct xsched_cu *xcu; + + /* General purpose xse lock. */ + spinlock_t xse_lock; +}; + +struct xsched_context { + uint32_t fd; + uint32_t dev_id; + pid_t tgid; + + struct list_head vstream_list; + struct list_head ctx_node; + + struct xsched_entity xse; + + spinlock_t ctx_lock; + struct mutex ctx_mutex; + struct kref kref; +}; + +extern struct list_head xsched_ctx_list; +extern struct mutex xsched_ctx_list_mutex; + +/* Returns a pointer to xsched_context object corresponding to a given + * tgid and xcu. + */ +static inline struct xsched_context * +ctx_find_by_tgid_and_xcu(pid_t tgid, struct xsched_cu *xcu) +{ + struct xsched_context *ctx; + struct xsched_context *ret = NULL; + + list_for_each_entry(ctx, &xcu->ctx_list, ctx_node) { + if (ctx->tgid == tgid) { + ret = ctx; + break; + } + } + return ret; +} + int xsched_xcu_init(struct xsched_cu *xcu, struct xcu_group *group, int xcu_id); int xsched_schedule(void *input_xcu); +int xsched_init_entity(struct xsched_context *ctx, struct vstream_info *vs); +int ctx_bind_to_xcu(vstream_info_t *vstream_info, struct xsched_context *ctx); #endif /* !__LINUX_XSCHED_H__ */ diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c index d85379d914f5..867c07f9e9d1 100644 --- a/kernel/xsched/core.c +++ b/kernel/xsched/core.c @@ -56,3 +56,35 @@ int xsched_xcu_init(struct xsched_cu *xcu, struct xcu_group *group, int xcu_id) } return 0; } + +int xsched_init_entity(struct xsched_context *ctx, struct vstream_info *vs) +{ + int err = 0; + struct xsched_entity *xse = &ctx->xse; + + xse->fd = ctx->fd; + xse->tgid = ctx->tgid; + + err = ctx_bind_to_xcu(vs, ctx); + if (err) { + XSCHED_ERR( + "Couldn't find valid xcu for vstream %u dev_id %u @ %s\n", + vs->id, vs->dev_id, __func__); + return -EINVAL; + } + + xse->ctx = ctx; + + if (vs->xcu == NULL) { + WARN_ON(vs->xcu == NULL); + return -EINVAL; + } + + xse->xcu = vs->xcu; + + WRITE_ONCE(xse->on_rq, false); + + spin_lock_init(&xse->xse_lock); + return err; +} + diff --git a/kernel/xsched/vstream.c b/kernel/xsched/vstream.c index ada4a4d3946e..5a6a3e565351 100644 --- a/kernel/xsched/vstream.c +++ b/kernel/xsched/vstream.c @@ -14,21 +14,560 @@ * more details. * */ +#include <linux/anon_inodes.h> +#include <linux/delay.h> +#include <linux/hash.h> +#include <linux/hashtable.h> #include <linux/syscalls.h> #include <linux/vstream.h> #include <linux/xsched.h> #if defined(CONFIG_XCU_SCHEDULER) && defined(CONFIG_XCU_VSTREAM) -int vstream_alloc(struct vstream_args *arg) + +#define XCU_HASH_ORDER 6 + +static DEFINE_MUTEX(revmap_mutex); +static DEFINE_HASHTABLE(ctx_revmap, XCU_HASH_ORDER); + +/** + * @group: value for this entry. + * @hash_node: hash node list. + * @dev_id: device id to bind with ctx. + */ +struct ctx_devid_revmap_data { + unsigned int dev_id; + struct xcu_group *group; + struct hlist_node hash_node; +}; + +static int vstream_del(vstream_info_t *vstream, uint32_t vstream_id); +static int vstream_file_release(struct inode *inode, struct file *file); +static const struct file_operations vstreamfd_fops = { + .release = vstream_file_release, +}; + +static inline struct file *vstream_file_get(int vs_fd) +{ + return fget(vs_fd); +} + +static inline void vstream_file_put(struct file *vstream_file) { + fput(vstream_file); +} + +static int vstream_file_create(struct vstream_info *vs) +{ + return anon_inode_getfd("[vstreamfd]", &vstreamfd_fops, vs, + O_RDWR | O_CLOEXEC | O_NONBLOCK); +} + +/* Frees a given vstream and also frees and dequeues it's context + * if a given vstream is the last and only vstream attached to it's + * corresponding context object. + */ +static void xsched_task_free(struct kref *kref) +{ + struct xsched_context *ctx; + vstream_info_t *vs, *tmp; + struct xsched_cu *xcu; + + ctx = container_of(kref, struct xsched_context, kref); + xcu = ctx->xse.xcu; + + /* Wait utill xse dequeues */ + while (READ_ONCE(ctx->xse.on_rq)) + usleep_range(100, 200); + + mutex_lock(&xcu->ctx_list_lock); + list_for_each_entry_safe(vs, tmp, &ctx->vstream_list, ctx_node) { + list_del(&vs->ctx_node); + kfree(vs); + } + + list_del(&ctx->ctx_node); + mutex_unlock(&xcu->ctx_list_lock); + + kfree(ctx); +} + +struct xsched_cu *xcu_find(uint32_t type, + uint32_t dev_id, uint32_t channel_id) +{ + struct xcu_group *group = NULL; + + /* Find xcu by type. */ + group = xcu_group_find(xcu_group_root, type); + if (group == NULL) { + XSCHED_ERR("Fail to find type group.\n"); + return NULL; + } + + /* Find device id group. */ + group = xcu_group_find(group, dev_id); + if (group == NULL) { + XSCHED_ERR("Fail to find device group.\n"); + return NULL; + } + /* Find channel id group. */ + group = xcu_group_find(group, channel_id); + if (group == NULL) { + XSCHED_ERR("Fail to find channel group.\n"); + return NULL; + } + + XSCHED_DEBUG("XCU found: type=%u, dev_id=%u, chan_id=%u.\n", + type, dev_id, channel_id); + + return group->xcu; +} + +static int vstream_destroy(vstream_info_t *vstream) +{ + int err; + struct xsched_context *ctx = NULL; + + err = vstream_del(vstream, vstream->id); + if (err) + return err; + + ctx = vstream->ctx; + kref_put(&ctx->kref, xsched_task_free); + return 0; } -int vstream_free(struct vstream_args *arg) +static int vstream_file_release(struct inode *inode, struct file *file) { + vstream_info_t *vstream; + (void) inode; + + if (!file->private_data) + return 0; + + vstream = file->private_data; + return vstream_destroy(vstream); +} + +static void init_xsched_ctx(struct xsched_context *ctx, + const struct vstream_info *vs) +{ + ctx->tgid = vs->tgid; + ctx->fd = vs->fd; + ctx->dev_id = vs->dev_id; + kref_init(&ctx->kref); + + INIT_LIST_HEAD(&ctx->vstream_list); + INIT_LIST_HEAD(&ctx->ctx_node); + + spin_lock_init(&ctx->ctx_lock); + mutex_init(&ctx->ctx_mutex); +} + +int ctx_bind_to_xcu(vstream_info_t *vstream_info, struct xsched_context *ctx) +{ + struct ctx_devid_revmap_data *revmap_data; + struct xsched_cu *xcu_found = NULL; + uint32_t type = XCU_TYPE_XPU; + + /* Find XCU history. */ + hash_for_each_possible(ctx_revmap, revmap_data, hash_node, + (unsigned long)ctx->dev_id) { + if (revmap_data && revmap_data->group) { + /* Bind ctx to group xcu.*/ + ctx->xse.xcu = revmap_data->group->xcu; + return 0; + } + } + + revmap_data = kzalloc(sizeof(struct ctx_devid_revmap_data), GFP_KERNEL); + if (revmap_data == NULL) { + XSCHED_ERR("Revmap_data is NULL @ %s\n", __func__); + return -ENOMEM; + } + + xcu_found = xcu_find(type, ctx->dev_id, vstream_info->channel_id); + if (!xcu_found) { + kfree(revmap_data); + return -EINVAL; + } + + /* Bind ctx to an XCU from channel group. */ + revmap_data->group = xcu_found->group; + ctx->xse.xcu = xcu_found; + vstream_info->xcu = xcu_found; + revmap_data->dev_id = vstream_info->dev_id; + XSCHED_DEBUG("Ctx bind to xcu %u @ %s\n", xcu_found->id, __func__); + + hash_add(ctx_revmap, &revmap_data->hash_node, + (unsigned long)ctx->dev_id); + + return 0; +} + +/* Allocates a new xsched_context if a new vstream_info is bound + * to a device that no other vstream that is currently present + * is bound to. + */ +static int alloc_ctx_from_vstream(struct vstream_info *vstream_info, + struct xsched_context **ctx) +{ + struct xsched_cu *xcu = vstream_info->xcu; + int ret; + + *ctx = ctx_find_by_tgid_and_xcu(vstream_info->tgid, xcu); + if (*ctx) + return 0; + + *ctx = kzalloc(sizeof(struct xsched_context), GFP_KERNEL); + if (!*ctx) { + XSCHED_ERR("Fail to alloc xsched context (tgid=%d) @ %s\n", + vstream_info->tgid, __func__); + return -ENOMEM; + } + + init_xsched_ctx(*ctx, vstream_info); + + ret = xsched_init_entity(*ctx, vstream_info); + if (ret) { + XSCHED_ERR("Fail to initialize XSE for context @ %s\n", + __func__); + kfree(*ctx); + return -EINVAL; + } + + list_add(&(*ctx)->ctx_node, &xcu->ctx_list); + return 0; } +/* Bounds a new vstream_info object to a corresponding xsched context. */ +static int vstream_bind_to_ctx(struct vstream_info *vs) +{ + struct xsched_context *ctx = NULL; + struct xsched_cu *xcu = vs->xcu; + int err = 0; + + mutex_lock(&xcu->ctx_list_lock); + ctx = ctx_find_by_tgid_and_xcu(vs->tgid, xcu); + if (ctx) { + XSCHED_DEBUG("Ctx %d found @ %s\n", vs->tgid, __func__); + kref_get(&ctx->kref); + } else { + err = alloc_ctx_from_vstream(vs, &ctx); + if (err) + goto out_err; + } + + vs->ctx = ctx; + list_add(&vs->ctx_node, &vs->ctx->vstream_list); + +out_err: + mutex_unlock(&xcu->ctx_list_lock); + return err; +} + +static vstream_info_t *vstream_create(struct vstream_args *arg) +{ + struct vstream_info *vstream = NULL; + + vstream = kzalloc(sizeof(vstream_info_t), GFP_KERNEL); + if (!vstream) { + XSCHED_ERR("Failed to allocate vstream.\n"); + return NULL; + } + + vstream->dev_id = arg->dev_id; + vstream->channel_id = arg->channel_id; + vstream->kicks_count = 0; + vstream->xcu = NULL; + + INIT_LIST_HEAD(&vstream->ctx_node); + INIT_LIST_HEAD(&vstream->xcu_node); + INIT_LIST_HEAD(&vstream->metadata_list); + + spin_lock_init(&vstream->stream_lock); + + return vstream; +} + +static int vstream_add(vstream_info_t *vstream, uint32_t id) +{ + int err = 0; + struct xsched_cu *xcu = vstream->xcu; + + if (id >= MAX_VSTREAM_NUM) { + XSCHED_ERR("Vstream id=%u out of range @ %s.\n", + id, __func__); + return -EINVAL; + } + + mutex_lock(&xcu->vs_array_lock); + if (xcu->vs_array[id] != NULL) { + XSCHED_ERR("Vstream id=%u cell is busy.\n", id); + err = -EINVAL; + goto out_err; + } + xcu->vs_array[id] = vstream; + +out_err: + mutex_unlock(&xcu->vs_array_lock); + return err; +} + +static int vstream_del(vstream_info_t *vstream, uint32_t vstream_id) +{ + struct xsched_cu *xcu = vstream->xcu; + + if (vstream_id >= MAX_VSTREAM_NUM) { + XSCHED_ERR("Vstream id=%u out of range @ %s.\n", + vstream_id, __func__); + return -EINVAL; + } + + mutex_lock(&xcu->vs_array_lock); + xcu->vs_array[vstream_id] = NULL; + mutex_unlock(&xcu->vs_array_lock); + return 0; +} + +static vstream_info_t *vstream_get(struct xsched_cu *xcu, uint32_t vstream_id) +{ + vstream_info_t *vstream = NULL; + + if (vstream_id >= MAX_VSTREAM_NUM) { + XSCHED_ERR("Vstream id=%u out of range @ %s.\n", + vstream_id, __func__); + return NULL; + } + + mutex_lock(&xcu->vs_array_lock); + vstream = xcu->vs_array[vstream_id]; + mutex_unlock(&xcu->vs_array_lock); + + return vstream; +} + +static vstream_info_t * +vstream_get_by_user_stream_id(struct xsched_cu *xcu, uint32_t user_stream_id) +{ + int id; + static vstream_info_t *ret; + + mutex_lock(&xcu->vs_array_lock); + for (id = 0; id < MAX_VSTREAM_NUM; id++) { + if (xcu->vs_array[id] != NULL && + xcu->vs_array[id]->user_stream_id == user_stream_id) { + ret = xcu->vs_array[id]; + break; + } + } + mutex_unlock(&xcu->vs_array_lock); + return ret; +} + +static int vstream_bind_to_xcu(vstream_info_t *vstream_info) +{ + struct xsched_cu *xcu_found = NULL; + uint32_t type = XCU_TYPE_XPU; + + xcu_found = xcu_find(type, vstream_info->dev_id, vstream_info->channel_id); + if (!xcu_found) + return -EINVAL; + + /* Bind vstream to a xcu. */ + vstream_info->xcu = xcu_found; + vstream_info->dev_id = xcu_found->id; + XSCHED_DEBUG("XCU bound to a vstream: type=%u, dev_id=%u, chan_id=%u.\n", + type, vstream_info->dev_id, vstream_info->channel_id); + + return 0; +} + +static int sqcq_alloc(struct vstream_args *arg) +{ + vstream_alloc_args_t *va_args = &arg->va_args; + struct xsched_context *ctx = NULL; + struct xcu_op_handler_params params; + struct file *vs_file; + uint32_t logic_cq_id = 0; + vstream_info_t *vstream; + int ret = 0; + uint32_t tgid = 0; + uint32_t cq_id = 0; + uint32_t sq_id = 0; + + vstream = vstream_create(arg); + if (!vstream) + return -ENOSPC; + + vstream->fd = arg->fd; + vstream->task_type = arg->task_type; + + ret = vstream_bind_to_xcu(vstream); + if (ret < 0) { + ret = -EINVAL; + goto out_err_vstream_free; + } + + /* Allocates vstream's SQ and CQ memory on a XCU for processing. */ + params.group = vstream->xcu->group; + params.fd = arg->fd; + params.payload = arg->payload; + params.param_1 = &tgid; + params.param_2 = &sq_id; + params.param_3 = &cq_id; + params.param_4 = &logic_cq_id; + ret = xcu_alloc(¶ms); + if (ret) { + XSCHED_ERR("Fail to allocate SQ/CQ memory to a vstream.\n"); + goto out_err_vstream_free; + } + + vstream->drv_ctx = params.param_5; + vstream->id = sq_id; + vstream->vcq_id = cq_id; + vstream->logic_vcq_id = logic_cq_id; + vstream->user_stream_id = va_args->user_stream_id; + vstream->tgid = tgid; + vstream->sqcq_type = va_args->type; + ret = vstream_bind_to_ctx(vstream); + if (ret) + goto out_err_xcu_finish; + + ctx = vstream->ctx; + ret = vstream_file_create(vstream); + if (ret < 0) { + XSCHED_ERR("Fail to alloc anon inode for vstream %u @ %s\n", + vstream->id, __func__); + goto out_err_ctx_free; + } + vstream->inode_fd = ret; + + /* Add new vstream to array after allocating inode */ + ret = vstream_add(vstream, vstream->id); + if (ret) + goto out_err_vstream_file_put; + + arg->sq_id = sq_id; + arg->cq_id = cq_id; + + return 0; + +out_err_vstream_file_put: + vs_file = vstream_file_get(vstream->inode_fd); + if (vs_file) { + vs_file->private_data = NULL; + vstream_file_put(vs_file); + } +out_err_ctx_free: + if (ctx) { + /* In the current code context, + * vstream should not be released inside xsched_task_free. + * Otherwise, vstream may become a wild pointer. + * If it is still being used by other objects, + * it may cause a UAF issue when it is released again in + * out_err_vstream_free. + */ + mutex_lock(&vstream->xcu->ctx_list_lock); + list_del(&vstream->ctx_node); + mutex_unlock(&vstream->xcu->ctx_list_lock); + kref_put(&ctx->kref, xsched_task_free); + } +out_err_xcu_finish: + if (xcu_finish(¶ms)) + XSCHED_ERR("Fail to free vstream sqId=%u, cqId=%u.\n", sq_id, cq_id); +out_err_vstream_free: + kfree(vstream); + return ret; +} + +static int logic_cq_alloc(struct vstream_args *arg) +{ + int err = 0; + struct xcu_op_handler_params params; + vstream_info_t *vstream = NULL; + vstream_alloc_args_t *logic_cq_alloc_para = &arg->va_args; + struct xsched_cu *xcu_found = NULL; + uint32_t logic_cq_id = 0; + uint32_t type = XCU_TYPE_XPU; + + xcu_found = xcu_find(type, arg->dev_id, arg->channel_id); + if (!xcu_found) + return -EINVAL; + + vstream = vstream_get_by_user_stream_id(xcu_found, + logic_cq_alloc_para->user_stream_id); + if (vstream) + xcu_found = vstream->xcu; + params.group = xcu_found->group; + params.fd = arg->fd; + params.payload = arg->payload; + params.param_1 = &logic_cq_id; + err = xcu_logic_alloc(¶ms); + if (err) { + XSCHED_ERR("Fail to alloc logic CQ memory to a vstream.\n"); + return err; + } + if (vstream) + vstream->logic_vcq_id = logic_cq_id; + + return 0; +} + +int vstream_alloc(struct vstream_args *arg) +{ + vstream_alloc_args_t *va_args = &arg->va_args; + int ret; + + if (!va_args->type) + ret = sqcq_alloc(arg); + else + ret = logic_cq_alloc(arg); + + return ret; +} + +int vstream_free(struct vstream_args *arg) +{ + struct file *vs_file; + struct xcu_op_handler_params params; + struct xsched_cu *xcu_found; + uint32_t vstream_id = arg->sq_id; + uint32_t type = XCU_TYPE_XPU; + vstream_info_t *vstream = NULL; + int err = 0; + + xcu_found = xcu_find(type, arg->dev_id, arg->channel_id); + if (!xcu_found) + return -EINVAL; + + vstream = vstream_get(xcu_found, vstream_id); + if (!vstream) { + XSCHED_ERR("Fail to free NULL vstream, vstream id=%u\n", vstream_id); + return -EINVAL; + } + + params.group = vstream->xcu->group; + params.fd = arg->fd; + params.payload = arg->payload; + + vs_file = vstream_file_get(vstream->inode_fd); + if (vs_file) { + vs_file->private_data = NULL; + vstream_file_put(vs_file); + } + + /* After vstream_get(), destroying the vstream may not fail */ + vstream_destroy(vstream); + err = xcu_finish(¶ms); + if (err) + XSCHED_ERR("Fail to free vstream sqId=%u, cqId=%u.\n", + arg->sq_id, arg->cq_id); + + return err; +} + int vstream_kick(struct vstream_args *arg) { return 0; -- 2.34.1

Zicheng Qu

2:43 p.m.

New subject: [PATCH OLK-6.6 4/8] xsched: Add basic scheduler core support

From: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC5EHB ----------------------------------------- Add kernels send/receive mechanism, deliver the queued kernels to the XPU device and trigger new scheduling after the kernels have completed processing. Implements scheduling core abstraction to subsequent expand RT/CFS sched class. Signed-off-by: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> Signed-off-by: Hui Tang <tanghui20@.huawei.com> Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Xia Fukun <xiafukun@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- drivers/xcu/xcu_group.c | 16 ++ include/linux/vstream.h | 32 +++ include/linux/xcu_group.h | 7 + include/linux/xsched.h | 144 +++++++++++ include/uapi/linux/xcu_vstream.h | 3 + kernel/xsched/core.c | 393 +++++++++++++++++++++++++++++++ kernel/xsched/vstream.c | 61 ++++- 7 files changed, 655 insertions(+), 1 deletion(-) diff --git a/drivers/xcu/xcu_group.c b/drivers/xcu/xcu_group.c index 3215f37e4ece..54d389534508 100644 --- a/drivers/xcu/xcu_group.c +++ b/drivers/xcu/xcu_group.c @@ -221,6 +221,22 @@ int xcu_logic_free(struct xcu_op_handler_params *params) return params->group->opt->logic_free(params); } +/* This function runs a "sqe_op" callback for a given xcu_group + * and a given vstream that are passed within + * xcu_op_handler_params object. + * + * This handler provides an interface to set or get sqe info. + */ +int xcu_sqe_op(struct xcu_op_handler_params *params) +{ + if (!params->group->opt || !params->group->opt->sqe_op) { + XSCHED_ERR("No function [sqe_op] called.\n"); + return -EINVAL; + } + + return params->group->opt->sqe_op(params); +} + static struct xcu_group __xcu_group_root = { .id = 0, .type = XCU_TYPE_ROOT, diff --git a/include/linux/vstream.h b/include/linux/vstream.h index ffab65889036..fd393ec97a99 100644 --- a/include/linux/vstream.h +++ b/include/linux/vstream.h @@ -3,6 +3,38 @@ #define _LINUX_VSTREAM_H #include <uapi/linux/xcu_vstream.h> +#include <linux/ktime.h> + +#define MAX_VSTREAM_SIZE 2048 + +/* Vstream metadata describes each incoming kick + * that gets stored into a list of pending kicks + * inside a vstream to keep track of what is left + * to be processed by a driver. + */ +typedef struct vstream_metadata { + uint32_t exec_time; + /* A value of SQ tail that has been passed with the + * kick that is described by this exact metadata object. + */ + uint32_t sq_tail; + uint32_t sqe_num; + uint32_t sq_id; + uint8_t sqe[XCU_SQE_SIZE_MAX]; + + /* Report buffer for fake read. */ + int8_t cqe[XCU_CQE_BUF_SIZE]; + uint32_t cqe_num; + int32_t timeout; + + /* A node for metadata list */ + struct list_head node; + + struct vstream_info *parent; + + /* Time of list insertion */ + ktime_t add_time; +} vstream_metadata_t; typedef int vstream_manage_t(struct vstream_args *arg); diff --git a/include/linux/xcu_group.h b/include/linux/xcu_group.h index b24641b98e6a..c129dca32c51 100644 --- a/include/linux/xcu_group.h +++ b/include/linux/xcu_group.h @@ -17,6 +17,11 @@ enum xcu_type { XCU_TYPE_XPU, }; +enum xcu_sqe_op_type { + SQE_SET_NOTIFY, + SQE_IS_NOTIFY, +}; + struct xcu_op_handler_params { int fd; struct xcu_group *group; @@ -45,6 +50,7 @@ struct xcu_operation { xcu_op_handler_fn_t alloc; xcu_op_handler_fn_t logic_alloc; xcu_op_handler_fn_t logic_free; + xcu_op_handler_fn_t sqe_op; }; struct xcu_group { @@ -83,6 +89,7 @@ extern int xcu_finish(struct xcu_op_handler_params *params); extern int xcu_alloc(struct xcu_op_handler_params *params); extern int xcu_logic_alloc(struct xcu_op_handler_params *params); extern int xcu_logic_free(struct xcu_op_handler_params *params); +extern int xcu_sqe_op(struct xcu_op_handler_params *params); int xsched_xcu_register(struct xcu_group *group, uint32_t phys_id); int xsched_xcu_unregister(struct xcu_group *group, uint32_t phys_id); diff --git a/include/linux/xsched.h b/include/linux/xsched.h index d0753639a9f2..d52461e63d8a 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -38,6 +38,30 @@ #define MAX_VSTREAM_NUM 512 +enum xcu_sched_type { + XSCHED_TYPE_NUM +}; + +#define xsched_first_class \ + list_first_entry(&(xsched_class_list), struct xsched_class, node) + +#define for_each_xsched_class(class) \ + list_for_each_entry((class), &(xsched_class_list), node) + +#define for_each_vstream_in_ctx(vs, ctx) \ + list_for_each_entry((vs), &((ctx)->vstream_list), ctx_node) + +/* Base XSched runqueue object structure that contains both mutual and + * individual parameters for different scheduling classes. + */ +struct xsched_rq { + struct xsched_entity *curr_xse; + const struct xsched_class *class; + + int state; + int nr_running; +}; + enum xsched_cu_status { /* Worker not initialized. */ XSCHED_XCU_NONE, @@ -58,11 +82,21 @@ enum xsched_cu_status { struct xsched_cu { uint32_t id; uint32_t state; + + atomic_t pending_kicks; struct task_struct *worker; + + /* Storage list for contexts associated with this xcu */ + uint32_t nr_ctx; struct list_head ctx_list; struct mutex ctx_list_lock; + vstream_info_t *vs_array[MAX_VSTREAM_NUM]; struct mutex vs_array_lock; + + struct xsched_rq xrq; + struct list_head vsm_list; + struct xcu_group *group; struct mutex xcu_lock; wait_queue_head_t wq_xcu_idle; @@ -76,6 +110,15 @@ struct xsched_entity { pid_t owner_pid; pid_t tgid; + /* Amount of pending kicks currently sitting on this context. */ + atomic_t kicks_pending_ctx_cnt; + + /* Amount of submitted kicks context, used for resched decision. */ + atomic_t submitted_one_kick; + + size_t total_scheduled; + size_t total_submitted; + /* File descriptor coming from an associated context * used for identifying a given xsched entity in * info and error prints. @@ -88,6 +131,9 @@ struct xsched_entity { /* Pointer to context object. */ struct xsched_context *ctx; + /* Xsched entity execution statistics */ + u64 last_exec_runtime; + /* Pointer to an XCU object that represents an XCU * on which this xse is to be processed or is being * processed currently. @@ -98,6 +144,55 @@ struct xsched_entity { spinlock_t xse_lock; }; +/* Increments pending kicks counter for an XCU that the given + * xsched entity is attached to and for xsched entity's xsched + * class. + */ +static inline int xsched_inc_pending_kicks_xse(struct xsched_entity *xse) +{ + atomic_inc(&xse->xcu->pending_kicks); + /* Icrement pending kicks for current XSE. */ + atomic_inc(&xse->kicks_pending_ctx_cnt); + + return 0; +} + +/* Decrements pending kicks counter for an XCU that the given + * xsched entity is attached to and for XSched entity's sched + * class. + */ +static inline int xsched_dec_pending_kicks_xse(struct xsched_entity *xse) +{ + atomic_dec(&xse->xcu->pending_kicks); + /* Decrementing pending kicks for current XSE. */ + atomic_dec(&xse->kicks_pending_ctx_cnt); + + return 0; +} + +/* Checks if there are pending kicks left on a given XCU for all + * xsched classes. + */ +static inline bool xsched_check_pending_kicks_xcu(struct xsched_cu *xcu) +{ + return atomic_read(&xcu->pending_kicks); +} + +static inline int xse_integrity_check(const struct xsched_entity *xse) +{ + if (!xse) { + XSCHED_ERR("xse is null @ %s\n", __func__); + return -EINVAL; + } + + if (!xse->class) { + XSCHED_ERR("xse->class is null @ %s\n", __func__); + return -EINVAL; + } + + return 0; +} + struct xsched_context { uint32_t fd; uint32_t dev_id; @@ -134,8 +229,57 @@ ctx_find_by_tgid_and_xcu(pid_t tgid, struct xsched_cu *xcu) return ret; } +struct xsched_class { + enum xcu_sched_type class_id; + size_t kick_slice; + struct list_head node; + + /* Initialize a new xsched entity */ + void (*xse_init)(struct xsched_entity *xse); + + /* Destroy XSE scheduler-specific data */ + void (*xse_deinit)(struct xsched_entity *xse); + + /* Initialize a new runqueue per xcu */ + void (*rq_init)(struct xsched_cu *xcu); + + /* Removes a given XSE from it's runqueue. */ + void (*dequeue_ctx)(struct xsched_entity *xse); + + /* Places a given XSE on a runqueue on a given XCU. */ + void (*enqueue_ctx)(struct xsched_entity *xse, struct xsched_cu *xcu); + + /* Returns a next XSE to be submitted on a given XCU. */ + struct xsched_entity *(*pick_next_ctx)(struct xsched_cu *xcu); + + /* Put a XSE back into rq during preemption. */ + void (*put_prev_ctx)(struct xsched_entity *xse); + + /* Check context preemption. */ + bool (*check_preempt)(struct xsched_entity *xse); + + /* Select jobs from XSE to submit on XCU */ + size_t (*select_work)(struct xsched_cu *xcu, struct xsched_entity *xse); +}; + +static inline void xsched_init_vsm(struct vstream_metadata *vsm, + struct vstream_info *vs, vstream_args_t *arg) +{ + vsm->sq_id = arg->sq_id; + vsm->sqe_num = arg->vk_args.sqe_num; + vsm->timeout = arg->vk_args.timeout; + memcpy(vsm->sqe, arg->vk_args.sqe, XCU_SQE_SIZE_MAX); + vsm->parent = vs; + INIT_LIST_HEAD(&vsm->node); +} + int xsched_xcu_init(struct xsched_cu *xcu, struct xcu_group *group, int xcu_id); int xsched_schedule(void *input_xcu); int xsched_init_entity(struct xsched_context *ctx, struct vstream_info *vs); int ctx_bind_to_xcu(vstream_info_t *vstream_info, struct xsched_context *ctx); +int xsched_vsm_add_tail(struct vstream_info *vs, vstream_args_t *arg); +struct vstream_metadata *xsched_vsm_fetch_first(struct vstream_info *vs); +void enqueue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu); +void dequeue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu); +int delete_ctx(struct xsched_context *ctx); #endif /* !__LINUX_XSCHED_H__ */ diff --git a/include/uapi/linux/xcu_vstream.h b/include/uapi/linux/xcu_vstream.h index 4d65789c37c7..38cc97d3a139 100644 --- a/include/uapi/linux/xcu_vstream.h +++ b/include/uapi/linux/xcu_vstream.h @@ -6,6 +6,9 @@ #define PAYLOAD_SIZE_MAX 512 #define XCU_SQE_SIZE_MAX 64 +#define XCU_CQE_SIZE_MAX 32 +#define XCU_CQE_REPORT_NUM 4 +#define XCU_CQE_BUF_SIZE (XCU_CQE_REPORT_NUM * XCU_CQE_SIZE_MAX) #define KABI_RESERVE_BYTES(idx, n) \ __u8 __kabi_reserved_##idx[n] diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c index 867c07f9e9d1..701a81297fc4 100644 --- a/kernel/xsched/core.c +++ b/kernel/xsched/core.c @@ -23,24 +23,403 @@ #include <linux/types.h> #include <linux/xsched.h> +/* List of scheduling classes available */ +struct list_head xsched_class_list; + +static void put_prev_ctx(struct xsched_entity *xse) +{ + struct xsched_cu *xcu = xse->xcu; + + lockdep_assert_held(&xcu->xcu_lock); + xse->class->put_prev_ctx(xse); + xse->last_exec_runtime = 0; + atomic_set(&xse->submitted_one_kick, 0); + XSCHED_DEBUG("Put current xse %d @ %s\n", xse->tgid, __func__); +} + +static size_t select_work_def(struct xsched_cu *xcu, struct xsched_entity *xse) +{ + int kick_count, scheduled = 0, not_empty; + struct vstream_info *vs; + struct xcu_op_handler_params params; + struct vstream_metadata *vsm; + size_t kick_slice = xse->class->kick_slice; + + kick_count = atomic_read(&xse->kicks_pending_ctx_cnt); + XSCHED_DEBUG("Before decrement XSE kick_count=%d @ %s\n", + kick_count, __func__); + + if (kick_count == 0) { + XSCHED_WARN("Try to select xse that has 0 kicks @ %s\n", + __func__); + return 0; + } + + do { + not_empty = 0; + for_each_vstream_in_ctx(vs, xse->ctx) { + spin_lock(&vs->stream_lock); + vsm = xsched_vsm_fetch_first(vs); + spin_unlock(&vs->stream_lock); + if (!vsm) + continue; + list_add_tail(&vsm->node, &xcu->vsm_list); + scheduled++; + xsched_dec_pending_kicks_xse(xse); + not_empty++; + } + } while ((scheduled < kick_slice) && (not_empty)); + + /* + * Iterate over all vstreams in context: + * Set wr_cqe bit in last computing task in vsm_list + */ + for_each_vstream_in_ctx(vs, xse->ctx) { + list_for_each_entry_reverse(vsm, &xcu->vsm_list, node) { + if (vsm->parent == vs) { + params.group = vsm->parent->xcu->group; + params.param_1 = &(int){SQE_SET_NOTIFY}; + params.param_2 = &vsm->sqe; + xcu_sqe_op(¶ms); + break; + } + } + } + + kick_count = atomic_read(&xse->kicks_pending_ctx_cnt); + XSCHED_DEBUG("After decrement XSE kick_count=%d @ %s\n", + kick_count, __func__); + + xse->total_scheduled += scheduled; + return scheduled; +} + +static struct xsched_entity *__raw_pick_next_ctx(struct xsched_cu *xcu) +{ + const struct xsched_class *class; + struct xsched_entity *next = NULL; + size_t scheduled; + + lockdep_assert_held(&xcu->xcu_lock); + for_each_xsched_class(class) { + next = class->pick_next_ctx(xcu); + if (next) { + scheduled = class->select_work ? + class->select_work(xcu, next) : select_work_def(xcu, next); + + XSCHED_DEBUG("xse %d scheduled=%zu total=%zu @ %s\n", + next->tgid, scheduled, next->total_scheduled, __func__); + break; + } + } + + return next; +} + +void enqueue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu) +{ + lockdep_assert_held(&xcu->xcu_lock); + + if (xse_integrity_check(xse)) { + XSCHED_ERR("Fail to check xse integrity @ %s\n", __func__); + return; + } + + if (!xse->on_rq) { + xse->on_rq = true; + xse->class->enqueue_ctx(xse, xcu); + XSCHED_DEBUG("Enqueue xse %d @ %s\n", xse->tgid, __func__); + } +} + +void dequeue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu) +{ + lockdep_assert_held(&xcu->xcu_lock); + + if (xse_integrity_check(xse)) { + XSCHED_ERR("Fail to check xse integrity @ %s\n", __func__); + return; + } + + if (xse->on_rq) { + xse->class->dequeue_ctx(xse); + xse->on_rq = false; + XSCHED_DEBUG("Dequeue xse %d @ %s\n", xse->tgid, __func__); + } +} + +int delete_ctx(struct xsched_context *ctx) +{ + struct xsched_cu *xcu = ctx->xse.xcu; + struct xsched_entity *curr_xse = xcu->xrq.curr_xse; + struct xsched_entity *xse = &ctx->xse; + + if (xse_integrity_check(xse)) { + XSCHED_ERR("Fail to check xse integrity @ %s\n", __func__); + return -EINVAL; + } + + if (!xse->xcu) { + XSCHED_ERR("Try to delete ctx that is not attached to xcu @ %s\n", + __func__); + return -EINVAL; + } + + /* Wait till context has been submitted. */ + while (atomic_read(&xse->kicks_pending_ctx_cnt)) { + XSCHED_DEBUG("Deleting ctx %d, xse->kicks_pending_ctx_cnt=%d @ %s\n", + xse->tgid, atomic_read(&xse->kicks_pending_ctx_cnt), + __func__); + usleep_range(100, 200); + } + + mutex_lock(&xcu->xcu_lock); + if (curr_xse == xse) + xcu->xrq.curr_xse = NULL; + dequeue_ctx(xse, xcu); + --xcu->nr_ctx; + mutex_unlock(&xcu->xcu_lock); + XSCHED_DEBUG("Deleting ctx %d, pending kicks left=%d @ %s\n", xse->tgid, + atomic_read(&xse->kicks_pending_ctx_cnt), __func__); + + xse->class->xse_deinit(xse); + return 0; +} + +int xsched_xse_set_class(struct xsched_entity *xse) +{ + struct xsched_class *sched = xsched_first_class; + + xse->class = sched; + return 0; +} + +static void submit_kick(struct vstream_metadata *vsm) +{ + struct vstream_info *vs = vsm->parent; + struct xcu_op_handler_params params; + + params.group = vs->xcu->group; + params.fd = vs->fd; + params.param_1 = &vs->id; + params.param_2 = &vs->channel_id; + params.param_3 = vsm->sqe; + params.param_4 = &vsm->sqe_num; + params.param_5 = &vsm->timeout; + params.param_6 = &vs->sqcq_type; + params.param_7 = vs->drv_ctx; + params.param_8 = &vs->logic_vcq_id; + + /* Send vstream on a device for processing. */ + if (xcu_run(¶ms) != 0) + XSCHED_ERR( + "Fail to send Vstream id %u tasks to a device for processing.\n", + vs->id); + + XSCHED_DEBUG("Vstream id %u submit vsm: sq_tail %u\n", vs->id, vsm->sq_tail); +} + +static void submit_wait(struct vstream_metadata *vsm) +{ + struct vstream_info *vs = vsm->parent; + struct xcu_op_handler_params params; + /* Wait timeout in ms. */ + int32_t timeout = 500; + + params.group = vs->xcu->group; + params.param_1 = &vs->channel_id; + params.param_2 = &vs->logic_vcq_id; + params.param_3 = &vs->user_stream_id; + params.param_4 = &vsm->sqe; + params.param_5 = vsm->cqe; + params.param_6 = vs->drv_ctx; + params.param_7 = &timeout; + + /* Wait for a device to complete processing. */ + if (xcu_wait(¶ms)) { + XSCHED_ERR("Fail to wait Vstream id %u tasks, logic_cq_id %u.\n", + vs->id, vs->logic_vcq_id); + } + + XSCHED_DEBUG("Vstream id %u wait finish, logic_cq_id %u\n", + vs->id, vs->logic_vcq_id); +} + +static int __xsched_submit(struct xsched_cu *xcu, struct xsched_entity *xse) +{ + struct vstream_metadata *vsm, *tmp; + int submitted = 0; + long submit_exec_time = 0; + ktime_t t_start = 0; + struct xcu_op_handler_params params; + + XSCHED_DEBUG("%s called for xse %d on xcu %u\n", + __func__, xse->tgid, xcu->id); + list_for_each_entry_safe(vsm, tmp, &xcu->vsm_list, node) { + submit_kick(vsm); + XSCHED_DEBUG("Xse %d vsm %u sched_delay: %lld ns\n", + xse->tgid, vsm->sq_id, ktime_to_ns(ktime_sub(ktime_get(), vsm->add_time))); + + params.group = vsm->parent->xcu->group; + params.param_1 = &(int){SQE_IS_NOTIFY}; + params.param_2 = &vsm->sqe; + if (xcu_sqe_op(¶ms)) { + mutex_unlock(&xcu->xcu_lock); + t_start = ktime_get(); + submit_wait(vsm); + submit_exec_time += ktime_to_ns(ktime_sub(ktime_get(), t_start)); + mutex_lock(&xcu->xcu_lock); + } + submitted++; + list_del(&vsm->node); + kfree(vsm); + } + + xse->last_exec_runtime += submit_exec_time; + xse->total_submitted += submitted; + atomic_add(submitted, &xse->submitted_one_kick); + INIT_LIST_HEAD(&xcu->vsm_list); + XSCHED_DEBUG("Xse %d submitted=%d total=%zu, exec_time=%ld @ %s\n", + xse->tgid, submitted, xse->total_submitted, + submit_exec_time, __func__); + + return submitted; +} + +static inline bool should_preempt(struct xsched_entity *xse) +{ + return xse->class->check_preempt(xse); +} + +int xsched_vsm_add_tail(struct vstream_info *vs, vstream_args_t *arg) +{ + struct vstream_metadata *new_vsm; + + new_vsm = kmalloc(sizeof(struct vstream_metadata), GFP_KERNEL); + if (!new_vsm) { + XSCHED_ERR("Fail to alloc kick metadata for vs %u @ %s\n", + vs->id, __func__); + return -ENOMEM; + } + + if (vs->kicks_count > MAX_VSTREAM_SIZE) { + kfree(new_vsm); + return -EBUSY; + } + + xsched_init_vsm(new_vsm, vs, arg); + list_add_tail(&new_vsm->node, &vs->metadata_list); + new_vsm->add_time = ktime_get(); + vs->kicks_count += 1; + + return 0; +} + +/* Fetch the first vstream metadata from vstream metadata list + * and removes it from that list. Returned vstream metadata pointer + * to be freed after. + */ +struct vstream_metadata *xsched_vsm_fetch_first(struct vstream_info *vs) +{ + struct vstream_metadata *vsm; + + if (list_empty(&vs->metadata_list)) { + XSCHED_DEBUG("No metadata to fetch from vs %u @ %s\n", + vs->id, __func__); + return NULL; + } + + vsm = list_first_entry(&vs->metadata_list, struct vstream_metadata, node); + if (!vsm) { + XSCHED_ERR("Corrupted metadata list in vs %u @ %s\n", + vs->id, __func__); + return NULL; + } + + list_del(&vsm->node); + if (vs->kicks_count == 0) + XSCHED_WARN("kicks_count underflow in vs %u @ %s\n", + vs->id, __func__); + else + vs->kicks_count -= 1; + + return vsm; +} + int xsched_schedule(void *input_xcu) { + struct xsched_cu *xcu = input_xcu; + struct xsched_entity *curr_xse = NULL; + struct xsched_entity *next_xse = NULL; + + while (!kthread_should_stop()) { + mutex_unlock(&xcu->xcu_lock); + wait_event_interruptible(xcu->wq_xcu_idle, 1); + + mutex_lock(&xcu->xcu_lock); + if (kthread_should_stop()) { + mutex_unlock(&xcu->xcu_lock); + break; + } + + if (!xsched_check_pending_kicks_xcu(xcu)) { + XSCHED_WARN("%s: No pending kicks on xcu %u\n", __func__, xcu->id); + continue; + } + + next_xse = __raw_pick_next_ctx(xcu); + if (!next_xse) { + XSCHED_WARN("%s: Couldn't find next xse on xcu %u\n", __func__, xcu->id); + continue; + } + + xcu->xrq.curr_xse = next_xse; + if (__xsched_submit(xcu, next_xse) == 0) + continue; + + curr_xse = xcu->xrq.curr_xse; + if (!curr_xse) + continue; + + /* if not deleted yet */ + put_prev_ctx(curr_xse); + if (!atomic_read(&curr_xse->kicks_pending_ctx_cnt)) + dequeue_ctx(curr_xse, xcu); + + xcu->xrq.curr_xse = NULL; + } + return 0; } + /* Initializes all xsched XCU objects. * Should only be called from xsched_xcu_register function. */ int xsched_xcu_init(struct xsched_cu *xcu, struct xcu_group *group, int xcu_id) { + struct xsched_class *sched; int err; xcu->id = xcu_id; xcu->state = XSCHED_XCU_NONE; xcu->group = group; + xcu->nr_ctx = 0; + xcu->xrq.curr_xse = NULL; + + atomic_set(&xcu->pending_kicks, 0); + INIT_LIST_HEAD(&xcu->vsm_list); + INIT_LIST_HEAD(&xcu->ctx_list); + init_waitqueue_head(&xcu->wq_xcu_idle); + mutex_init(&xcu->ctx_list_lock); + mutex_init(&xcu->vs_array_lock); mutex_init(&xcu->xcu_lock); + /* Initialize current XCU's runqueue. */ + for_each_xsched_class(sched) + sched->rq_init(xcu); + /* This worker should set XCU to XSCHED_XCU_WAIT_IDLE. * If after initialization XCU still has XSCHED_XCU_NONE * status then we can assume that there was a problem @@ -62,6 +441,13 @@ int xsched_init_entity(struct xsched_context *ctx, struct vstream_info *vs) int err = 0; struct xsched_entity *xse = &ctx->xse; + atomic_set(&xse->kicks_pending_ctx_cnt, 0); + atomic_set(&xse->submitted_one_kick, 0); + + xse->total_scheduled = 0; + xse->total_submitted = 0; + xse->last_exec_runtime = 0; + xse->fd = ctx->fd; xse->tgid = ctx->tgid; @@ -82,6 +468,13 @@ int xsched_init_entity(struct xsched_context *ctx, struct vstream_info *vs) xse->xcu = vs->xcu; + err = xsched_xse_set_class(xse); + if (err) { + XSCHED_ERR("Fail to set xse class @ %s\n", __func__); + return err; + } + xse->class->xse_init(xse); + WRITE_ONCE(xse->on_rq, false); spin_lock_init(&xse->xse_lock); diff --git a/kernel/xsched/vstream.c b/kernel/xsched/vstream.c index 5a6a3e565351..4de969872091 100644 --- a/kernel/xsched/vstream.c +++ b/kernel/xsched/vstream.c @@ -85,6 +85,7 @@ static void xsched_task_free(struct kref *kref) kfree(vs); } + delete_ctx(ctx); list_del(&ctx->ctx_node); mutex_unlock(&xcu->ctx_list_lock); @@ -237,6 +238,7 @@ static int alloc_ctx_from_vstream(struct vstream_info *vstream_info, } list_add(&(*ctx)->ctx_node, &xcu->ctx_list); + ++xcu->nr_ctx; return 0; } @@ -570,7 +572,64 @@ int vstream_free(struct vstream_args *arg) int vstream_kick(struct vstream_args *arg) { - return 0; + vstream_info_t *vstream; + struct xsched_cu *xcu = NULL; + struct xsched_entity *xse; + int err = 0; + uint32_t vstream_id = arg->sq_id; + uint32_t type = XCU_TYPE_XPU; + + xcu = xcu_find(type, arg->dev_id, arg->channel_id); + if (!xcu) + return -EINVAL; + + /* Get vstream. */ + vstream = vstream_get(xcu, vstream_id); + if (!vstream || !vstream->ctx) { + XSCHED_ERR("Vstream NULL or doesn't have a context. vstream_id=%u, dev_id=%u\n", + vstream_id, arg->dev_id); + return -EINVAL; + } + + xse = &vstream->ctx->xse; + XSCHED_DEBUG("New kick on xse %d @ %s\n", xse->tgid, __func__); + + do { + mutex_lock(&xcu->xcu_lock); + spin_lock(&vstream->stream_lock); + + /* Adding kick metadata. */ + err = xsched_vsm_add_tail(vstream, arg); + if (err == -EBUSY) { + spin_unlock(&vstream->stream_lock); + mutex_unlock(&xcu->xcu_lock); + + /* Retry after a while */ + usleep_range(100, 200); + continue; + } + + /* Don't forget to unlock */ + if (err) { + XSCHED_ERR("Fail to add kick metadata to vs %u @ %s\n", + vstream->id, __func__); + break; + } + + enqueue_ctx(xse, xcu); + + /* Increasing a total amount of kicks on an CU to which this + * context is attached to based on sched_class. + */ + xsched_inc_pending_kicks_xse(&vstream->ctx->xse); + } while (err == -EBUSY); + + spin_unlock(&vstream->stream_lock); + mutex_unlock(&xcu->xcu_lock); + if (!err) + wake_up_interruptible(&xcu->wq_xcu_idle); + + return err; } /* -- 2.34.1

Zicheng Qu

2:43 p.m.

New subject: [PATCH OLK-6.6 5/8] xsched: Add xsched RT class

From: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC5EHB ----------------------------------------- Add rt class callbacks implementation: - dequeue_ctx - enqueue_ctx - pick_next_ctx - put_prev_ctx - submit_prepare_ctx - select_work - check_preempt Add xsched_rt.c in /kernel/xsched Makefile. Add RT class callbacks support in core.c. Signed-off-by: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> Signed-off-by: Hui Tang <tanghui20@.huawei.com> Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Xia Fukun <xiafukun@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- drivers/xcu/xcu_group.c | 2 +- include/linux/xsched.h | 52 +++++++- kernel/xsched/Kconfig | 15 +++ kernel/xsched/Makefile | 1 + kernel/xsched/core.c | 30 ++++- kernel/xsched/rt.c | 281 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 376 insertions(+), 5 deletions(-) create mode 100644 kernel/xsched/rt.c diff --git a/drivers/xcu/xcu_group.c b/drivers/xcu/xcu_group.c index 54d389534508..2a349de62256 100644 --- a/drivers/xcu/xcu_group.c +++ b/drivers/xcu/xcu_group.c @@ -20,7 +20,7 @@ #include <linux/xcu_group.h> #include <linux/xsched.h> -static int num_active_xcu; +int num_active_xcu; static DEFINE_SPINLOCK(xcu_mgr_lock); struct xsched_cu *xsched_cu_mgr[XSCHED_NR_CUS]; static DECLARE_RWSEM(xcu_group_rwsem); diff --git a/include/linux/xsched.h b/include/linux/xsched.h index d52461e63d8a..f62bbc55c354 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -38,19 +38,51 @@ #define MAX_VSTREAM_NUM 512 +/* + * A default kick slice for RT class XSEs. + */ +#define XSCHED_RT_KICK_SLICE 2 + +extern struct xsched_cu *xsched_cu_mgr[XSCHED_NR_CUS]; + enum xcu_sched_type { - XSCHED_TYPE_NUM + XSCHED_TYPE_RT = 0, + XSCHED_TYPE_NUM, + XSCHED_TYPE_DFLT = XSCHED_TYPE_RT }; +enum xse_prio { + XSE_PRIO_HIGH = 0, + XSE_PRIO_LOW = 4, + NR_XSE_PRIO, + XSE_PRIO_DFLT = XSE_PRIO_LOW +}; + +extern struct xsched_class rt_xsched_class; + #define xsched_first_class \ list_first_entry(&(xsched_class_list), struct xsched_class, node) #define for_each_xsched_class(class) \ list_for_each_entry((class), &(xsched_class_list), node) +#define for_each_xse_prio(prio) \ + for (prio = XSE_PRIO_HIGH; prio < NR_XSE_PRIO; prio++) #define for_each_vstream_in_ctx(vs, ctx) \ list_for_each_entry((vs), &((ctx)->vstream_list), ctx_node) + +/* Manages xsched RT-like class linked list based runqueue. + * + * Now RT-like class runqueue structs is identical + * but will most likely grow different in the + * future as the Xsched evolves. + */ +struct xsched_rq_rt { + struct list_head rq[NR_XSE_PRIO]; + unsigned int nr_running; +}; + /* Base XSched runqueue object structure that contains both mutual and * individual parameters for different scheduling classes. */ @@ -60,6 +92,8 @@ struct xsched_rq { int state; int nr_running; + /* RT class run queue.*/ + struct xsched_rq_rt rt; }; enum xsched_cu_status { @@ -102,6 +136,18 @@ struct xsched_cu { wait_queue_head_t wq_xcu_idle; }; +extern int num_active_xcu; +#define for_each_active_xcu(xcu, id) \ + for ((id) = 0, xcu = xsched_cu_mgr[(id)]; \ + (id) < num_active_xcu && (xcu = xsched_cu_mgr[(id)]); (id)++) + +struct xsched_entity_rt { + struct list_head list_node; + enum xse_prio prio; + + ktime_t timeslice; +}; + struct xsched_entity { uint32_t task_type; @@ -128,6 +174,9 @@ struct xsched_entity { /* Xsched class for this xse. */ const struct xsched_class *class; + /* RT class entity. */ + struct xsched_entity_rt rt; + /* Pointer to context object. */ struct xsched_context *ctx; @@ -279,6 +328,7 @@ int xsched_init_entity(struct xsched_context *ctx, struct vstream_info *vs); int ctx_bind_to_xcu(vstream_info_t *vstream_info, struct xsched_context *ctx); int xsched_vsm_add_tail(struct vstream_info *vs, vstream_args_t *arg); struct vstream_metadata *xsched_vsm_fetch_first(struct vstream_info *vs); +int xsched_rt_prio_set(pid_t tgid, unsigned int prio); void enqueue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu); void dequeue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu); int delete_ctx(struct xsched_context *ctx); diff --git a/kernel/xsched/Kconfig b/kernel/xsched/Kconfig index 8d12b8db5f6d..77883b6a3cc3 100644 --- a/kernel/xsched/Kconfig +++ b/kernel/xsched/Kconfig @@ -4,6 +4,7 @@ config XCU_SCHEDULER bool "Enable XSched functionality" default n select XCU_VSTREAM + select XCU_SCHED_RT help This option enables the XSched scheduler, a custom scheduling mechanism designed for heterogeneous compute units (e.g., XPUs). It provides: @@ -34,3 +35,17 @@ config XSCHED_NR_CUS This option defines the maximum number of Compute Units (CUs) that can be managed by the XSched scheduler, consider changing this value proportionally to the number of available XCU cores. + +config XCU_SCHED_RT + bool "XCU RT scheduling class" + default y + depends on XCU_SCHEDULER + help + Enable support for the RT scheduling class in the XCU scheduler. + + This option allows XCU to schedule tasks using real-time priorities + (XSCHED_TYPE_RT). When enabled, tasks in RT cgroups can be assigned + deterministic priorities and will be scheduled ahead of CFS tasks. + + Unless you are using RT workloads that rely on strict priority-based + scheduling within XCU, it is recommended to keep the default setting. diff --git a/kernel/xsched/Makefile b/kernel/xsched/Makefile index 031b09b9fb4d..a7c8e2f7f250 100644 --- a/kernel/xsched/Makefile +++ b/kernel/xsched/Makefile @@ -2,3 +2,4 @@ obj-y += vstream.o xsched_enabled := $(CONFIG_XCU_SCHEDULER) obj-$(xsched_enabled) += core.o +obj-$(xsched_enabled)-$(CONFIG_XCU_SCHED_RT) += rt.o diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c index 701a81297fc4..bdad82041ada 100644 --- a/kernel/xsched/core.c +++ b/kernel/xsched/core.c @@ -188,9 +188,16 @@ int delete_ctx(struct xsched_context *ctx) int xsched_xse_set_class(struct xsched_entity *xse) { - struct xsched_class *sched = xsched_first_class; + switch (xse->task_type) { + case XSCHED_TYPE_RT: + xse->class = &rt_xsched_class; + XSCHED_DEBUG("Context is in RT class %s\n", __func__); + break; + default: + XSCHED_ERR("Xse has incorrect class @ %s\n", __func__); + return -EINVAL; + } - xse->class = sched; return 0; } @@ -354,7 +361,8 @@ int xsched_schedule(void *input_xcu) while (!kthread_should_stop()) { mutex_unlock(&xcu->xcu_lock); - wait_event_interruptible(xcu->wq_xcu_idle, 1); + wait_event_interruptible(xcu->wq_xcu_idle, + xcu->xrq.rt.nr_running); mutex_lock(&xcu->xcu_lock); if (kthread_should_stop()) { @@ -481,3 +489,19 @@ int xsched_init_entity(struct xsched_context *ctx, struct vstream_info *vs) return err; } +static void xsched_register_sched_class(struct xsched_class *sched) +{ + list_add_tail(&sched->node, &xsched_class_list); +} + +__init int xsched_sched_init(void) +{ + INIT_LIST_HEAD(&xsched_class_list); +#ifdef CONFIG_XCU_SCHED_RT + xsched_register_sched_class(&rt_xsched_class); +#endif + + return 0; +} +late_initcall(xsched_sched_init); + diff --git a/kernel/xsched/rt.c b/kernel/xsched/rt.c new file mode 100644 index 000000000000..41b60e341679 --- /dev/null +++ b/kernel/xsched/rt.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Real-Time Scheduling Class for XPU device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include <uapi/linux/sched/types.h> +#include <linux/hash.h> +#include <linux/hashtable.h> +#include <linux/kthread.h> +#include <linux/slab.h> +#include <linux/xsched.h> +#include <linux/vstream.h> + +#define XSCHED_RT_TIMESLICE (10 * NSEC_PER_MSEC) + +#define TGID_HASH_BITS 8 + +/* Mapping between tgid and context */ +struct tgid_prio { + pid_t tgid; + int32_t prio; + struct hlist_node hnode; +}; + +static DEFINE_HASHTABLE(tgid_prio_map, TGID_HASH_BITS); +static DEFINE_SPINLOCK(tgid_prio_lock); + +static int tgid_prio_insert(pid_t tgid, int32_t prio) +{ + struct tgid_prio *new_map; + unsigned int hash_key; + + if (prio >= NR_XSE_PRIO) + return -EINVAL; + + new_map = kzalloc(sizeof(struct tgid_prio), GFP_KERNEL); + if (!new_map) { + XSCHED_ERR("Fail to alloc mapping (tgid=%d) @ %s\n", + tgid, __func__); + return -ENOMEM; + } + + new_map->tgid = tgid; + new_map->prio = prio; + + hash_key = hash_32(tgid, TGID_HASH_BITS); + + spin_lock(&tgid_prio_lock); + hash_add_rcu(tgid_prio_map, &new_map->hnode, hash_key); + spin_unlock(&tgid_prio_lock); + + return 0; +} + +static struct tgid_prio *tgid_prio_find(pid_t tgid) +{ + struct tgid_prio *map = NULL; + unsigned int hash_key = hash_32(tgid, TGID_HASH_BITS); + + rcu_read_lock(); + hash_for_each_possible_rcu(tgid_prio_map, map, hnode, hash_key) { + if (map->tgid == tgid) + break; + } + rcu_read_unlock(); + return map; +} + +static void tgid_prio_delete(pid_t tgid) +{ + struct tgid_prio *map; + unsigned int hash_key = hash_32(tgid, TGID_HASH_BITS); + + spin_lock(&tgid_prio_lock); + hash_for_each_possible(tgid_prio_map, map, hnode, hash_key) { + if (map->tgid == tgid) { + hash_del_rcu(&map->hnode); + spin_unlock(&tgid_prio_lock); + kfree(map); + return; + } + } + spin_unlock(&tgid_prio_lock); +} + +static inline void +xse_rt_add(struct xsched_entity *xse, struct xsched_cu *xcu) +{ + list_add_tail(&xse->rt.list_node, &xcu->xrq.rt.rq[xse->rt.prio]); +} + +static inline void xse_rt_del(struct xsched_entity *xse) +{ + list_del_init(&xse->rt.list_node); +} + +static inline void xse_rt_move_tail(struct xsched_entity *xse) +{ + struct xsched_cu *xcu = xse->xcu; + + list_move_tail(&xse->rt.list_node, &xcu->xrq.rt.rq[xse->rt.prio]); +} + +/* Increase RT runqueue total and per prio nr_running stat. */ +static inline void xrq_inc_nr_running(struct xsched_entity *xse, + struct xsched_cu *xcu) +{ + xcu->xrq.rt.nr_running++; +} + +/* Decrease RT runqueue total and per prio nr_running stat + * and raise a bug if nr_running decrease beyond zero. + */ +static inline void xrq_dec_nr_running(struct xsched_entity *xse) +{ + struct xsched_cu *xcu = xse->xcu; + + xcu->xrq.rt.nr_running--; +} + +static void dequeue_ctx_rt(struct xsched_entity *xse) +{ + xse_rt_del(xse); + xrq_dec_nr_running(xse); +} + +static void enqueue_ctx_rt(struct xsched_entity *xse, struct xsched_cu *xcu) +{ + xse_rt_add(xse, xcu); + xrq_inc_nr_running(xse, xcu); +} + +static inline struct xsched_entity *xrq_next_xse(struct xsched_cu *xcu, + int prio) +{ + return list_first_entry(&xcu->xrq.rt.rq[prio], struct xsched_entity, + rt.list_node); +} + +/* Return the next priority for pick_next_ctx taking into + * account if there are pending kicks on certain priority. + */ +static inline uint32_t get_next_prio_rt(struct xsched_rq *xrq) +{ + unsigned int curr_prio; + + for_each_xse_prio(curr_prio) { + if (!list_empty(&xrq->rt.rq[curr_prio])) + return curr_prio; + } + return NR_XSE_PRIO; +} + +static struct xsched_entity *pick_next_ctx_rt(struct xsched_cu *xcu) +{ + struct xsched_entity *result; + int next_prio; + + next_prio = get_next_prio_rt(&xcu->xrq); + if (next_prio >= NR_XSE_PRIO) { + XSCHED_DEBUG("No pending kicks in RT class @ %s\n", __func__); + return NULL; + } + + result = xrq_next_xse(xcu, next_prio); + if (!result) + XSCHED_ERR("Next XSE not found @ %s\n", __func__); + else + XSCHED_DEBUG("Next XSE %u at prio %u @ %s\n", result->tgid, next_prio, __func__); + + return result; +} + +static void put_prev_ctx_rt(struct xsched_entity *xse) +{ + xse->rt.timeslice -= xse->last_exec_runtime; + XSCHED_DEBUG( + "Update XSE=%d timeslice=%lld, XSE submitted=%lld in RT class @ %s\n", + xse->tgid, xse->rt.timeslice, + xse->last_exec_runtime, __func__); + + if (xse->rt.timeslice <= 0) { + xse->rt.timeslice = XSCHED_RT_TIMESLICE; + XSCHED_DEBUG("Refill XSE=%d kick_slice=%lld in RT class @ %s\n", + xse->tgid, xse->rt.timeslice, __func__); + xse_rt_move_tail(xse); + } +} + +static bool check_preempt_ctx_rt(struct xsched_entity *xse) +{ + return true; +} + +void rq_init_rt(struct xsched_cu *xcu) +{ + int prio = 0; + + xcu->xrq.rt.nr_running = 0; + + for_each_xse_prio(prio) { + INIT_LIST_HEAD(&xcu->xrq.rt.rq[prio]); + } +} + +void xse_init_rt(struct xsched_entity *xse) +{ + struct tgid_prio *map = tgid_prio_find(xse->tgid); + + xse->rt.prio = (map) ? map->prio : XSE_PRIO_DFLT; + XSCHED_DEBUG("Xse init: set priority=%d.\n", xse->rt.prio); + xse->rt.timeslice = XSCHED_RT_TIMESLICE; + INIT_LIST_HEAD(&xse->rt.list_node); +} + +void xse_deinit_rt(struct xsched_entity *xse) +{ + struct tgid_prio *map = tgid_prio_find(xse->tgid); + + if (map) { + tgid_prio_delete(xse->tgid); + XSCHED_DEBUG("Map deleted: tgid=%d\n", xse->tgid); + } +} + +struct xsched_class rt_xsched_class = { + .class_id = XSCHED_TYPE_RT, + .kick_slice = XSCHED_RT_KICK_SLICE, + .rq_init = rq_init_rt, + .xse_init = xse_init_rt, + .xse_deinit = xse_deinit_rt, + .dequeue_ctx = dequeue_ctx_rt, + .enqueue_ctx = enqueue_ctx_rt, + .pick_next_ctx = pick_next_ctx_rt, + .put_prev_ctx = put_prev_ctx_rt, + .check_preempt = check_preempt_ctx_rt +}; + +int xsched_rt_prio_set(pid_t tgid, unsigned int prio) +{ + unsigned int id; + struct xsched_cu *xcu; + struct xsched_context *ctx; + struct xsched_entity *xse; + + tgid_prio_delete(tgid); + tgid_prio_insert(tgid, prio); + + for_each_active_xcu(xcu, id) { + mutex_lock(&xcu->ctx_list_lock); + mutex_lock(&xcu->xcu_lock); + + ctx = ctx_find_by_tgid_and_xcu(tgid, xcu); + if (ctx) { + xse = &ctx->xse; + xse->rt.prio = clamp_t(unsigned int, prio, XSE_PRIO_HIGH, XSE_PRIO_LOW); + if (xse->on_rq) { + xse_rt_del(xse); + xse_rt_add(xse, xcu); + } + } + + mutex_unlock(&xcu->xcu_lock); + mutex_unlock(&xcu->ctx_list_lock); + } + + return 0; +} + -- 2.34.1

Zicheng Qu

2:43 p.m.

New subject: [PATCH OLK-6.6 6/8] xsched: Add xsched CFS class

From: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC5EHB ----------------------------------------- Add xsched cfs class callbacks implementation: - dequeue_ctx_fair. - enqueue_ctx_fair. - pick_next_ctx_fair. - check_preempt_fair. - put_prev_ctx_fair. - submit_prepare_ctx_fair. Add xsched_cfs.c in /kernel/xsched Makefile. Add cfs class related data structure. Signed-off-by: Konstantin Meskhidze <konstantin.meskhidze@huawei.com> Signed-off-by: Hui Tang <tanghui20@.huawei.com> Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Xia Fukun <xiafukun@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- include/linux/xsched.h | 37 +++++++++ kernel/xsched/Kconfig | 19 +++++ kernel/xsched/Makefile | 1 + kernel/xsched/cfs.c | 185 +++++++++++++++++++++++++++++++++++++++++ kernel/xsched/core.c | 10 ++- 5 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 kernel/xsched/cfs.c diff --git a/include/linux/xsched.h b/include/linux/xsched.h index f62bbc55c354..0bb11d7360bd 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -38,15 +38,24 @@ #define MAX_VSTREAM_NUM 512 +#define RUNTIME_INF ((u64)~0ULL) +#define XSCHED_TIME_INF RUNTIME_INF +#define XSCHED_CFS_WEIGHT_DFLT 1 + /* * A default kick slice for RT class XSEs. */ #define XSCHED_RT_KICK_SLICE 2 +/* + * A default kick slice for CFS class XSEs. + */ +#define XSCHED_CFS_KICK_SLICE 10 extern struct xsched_cu *xsched_cu_mgr[XSCHED_NR_CUS]; enum xcu_sched_type { XSCHED_TYPE_RT = 0, + XSCHED_TYPE_CFS = 1, XSCHED_TYPE_NUM, XSCHED_TYPE_DFLT = XSCHED_TYPE_RT }; @@ -59,6 +68,7 @@ enum xse_prio { }; extern struct xsched_class rt_xsched_class; +extern struct xsched_class fair_xsched_class; #define xsched_first_class \ list_first_entry(&(xsched_class_list), struct xsched_class, node) @@ -83,6 +93,14 @@ struct xsched_rq_rt { unsigned int nr_running; }; +/* Manages xsched CFS-like class rbtree based runqueue. */ +struct xsched_rq_cfs { + unsigned int nr_running; + unsigned int load; + u64 min_xruntime; + struct rb_root_cached ctx_timeline; +}; + /* Base XSched runqueue object structure that contains both mutual and * individual parameters for different scheduling classes. */ @@ -94,6 +112,8 @@ struct xsched_rq { int nr_running; /* RT class run queue.*/ struct xsched_rq_rt rt; + /* CFS class run queue.*/ + struct xsched_rq_cfs cfs; }; enum xsched_cu_status { @@ -148,6 +168,21 @@ struct xsched_entity_rt { ktime_t timeslice; }; +struct xsched_entity_cfs { + struct rb_node run_node; + + /* Rq on which this entity is (to be) queued. */ + struct xsched_rq_cfs *cfs_rq; + + /* Value of "virtual" runtime to sort entities in rbtree */ + u64 xruntime; + u32 weight; + + /* Execution time of scheduling entity */ + u64 exec_start; + u64 sum_exec_runtime; +}; + struct xsched_entity { uint32_t task_type; @@ -176,6 +211,8 @@ struct xsched_entity { /* RT class entity. */ struct xsched_entity_rt rt; + /* CFS class entity. */ + struct xsched_entity_cfs cfs; /* Pointer to context object. */ struct xsched_context *ctx; diff --git a/kernel/xsched/Kconfig b/kernel/xsched/Kconfig index 77883b6a3cc3..cc03f668a5dc 100644 --- a/kernel/xsched/Kconfig +++ b/kernel/xsched/Kconfig @@ -5,6 +5,7 @@ config XCU_SCHEDULER default n select XCU_VSTREAM select XCU_SCHED_RT + select XCU_SCHED_CFS help This option enables the XSched scheduler, a custom scheduling mechanism designed for heterogeneous compute units (e.g., XPUs). It provides: @@ -49,3 +50,21 @@ config XCU_SCHED_RT Unless you are using RT workloads that rely on strict priority-based scheduling within XCU, it is recommended to keep the default setting. + +config XCU_SCHED_CFS + bool "XCU CFS scheduling class" + default n + depends on XCU_SCHEDULER + help + Enable support for the CFS scheduling class in the XCU scheduler. + + This option allows the XCU scheduler to manage tasks using a fair-share + scheduling model similar to the Completely Fair Scheduler (CFS). + XCU-CFS provides proportional CPU sharing based on weights and supports + hierarchical control through cgroups. + + Enable this option if you want to run workloads that rely on fair, + weight-based CPU distribution within the XCU scheduling framework. + If your workload does not require proportional sharing or uses only the + RT scheduling class, you may leave this disabled. + diff --git a/kernel/xsched/Makefile b/kernel/xsched/Makefile index a7c8e2f7f250..9156349d260c 100644 --- a/kernel/xsched/Makefile +++ b/kernel/xsched/Makefile @@ -3,3 +3,4 @@ obj-y += vstream.o xsched_enabled := $(CONFIG_XCU_SCHEDULER) obj-$(xsched_enabled) += core.o obj-$(xsched_enabled)-$(CONFIG_XCU_SCHED_RT) += rt.o +obj-$(xsched_enabled)-$(CONFIG_XCU_SCHED_CFS) += cfs.o diff --git a/kernel/xsched/cfs.c b/kernel/xsched/cfs.c new file mode 100644 index 000000000000..ea39ef8770f8 --- /dev/null +++ b/kernel/xsched/cfs.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Completely Fair Scheduling (CFS) Class for XPU device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include <linux/xsched.h> + +#define CFS_INNER_RQ_EMPTY(cfs_xse) \ + ((cfs_xse)->xruntime == XSCHED_TIME_INF) + +void xs_rq_add(struct xsched_entity_cfs *xse) +{ + struct xsched_rq_cfs *cfs_rq = xse->cfs_rq; + struct rb_node **link = &cfs_rq->ctx_timeline.rb_root.rb_node; + struct rb_node *parent = NULL; + struct xsched_entity_cfs *entry; + bool leftmost = true; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct xsched_entity_cfs, run_node); + if (xse->xruntime <= entry->xruntime) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = false; + } + } + + rb_link_node(&xse->run_node, parent, link); + rb_insert_color_cached(&xse->run_node, &cfs_rq->ctx_timeline, leftmost); +} + +void xs_rq_remove(struct xsched_entity_cfs *xse) +{ + struct xsched_rq_cfs *cfs_rq = xse->cfs_rq; + + rb_erase_cached(&xse->run_node, &cfs_rq->ctx_timeline); +} + +/** + * xs_cfs_rq_update() - Update entity's runqueue position with new xruntime + */ +static void xs_cfs_rq_update(struct xsched_entity_cfs *xse_cfs, u64 new_xrt) +{ + xs_rq_remove(xse_cfs); + xse_cfs->xruntime = new_xrt; + xs_rq_add(xse_cfs); +} + +static inline struct xsched_entity_cfs * +xs_pick_first(struct xsched_rq_cfs *cfs_rq) +{ + struct xsched_entity_cfs *xse_cfs; + struct rb_node *left = rb_first_cached(&cfs_rq->ctx_timeline); + + if (!left) + return NULL; + + xse_cfs = rb_entry(left, struct xsched_entity_cfs, run_node); + return xse_cfs; +} + +/** + * xs_update() - Account xruntime and runtime metrics. + * @xse_cfs: Point to CFS scheduling entity. + * @delta: Execution time in last period + */ +static void xs_update(struct xsched_entity_cfs *xse_cfs, u64 delta) +{ + u64 new_xrt = xse_cfs->xruntime + delta * xse_cfs->weight; + + xs_cfs_rq_update(xse_cfs, new_xrt); + xse_cfs->sum_exec_runtime += delta; +} + +/* + * Xsched Fair class methods + * For rq manipulation we rely on root runqueue lock already acquired in core. + * Access xsched_group_xcu_priv requires no locks because one thread per XCU. + */ +static void dequeue_ctx_fair(struct xsched_entity *xse) +{ + struct xsched_cu *xcu = xse->xcu; + struct xsched_entity_cfs *first; + struct xsched_entity_cfs *xse_cfs = &xse->cfs; + + xs_rq_remove(xse_cfs); + + first = xs_pick_first(&xcu->xrq.cfs); + xcu->xrq.cfs.min_xruntime = (first) ? first->xruntime : XSCHED_TIME_INF; +} + +/** + * enqueue_ctx_fair() - Add context to the runqueue + * @xse: xsched entity of context + * @xcu: executor + * + * In contrary to enqueue_task it is called once on context init. + * Although groups reside in tree, their nodes not counted in nr_running. + * The xruntime of a group xsched entitry represented by min xruntime inside. + */ +static void enqueue_ctx_fair(struct xsched_entity *xse, struct xsched_cu *xcu) +{ + struct xsched_entity_cfs *first; + struct xsched_rq_cfs *rq; + struct xsched_entity_cfs *xse_cfs = &xse->cfs; + + rq = xse_cfs->cfs_rq = &xcu->xrq.cfs; + + /* If no XSE of only empty groups */ + if (xs_pick_first(rq) == NULL || rq->min_xruntime == XSCHED_TIME_INF) + rq->min_xruntime = xse_cfs->xruntime; + else + xse_cfs->xruntime = max(xse_cfs->xruntime, rq->min_xruntime); + + xs_rq_add(xse_cfs); + + first = xs_pick_first(&xcu->xrq.cfs); + xcu->xrq.cfs.min_xruntime = (first) ? first->xruntime : XSCHED_TIME_INF; +} + +static struct xsched_entity *pick_next_ctx_fair(struct xsched_cu *xcu) +{ + struct xsched_entity_cfs *xse; + struct xsched_rq_cfs *rq = &xcu->xrq.cfs; + + xse = xs_pick_first(rq); + if (!xse) + return NULL; + + return container_of(xse, struct xsched_entity, cfs); +} + +static inline bool +xs_should_preempt_fair(struct xsched_entity *xse) +{ + return (atomic_read(&xse->submitted_one_kick) >= XSCHED_CFS_KICK_SLICE); +} + +static void put_prev_ctx_fair(struct xsched_entity *xse) +{ + struct xsched_entity_cfs *prev = &xse->cfs; + + xs_update(prev, xse->last_exec_runtime); +} + +void rq_init_fair(struct xsched_cu *xcu) +{ + xcu->xrq.cfs.ctx_timeline = RB_ROOT_CACHED; +} + +void xse_init_fair(struct xsched_entity *xse) +{ + xse->cfs.weight = XSCHED_CFS_WEIGHT_DFLT; +} + +void xse_deinit_fair(struct xsched_entity *xse) +{ + /* TODO Cgroup exit */ +} + +struct xsched_class fair_xsched_class = { + .class_id = XSCHED_TYPE_CFS, + .kick_slice = XSCHED_CFS_KICK_SLICE, + .rq_init = rq_init_fair, + .xse_init = xse_init_fair, + .xse_deinit = xse_deinit_fair, + .dequeue_ctx = dequeue_ctx_fair, + .enqueue_ctx = enqueue_ctx_fair, + .pick_next_ctx = pick_next_ctx_fair, + .put_prev_ctx = put_prev_ctx_fair, + .check_preempt = xs_should_preempt_fair, +}; diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c index bdad82041ada..2905cca41205 100644 --- a/kernel/xsched/core.c +++ b/kernel/xsched/core.c @@ -193,6 +193,10 @@ int xsched_xse_set_class(struct xsched_entity *xse) xse->class = &rt_xsched_class; XSCHED_DEBUG("Context is in RT class %s\n", __func__); break; + case XSCHED_TYPE_CFS: + xse->class = &fair_xsched_class; + XSCHED_DEBUG("Context is in CFS class %s\n", __func__); + break; default: XSCHED_ERR("Xse has incorrect class @ %s\n", __func__); return -EINVAL; @@ -362,7 +366,7 @@ int xsched_schedule(void *input_xcu) while (!kthread_should_stop()) { mutex_unlock(&xcu->xcu_lock); wait_event_interruptible(xcu->wq_xcu_idle, - xcu->xrq.rt.nr_running); + xcu->xrq.rt.nr_running || xcu->xrq.cfs.nr_running || kthread_should_stop()); mutex_lock(&xcu->xcu_lock); if (kthread_should_stop()) { @@ -501,6 +505,10 @@ __init int xsched_sched_init(void) xsched_register_sched_class(&rt_xsched_class); #endif +#ifdef CONFIG_XCU_SCHED_CFS + xsched_register_sched_class(&fair_xsched_class); +#endif + return 0; } late_initcall(xsched_sched_init); -- 2.34.1

Zicheng Qu

2:43 p.m.

New subject: [PATCH OLK-6.6 7/8] xsched: Add XCU control group implementation and its backend in xsched CFS

From: Alekseev Dmitry <alekseev.dmitry@huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC5EHB ----------------------------------------- Add cgroup initialization inculing root cgroup. Add xcu cgroup callbacks: alloc, free, attach, detach, etc. Add xsched_group cgroup management files and methods for: - sched type - shares Add xcu cgroup subsys and option CONFIG_CGROUP_XCU Add cgroup.c in /kernel/xsched Makefile. Signed-off-by: Alekseev Dmitry <alekseev.dmitry@huawei.com> Signed-off-by: Hui Tang <tanghui20@.huawei.com> Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Xia Fukun <xiafukun@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- drivers/xcu/xcu_group.c | 2 + include/linux/cgroup_subsys.h | 4 + include/linux/xsched.h | 111 ++++++ kernel/cgroup/cgroup.c | 2 +- kernel/xsched/Kconfig | 15 + kernel/xsched/Makefile | 1 + kernel/xsched/cfs.c | 61 +++- kernel/xsched/cgroup.c | 662 ++++++++++++++++++++++++++++++++++ kernel/xsched/core.c | 30 +- 9 files changed, 870 insertions(+), 18 deletions(-) create mode 100644 kernel/xsched/cgroup.c diff --git a/drivers/xcu/xcu_group.c b/drivers/xcu/xcu_group.c index 2a349de62256..0cd8f535fb2b 100644 --- a/drivers/xcu/xcu_group.c +++ b/drivers/xcu/xcu_group.c @@ -327,6 +327,8 @@ int xsched_xcu_register(struct xcu_group *group, uint32_t phys_id) return ret; } + xcu_cfs_root_cg_init(xcu); + return 0; } EXPORT_SYMBOL(xsched_xcu_register); diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 85fa78049bd0..e65ae90946c2 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -61,6 +61,10 @@ SUBSYS(pids) SUBSYS(rdma) #endif +#if IS_ENABLED(CONFIG_CGROUP_XCU) +SUBSYS(xcu) +#endif + #if IS_ENABLED(CONFIG_CGROUP_MISC) SUBSYS(misc) #endif diff --git a/include/linux/xsched.h b/include/linux/xsched.h index 0bb11d7360bd..8bbb533ab043 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -2,6 +2,7 @@ #ifndef __LINUX_XSCHED_H__ #define __LINUX_XSCHED_H__ +#include <linux/cgroup.h> #include <linux/kref.h> #include <linux/vstream.h> #include <linux/xcu_group.h> @@ -41,6 +42,7 @@ #define RUNTIME_INF ((u64)~0ULL) #define XSCHED_TIME_INF RUNTIME_INF #define XSCHED_CFS_WEIGHT_DFLT 1 +#define XSCHED_CFG_SHARE_DFLT 1024 /* * A default kick slice for RT class XSEs. @@ -226,10 +228,96 @@ struct xsched_entity { */ struct xsched_cu *xcu; + /* Link to list of xsched_group items */ + struct list_head group_node; + struct xsched_group *parent_grp; + bool is_group; + /* General purpose xse lock. */ spinlock_t xse_lock; }; +struct xcg_attach_entry { + struct task_struct *task; + struct xsched_group *old_xcg; + struct xsched_group *new_xcg; + + struct list_head node; +}; + +/* xsched_group's xcu related stuff */ +struct xsched_group_xcu_priv { + /* Owner of this group */ + struct xsched_group *self; + + /* xcu id */ + int xcu_id; + + /* Link to scheduler */ + struct xsched_entity xse; /* xse of this group on runqueue */ + struct xsched_rq_cfs *cfs_rq; /* cfs runqueue "owned" by this group */ + struct xsched_rq_rt *rt_rq; /* rt runqueue "owned" by this group */ +}; + +/* Xsched scheduling control group */ +struct xsched_group { + /* Cgroups controller structure */ + struct cgroup_subsys_state css; + + /* Control group settings: */ + int sched_class; + int prio; + + /* Bandwidth setting: shares value set by user */ + u64 shares_cfg; + u64 shares_cfg_red; + u32 weight; + u64 children_shares_sum; + + struct xsched_group_xcu_priv perxcu_priv[XSCHED_NR_CUS]; + + /* Groups hierarchcy */ + struct xsched_group *parent; + struct list_head children_groups; + struct list_head group_node; + + spinlock_t lock; + + /* for XSE to move in perxcu */ + struct list_head members; +}; + +#define XSCHED_RQ_OF(xse) \ + (container_of(((xse)->cfs.cfs_rq), struct xsched_rq, cfs)) + +#define XSCHED_RQ_OF_CFS_XSE(cfs_xse) \ + (container_of(((cfs_xse)->cfs_rq), struct xsched_rq, cfs)) + +#define XSCHED_SE_OF(cfs_xse) \ + (container_of((cfs_xse), struct xsched_entity, cfs)) + +#define xcg_parent_grp_xcu(xcg) \ + ((xcg)->self->parent->perxcu_priv[(xcg)->xcu_id]) + +#define xse_parent_grp_xcu(xse_cfs) \ + (&((XSCHED_SE_OF(xse_cfs) \ + ->parent_grp->perxcu_priv[(XSCHED_SE_OF(xse_cfs))->xcu->id]))) + +static inline struct xsched_group_xcu_priv * +xse_this_grp_xcu(struct xsched_entity_cfs *xse_cfs) +{ + struct xsched_entity *xse; + + xse = xse_cfs ? container_of(xse_cfs, struct xsched_entity, cfs) : NULL; + return xse ? container_of(xse, struct xsched_group_xcu_priv, xse) : NULL; +} + +static inline struct xsched_group * +xse_this_grp(struct xsched_entity_cfs *xse_cfs) +{ + return xse_cfs ? xse_this_grp_xcu(xse_cfs)->self : NULL; +} + /* Increments pending kicks counter for an XCU that the given * xsched entity is attached to and for xsched entity's xsched * class. @@ -315,6 +403,22 @@ ctx_find_by_tgid_and_xcu(pid_t tgid, struct xsched_cu *xcu) return ret; } +static inline u64 gcd(u64 a, u64 b) +{ + u64 rem; + + while (a != 0 && b != 0) { + if (a > b) { + div64_u64_rem(a, b, &rem); + a = rem; + } else { + div64_u64_rem(b, a, &rem); + b = rem; + } + } + return (a) ? a : b; +} + struct xsched_class { enum xcu_sched_type class_id; size_t kick_slice; @@ -369,4 +473,11 @@ int xsched_rt_prio_set(pid_t tgid, unsigned int prio); void enqueue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu); void dequeue_ctx(struct xsched_entity *xse, struct xsched_cu *xcu); int delete_ctx(struct xsched_context *ctx); + +/* Xsched group manage functions */ +void xsched_group_inherit(struct task_struct *tsk, struct xsched_entity *xse); +void xcu_cg_subsys_init(void); +void xcu_cfs_root_cg_init(struct xsched_cu *xcu); +void xcu_grp_shares_update(struct xsched_group *parent); +void xsched_group_xse_detach(struct xsched_entity *xse); #endif /* !__LINUX_XSCHED_H__ */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 115717d58aa7..7df73d1d6628 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6256,7 +6256,7 @@ int __init cgroup_init(void) struct cgroup_subsys *ss; int ssid; - BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 17); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); diff --git a/kernel/xsched/Kconfig b/kernel/xsched/Kconfig index cc03f668a5dc..b7e7b222c949 100644 --- a/kernel/xsched/Kconfig +++ b/kernel/xsched/Kconfig @@ -6,6 +6,7 @@ config XCU_SCHEDULER select XCU_VSTREAM select XCU_SCHED_RT select XCU_SCHED_CFS + select CGROUP_XCU help This option enables the XSched scheduler, a custom scheduling mechanism designed for heterogeneous compute units (e.g., XPUs). It provides: @@ -68,3 +69,17 @@ config XCU_SCHED_CFS If your workload does not require proportional sharing or uses only the RT scheduling class, you may leave this disabled. +config CGROUP_XCU + bool "XCU bandwidth control and group scheduling for xsched_cfs" + default n + depends on XCU_SCHEDULER + help + This option enables the extended Compute Unit (XCU) resource controller for + CFS task groups, providing hierarchical scheduling and fine-grained bandwidth + allocation capabilities. Key features include: + - Proportional XCU time distribution across cgroups based on shares/quotas + - Nested group scheduling with latency isolation + - Integration with xsched_cfs for fair CPU resource management + + Required for systems requiring fine-grained resource control in cgroups. + If unsure, say N. diff --git a/kernel/xsched/Makefile b/kernel/xsched/Makefile index 9156349d260c..38b2f20e36fd 100644 --- a/kernel/xsched/Makefile +++ b/kernel/xsched/Makefile @@ -4,3 +4,4 @@ xsched_enabled := $(CONFIG_XCU_SCHEDULER) obj-$(xsched_enabled) += core.o obj-$(xsched_enabled)-$(CONFIG_XCU_SCHED_RT) += rt.o obj-$(xsched_enabled)-$(CONFIG_XCU_SCHED_CFS) += cfs.o +obj-$(xsched_enabled)-$(CONFIG_CGROUP_XCU) += cgroup.o diff --git a/kernel/xsched/cfs.c b/kernel/xsched/cfs.c index ea39ef8770f8..86dc63cd5745 100644 --- a/kernel/xsched/cfs.c +++ b/kernel/xsched/cfs.c @@ -79,10 +79,46 @@ xs_pick_first(struct xsched_rq_cfs *cfs_rq) */ static void xs_update(struct xsched_entity_cfs *xse_cfs, u64 delta) { - u64 new_xrt = xse_cfs->xruntime + delta * xse_cfs->weight; + struct xsched_group_xcu_priv *xg = xse_parent_grp_xcu(xse_cfs); - xs_cfs_rq_update(xse_cfs, new_xrt); - xse_cfs->sum_exec_runtime += delta; + for (; xg; xse_cfs = &xg->xse.cfs, xg = &xcg_parent_grp_xcu(xg)) { + u64 new_xrt = xse_cfs->xruntime + delta * xse_cfs->weight; + + xs_cfs_rq_update(xse_cfs, new_xrt); + xse_cfs->sum_exec_runtime += delta; + + if (xg->self->parent == NULL) + break; + } +} + +/** + * xg_update() - Update container group's xruntime + * @gxcu: Descendant xsched group's private xcu control structure + * + * No locks required to access xsched_group_xcu_priv members, + * because only one worker thread works for one XCU. + */ +static void xg_update(struct xsched_group_xcu_priv *xg, int task_delta) +{ + u64 new_xrt; + struct xsched_entity_cfs *entry; + + for (; xg; xg = &xcg_parent_grp_xcu(xg)) { + xg->cfs_rq->nr_running += task_delta; + entry = xs_pick_first(xg->cfs_rq); + new_xrt = entry ? entry->xruntime * xg->xse.cfs.weight : XSCHED_TIME_INF; + + xg->cfs_rq->min_xruntime = new_xrt; + xg->xse.cfs.xruntime = new_xrt; + + if (!xg->xse.on_rq) + break; + if (!xg->self->parent) + break; + + xs_cfs_rq_update(&xg->xse.cfs, new_xrt); + } } /* @@ -92,11 +128,16 @@ static void xs_update(struct xsched_entity_cfs *xse_cfs, u64 delta) */ static void dequeue_ctx_fair(struct xsched_entity *xse) { + int task_delta; struct xsched_cu *xcu = xse->xcu; struct xsched_entity_cfs *first; struct xsched_entity_cfs *xse_cfs = &xse->cfs; + task_delta = + (xse->is_group) ? -(xse_this_grp_xcu(xse_cfs)->cfs_rq->nr_running) : -1; + xs_rq_remove(xse_cfs); + xg_update(xse_parent_grp_xcu(xse_cfs), task_delta); first = xs_pick_first(&xcu->xrq.cfs); xcu->xrq.cfs.min_xruntime = (first) ? first->xruntime : XSCHED_TIME_INF; @@ -113,19 +154,23 @@ static void dequeue_ctx_fair(struct xsched_entity *xse) */ static void enqueue_ctx_fair(struct xsched_entity *xse, struct xsched_cu *xcu) { + int task_delta; struct xsched_entity_cfs *first; struct xsched_rq_cfs *rq; struct xsched_entity_cfs *xse_cfs = &xse->cfs; - rq = xse_cfs->cfs_rq = &xcu->xrq.cfs; + rq = xse_cfs->cfs_rq = xse_parent_grp_xcu(xse_cfs)->cfs_rq; + task_delta = + (xse->is_group) ? xse_this_grp_xcu(xse_cfs)->cfs_rq->nr_running : 1; - /* If no XSE of only empty groups */ + /* If no XSE or only empty groups */ if (xs_pick_first(rq) == NULL || rq->min_xruntime == XSCHED_TIME_INF) rq->min_xruntime = xse_cfs->xruntime; else xse_cfs->xruntime = max(xse_cfs->xruntime, rq->min_xruntime); xs_rq_add(xse_cfs); + xg_update(xse_parent_grp_xcu(xse_cfs), task_delta); first = xs_pick_first(&xcu->xrq.cfs); xcu->xrq.cfs.min_xruntime = (first) ? first->xruntime : XSCHED_TIME_INF; @@ -140,6 +185,12 @@ static struct xsched_entity *pick_next_ctx_fair(struct xsched_cu *xcu) if (!xse) return NULL; + for (; XSCHED_SE_OF(xse)->is_group; xse = xs_pick_first(rq)) { + if (!xse || CFS_INNER_RQ_EMPTY(xse)) + return NULL; + rq = xse_this_grp_xcu(xse)->cfs_rq; + } + return container_of(xse, struct xsched_entity, cfs); } diff --git a/kernel/xsched/cgroup.c b/kernel/xsched/cgroup.c new file mode 100644 index 000000000000..9f7b3d15e9a9 --- /dev/null +++ b/kernel/xsched/cgroup.c @@ -0,0 +1,662 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Support cgroup for xpu device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include <linux/err.h> +#include <linux/cgroup.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/xsched.h> + +enum xcu_file_type { + XCU_FILE_PERIOD_MS, + XCU_FILE_QUOTA_MS, + XCU_FILE_SHARES, +}; + +static struct xsched_group root_xsched_group; +struct xsched_group *root_xcg = &root_xsched_group; + +/* + * Cacheline aligned slab cache for xsched_group, + * to replace kzalloc with kmem_cache_alloc. + */ +static struct kmem_cache *xsched_group_cache __read_mostly; +static struct kmem_cache *xcg_attach_entry_cache __read_mostly; +static LIST_HEAD(xcg_attach_list); + +static const char xcu_sched_name[XSCHED_TYPE_NUM][4] = { + [XSCHED_TYPE_RT] = "rt", + [XSCHED_TYPE_CFS] = "cfs" +}; + +/** + * @brief Initialize the core components of an xsched_group. + * + * This function initializes the essential components of an xsched_group, + * including the spin lock, member list, children groups list, quota timeout + * mechanism, and refill work queue. These components are necessary for the + * proper functioning of the xsched_group. + * + * @param xcg Pointer to the xsched_group to be initialized. + */ +static void xcu_cg_initialize_components(struct xsched_group *xcg) +{ + spin_lock_init(&xcg->lock); + INIT_LIST_HEAD(&xcg->members); + INIT_LIST_HEAD(&xcg->children_groups); +} + +void xcu_cg_subsys_init(void) +{ + xcu_cg_initialize_components(root_xcg); + + root_xcg->sched_class = XSCHED_TYPE_DFLT; + + xsched_group_cache = KMEM_CACHE(xsched_group, 0); + xcg_attach_entry_cache = KMEM_CACHE(xcg_attach_entry, 0); +} + +void xcu_cfs_root_cg_init(struct xsched_cu *xcu) +{ + int id = xcu->id; + + root_xcg->perxcu_priv[id].xcu_id = id; + root_xcg->perxcu_priv[id].self = root_xcg; + root_xcg->perxcu_priv[id].cfs_rq = &xcu->xrq.cfs; + root_xcg->perxcu_priv[id].xse.cfs.weight = XSCHED_CFS_WEIGHT_DFLT; +} + +/** + * xcu_cfs_cg_init() - Initialize xsched_group cfs runqueues and bw control. + * @xcg: new xsched_cgroup + * @parent_xg: parent's group + * + * One xsched_group can host many processes with contexts on different devices. + * Function creates xsched_entity for every XCU, and places it in runqueue + * of parent group. Create new cfs rq for xse inside group. + */ +static int xcu_cfs_cg_init(struct xsched_group *xcg, + struct xsched_group *parent_xg) +{ + int id = 0, err, i; + struct xsched_cu *xcu; + struct xsched_rq_cfs *sub_cfs_rq; + + for_each_active_xcu(xcu, id) { + xcg->perxcu_priv[id].xcu_id = id; + xcg->perxcu_priv[id].self = xcg; + + sub_cfs_rq = kzalloc(sizeof(struct xsched_rq_cfs), GFP_KERNEL); + if (!sub_cfs_rq) { + XSCHED_ERR("Fail to alloc cfs runqueue on xcu %d\n", id); + err = -ENOMEM; + goto alloc_error; + } + xcg->perxcu_priv[id].cfs_rq = sub_cfs_rq; + xcg->perxcu_priv[id].cfs_rq->ctx_timeline = RB_ROOT_CACHED; + + xcg->perxcu_priv[id].xse.is_group = true; + xcg->perxcu_priv[id].xse.xcu = xcu; + xcg->perxcu_priv[id].xse.class = &fair_xsched_class; + + /* Put new empty groups to the right in parent's rbtree: */ + xcg->perxcu_priv[id].xse.cfs.xruntime = XSCHED_TIME_INF; + xcg->perxcu_priv[id].xse.cfs.weight = XSCHED_CFS_WEIGHT_DFLT; + xcg->perxcu_priv[id].xse.parent_grp = parent_xg; + + mutex_lock(&xcu->xcu_lock); + enqueue_ctx(&xcg->perxcu_priv[id].xse, xcu); + mutex_unlock(&xcu->xcu_lock); + } + + xcg->shares_cfg = XSCHED_CFG_SHARE_DFLT; + xcu_grp_shares_update(parent_xg); + + return 0; + +alloc_error: + for (i = 0; i < id; i++) { + xcu = xsched_cu_mgr[i]; + mutex_lock(&xcu->xcu_lock); + dequeue_ctx(&xcg->perxcu_priv[i].xse, xcu); + mutex_unlock(&xcu->xcu_lock); + + kfree(xcg->perxcu_priv[i].cfs_rq); + } + + return err; +} + +static void xcu_cfs_cg_deinit(struct xsched_group *xcg) +{ + uint32_t id; + struct xsched_cu *xcu; + + for_each_active_xcu(xcu, id) { + mutex_lock(&xcu->xcu_lock); + dequeue_ctx(&xcg->perxcu_priv[id].xse, xcu); + mutex_unlock(&xcu->xcu_lock); + kfree(xcg->perxcu_priv[id].cfs_rq); + } + xcu_grp_shares_update(xcg->parent); +} + +/** + * xcu_cg_init() - Initialize non-root xsched_group structure. + * @xcg: new xsched_cgroup + * @parent_xg: parent's group + */ +static int xcu_cg_init(struct xsched_group *xcg, + struct xsched_group *parent_xg) +{ + xcu_cg_initialize_components(xcg); + xcg->parent = parent_xg; + list_add_tail(&xcg->group_node, &parent_xg->children_groups); + xcg->sched_class = parent_xg->sched_class; + + switch (xcg->sched_class) { + case XSCHED_TYPE_CFS: + return xcu_cfs_cg_init(xcg, parent_xg); + default: + XSCHED_INFO("xcu_cgroup: init RT group css=0x%lx\n", + (uintptr_t)&xcg->css); + break; + } + + return 0; +} + +inline struct xsched_group *xcu_cg_from_css(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct xsched_group, css) : NULL; +} + +/* + * Determine whether the given css corresponds to root_xsched_group.css. + * + * Parameter only_css_self: + * - true : Only check whether the css pointer itself is NULL + * (i.e., the subsystem root). Do not dereference xg->parent. + * Used in the allocation path (css_alloc). + * - false : Further check whether the associated xsched_group + * has no parent (i.e., a normal root check). + */ +static inline bool xsched_group_css_is_root(struct cgroup_subsys_state *css, bool only_css_self) +{ + struct xsched_group *xg; + + /* NULL indicates the subsystem root */ + if (!css) + return true; + + /* + * During the allocation phase, + * cannot find its parent xsched_group via xg->parent, + * so can only determine on the css itself. + */ + if (only_css_self) + return false; + + xg = xcu_cg_from_css(css); + + return xg && !xg->parent; +} + +/** + * xcu_css_alloc() - Allocate and init xcu cgroup. + * @parent_css: css of parent xcu cgroup + * + * Called from kernel/cgroup.c with cgroup_lock() held. + * First called in subsys initialization to create root xcu cgroup, when + * XCUs haven't been initialized yet. Func used on every new cgroup creation, + * on second call to set root xsched_group runqueue. + * + * Return: pointer of new xcu cgroup css on success, -ENOMEM otherwise. + */ +static struct cgroup_subsys_state * +xcu_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct xsched_group *xg; + + if (xsched_group_css_is_root(parent_css, true)) + return &root_xsched_group.css; + + xg = kmem_cache_alloc(xsched_group_cache, GFP_KERNEL | __GFP_ZERO); + if (!xg) + return ERR_PTR(-ENOMEM); + + return &xg->css; +} + +static void xcu_css_free(struct cgroup_subsys_state *css) +{ + struct xsched_group *xcg = xcu_cg_from_css(css); + + kmem_cache_free(xsched_group_cache, xcg); +} + +static int xcu_css_online(struct cgroup_subsys_state *css) +{ + struct xsched_group *xg = xcu_cg_from_css(css); + struct cgroup_subsys_state *parent_css = css->parent; + struct xsched_group *parent_xg; + int err; + + if (!parent_css) + return 0; + + parent_xg = xcu_cg_from_css(parent_css); + err = xcu_cg_init(xg, parent_xg); + if (err) { + kmem_cache_free(xsched_group_cache, xg); + XSCHED_ERR("Failed to initialize new xsched_group @ %s.\n", __func__); + return err; + } + + return 0; +} + +static void xcu_css_offline(struct cgroup_subsys_state *css) +{ + struct xsched_group *xcg; + + xcg = xcu_cg_from_css(css); + if (!xsched_group_css_is_root(css, false)) { + switch (xcg->sched_class) { + case XSCHED_TYPE_CFS: + xcu_cfs_cg_deinit(xcg); + break; + default: + XSCHED_INFO("xcu_cgroup: deinit RT group css=0x%lx\n", + (uintptr_t)&xcg->css); + break; + } + } + list_del(&xcg->group_node); +} + +static void xsched_group_xse_attach(struct xsched_group *xg, + struct xsched_entity *xse) +{ + spin_lock(&xg->lock); + list_add_tail(&xse->group_node, &xg->members); + spin_unlock(&xg->lock); + xse->parent_grp = xg; +} + +void xsched_group_xse_detach(struct xsched_entity *xse) +{ + struct xsched_group *xcg = xse->parent_grp; + + spin_lock(&xcg->lock); + list_del(&xse->group_node); + spin_unlock(&xcg->lock); +} + +static int xcu_task_can_attach(struct task_struct *task, + struct xsched_group *old) +{ + struct xsched_entity *xse; + bool has_xse = false; + + spin_lock(&old->lock); + list_for_each_entry(xse, &old->members, group_node) { + if (xse->owner_pid == task_pid_nr(task)) { + has_xse = true; + break; + } + } + spin_unlock(&old->lock); + + return has_xse ? -EINVAL : 0; +} + +static int xcu_can_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *dst_css, *old_css; + struct xsched_group *old_xcg, *dst_xcg; + struct xcg_attach_entry *entry; + int ret = 0; + + cgroup_taskset_for_each(task, dst_css, tset) { + rcu_read_lock(); + old_css = task_css(task, xcu_cgrp_id); + rcu_read_unlock(); + dst_xcg = xcu_cg_from_css(dst_css); + old_xcg = xcu_cg_from_css(old_css); + + ret = xcu_task_can_attach(task, old_xcg); + if (ret) + break; + + /* record entry for this task */ + entry = kmem_cache_alloc(xcg_attach_entry_cache, GFP_KERNEL | __GFP_ZERO); + entry->task = task; + entry->old_xcg = old_xcg; + entry->new_xcg = dst_xcg; + list_add_tail(&entry->node, &xcg_attach_list); + } + + return ret; +} + +static void xcu_cancel_attach(struct cgroup_taskset *tset) +{ + struct xcg_attach_entry *entry, *tmp; + + /* error: clear all entries */ + list_for_each_entry_safe(entry, tmp, &xcg_attach_list, node) { + list_del(&entry->node); + kmem_cache_free(xcg_attach_entry_cache, entry); + } +} + +void xcu_move_task(struct task_struct *task, struct xsched_group *old_xcg, + struct xsched_group *new_xcg) +{ + struct xsched_entity *xse, *tmp; + struct xsched_cu *xcu; + + spin_lock(&old_xcg->lock); + list_for_each_entry_safe(xse, tmp, &old_xcg->members, group_node) { + if (xse->owner_pid != task_pid_nr(task)) + continue; + + xcu = xse->xcu; + + if (old_xcg != xse->parent_grp) { + WARN_ON(old_xcg != xse->parent_grp); + return; + } + + /* delete from the old_xcg */ + list_del(&xse->group_node); + + mutex_lock(&xcu->xcu_lock); + /* dequeue from the current runqueue */ + dequeue_ctx(xse, xcu); + /* attach to the new_xcg */ + xsched_group_xse_attach(new_xcg, xse); + /* enqueue to the runqueue in new_xcg */ + enqueue_ctx(xse, xcu); + mutex_unlock(&xcu->xcu_lock); + } + spin_unlock(&old_xcg->lock); +} + +static void xcu_attach(struct cgroup_taskset *tset) +{ + struct xcg_attach_entry *entry, *tmp; + + list_for_each_entry(entry, &xcg_attach_list, node) { + xcu_move_task(entry->task, entry->old_xcg, entry->new_xcg); + } + + /* cleanup */ + list_for_each_entry_safe(entry, tmp, &xcg_attach_list, node) { + list_del(&entry->node); + kmem_cache_free(xcg_attach_entry_cache, entry); + } +} + +/** + * xsched_group_inherit() - Attach new entity to task's xsched_group. + * @task: task_struct + * @xse: xsched entity + * + * Called in xsched context initialization to attach xse to task's group + * and inherit its xse scheduling class and bandwidth control policy. + * + * Return: Zero on success. + */ +void xsched_group_inherit(struct task_struct *task, struct xsched_entity *xse) +{ + struct cgroup_subsys_state *css; + struct xsched_group *xg; + + xse->owner_pid = task_pid_nr(task); + css = task_get_css(task, xcu_cgrp_id); + xg = xcu_cg_from_css(css); + xsched_group_xse_attach(xg, xse); + css_put(css); +} + +static int xcu_sched_class_show(struct seq_file *sf, void *v) +{ + struct cgroup_subsys_state *css = seq_css(sf); + struct xsched_group *xg = xcu_cg_from_css(css); + + seq_printf(sf, "%s\n", xcu_sched_name[xg->sched_class]); + return 0; +} + +/** + * xcu_cg_set_sched_class() - Set scheduling type for group. + * @xg: xsched group + * @type: scheduler type + * + * Scheduler type can be changed if task is child of root group + * and haven't got scheduling entities. + * + * Return: Zero on success or -EINVAL + */ +static int xcu_cg_set_sched_class(struct xsched_group *xg, int type) +{ + if (type == xg->sched_class) + return 0; + + /* can't change scheduler when there are running members */ + if (!list_empty(&xg->members)) + return -EBUSY; + + /* deinit old type if necessary */ + switch (xg->sched_class) { + case XSCHED_TYPE_CFS: + xcu_cfs_cg_deinit(xg); + break; + default: + break; + } + + /* update type */ + xg->sched_class = type; + + /* init new type if necessary */ + switch (type) { + case XSCHED_TYPE_CFS: + return xcu_cfs_cg_init(xg, xg->parent); + default: + return 0; + } +} + +static ssize_t xcu_sched_class_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cgroup_subsys_state *css = of_css(of); + struct xsched_group *xg = xcu_cg_from_css(css); + char type_name[4]; + int type = -1; + + ssize_t ret = sscanf(buf, "%3s", type_name); + + if (ret < 1) + return -EINVAL; + + for (type = 0; type < XSCHED_TYPE_NUM; type++) { + if (!strcmp(type_name, xcu_sched_name[type])) + break; + } + + if (type == XSCHED_TYPE_NUM) + return -EINVAL; + + if (!list_empty(&css->children)) + return -EBUSY; + + /* only root child can switch scheduler type */ + if (!xg->parent || !xsched_group_css_is_root(&xg->parent->css, false)) + return -EINVAL; + + ret = xcu_cg_set_sched_class(xg, type); + + return (ret) ? ret : nbytes; +} + +static s64 xcu_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + s64 ret = 0; + struct xsched_group *xcucg = xcu_cg_from_css(css); + + switch (cft->private) { + case XCU_FILE_SHARES: + ret = xcucg->shares_cfg; + break; + default: + XSCHED_ERR("invalid operation %lu @ %s\n", cft->private, __func__); + break; + } + + return ret; +} + +void xcu_grp_shares_update(struct xsched_group *parent) +{ + int id; + struct xsched_cu *xcu; + struct xsched_group *children; + u64 rem, sh_sum = 0, sh_gcd = 0, w_gcd = 0, sh_prod_red = 1; + + spin_lock(&parent->lock); + list_for_each_entry(children, &(parent)->children_groups, group_node) { + if (children->sched_class == XSCHED_TYPE_CFS) + sh_gcd = gcd(sh_gcd, children->shares_cfg); + } + + list_for_each_entry(children, &(parent)->children_groups, group_node) { + if (children->sched_class == XSCHED_TYPE_CFS) { + sh_sum += children->shares_cfg; + children->shares_cfg_red = div64_u64(children->shares_cfg, sh_gcd); + div64_u64_rem(sh_prod_red, children->shares_cfg_red, &rem); + if (rem) + sh_prod_red *= children->shares_cfg_red; + } + } + + parent->children_shares_sum = sh_sum; + list_for_each_entry(children, &(parent)->children_groups, group_node) { + if (children->sched_class == XSCHED_TYPE_CFS) { + children->weight = div64_u64(sh_prod_red, children->shares_cfg_red); + w_gcd = gcd(w_gcd, children->weight); + } + } + + list_for_each_entry(children, &(parent)->children_groups, group_node) { + if (children->sched_class == XSCHED_TYPE_CFS) { + children->weight = div64_u64(children->weight, w_gcd); + for_each_active_xcu(xcu, id) { + mutex_lock(&xcu->xcu_lock); + children->perxcu_priv[id].xse.cfs.weight = children->weight; + mutex_unlock(&xcu->xcu_lock); + } + } + } + spin_unlock(&parent->lock); +} + +static int xcu_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) +{ + int ret = 0; + struct xsched_group *xcucg = xcu_cg_from_css(css); + + switch (cft->private) { + case XCU_FILE_SHARES: + if (val <= 0) { + ret = -EINVAL; + break; + } + xcucg->shares_cfg = val; + xcu_grp_shares_update(xcucg->parent); + break; + default: + XSCHED_ERR("invalid operation %lu @ %s\n", cft->private, __func__); + ret = -EINVAL; + break; + } + + return ret; +} + +static int xcu_stat(struct seq_file *sf, void *v) +{ + struct cgroup_subsys_state *css = seq_css(sf); + struct xsched_group *xcucg = xcu_cg_from_css(css); + u64 exec_runtime = 0; + int xcu_id; + struct xsched_cu *xcu; + + if (xcucg->sched_class == XSCHED_TYPE_RT) { + seq_printf(sf, "RT group stat is not supported @ %s.\n", __func__); + return 0; + } + + for_each_active_xcu(xcu, xcu_id) { + exec_runtime += + xcucg->perxcu_priv[xcu_id].xse.cfs.sum_exec_runtime; + } + + seq_printf(sf, "exec_runtime: %llu\n", exec_runtime); + seq_printf(sf, "shares cfg: %llu/%llu x%u\n", xcucg->shares_cfg, + xcucg->parent->children_shares_sum, xcucg->weight); + + return 0; +} + +static struct cftype xcu_cg_files[] = { + { + .name = "shares", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = xcu_read_s64, + .write_s64 = xcu_write_s64, + .private = XCU_FILE_SHARES, + }, + { + .name = "stat", + .seq_show = xcu_stat, + }, + { + .name = "sched_class", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = xcu_sched_class_show, + .write = xcu_sched_class_write, + }, + {} /* terminate */ +}; + +struct cgroup_subsys xcu_cgrp_subsys = { + .css_alloc = xcu_css_alloc, + .css_online = xcu_css_online, + .css_offline = xcu_css_offline, + .css_free = xcu_css_free, + .can_attach = xcu_can_attach, + .cancel_attach = xcu_cancel_attach, + .attach = xcu_attach, + .dfl_cftypes = xcu_cg_files, + .legacy_cftypes = xcu_cg_files, + .early_init = false, +}; diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c index 2905cca41205..ad32f8a74440 100644 --- a/kernel/xsched/core.c +++ b/kernel/xsched/core.c @@ -183,25 +183,27 @@ int delete_ctx(struct xsched_context *ctx) atomic_read(&xse->kicks_pending_ctx_cnt), __func__); xse->class->xse_deinit(xse); + +#ifdef CONFIG_CGROUP_XCU + xsched_group_xse_detach(xse); +#endif + return 0; } int xsched_xse_set_class(struct xsched_entity *xse) { - switch (xse->task_type) { - case XSCHED_TYPE_RT: - xse->class = &rt_xsched_class; - XSCHED_DEBUG("Context is in RT class %s\n", __func__); - break; - case XSCHED_TYPE_CFS: - xse->class = &fair_xsched_class; - XSCHED_DEBUG("Context is in CFS class %s\n", __func__); - break; - default: - XSCHED_ERR("Xse has incorrect class @ %s\n", __func__); - return -EINVAL; + struct xsched_class *sched = xsched_first_class; + +#ifdef CONFIG_CGROUP_XCU + xsched_group_inherit(current, xse); + for_each_xsched_class(sched) { + if (sched->class_id == xse->parent_grp->sched_class) + break; } +#endif + xse->class = sched; return 0; } @@ -509,6 +511,10 @@ __init int xsched_sched_init(void) xsched_register_sched_class(&fair_xsched_class); #endif +#ifdef CONFIG_CGROUP_XCU + xcu_cg_subsys_init(); +#endif + return 0; } late_initcall(xsched_sched_init); -- 2.34.1

Zicheng Qu

2:43 p.m.

New subject: [PATCH OLK-6.6 8/8] xsched: Add support for CFS quota for cgroups

From: Alekseev Dmitry <alekseev.dmitry@huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC5EHB ----------------------------------------- Add support for CFS quota for cgroups. Signed-off-by: Alekseev Dmitry <alekseev.dmitry@huawei.com> Signed-off-by: Hui Tang <tanghui20@.huawei.com> Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Xia Fukun <xiafukun@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- include/linux/xsched.h | 31 ++++++++ include/uapi/linux/xcu_vstream.h | 1 + kernel/xsched/Makefile | 2 +- kernel/xsched/cfs.c | 1 + kernel/xsched/cfs_quota.c | 98 ++++++++++++++++++++++++ kernel/xsched/cgroup.c | 127 +++++++++++++++++++++++++++++-- kernel/xsched/core.c | 5 ++ 7 files changed, 257 insertions(+), 8 deletions(-) create mode 100644 kernel/xsched/cfs_quota.c diff --git a/include/linux/xsched.h b/include/linux/xsched.h index 8bbb533ab043..d97b3beae8ad 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -42,6 +42,7 @@ #define RUNTIME_INF ((u64)~0ULL) #define XSCHED_TIME_INF RUNTIME_INF #define XSCHED_CFS_WEIGHT_DFLT 1 +#define XSCHED_CFS_QUOTA_PERIOD_MS (100 * NSEC_PER_MSEC) #define XSCHED_CFG_SHARE_DFLT 1024 /* @@ -257,6 +258,16 @@ struct xsched_group_xcu_priv { struct xsched_entity xse; /* xse of this group on runqueue */ struct xsched_rq_cfs *cfs_rq; /* cfs runqueue "owned" by this group */ struct xsched_rq_rt *rt_rq; /* rt runqueue "owned" by this group */ + /* Statistics */ + int nr_throttled; + u64 throttled_time; +}; + +enum xcu_file_type { + XCU_FILE_PERIOD_MS, + XCU_FILE_QUOTA_MS, + XCU_FILE_SHARES, + NR_XCU_FILE_TYPES, }; /* Xsched scheduling control group */ @@ -274,6 +285,14 @@ struct xsched_group { u32 weight; u64 children_shares_sum; + /* Bandwidth setting: maximal quota in period */ + s64 quota; + /* record the runtime of operators during the period */ + s64 runtime; + s64 period; + struct hrtimer quota_timeout; + struct work_struct refill_work; + struct xsched_group_xcu_priv perxcu_priv[XSCHED_NR_CUS]; /* Groups hierarchcy */ @@ -285,6 +304,10 @@ struct xsched_group { /* for XSE to move in perxcu */ struct list_head members; + + /* to control the xcu.{period, quota, shares} files shown or not */ + struct cgroup_file xcu_file[NR_XCU_FILE_TYPES]; + struct work_struct file_show_work; }; #define XSCHED_RQ_OF(xse) \ @@ -456,6 +479,7 @@ static inline void xsched_init_vsm(struct vstream_metadata *vsm, struct vstream_info *vs, vstream_args_t *arg) { vsm->sq_id = arg->sq_id; + vsm->exec_time = arg->vk_args.exec_time; vsm->sqe_num = arg->vk_args.sqe_num; vsm->timeout = arg->vk_args.timeout; memcpy(vsm->sqe, arg->vk_args.sqe, XCU_SQE_SIZE_MAX); @@ -480,4 +504,11 @@ void xcu_cg_subsys_init(void); void xcu_cfs_root_cg_init(struct xsched_cu *xcu); void xcu_grp_shares_update(struct xsched_group *parent); void xsched_group_xse_detach(struct xsched_entity *xse); + +void xsched_quota_init(void); +void xsched_quota_timeout_init(struct xsched_group *xg); +void xsched_quota_timeout_update(struct xsched_group *xg); +void xsched_quota_account(struct xsched_group *xg, s64 exec_time); +bool xsched_quota_exceed(struct xsched_group *xg); +void xsched_quota_refill(struct work_struct *work); #endif /* !__LINUX_XSCHED_H__ */ diff --git a/include/uapi/linux/xcu_vstream.h b/include/uapi/linux/xcu_vstream.h index 38cc97d3a139..b60c0e0e15f5 100644 --- a/include/uapi/linux/xcu_vstream.h +++ b/include/uapi/linux/xcu_vstream.h @@ -42,6 +42,7 @@ typedef struct vstream_free_args { typedef struct vstream_kick_args { __u32 sqe_num; + __u32 exec_time; __s32 timeout; __s8 sqe[XCU_SQE_SIZE_MAX]; diff --git a/kernel/xsched/Makefile b/kernel/xsched/Makefile index 38b2f20e36fd..14dea4d32391 100644 --- a/kernel/xsched/Makefile +++ b/kernel/xsched/Makefile @@ -3,5 +3,5 @@ obj-y += vstream.o xsched_enabled := $(CONFIG_XCU_SCHEDULER) obj-$(xsched_enabled) += core.o obj-$(xsched_enabled)-$(CONFIG_XCU_SCHED_RT) += rt.o -obj-$(xsched_enabled)-$(CONFIG_XCU_SCHED_CFS) += cfs.o +obj-$(xsched_enabled)-$(CONFIG_XCU_SCHED_CFS) += cfs.o cfs_quota.o obj-$(xsched_enabled)-$(CONFIG_CGROUP_XCU) += cgroup.o diff --git a/kernel/xsched/cfs.c b/kernel/xsched/cfs.c index 86dc63cd5745..1cbfd5f0e586 100644 --- a/kernel/xsched/cfs.c +++ b/kernel/xsched/cfs.c @@ -204,6 +204,7 @@ static void put_prev_ctx_fair(struct xsched_entity *xse) { struct xsched_entity_cfs *prev = &xse->cfs; + xsched_quota_account(xse->parent_grp, (s64)xse->last_exec_runtime); xs_update(prev, xse->last_exec_runtime); } diff --git a/kernel/xsched/cfs_quota.c b/kernel/xsched/cfs_quota.c new file mode 100644 index 000000000000..2b516ab5592f --- /dev/null +++ b/kernel/xsched/cfs_quota.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Bandwidth provisioning for XPU device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include <linux/timer.h> +#include <linux/xsched.h> + +static struct workqueue_struct *quota_workqueue; + +void xsched_quota_refill(struct work_struct *work) +{ + uint32_t id; + struct xsched_cu *xcu; + struct xsched_group *xg; + + xg = container_of(work, struct xsched_group, refill_work); + + spin_lock(&xg->lock); + xg->runtime = max((xg->runtime - xg->quota), (s64)0); + hrtimer_start(&xg->quota_timeout, ns_to_ktime(xg->period), HRTIMER_MODE_REL_SOFT); + spin_unlock(&xg->lock); + + if (xg->runtime >= xg->quota) { + XSCHED_DEBUG("xcu_cgroup [css=0x%lx] is still be throttled @ %s\n", + (uintptr_t)&xg->css, __func__); + return; + } + + for_each_active_xcu(xcu, id) { + mutex_lock(&xcu->xcu_lock); + if (!READ_ONCE(xg->perxcu_priv[id].xse.on_rq)) { + enqueue_ctx(&xg->perxcu_priv[id].xse, xcu); + wake_up_interruptible(&xcu->wq_xcu_idle); + } + mutex_unlock(&xcu->xcu_lock); + } +} + +static enum hrtimer_restart quota_timer_cb(struct hrtimer *hrtimer) +{ + struct xsched_group *xg; + + xg = container_of(hrtimer, struct xsched_group, quota_timeout); + queue_work(quota_workqueue, &xg->refill_work); + + return HRTIMER_NORESTART; +} + +void xsched_quota_account(struct xsched_group *xg, s64 exec_time) +{ + spin_lock(&xg->lock); + xg->runtime += exec_time; + spin_unlock(&xg->lock); +} + +bool xsched_quota_exceed(struct xsched_group *xg) +{ + bool ret; + + spin_lock(&xg->lock); + ret = (xg->quota > 0) ? (xg->runtime >= xg->quota) : false; + spin_unlock(&xg->lock); + + return ret; +} + +void xsched_quota_init(void) +{ + quota_workqueue = create_singlethread_workqueue("xsched_quota_workqueue"); +} + +void xsched_quota_timeout_init(struct xsched_group *xg) +{ + hrtimer_init(&xg->quota_timeout, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + xg->quota_timeout.function = quota_timer_cb; +} + +void xsched_quota_timeout_update(struct xsched_group *xg) +{ + struct hrtimer *t = &xg->quota_timeout; + + hrtimer_cancel(t); + + if (xg->quota > 0 && xg->period > 0) + hrtimer_start(t, ns_to_ktime(xg->period), HRTIMER_MODE_REL_SOFT); +} diff --git a/kernel/xsched/cgroup.c b/kernel/xsched/cgroup.c index 9f7b3d15e9a9..f7eeedc80fc3 100644 --- a/kernel/xsched/cgroup.c +++ b/kernel/xsched/cgroup.c @@ -19,12 +19,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/xsched.h> - -enum xcu_file_type { - XCU_FILE_PERIOD_MS, - XCU_FILE_QUOTA_MS, - XCU_FILE_SHARES, -}; +#include <linux/delay.h> static struct xsched_group root_xsched_group; struct xsched_group *root_xcg = &root_xsched_group; @@ -42,6 +37,29 @@ static const char xcu_sched_name[XSCHED_TYPE_NUM][4] = { [XSCHED_TYPE_CFS] = "cfs" }; +static int xcu_cg_set_file_show(struct xsched_group *xg) +{ + if (!xg) { + XSCHED_ERR("xsched_group is NULL.\n"); + return -EINVAL; + } + + /* Update visibility of related files based on sched_class */ + for (int type_name = XCU_FILE_PERIOD_MS; type_name < NR_XCU_FILE_TYPES; type_name++) { + if (unlikely(!xg->xcu_file[type_name].kn)) { + XSCHED_ERR("Fail to control the file [%d] to be %s @ %s.\n", + type_name, + xg->sched_class == XSCHED_TYPE_CFS ? "visible" : "invisible", + __func__); + return -EBUSY; + } + + cgroup_file_show(&xg->xcu_file[type_name], xg->sched_class == XSCHED_TYPE_CFS); + } + + return 0; +} + /** * @brief Initialize the core components of an xsched_group. * @@ -57,6 +75,8 @@ static void xcu_cg_initialize_components(struct xsched_group *xcg) spin_lock_init(&xcg->lock); INIT_LIST_HEAD(&xcg->members); INIT_LIST_HEAD(&xcg->children_groups); + xsched_quota_timeout_init(xcg); + INIT_WORK(&xcg->refill_work, xsched_quota_refill); } void xcu_cg_subsys_init(void) @@ -64,6 +84,10 @@ void xcu_cg_subsys_init(void) xcu_cg_initialize_components(root_xcg); root_xcg->sched_class = XSCHED_TYPE_DFLT; + root_xcg->period = XSCHED_CFS_QUOTA_PERIOD_MS; + root_xcg->quota = XSCHED_TIME_INF; + root_xcg->runtime = 0; + xsched_quota_init(); xsched_group_cache = KMEM_CACHE(xsched_group, 0); xcg_attach_entry_cache = KMEM_CACHE(xcg_attach_entry, 0); @@ -124,6 +148,9 @@ static int xcu_cfs_cg_init(struct xsched_group *xcg, xcg->shares_cfg = XSCHED_CFG_SHARE_DFLT; xcu_grp_shares_update(parent_xg); + xcg->period = XSCHED_CFS_QUOTA_PERIOD_MS; + xcg->quota = XSCHED_TIME_INF; + xcg->runtime = 0; return 0; @@ -248,6 +275,26 @@ static void xcu_css_free(struct cgroup_subsys_state *css) kmem_cache_free(xsched_group_cache, xcg); } + +static void delay_xcu_cg_set_file_show_workfn(struct work_struct *work) +{ + struct xsched_group *xg; + int retry = 50; + + xg = container_of(work, struct xsched_group, file_show_work); + + for (int i = 0; i < retry; i++) { + if (!xcu_cg_set_file_show(xg)) + return; + + mdelay(10); + } + + XSCHED_ERR("Failed to control the files xcu.{quota, period, shares} visibility after\n" + "%d retries, sched_class=%d, css=0x%lx\n", + retry, xg->sched_class, (uintptr_t)&xg->css); +} + static int xcu_css_online(struct cgroup_subsys_state *css) { struct xsched_group *xg = xcu_cg_from_css(css); @@ -266,6 +313,9 @@ static int xcu_css_online(struct cgroup_subsys_state *css) return err; } + INIT_WORK(&xg->file_show_work, delay_xcu_cg_set_file_show_workfn); + schedule_work(&xg->file_show_work); + return 0; } @@ -285,6 +335,8 @@ static void xcu_css_offline(struct cgroup_subsys_state *css) break; } } + hrtimer_cancel(&xcg->quota_timeout); + cancel_work_sync(&xcg->refill_work); list_del(&xcg->group_node); } @@ -469,17 +521,22 @@ static int xcu_cg_set_sched_class(struct xsched_group *xg, int type) xcu_cfs_cg_deinit(xg); break; default: + XSCHED_INFO("xcu_cgroup: the original sched_class is RT, css=0x%lx\n", + (uintptr_t)&xg->css); break; } /* update type */ xg->sched_class = type; + xcu_cg_set_file_show(xg); /* init new type if necessary */ switch (type) { case XSCHED_TYPE_CFS: return xcu_cfs_cg_init(xg, xg->parent); default: + XSCHED_INFO("xcu_cgroup: the target sched_class is RT, css=0x%lx\n", + (uintptr_t)&xg->css); return 0; } } @@ -523,6 +580,13 @@ static s64 xcu_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) struct xsched_group *xcucg = xcu_cg_from_css(css); switch (cft->private) { + case XCU_FILE_PERIOD_MS: + ret = div_s64(xcucg->period, NSEC_PER_MSEC); + break; + case XCU_FILE_QUOTA_MS: + ret = (xcucg->quota > 0) ? div_s64(xcucg->quota, NSEC_PER_MSEC) + : xcucg->quota; + break; case XCU_FILE_SHARES: ret = xcucg->shares_cfg; break; @@ -583,10 +647,33 @@ static int xcu_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, { int ret = 0; struct xsched_group *xcucg = xcu_cg_from_css(css); + s64 quota_ns; switch (cft->private) { + case XCU_FILE_PERIOD_MS: + if (val < 1 || val > (S64_MAX / NSEC_PER_MSEC)) { + ret = -EINVAL; + break; + } + xcucg->period = val * NSEC_PER_MSEC; + xsched_quota_timeout_update(xcucg); + break; + case XCU_FILE_QUOTA_MS: + if (val < -1 || val > (S64_MAX / NSEC_PER_MSEC)) { + ret = -EINVAL; + break; + } + /* Runtime should be updated when modifying quota_ms configuration */ + quota_ns = (val > 0) ? val * NSEC_PER_MSEC : val; + if (xcucg->quota > 0 && quota_ns > 0) + xcucg->runtime = max((xcucg->runtime - quota_ns), (s64)0); + else + xcucg->runtime = 0; + xcucg->quota = quota_ns; + xsched_quota_timeout_update(xcucg); + break; case XCU_FILE_SHARES: - if (val <= 0) { + if (val <= 0 || val > U64_MAX) { ret = -EINVAL; break; } @@ -606,6 +693,8 @@ static int xcu_stat(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); struct xsched_group *xcucg = xcu_cg_from_css(css); + u64 nr_throttled = 0; + u64 throttled_time = 0; u64 exec_runtime = 0; int xcu_id; struct xsched_cu *xcu; @@ -616,6 +705,8 @@ static int xcu_stat(struct seq_file *sf, void *v) } for_each_active_xcu(xcu, xcu_id) { + nr_throttled += xcucg->perxcu_priv[xcu_id].nr_throttled; + throttled_time += xcucg->perxcu_priv[xcu_id].throttled_time; exec_runtime += xcucg->perxcu_priv[xcu_id].xse.cfs.sum_exec_runtime; } @@ -623,17 +714,39 @@ static int xcu_stat(struct seq_file *sf, void *v) seq_printf(sf, "exec_runtime: %llu\n", exec_runtime); seq_printf(sf, "shares cfg: %llu/%llu x%u\n", xcucg->shares_cfg, xcucg->parent->children_shares_sum, xcucg->weight); + seq_printf(sf, "quota: %lld\n", xcucg->quota); + seq_printf(sf, "used: %lld\n", xcucg->runtime); + seq_printf(sf, "period: %lld\n", xcucg->period); + seq_printf(sf, "nr_throttled: %lld\n", nr_throttled); + seq_printf(sf, "throttled_time: %lld\n", throttled_time); return 0; } static struct cftype xcu_cg_files[] = { + { + .name = "period_ms", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = xcu_read_s64, + .write_s64 = xcu_write_s64, + .private = XCU_FILE_PERIOD_MS, + .file_offset = offsetof(struct xsched_group, xcu_file[XCU_FILE_PERIOD_MS]), + }, + { + .name = "quota_ms", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = xcu_read_s64, + .write_s64 = xcu_write_s64, + .private = XCU_FILE_QUOTA_MS, + .file_offset = offsetof(struct xsched_group, xcu_file[XCU_FILE_QUOTA_MS]), + }, { .name = "shares", .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = xcu_read_s64, .write_s64 = xcu_write_s64, .private = XCU_FILE_SHARES, + .file_offset = offsetof(struct xsched_group, xcu_file[XCU_FILE_SHARES]), }, { .name = "stat", diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c index ad32f8a74440..b920a7923999 100644 --- a/kernel/xsched/core.c +++ b/kernel/xsched/core.c @@ -400,6 +400,11 @@ int xsched_schedule(void *input_xcu) if (!atomic_read(&curr_xse->kicks_pending_ctx_cnt)) dequeue_ctx(curr_xse, xcu); +#ifdef CONFIG_CGROUP_XCU + if (xsched_quota_exceed(curr_xse->parent_grp)) + dequeue_ctx(&curr_xse->parent_grp->perxcu_priv[xcu->id].xse, xcu); +#endif + xcu->xrq.curr_xse = NULL; } -- 2.34.1

patchwork bot

3:05 p.m.

反馈：您发送到kernel@openeuler.org的补丁/补丁集，已成功转换为PR！ PR链接地址： https://gitee.com/openeuler/kernel/pulls/19000 邮件列表地址：https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/EDI... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/19000 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/EDI...

Age (days ago)

Last active (days ago)

List overview

9 comments

2 participants

participants (2)

patchwork bot
Zicheng Qu

[PATCH OLK-6.6 0/8] xsched: XCU Partition

tags

participants (2)