From: Alekseev Dmitry <alekseev.dmitry@huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC5EHB ----------------------------------------- Add support for CFS quota for cgroups. Signed-off-by: Alekseev Dmitry <alekseev.dmitry@huawei.com> Signed-off-by: Hui Tang <tanghui20@.huawei.com> Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Xia Fukun <xiafukun@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- include/linux/xsched.h | 31 ++++++++ include/uapi/linux/xcu_vstream.h | 1 + kernel/xsched/Makefile | 2 +- kernel/xsched/cfs.c | 1 + kernel/xsched/cfs_quota.c | 98 ++++++++++++++++++++++++ kernel/xsched/cgroup.c | 127 +++++++++++++++++++++++++++++-- kernel/xsched/core.c | 5 ++ 7 files changed, 257 insertions(+), 8 deletions(-) create mode 100644 kernel/xsched/cfs_quota.c diff --git a/include/linux/xsched.h b/include/linux/xsched.h index 8bbb533ab043..d97b3beae8ad 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -42,6 +42,7 @@ #define RUNTIME_INF ((u64)~0ULL) #define XSCHED_TIME_INF RUNTIME_INF #define XSCHED_CFS_WEIGHT_DFLT 1 +#define XSCHED_CFS_QUOTA_PERIOD_MS (100 * NSEC_PER_MSEC) #define XSCHED_CFG_SHARE_DFLT 1024 /* @@ -257,6 +258,16 @@ struct xsched_group_xcu_priv { struct xsched_entity xse; /* xse of this group on runqueue */ struct xsched_rq_cfs *cfs_rq; /* cfs runqueue "owned" by this group */ struct xsched_rq_rt *rt_rq; /* rt runqueue "owned" by this group */ + /* Statistics */ + int nr_throttled; + u64 throttled_time; +}; + +enum xcu_file_type { + XCU_FILE_PERIOD_MS, + XCU_FILE_QUOTA_MS, + XCU_FILE_SHARES, + NR_XCU_FILE_TYPES, }; /* Xsched scheduling control group */ @@ -274,6 +285,14 @@ struct xsched_group { u32 weight; u64 children_shares_sum; + /* Bandwidth setting: maximal quota in period */ + s64 quota; + /* record the runtime of operators during the period */ + s64 runtime; + s64 period; + struct hrtimer quota_timeout; + struct work_struct refill_work; + struct xsched_group_xcu_priv perxcu_priv[XSCHED_NR_CUS]; /* Groups hierarchcy */ @@ -285,6 +304,10 @@ struct xsched_group { /* for XSE to move in perxcu */ struct list_head members; + + /* to control the xcu.{period, quota, shares} files shown or not */ + struct cgroup_file xcu_file[NR_XCU_FILE_TYPES]; + struct work_struct file_show_work; }; #define XSCHED_RQ_OF(xse) \ @@ -456,6 +479,7 @@ static inline void xsched_init_vsm(struct vstream_metadata *vsm, struct vstream_info *vs, vstream_args_t *arg) { vsm->sq_id = arg->sq_id; + vsm->exec_time = arg->vk_args.exec_time; vsm->sqe_num = arg->vk_args.sqe_num; vsm->timeout = arg->vk_args.timeout; memcpy(vsm->sqe, arg->vk_args.sqe, XCU_SQE_SIZE_MAX); @@ -480,4 +504,11 @@ void xcu_cg_subsys_init(void); void xcu_cfs_root_cg_init(struct xsched_cu *xcu); void xcu_grp_shares_update(struct xsched_group *parent); void xsched_group_xse_detach(struct xsched_entity *xse); + +void xsched_quota_init(void); +void xsched_quota_timeout_init(struct xsched_group *xg); +void xsched_quota_timeout_update(struct xsched_group *xg); +void xsched_quota_account(struct xsched_group *xg, s64 exec_time); +bool xsched_quota_exceed(struct xsched_group *xg); +void xsched_quota_refill(struct work_struct *work); #endif /* !__LINUX_XSCHED_H__ */ diff --git a/include/uapi/linux/xcu_vstream.h b/include/uapi/linux/xcu_vstream.h index 38cc97d3a139..b60c0e0e15f5 100644 --- a/include/uapi/linux/xcu_vstream.h +++ b/include/uapi/linux/xcu_vstream.h @@ -42,6 +42,7 @@ typedef struct vstream_free_args { typedef struct vstream_kick_args { __u32 sqe_num; + __u32 exec_time; __s32 timeout; __s8 sqe[XCU_SQE_SIZE_MAX]; diff --git a/kernel/xsched/Makefile b/kernel/xsched/Makefile index ddbeefe37e87..a6081a7aaf14 100644 --- a/kernel/xsched/Makefile +++ b/kernel/xsched/Makefile @@ -4,6 +4,6 @@ obj-y += vstream.o ifdef CONFIG_XCU_SCHEDULER obj-y += core.o obj-$(CONFIG_XCU_SCHED_RT) += rt.o -obj-$(CONFIG_XCU_SCHED_CFS) += cfs.o +obj-$(CONFIG_XCU_SCHED_CFS) += cfs.o cfs_quota.o obj-$(CONFIG_CGROUP_XCU) += cgroup.o endif diff --git a/kernel/xsched/cfs.c b/kernel/xsched/cfs.c index 86dc63cd5745..1cbfd5f0e586 100644 --- a/kernel/xsched/cfs.c +++ b/kernel/xsched/cfs.c @@ -204,6 +204,7 @@ static void put_prev_ctx_fair(struct xsched_entity *xse) { struct xsched_entity_cfs *prev = &xse->cfs; + xsched_quota_account(xse->parent_grp, (s64)xse->last_exec_runtime); xs_update(prev, xse->last_exec_runtime); } diff --git a/kernel/xsched/cfs_quota.c b/kernel/xsched/cfs_quota.c new file mode 100644 index 000000000000..2b516ab5592f --- /dev/null +++ b/kernel/xsched/cfs_quota.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Bandwidth provisioning for XPU device + * + * Copyright (C) 2025-2026 Huawei Technologies Co., Ltd + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include <linux/timer.h> +#include <linux/xsched.h> + +static struct workqueue_struct *quota_workqueue; + +void xsched_quota_refill(struct work_struct *work) +{ + uint32_t id; + struct xsched_cu *xcu; + struct xsched_group *xg; + + xg = container_of(work, struct xsched_group, refill_work); + + spin_lock(&xg->lock); + xg->runtime = max((xg->runtime - xg->quota), (s64)0); + hrtimer_start(&xg->quota_timeout, ns_to_ktime(xg->period), HRTIMER_MODE_REL_SOFT); + spin_unlock(&xg->lock); + + if (xg->runtime >= xg->quota) { + XSCHED_DEBUG("xcu_cgroup [css=0x%lx] is still be throttled @ %s\n", + (uintptr_t)&xg->css, __func__); + return; + } + + for_each_active_xcu(xcu, id) { + mutex_lock(&xcu->xcu_lock); + if (!READ_ONCE(xg->perxcu_priv[id].xse.on_rq)) { + enqueue_ctx(&xg->perxcu_priv[id].xse, xcu); + wake_up_interruptible(&xcu->wq_xcu_idle); + } + mutex_unlock(&xcu->xcu_lock); + } +} + +static enum hrtimer_restart quota_timer_cb(struct hrtimer *hrtimer) +{ + struct xsched_group *xg; + + xg = container_of(hrtimer, struct xsched_group, quota_timeout); + queue_work(quota_workqueue, &xg->refill_work); + + return HRTIMER_NORESTART; +} + +void xsched_quota_account(struct xsched_group *xg, s64 exec_time) +{ + spin_lock(&xg->lock); + xg->runtime += exec_time; + spin_unlock(&xg->lock); +} + +bool xsched_quota_exceed(struct xsched_group *xg) +{ + bool ret; + + spin_lock(&xg->lock); + ret = (xg->quota > 0) ? (xg->runtime >= xg->quota) : false; + spin_unlock(&xg->lock); + + return ret; +} + +void xsched_quota_init(void) +{ + quota_workqueue = create_singlethread_workqueue("xsched_quota_workqueue"); +} + +void xsched_quota_timeout_init(struct xsched_group *xg) +{ + hrtimer_init(&xg->quota_timeout, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + xg->quota_timeout.function = quota_timer_cb; +} + +void xsched_quota_timeout_update(struct xsched_group *xg) +{ + struct hrtimer *t = &xg->quota_timeout; + + hrtimer_cancel(t); + + if (xg->quota > 0 && xg->period > 0) + hrtimer_start(t, ns_to_ktime(xg->period), HRTIMER_MODE_REL_SOFT); +} diff --git a/kernel/xsched/cgroup.c b/kernel/xsched/cgroup.c index 9f7b3d15e9a9..f7eeedc80fc3 100644 --- a/kernel/xsched/cgroup.c +++ b/kernel/xsched/cgroup.c @@ -19,12 +19,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/xsched.h> - -enum xcu_file_type { - XCU_FILE_PERIOD_MS, - XCU_FILE_QUOTA_MS, - XCU_FILE_SHARES, -}; +#include <linux/delay.h> static struct xsched_group root_xsched_group; struct xsched_group *root_xcg = &root_xsched_group; @@ -42,6 +37,29 @@ static const char xcu_sched_name[XSCHED_TYPE_NUM][4] = { [XSCHED_TYPE_CFS] = "cfs" }; +static int xcu_cg_set_file_show(struct xsched_group *xg) +{ + if (!xg) { + XSCHED_ERR("xsched_group is NULL.\n"); + return -EINVAL; + } + + /* Update visibility of related files based on sched_class */ + for (int type_name = XCU_FILE_PERIOD_MS; type_name < NR_XCU_FILE_TYPES; type_name++) { + if (unlikely(!xg->xcu_file[type_name].kn)) { + XSCHED_ERR("Fail to control the file [%d] to be %s @ %s.\n", + type_name, + xg->sched_class == XSCHED_TYPE_CFS ? "visible" : "invisible", + __func__); + return -EBUSY; + } + + cgroup_file_show(&xg->xcu_file[type_name], xg->sched_class == XSCHED_TYPE_CFS); + } + + return 0; +} + /** * @brief Initialize the core components of an xsched_group. * @@ -57,6 +75,8 @@ static void xcu_cg_initialize_components(struct xsched_group *xcg) spin_lock_init(&xcg->lock); INIT_LIST_HEAD(&xcg->members); INIT_LIST_HEAD(&xcg->children_groups); + xsched_quota_timeout_init(xcg); + INIT_WORK(&xcg->refill_work, xsched_quota_refill); } void xcu_cg_subsys_init(void) @@ -64,6 +84,10 @@ void xcu_cg_subsys_init(void) xcu_cg_initialize_components(root_xcg); root_xcg->sched_class = XSCHED_TYPE_DFLT; + root_xcg->period = XSCHED_CFS_QUOTA_PERIOD_MS; + root_xcg->quota = XSCHED_TIME_INF; + root_xcg->runtime = 0; + xsched_quota_init(); xsched_group_cache = KMEM_CACHE(xsched_group, 0); xcg_attach_entry_cache = KMEM_CACHE(xcg_attach_entry, 0); @@ -124,6 +148,9 @@ static int xcu_cfs_cg_init(struct xsched_group *xcg, xcg->shares_cfg = XSCHED_CFG_SHARE_DFLT; xcu_grp_shares_update(parent_xg); + xcg->period = XSCHED_CFS_QUOTA_PERIOD_MS; + xcg->quota = XSCHED_TIME_INF; + xcg->runtime = 0; return 0; @@ -248,6 +275,26 @@ static void xcu_css_free(struct cgroup_subsys_state *css) kmem_cache_free(xsched_group_cache, xcg); } + +static void delay_xcu_cg_set_file_show_workfn(struct work_struct *work) +{ + struct xsched_group *xg; + int retry = 50; + + xg = container_of(work, struct xsched_group, file_show_work); + + for (int i = 0; i < retry; i++) { + if (!xcu_cg_set_file_show(xg)) + return; + + mdelay(10); + } + + XSCHED_ERR("Failed to control the files xcu.{quota, period, shares} visibility after\n" + "%d retries, sched_class=%d, css=0x%lx\n", + retry, xg->sched_class, (uintptr_t)&xg->css); +} + static int xcu_css_online(struct cgroup_subsys_state *css) { struct xsched_group *xg = xcu_cg_from_css(css); @@ -266,6 +313,9 @@ static int xcu_css_online(struct cgroup_subsys_state *css) return err; } + INIT_WORK(&xg->file_show_work, delay_xcu_cg_set_file_show_workfn); + schedule_work(&xg->file_show_work); + return 0; } @@ -285,6 +335,8 @@ static void xcu_css_offline(struct cgroup_subsys_state *css) break; } } + hrtimer_cancel(&xcg->quota_timeout); + cancel_work_sync(&xcg->refill_work); list_del(&xcg->group_node); } @@ -469,17 +521,22 @@ static int xcu_cg_set_sched_class(struct xsched_group *xg, int type) xcu_cfs_cg_deinit(xg); break; default: + XSCHED_INFO("xcu_cgroup: the original sched_class is RT, css=0x%lx\n", + (uintptr_t)&xg->css); break; } /* update type */ xg->sched_class = type; + xcu_cg_set_file_show(xg); /* init new type if necessary */ switch (type) { case XSCHED_TYPE_CFS: return xcu_cfs_cg_init(xg, xg->parent); default: + XSCHED_INFO("xcu_cgroup: the target sched_class is RT, css=0x%lx\n", + (uintptr_t)&xg->css); return 0; } } @@ -523,6 +580,13 @@ static s64 xcu_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) struct xsched_group *xcucg = xcu_cg_from_css(css); switch (cft->private) { + case XCU_FILE_PERIOD_MS: + ret = div_s64(xcucg->period, NSEC_PER_MSEC); + break; + case XCU_FILE_QUOTA_MS: + ret = (xcucg->quota > 0) ? div_s64(xcucg->quota, NSEC_PER_MSEC) + : xcucg->quota; + break; case XCU_FILE_SHARES: ret = xcucg->shares_cfg; break; @@ -583,10 +647,33 @@ static int xcu_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, { int ret = 0; struct xsched_group *xcucg = xcu_cg_from_css(css); + s64 quota_ns; switch (cft->private) { + case XCU_FILE_PERIOD_MS: + if (val < 1 || val > (S64_MAX / NSEC_PER_MSEC)) { + ret = -EINVAL; + break; + } + xcucg->period = val * NSEC_PER_MSEC; + xsched_quota_timeout_update(xcucg); + break; + case XCU_FILE_QUOTA_MS: + if (val < -1 || val > (S64_MAX / NSEC_PER_MSEC)) { + ret = -EINVAL; + break; + } + /* Runtime should be updated when modifying quota_ms configuration */ + quota_ns = (val > 0) ? val * NSEC_PER_MSEC : val; + if (xcucg->quota > 0 && quota_ns > 0) + xcucg->runtime = max((xcucg->runtime - quota_ns), (s64)0); + else + xcucg->runtime = 0; + xcucg->quota = quota_ns; + xsched_quota_timeout_update(xcucg); + break; case XCU_FILE_SHARES: - if (val <= 0) { + if (val <= 0 || val > U64_MAX) { ret = -EINVAL; break; } @@ -606,6 +693,8 @@ static int xcu_stat(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); struct xsched_group *xcucg = xcu_cg_from_css(css); + u64 nr_throttled = 0; + u64 throttled_time = 0; u64 exec_runtime = 0; int xcu_id; struct xsched_cu *xcu; @@ -616,6 +705,8 @@ static int xcu_stat(struct seq_file *sf, void *v) } for_each_active_xcu(xcu, xcu_id) { + nr_throttled += xcucg->perxcu_priv[xcu_id].nr_throttled; + throttled_time += xcucg->perxcu_priv[xcu_id].throttled_time; exec_runtime += xcucg->perxcu_priv[xcu_id].xse.cfs.sum_exec_runtime; } @@ -623,17 +714,39 @@ static int xcu_stat(struct seq_file *sf, void *v) seq_printf(sf, "exec_runtime: %llu\n", exec_runtime); seq_printf(sf, "shares cfg: %llu/%llu x%u\n", xcucg->shares_cfg, xcucg->parent->children_shares_sum, xcucg->weight); + seq_printf(sf, "quota: %lld\n", xcucg->quota); + seq_printf(sf, "used: %lld\n", xcucg->runtime); + seq_printf(sf, "period: %lld\n", xcucg->period); + seq_printf(sf, "nr_throttled: %lld\n", nr_throttled); + seq_printf(sf, "throttled_time: %lld\n", throttled_time); return 0; } static struct cftype xcu_cg_files[] = { + { + .name = "period_ms", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = xcu_read_s64, + .write_s64 = xcu_write_s64, + .private = XCU_FILE_PERIOD_MS, + .file_offset = offsetof(struct xsched_group, xcu_file[XCU_FILE_PERIOD_MS]), + }, + { + .name = "quota_ms", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = xcu_read_s64, + .write_s64 = xcu_write_s64, + .private = XCU_FILE_QUOTA_MS, + .file_offset = offsetof(struct xsched_group, xcu_file[XCU_FILE_QUOTA_MS]), + }, { .name = "shares", .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = xcu_read_s64, .write_s64 = xcu_write_s64, .private = XCU_FILE_SHARES, + .file_offset = offsetof(struct xsched_group, xcu_file[XCU_FILE_SHARES]), }, { .name = "stat", diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c index ad32f8a74440..b920a7923999 100644 --- a/kernel/xsched/core.c +++ b/kernel/xsched/core.c @@ -400,6 +400,11 @@ int xsched_schedule(void *input_xcu) if (!atomic_read(&curr_xse->kicks_pending_ctx_cnt)) dequeue_ctx(curr_xse, xcu); +#ifdef CONFIG_CGROUP_XCU + if (xsched_quota_exceed(curr_xse->parent_grp)) + dequeue_ctx(&curr_xse->parent_grp->perxcu_priv[xcu->id].xse, xcu); +#endif + xcu->xrq.curr_xse = NULL; } -- 2.34.1