[PATCH OLK-6.6 0/6] xsched: XCU Partition

Zicheng Qu

8 Dec 2025 8 Dec '25

5:26 p.m.

Zicheng Qu (6): xsched: refactor CFS per-XCU rq init/deinit paths xsched: fix NULL pointer risk on attach entry allocation xsched: fix concurrent modification NULL pointer dereference in xcu.sched_class xsched: fix hung_task caused by cgroup_file_show and xcg_mutex deadlock xsched: prevent NULL deref by refcounting css and tracking offline state xsched: fix divide-by-zero caused by u64 overflow in CFS shares weight calculation include/linux/xsched.h | 27 ++++- kernel/xsched/cfs.c | 31 +++++- kernel/xsched/cfs_quota.c | 7 ++ kernel/xsched/cgroup.c | 219 +++++++++++++++++++++----------------- 4 files changed, 179 insertions(+), 105 deletions(-) -- 2.34.1

Show replies by date

Zicheng Qu

8 Dec 8 Dec

5:26 p.m.

New subject: [PATCH OLK-6.6 1/6] xsched: refactor CFS per-XCU rq init/deinit paths

hulk inclusion category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/IDB5TR ----------------------------------------- This commit mainly refactors the CFS per-XCU runqueue initialization and deinitialization logic without changing any functional behavior. This is a pure code cleanup and refactoring change with no intended behavioral impact. Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- kernel/xsched/cgroup.c | 59 ++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/kernel/xsched/cgroup.c b/kernel/xsched/cgroup.c index 02957b33340a..1507d457ef0b 100644 --- a/kernel/xsched/cgroup.c +++ b/kernel/xsched/cgroup.c @@ -98,6 +98,21 @@ void xcu_cfs_root_cg_init(struct xsched_cu *xcu) root_xcg->perxcu_priv[id].xse.cfs.weight = XSCHED_CFS_WEIGHT_DFLT; } +static void xcg_perxcu_cfs_rq_deinit(struct xsched_group *xcg, int max_id) +{ + struct xsched_cu *xcu; + int i; + + for (i = 0; i < max_id; i++) { + xcu = xsched_cu_mgr[i]; + mutex_lock(&xcu->xcu_lock); + dequeue_ctx(&xcg->perxcu_priv[i].xse, xcu); + mutex_unlock(&xcu->xcu_lock); + kfree(xcg->perxcu_priv[i].cfs_rq); + xcg->perxcu_priv[i].cfs_rq = NULL; + } +} + /** * xcu_cfs_cg_init() - Initialize xsched_group cfs runqueues and bw control. * @xcg: new xsched_cgroup @@ -110,7 +125,7 @@ void xcu_cfs_root_cg_init(struct xsched_cu *xcu) static int xcu_cfs_cg_init(struct xsched_group *xcg, struct xsched_group *parent_xg) { - int id = 0, err, i; + int id = 0; struct xsched_cu *xcu; struct xsched_rq_cfs *sub_cfs_rq; @@ -118,11 +133,11 @@ static int xcu_cfs_cg_init(struct xsched_group *xcg, xcg->perxcu_priv[id].xcu_id = id; xcg->perxcu_priv[id].self = xcg; - sub_cfs_rq = kzalloc(sizeof(struct xsched_rq_cfs), GFP_KERNEL); + sub_cfs_rq = kzalloc(sizeof(*sub_cfs_rq), GFP_KERNEL); if (!sub_cfs_rq) { XSCHED_ERR("Fail to alloc cfs runqueue on xcu %d\n", id); - err = -ENOMEM; - goto alloc_error; + xcg_perxcu_cfs_rq_deinit(xcg, id); + return -ENOMEM; } xcg->perxcu_priv[id].cfs_rq = sub_cfs_rq; xcg->perxcu_priv[id].cfs_rq->ctx_timeline = RB_ROOT_CACHED; @@ -148,31 +163,11 @@ static int xcu_cfs_cg_init(struct xsched_group *xcg, xcg->runtime = 0; return 0; - -alloc_error: - for (i = 0; i < id; i++) { - xcu = xsched_cu_mgr[i]; - mutex_lock(&xcu->xcu_lock); - dequeue_ctx(&xcg->perxcu_priv[i].xse, xcu); - mutex_unlock(&xcu->xcu_lock); - - kfree(xcg->perxcu_priv[i].cfs_rq); - } - - return err; } static void xcu_cfs_cg_deinit(struct xsched_group *xcg) { - uint32_t id; - struct xsched_cu *xcu; - - for_each_active_xcu(xcu, id) { - mutex_lock(&xcu->xcu_lock); - dequeue_ctx(&xcg->perxcu_priv[id].xse, xcu); - mutex_unlock(&xcu->xcu_lock); - kfree(xcg->perxcu_priv[id].cfs_rq); - } + xcg_perxcu_cfs_rq_deinit(xcg, num_active_xcu); xcu_grp_shares_update(xcg->parent); } @@ -230,7 +225,7 @@ xcu_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) return &root_xsched_group.css; - xg = kmem_cache_alloc(xsched_group_cache, GFP_KERNEL | __GFP_ZERO); + xg = kmem_cache_zalloc(xsched_group_cache, GFP_KERNEL); if (!xg) return ERR_PTR(-ENOMEM); @@ -360,18 +355,18 @@ static int xcu_can_attach(struct cgroup_taskset *tset) old_xcg = xcu_cg_from_css(old_css); ret = xcu_task_can_attach(task, old_xcg); - if (ret) - break; + if (ret < 0) + return ret; /* record entry for this task */ - entry = kmem_cache_alloc(xcg_attach_entry_cache, GFP_KERNEL | __GFP_ZERO); + entry = kmem_cache_zalloc(xcg_attach_entry_cache, GFP_KERNEL); entry->task = task; entry->old_xcg = old_xcg; entry->new_xcg = dst_xcg; list_add_tail(&entry->node, &xcg_attach_list); } - return ret; + return 0; } static void xcu_cancel_attach(struct cgroup_taskset *tset) @@ -496,8 +491,6 @@ static int xcu_cg_set_sched_class(struct xsched_group *xg, int type) xcu_cfs_cg_deinit(xg); break; default: - XSCHED_INFO("xcu_cgroup: the original sched_class is RT, css=0x%lx\n", - (uintptr_t)&xg->css); break; } @@ -510,8 +503,6 @@ static int xcu_cg_set_sched_class(struct xsched_group *xg, int type) case XSCHED_TYPE_CFS: return xcu_cfs_cg_init(xg, xg->parent); default: - XSCHED_INFO("xcu_cgroup: the target sched_class is RT, css=0x%lx\n", - (uintptr_t)&xg->css); return 0; } } -- 2.34.1

Zicheng Qu

5:26 p.m.

New subject: [PATCH OLK-6.6 2/6] xsched: fix NULL pointer risk on attach entry allocation

hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IDB5TR ----------------------------------------- xcu_can_attach() allocates an xcg_attach_entry via kmem_cache_zalloc() but failed to check the return value before dereferencing it. Fixes: 43bbefc53356 ("xsched: Add XCU control group implementation and its backend in xsched CFS") Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- kernel/xsched/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/xsched/cgroup.c b/kernel/xsched/cgroup.c index 1507d457ef0b..450439aac9ef 100644 --- a/kernel/xsched/cgroup.c +++ b/kernel/xsched/cgroup.c @@ -360,6 +360,8 @@ static int xcu_can_attach(struct cgroup_taskset *tset) /* record entry for this task */ entry = kmem_cache_zalloc(xcg_attach_entry_cache, GFP_KERNEL); + if (!entry) + return -ENOMEM; entry->task = task; entry->old_xcg = old_xcg; entry->new_xcg = dst_xcg; -- 2.34.1

Zicheng Qu

5:26 p.m.

New subject: [PATCH OLK-6.6 3/6] xsched: fix concurrent modification NULL pointer dereference in xcu.sched_class

hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IDB5TR ----------------------------------------- When concurrently reading and writing to sched_class, the allocation and deallocation of cfs_rq in perxcu lacked lock protection. This could lead to the cfs_rq being freed twice during concurrent operations, resulting in a null pointer issue. Therefore, a mutex lock has been added when setting up sched_class to ensure that memory is not released repeatedly, and to allocate and deallocate the memory for xsched_group more reasonably. Fixes: 43bbefc53356 ("xsched: Add XCU control group implementation and its backend in xsched CFS") Signed-off-by: Liu Kai <liukai284@huawei.com> Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- kernel/xsched/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/xsched/cgroup.c b/kernel/xsched/cgroup.c index 450439aac9ef..bab7d1049f7e 100644 --- a/kernel/xsched/cgroup.c +++ b/kernel/xsched/cgroup.c @@ -31,6 +31,7 @@ struct xsched_group *root_xcg = &root_xsched_group; static struct kmem_cache *xsched_group_cache __read_mostly; static struct kmem_cache *xcg_attach_entry_cache __read_mostly; static LIST_HEAD(xcg_attach_list); +static DEFINE_MUTEX(xcg_mutex); static const char xcu_sched_name[XSCHED_TYPE_NUM][4] = { [XSCHED_TYPE_RT] = "rt", @@ -537,7 +538,9 @@ static ssize_t xcu_sched_class_write(struct kernfs_open_file *of, char *buf, if (!xsched_group_is_root(xg->parent)) return -EINVAL; + mutex_lock(&xcg_mutex); ret = xcu_cg_set_sched_class(xg, type); + mutex_unlock(&xcg_mutex); return (ret) ? ret : nbytes; } -- 2.34.1

Zicheng Qu

5:26 p.m.

New subject: [PATCH OLK-6.6 4/6] xsched: fix hung_task caused by cgroup_file_show and xcg_mutex deadlock

hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IDB5TR ----------------------------------------- A hung_task was observed when switching xcu.sched_class concurrently with writing xcu.quota_ms/period_ms/shares, leaving multiple tasks blocked in cgroup core paths such as cgroup_kn_lock_live() and kernfs_drain(). The deadlock is caused by the following lock inversion: - xcu_sched_class_write: kernfs node active on xcu.sched_class -> xcg_mutex -> cgroup_file_show -> kernfs_drain() on quota/period/shares files - quota/period/shares write path: kernfs node active on xcu.{quota, period, shares} -> xcg_mutex If sched_class switch holds xcg_mutex while calling cgroup_file_show(), kernfs_drain() may wait for an active writer on the same file xcu.{quota, period, shares}, while that writer is blocked waiting for xcg_mutex, forming an ABBA deadlock. Fixes: 43bbefc53356 ("xsched: Add XCU control group implementation and its backend in xsched CFS") Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- kernel/xsched/cgroup.c | 48 ++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/kernel/xsched/cgroup.c b/kernel/xsched/cgroup.c index bab7d1049f7e..859a901b0b48 100644 --- a/kernel/xsched/cgroup.c +++ b/kernel/xsched/cgroup.c @@ -32,27 +32,32 @@ static struct kmem_cache *xsched_group_cache __read_mostly; static struct kmem_cache *xcg_attach_entry_cache __read_mostly; static LIST_HEAD(xcg_attach_list); static DEFINE_MUTEX(xcg_mutex); +static DEFINE_MUTEX(xcu_file_show_mutex); static const char xcu_sched_name[XSCHED_TYPE_NUM][4] = { [XSCHED_TYPE_RT] = "rt", [XSCHED_TYPE_CFS] = "cfs" }; -static int xcu_cg_set_file_show(struct xsched_group *xg) +static int xcu_cg_set_file_show(struct xsched_group *xg, int sched_class) { if (!xg) { XSCHED_ERR("xsched_group is NULL.\n"); return -EINVAL; } + mutex_lock(&xcu_file_show_mutex); /* Update visibility of related files based on sched_class */ for (int type_name = XCU_FILE_PERIOD_MS; type_name < NR_XCU_FILE_TYPES; type_name++) { - if (unlikely(!xg->xcu_file[type_name].kn)) + if (unlikely(!xg->xcu_file[type_name].kn)) { + mutex_unlock(&xcu_file_show_mutex); return -EBUSY; + } - cgroup_file_show(&xg->xcu_file[type_name], xg->sched_class == XSCHED_TYPE_CFS); + cgroup_file_show(&xg->xcu_file[type_name], sched_class == XSCHED_TYPE_CFS); } + mutex_unlock(&xcu_file_show_mutex); return 0; } @@ -247,8 +252,13 @@ static void delay_xcu_cg_set_file_show_workfn(struct work_struct *work) xg = container_of(work, struct xsched_group, file_show_work); + if (!xg) { + XSCHED_ERR("xsched_group cannot be null @ %s", __func__); + return; + } + for (int i = 0; i < XCUCG_SET_FILE_RETRY_COUNT; i++) { - if (!xcu_cg_set_file_show(xg)) + if (!xcu_cg_set_file_show(xg, xg->sched_class)) return; mdelay(XCUCG_SET_FILE_DELAY_MS); @@ -301,6 +311,7 @@ static void xcu_css_offline(struct cgroup_subsys_state *css) } hrtimer_cancel(&xcg->quota_timeout); cancel_work_sync(&xcg->refill_work); + cancel_work_sync(&xcg->file_show_work); list_del(&xcg->group_node); } @@ -499,7 +510,6 @@ static int xcu_cg_set_sched_class(struct xsched_group *xg, int type) /* update type */ xg->sched_class = type; - xcu_cg_set_file_show(xg); /* init new type if necessary */ switch (type) { @@ -514,7 +524,7 @@ static ssize_t xcu_sched_class_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_subsys_state *css = of_css(of); - struct xsched_group *xg = xcu_cg_from_css(css); + struct xsched_group *xg; char type_name[SCHED_CLASS_MAX_LENGTH]; int type; @@ -527,22 +537,31 @@ static ssize_t xcu_sched_class_write(struct kernfs_open_file *of, char *buf, if (!strcmp(type_name, xcu_sched_name[type])) break; } - if (type == XSCHED_TYPE_NUM) return -EINVAL; if (!list_empty(&css->children)) return -EBUSY; + css_get(css); + xg = xcu_cg_from_css(css); + /* only the first level of root can switch scheduler type */ - if (!xsched_group_is_root(xg->parent)) + if (!xsched_group_is_root(xg->parent)) { + css_put(css); return -EINVAL; + } mutex_lock(&xcg_mutex); ret = xcu_cg_set_sched_class(xg, type); mutex_unlock(&xcg_mutex); - return (ret) ? ret : nbytes; + if (!ret) + xcu_cg_set_file_show(xg, type); + + css_put(css); + + return ret ? ret : nbytes; } static s64 xcu_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) @@ -576,8 +595,6 @@ void xcu_grp_shares_update(struct xsched_group *parent) struct xsched_group *children; u64 rem, sh_sum = 0, sh_gcd = 0, w_gcd = 0, sh_prod_red = 1; - lockdep_assert_held(&cgroup_mutex); - list_for_each_entry(children, &parent->children_groups, group_node) { if (children->sched_class == XSCHED_TYPE_CFS) sh_gcd = gcd(sh_gcd, children->shares_cfg); @@ -618,9 +635,12 @@ static int xcu_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 val) { int ret = 0; - struct xsched_group *xcucg = xcu_cg_from_css(css); + struct xsched_group *xcucg; s64 quota_ns; + css_get(css); + xcucg = xcu_cg_from_css(css); + switch (cft->private) { case XCU_FILE_PERIOD_MS: if (val < XCU_PERIOD_MIN_MS || val > (S64_MAX / NSEC_PER_MSEC)) { @@ -649,10 +669,8 @@ static int xcu_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, ret = -EINVAL; break; } - cgroup_lock(); xcucg->shares_cfg = val; xcu_grp_shares_update(xcucg->parent); - cgroup_unlock(); break; default: XSCHED_ERR("invalid operation %lu @ %s\n", cft->private, __func__); @@ -660,6 +678,8 @@ static int xcu_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } + css_put(css); + return ret; } -- 2.34.1

Zicheng Qu

5:26 p.m.

New subject: [PATCH OLK-6.6 5/6] xsched: prevent NULL deref by refcounting css and tracking offline state

hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IDB5TR ----------------------------------------- NULL pointer dereferences were observed when xsched paths accessed xsched_group fields after the corresponding cgroup had entered the offline/free sequence. css_get() only protects the cgroup from being freed, but does not prevent xcu_css_offline() from running concurrently. As a result, timer callbacks, workqueues and enqueue paths could still race with offline teardown and access partially destroyed group state. Fix this properly by: - introducing an explicit xcg->is_offline flag to mark offline state - checking is_offline in quota timeout and unthrottle paths - moving group deinit and list removal strictly into the css_free() path - protecting css lifetime with css_get()/css_put() - adding NULL checks for guard against unexpected rq corruption This ensures that: - offline and free are cleanly separated - no async paths operate on an already-offline group - css lifetime is protected without relying on global locks Fixes: 43bbefc53356 ("xsched: Add XCU control group implementation and its backend in xsched CFS") Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- include/linux/xsched.h | 2 ++ kernel/xsched/cfs.c | 26 +++++++++++++++++++++++--- kernel/xsched/cfs_quota.c | 7 +++++++ kernel/xsched/cgroup.c | 31 ++++++++++++++++++------------- 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/include/linux/xsched.h b/include/linux/xsched.h index a277c70b605b..e33c91c6d969 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -296,6 +296,8 @@ struct xsched_group { /* to control the xcu.{period, quota, shares} files shown or not */ struct cgroup_file xcu_file[NR_XCU_FILE_TYPES]; struct work_struct file_show_work; + + bool is_offline; }; #endif /* CONFIG_CGROUP_XCU */ diff --git a/kernel/xsched/cfs.c b/kernel/xsched/cfs.c index 883ba6974450..ad3d1652cb9b 100644 --- a/kernel/xsched/cfs.c +++ b/kernel/xsched/cfs.c @@ -63,7 +63,14 @@ static inline struct xsched_entity_cfs * xs_pick_first(struct xsched_rq_cfs *cfs_rq) { struct xsched_entity_cfs *xse_cfs; - struct rb_node *left = rb_first_cached(&cfs_rq->ctx_timeline); + struct rb_node *left; + + if (!cfs_rq) { + XSCHED_WARN("the rq cannot be NULL @ %s\n", __func__); + return NULL; + } + + left = rb_first_cached(&cfs_rq->ctx_timeline); if (!left) return NULL; @@ -159,12 +166,25 @@ static void enqueue_ctx_fair(struct xsched_entity *xse, struct xsched_cu *xcu) { int task_delta; struct xsched_entity_cfs *first; - struct xsched_rq_cfs *rq; + struct xsched_rq_cfs *rq, *sub_rq; struct xsched_entity_cfs *xse_cfs = &xse->cfs; rq = xse_cfs->cfs_rq = xse_parent_grp_xcu(xse_cfs)->cfs_rq; + if (!rq) { + XSCHED_WARN("the parent rq this xse [%d] attached cannot be NULL @ %s\n", + xse->tgid, __func__); + return; + } + + sub_rq = xse_this_grp_xcu(xse_cfs)->cfs_rq; + if (xse->is_group && !sub_rq) { + XSCHED_WARN("the sub_rq this cgroup-type xse [%d] owned cannot be NULL @ %s\n", + xse->tgid, __func__); + return; + } + task_delta = - (xse->is_group) ? xse_this_grp_xcu(xse_cfs)->cfs_rq->nr_running : 1; + (xse->is_group) ? sub_rq->nr_running : 1; /* If no XSE or only empty groups */ if (xs_pick_first(rq) == NULL || rq->min_xruntime == XSCHED_TIME_INF) diff --git a/kernel/xsched/cfs_quota.c b/kernel/xsched/cfs_quota.c index 2e17a48c071b..a9c01a6ef388 100644 --- a/kernel/xsched/cfs_quota.c +++ b/kernel/xsched/cfs_quota.c @@ -26,6 +26,10 @@ static void xsched_group_unthrottle(struct xsched_group *xg) for_each_active_xcu(xcu, id) { mutex_lock(&xcu->xcu_lock); + if (READ_ONCE(xg->is_offline)) { + mutex_unlock(&xcu->xcu_lock); + return; + } if (!READ_ONCE(xg->perxcu_priv[id].xse.on_rq)) { enqueue_ctx(&xg->perxcu_priv[id].xse, xcu); wake_up_interruptible(&xcu->wq_xcu_idle); @@ -107,6 +111,9 @@ void xsched_quota_timeout_update(struct xsched_group *xg) hrtimer_cancel(t); + if (READ_ONCE(xg->is_offline)) + return; + if (xg->quota > 0 && xg->period > 0) hrtimer_start(t, ns_to_ktime(xg->period), HRTIMER_MODE_REL_SOFT); else diff --git a/kernel/xsched/cgroup.c b/kernel/xsched/cgroup.c index 859a901b0b48..09f250906e46 100644 --- a/kernel/xsched/cgroup.c +++ b/kernel/xsched/cgroup.c @@ -78,6 +78,7 @@ static void xcu_cg_initialize_components(struct xsched_group *xcg) INIT_LIST_HEAD(&xcg->children_groups); xsched_quota_timeout_init(xcg); INIT_WORK(&xcg->refill_work, xsched_quota_refill); + WRITE_ONCE(xcg->is_offline, false); } void xcu_cg_subsys_init(void) @@ -242,10 +243,23 @@ static void xcu_css_free(struct cgroup_subsys_state *css) { struct xsched_group *xcg = xcu_cg_from_css(css); + if (!xsched_group_is_root(xcg)) { + switch (xcg->sched_class) { + case XSCHED_TYPE_CFS: + xcu_cfs_cg_deinit(xcg); + break; + default: + XSCHED_INFO("xcu_cgroup: deinit RT group css=0x%lx\n", + (uintptr_t)&xcg->css); + break; + } + } + + list_del(&xcg->group_node); + kmem_cache_free(xsched_group_cache, xcg); } - static void delay_xcu_cg_set_file_show_workfn(struct work_struct *work) { struct xsched_group *xg; @@ -298,21 +312,12 @@ static void xcu_css_offline(struct cgroup_subsys_state *css) struct xsched_group *xcg; xcg = xcu_cg_from_css(css); - if (!xsched_group_is_root(xcg)) { - switch (xcg->sched_class) { - case XSCHED_TYPE_CFS: - xcu_cfs_cg_deinit(xcg); - break; - default: - XSCHED_INFO("xcu_cgroup: deinit RT group css=0x%lx\n", - (uintptr_t)&xcg->css); - break; - } - } + + WRITE_ONCE(xcg->is_offline, true); + hrtimer_cancel(&xcg->quota_timeout); cancel_work_sync(&xcg->refill_work); cancel_work_sync(&xcg->file_show_work); - list_del(&xcg->group_node); } static void xsched_group_xse_attach(struct xsched_group *xg, -- 2.34.1

Zicheng Qu

5:26 p.m.

New subject: [PATCH OLK-6.6 6/6] xsched: fix divide-by-zero caused by u64 overflow in CFS shares weight calculation

hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IDB5TR ----------------------------------------- The previous CFS group weight calculation used a "product / share" scheme to normalize children weights: weight = prod(shares_cfg_red[]) / shares_cfg_red[i] When users configured very large shares values (u64 range), the intermediate product (sh_prod_red) could easily overflow u64 and wrap around to a much smaller value. As a result, all computed children->weight became zero: sh_prod_red < shares_cfg_red[i] => weight = 0 This further caused the global weight gcd (w_gcd) to remain zero, and finally triggered a fatal division by zero in the last normalization step: weight = weight / w_gcd (0 / 0) This patch fixes the problem by: 1. Limiting shares_cfg from u64 to u32 to prevent arithmetic overflow. 2. Removing the fragile product-based normalization algorithm entirely. 3. Using shares_cfg directly as the scheduling weight. 4. Introducing xs_calc_delta_fair() helpers to safely scale vruntime with mul_u64_u32_div(), avoiding direct division by zero. 5. Updating all CFS vruntime update paths to use the new safe delta helpers. Fixes: 43bbefc53356 ("xsched: Add XCU control group implementation and its backend in xsched CFS") Signed-off-by: Zicheng Qu <quzicheng@huawei.com> --- include/linux/xsched.h | 25 ++++++++++++-- kernel/xsched/cfs.c | 5 +-- kernel/xsched/cgroup.c | 78 ++++++++++++++++++++++-------------------- 3 files changed, 66 insertions(+), 42 deletions(-) diff --git a/include/linux/xsched.h b/include/linux/xsched.h index e33c91c6d969..9aca677dbf94 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -7,6 +7,7 @@ #include <linux/vstream.h> #include <linux/xcu_group.h> #include <linux/xsched_types.h> +#include <linux/math64.h> #ifndef pr_fmt #define pr_fmt(fmt) fmt @@ -268,8 +269,7 @@ struct xsched_group { int prio; /* Bandwidth setting: shares value set by user */ - u64 shares_cfg; - u64 shares_cfg_red; + u32 shares_cfg; u32 weight; u64 children_shares_sum; @@ -456,7 +456,10 @@ int delete_ctx(struct xsched_context *ctx); void xsched_group_inherit(struct task_struct *tsk, struct xsched_entity *xse); void xcu_cg_subsys_init(void); void xcu_cfs_root_cg_init(struct xsched_cu *xcu); -void xcu_grp_shares_update(struct xsched_group *parent); +void xcu_grp_shares_update(struct xsched_group *parent, + struct xsched_group *child, u32 shares_cfg); +void xcu_grp_shares_add(struct xsched_group *parent, struct xsched_group *child); +void xcu_grp_shares_sub(struct xsched_group *parent, struct xsched_group *child); void xsched_group_xse_detach(struct xsched_entity *xse); void xsched_quota_init(void); @@ -477,4 +480,20 @@ void xsched_quota_refill(struct work_struct *work); #endif +static inline u64 xs_calc_delta(u64 delta_exec, u32 base_weight, u32 weight) +{ + if (unlikely(weight == 0)) + weight = 1; + + if (weight == base_weight) + return delta_exec; + + return mul_u64_u32_div(delta_exec, base_weight, weight); +} + +static inline u64 xs_calc_delta_fair(u64 delta_exec, u32 weight) +{ + return xs_calc_delta(delta_exec, XSCHED_CFG_SHARE_DFLT, weight); +} + #endif /* !__LINUX_XSCHED_H__ */ diff --git a/kernel/xsched/cfs.c b/kernel/xsched/cfs.c index ad3d1652cb9b..d2615939292f 100644 --- a/kernel/xsched/cfs.c +++ b/kernel/xsched/cfs.c @@ -89,7 +89,7 @@ static void xs_update(struct xsched_entity_cfs *xse_cfs, u64 delta) struct xsched_group_xcu_priv *xg = xse_parent_grp_xcu(xse_cfs); for (; xg; xse_cfs = &xg->xse.cfs, xg = &xcg_parent_grp_xcu(xg)) { - u64 new_xrt = xse_cfs->xruntime + delta * xse_cfs->weight; + u64 new_xrt = xse_cfs->xruntime + xs_calc_delta_fair(delta, xse_cfs->weight); xs_cfs_rq_update(xse_cfs, new_xrt); xse_cfs->sum_exec_runtime += delta; @@ -115,7 +115,8 @@ static void xg_update(struct xsched_group_xcu_priv *xg, int task_delta) xg->cfs_rq->nr_running += task_delta; entry = xs_pick_first(xg->cfs_rq); if (entry) - new_xrt = xg->xse.cfs.sum_exec_runtime * xg->xse.cfs.weight; + new_xrt = xs_calc_delta_fair(xg->xse.cfs.sum_exec_runtime, + xg->xse.cfs.weight); else new_xrt = XSCHED_TIME_INF; diff --git a/kernel/xsched/cgroup.c b/kernel/xsched/cgroup.c index 09f250906e46..4ffe07e84c1c 100644 --- a/kernel/xsched/cgroup.c +++ b/kernel/xsched/cgroup.c @@ -164,7 +164,7 @@ static int xcu_cfs_cg_init(struct xsched_group *xcg, } xcg->shares_cfg = XSCHED_CFG_SHARE_DFLT; - xcu_grp_shares_update(parent_xg); + xcu_grp_shares_add(parent_xg, xcg); xcg->period = XSCHED_CFS_QUOTA_PERIOD_MS; xcg->quota = XSCHED_TIME_INF; xcg->runtime = 0; @@ -175,7 +175,7 @@ static int xcu_cfs_cg_init(struct xsched_group *xcg, static void xcu_cfs_cg_deinit(struct xsched_group *xcg) { xcg_perxcu_cfs_rq_deinit(xcg, num_active_xcu); - xcu_grp_shares_update(xcg->parent); + xcu_grp_shares_sub(xcg->parent, xcg); } /** @@ -593,47 +593,52 @@ static s64 xcu_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) return ret; } -void xcu_grp_shares_update(struct xsched_group *parent) +void xcu_grp_shares_update(struct xsched_group *parent, struct xsched_group *child, u32 shares_cfg) { int id; struct xsched_cu *xcu; - struct xsched_group *children; - u64 rem, sh_sum = 0, sh_gcd = 0, w_gcd = 0, sh_prod_red = 1; - list_for_each_entry(children, &parent->children_groups, group_node) { - if (children->sched_class == XSCHED_TYPE_CFS) - sh_gcd = gcd(sh_gcd, children->shares_cfg); - } + if (child->sched_class != XSCHED_TYPE_CFS) + return; - list_for_each_entry(children, &parent->children_groups, group_node) { - if (children->sched_class == XSCHED_TYPE_CFS) { - sh_sum += children->shares_cfg; - children->shares_cfg_red = div64_u64(children->shares_cfg, sh_gcd); - div64_u64_rem(sh_prod_red, children->shares_cfg_red, &rem); - if (rem) - sh_prod_red *= children->shares_cfg_red; - } - } + parent->children_shares_sum -= child->shares_cfg; - parent->children_shares_sum = sh_sum; + child->shares_cfg = shares_cfg; + child->weight = child->shares_cfg; - list_for_each_entry(children, &parent->children_groups, group_node) { - if (children->sched_class == XSCHED_TYPE_CFS) { - children->weight = div64_u64(sh_prod_red, children->shares_cfg_red); - w_gcd = gcd(w_gcd, children->weight); - } + for_each_active_xcu(xcu, id) { + mutex_lock(&xcu->xcu_lock); + child->perxcu_priv[id].xse.cfs.weight = child->weight; + mutex_unlock(&xcu->xcu_lock); } - list_for_each_entry(children, &parent->children_groups, group_node) { - if (children->sched_class == XSCHED_TYPE_CFS) { - children->weight = div64_u64(children->weight, w_gcd); - for_each_active_xcu(xcu, id) { - mutex_lock(&xcu->xcu_lock); - children->perxcu_priv[id].xse.cfs.weight = children->weight; - mutex_unlock(&xcu->xcu_lock); - } - } + parent->children_shares_sum += child->shares_cfg; +} + +void xcu_grp_shares_add(struct xsched_group *parent, struct xsched_group *child) +{ + int id; + struct xsched_cu *xcu; + + if (child->sched_class != XSCHED_TYPE_CFS) + return; + + child->weight = child->shares_cfg; + for_each_active_xcu(xcu, id) { + mutex_lock(&xcu->xcu_lock); + child->perxcu_priv[id].xse.cfs.weight = child->weight; + mutex_unlock(&xcu->xcu_lock); } + + parent->children_shares_sum += child->shares_cfg; +} + +void xcu_grp_shares_sub(struct xsched_group *parent, struct xsched_group *child) +{ + if (child->sched_class != XSCHED_TYPE_CFS) + return; + + parent->children_shares_sum -= child->shares_cfg; } static int xcu_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, @@ -670,12 +675,11 @@ static int xcu_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, xsched_quota_timeout_update(xcucg); break; case XCU_FILE_SHARES: - if (val < XCU_SHARES_MIN || val > U64_MAX) { + if (val < XCU_SHARES_MIN || val > U32_MAX) { ret = -EINVAL; break; } - xcucg->shares_cfg = val; - xcu_grp_shares_update(xcucg->parent); + xcu_grp_shares_update(xcucg->parent, xcucg, val); break; default: XSCHED_ERR("invalid operation %lu @ %s\n", cft->private, __func__); @@ -711,7 +715,7 @@ static int xcu_stat(struct seq_file *sf, void *v) } seq_printf(sf, "exec_runtime: %llu\n", exec_runtime); - seq_printf(sf, "shares cfg: %llu/%llu x%u\n", xcucg->shares_cfg, + seq_printf(sf, "shares cfg: %u/%llu x%u\n", xcucg->shares_cfg, xcucg->parent->children_shares_sum, xcucg->weight); seq_printf(sf, "quota: %lld\n", xcucg->quota); seq_printf(sf, "used: %lld\n", xcucg->runtime); -- 2.34.1

patchwork bot

5:51 p.m.

反馈：您发送到kernel@openeuler.org的补丁/补丁集，已成功转换为PR！ PR链接地址： https://gitee.com/openeuler/kernel/pulls/19546 邮件列表地址：https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/LFB... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/19546 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/LFB...

Age (days ago)

Last active (days ago)

List overview

7 comments

2 participants

participants (2)

patchwork bot
Zicheng Qu

[PATCH OLK-6.6 0/6] xsched: XCU Partition

tags

participants (2)