[PATCH v4 OLK-6.6 07/20] xsched/quota: support multi-level cgroup

5 Mar 2026

hulk inclusion
category: bugfix
bugzilla: https://atomgit.com/openeuler/kernel/issues/8423

--------------------------------

Extend the existing quota throttling mechanism to support hierarchical
enforcement, allowing parent groups to be throttled based on their own
quota limits——not just leaf groups. This ensures that resource
consumption at any level of the scheduling hierarchy respects its
configured budget.

- Propagate bandwidth usage upward during task execution so parent
  groups accumulate runtime.

- Apply throttling logic recursively: when a parent group exceeds its
  quota, it is marked as throttled, and all its descendants are
  effectively blocked from running (even if they have remaining local
  quota).

- Update unthrottling and enqueue/dequeue paths to respect hierarchical
  throttling state, ensuring correct scheduler behavior under nested
  limits.

This enhancement strengthens the fairness and predictability of the XCU
scheduler in multi-level cgroup deployments, enabling true hierarchical
resource control.

Fixes: aafde051ac61 ("xsched: Add support for CFS quota for cgroups")
Signed-off-by: Liu Kai <liukai284@huawei.com>
---
 include/linux/xsched.h    |  5 ++++-
 kernel/xsched/cfs.c       | 14 ++++++++++----
 kernel/xsched/cfs_quota.c | 31 +++++++++++++++++++++++++++----
 kernel/xsched/core.c      | 10 ----------
 4 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/include/linux/xsched.h b/include/linux/xsched.h
index 0a09b94886db..09a5760591f9 100644
--- a/include/linux/xsched.h
+++ b/include/linux/xsched.h
@@ -310,6 +310,9 @@ struct xsched_group {
 	for (; (__xse) && (__xse)->parent_grp;		\
 		(__xse) = &(xse_parent_grp_xcu((__xse))->xse))
 
+#define for_each_xsched_group(__xg)		\
+	for (; (__xg) && (__xg)->parent; (__xg) = (__xg)->parent)
+
 static inline struct xsched_group_xcu_priv *
 xse_this_grp_xcu(struct xsched_entity_cfs *xse_cfs)
 {
@@ -466,7 +469,7 @@ void xsched_quota_init(void);
 void xsched_quota_timeout_init(struct xsched_group *xg);
 void xsched_quota_timeout_update(struct xsched_group *xg);
 void xsched_quota_account(struct xsched_group *xg, s64 exec_time);
-bool xsched_quota_exceed(struct xsched_group *xg);
+void xsched_quota_check(struct xsched_group *xg, struct xsched_cu *xcu);
 void xsched_quota_refill(struct work_struct *work);
 
 #define XCU_PERIOD_MIN_MS 1
diff --git a/kernel/xsched/cfs.c b/kernel/xsched/cfs.c
index aa47f7d9ee94..df843c06e748 100644
--- a/kernel/xsched/cfs.c
+++ b/kernel/xsched/cfs.c
@@ -206,12 +206,18 @@ static void put_prev_ctx_fair(struct xsched_entity *xse)
 {
 	struct xsched_entity *prev = xse;
 
-#ifdef CONFIG_CGROUP_XCU
-	xsched_quota_account(xse->parent_grp, (s64)xse->last_exec_runtime);
-#endif
-
 	for_each_xse(prev)
 		xs_update(&prev->cfs, xse->last_exec_runtime);
+
+#ifdef CONFIG_CGROUP_XCU
+	struct xsched_group *group = xse->parent_grp;
+	struct xsched_cu *xcu = xse->xcu;
+
+	for_each_xsched_group(group) {
+		xsched_quota_account(group, (s64)xse->last_exec_runtime);
+		xsched_quota_check(group, xcu);
+	}
+#endif
 }
 
 void rq_init_fair(struct xsched_cu *xcu)
diff --git a/kernel/xsched/cfs_quota.c b/kernel/xsched/cfs_quota.c
index 70316dab682a..bdbd2330e1b3 100644
--- a/kernel/xsched/cfs_quota.c
+++ b/kernel/xsched/cfs_quota.c
@@ -19,6 +19,28 @@
 
 static struct workqueue_struct *quota_workqueue;
 
+static void xsched_group_throttle(struct xsched_group *xg, struct xsched_cu *xcu)
+{
+	int xcu_id = xcu->id;
+	ktime_t now = ktime_get();
+
+	if (!xg || READ_ONCE(xg->is_offline))
+		return;
+
+	lockdep_assert_held(&xcu->xcu_lock);
+
+	xg->perxcu_priv[xcu_id].nr_throttled++;
+	xg->perxcu_priv[xcu_id].start_throttled_time = now;
+
+	/**
+	 * When an xse triggers XCU throttling, only the corresponding gse is
+	 * dequeued from this XCU's group scheduling entity (gse) hierarchy,
+	 * no further propagation or global dequeue occurs, ensuring throttling
+	 * is scoped to the affected XCU.
+	 */
+	dequeue_ctx(&xg->perxcu_priv[xcu_id].xse);
+}
+
 static void xsched_group_unthrottle(struct xsched_group *xg)
 {
 	uint32_t id;
@@ -84,15 +106,16 @@ void xsched_quota_account(struct xsched_group *xg, s64 exec_time)
 	spin_unlock(&xg->lock);
 }
 
-bool xsched_quota_exceed(struct xsched_group *xg)
+void xsched_quota_check(struct xsched_group *xg, struct xsched_cu *xcu)
 {
-	bool ret;
+	bool throttled;
 
 	spin_lock(&xg->lock);
-	ret = (xg->quota > 0) ? (xg->runtime >= xg->quota) : false;
+	throttled = (xg->quota > 0) ? (xg->runtime >= xg->quota) : false;
 	spin_unlock(&xg->lock);
 
-	return ret;
+	if (throttled)
+		xsched_group_throttle(xg, xcu);
 }
 
 void xsched_quota_init(void)
diff --git a/kernel/xsched/core.c b/kernel/xsched/core.c
index d1ba01b3155c..e9b0c6c4c86c 100644
--- a/kernel/xsched/core.c
+++ b/kernel/xsched/core.c
@@ -414,22 +414,12 @@ int xsched_schedule(void *input_xcu)
 		if (!atomic_read(&curr_xse->kicks_pending_cnt))
 			dequeue_ctx(curr_xse);
 
-#ifdef CONFIG_CGROUP_XCU
-		if (xsched_quota_exceed(curr_xse->parent_grp)) {
-			dequeue_ctx(&curr_xse->parent_grp->perxcu_priv[xcu->id].xse);
-			curr_xse->parent_grp->perxcu_priv[xcu->id].nr_throttled++;
-			curr_xse->parent_grp->perxcu_priv[xcu->id].start_throttled_time =
-				ktime_get();
-		}
-#endif
-
 		xcu->xrq.curr_xse = NULL;
 	}
 
 	return 0;
 }
 
-
 /* Initializes all xsched XCU objects.
  * Should only be called from xsched_xcu_register function.
  */
-- 
2.34.1