[PATCH OLK-6.6 V1] sched: More flexible use of CPU quota when CPU is idle

17 Aug 2025

From: Zheng Zucheng <zhengzucheng@huawei.com>

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/ICE7WC

--------------------------------

This feature allows users to use CPU quota more flexibly when CPU is
idle and it will cause the CPU quota to be exceeded. So, it cannot be
used in scenarios where there are strict restrictions on the use of the
CPU quota, such as some commercial scenarios that charge based on the
use of CPU quota.

Signed-off-by: Zheng Zucheng <zhengzucheng@huawei.com>
Signed-off-by: Liao Chang <liaochang1@huawei.com>
Signed-off-by: Cheng Yu <serein.chengyu@huawei.com>
---
 arch/arm64/Kconfig                     |   1 +
 arch/arm64/configs/openeuler_defconfig |   1 +
 arch/arm64/kernel/idle.c               |  35 +++++-
 init/Kconfig                           |  18 +++
 kernel/sched/core.c                    |  32 +++++
 kernel/sched/fair.c                    | 168 +++++++++++++++++++++++--
 kernel/sched/features.h                |   4 +
 kernel/sched/idle.c                    |   7 ++
 kernel/sched/sched.h                   |  15 +++
 9 files changed, 272 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b1f550c8c82a..3c8daac96773 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -106,6 +106,7 @@ config ARM64
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_SUPPORTS_SCHED_PARAL
+	select ARCH_SUPPORTS_SCHED_SOFT_QUOTA
 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
 	select ARCH_SUPPORTS_PER_VMA_LOCK
 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 37c970407d37..4b8b1c4fa9f3 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -210,6 +210,7 @@ CONFIG_PID_NS=y
 CONFIG_NET_NS=y
 CONFIG_SCHED_STEAL=y
 CONFIG_SCHED_PARAL=y
+CONFIG_SCHED_SOFT_QUOTA=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
 CONFIG_RELAY=y
diff --git a/arch/arm64/kernel/idle.c b/arch/arm64/kernel/idle.c
index 46a40b693da8..31d9bfbe10b8 100644
--- a/arch/arm64/kernel/idle.c
+++ b/arch/arm64/kernel/idle.c
@@ -45,6 +45,26 @@ void noinstr arch_cpu_idle(void)
 }
 EXPORT_SYMBOL_GPL(arch_cpu_idle);
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+static DEFINE_PER_CPU(int, sibling_idle) = 1;
+
+int is_sibling_idle(void)
+{
+	return this_cpu_read(sibling_idle);
+}
+
+static void smt_measurement_begin(void)
+{
+}
+
+static void smt_measurement_done(void)
+{
+}
+#else
+static inline void smt_measurement_begin(void) { }
+static inline void smt_measurement_done(void) { }
+#endif
+
 #ifdef CONFIG_ACTLR_XCALL_XINT
 struct arm_cpuidle_xcall_xint_context {
 	unsigned long actlr_el1;
@@ -57,6 +77,8 @@ void arch_cpu_idle_enter(void)
 {
 	struct arm_cpuidle_xcall_xint_context *context;
 
+	smt_measurement_begin();
+
 	if (!system_uses_xcall_xint())
 		return;
 
@@ -71,6 +93,8 @@ void arch_cpu_idle_exit(void)
 {
 	struct arm_cpuidle_xcall_xint_context *context;
 
+	smt_measurement_done();
+
 	if (!system_uses_xcall_xint())
 		return;
 
@@ -81,6 +105,13 @@ void arch_cpu_idle_exit(void)
 	put_cpu_var(contexts);
 }
 #else
-void arch_cpu_idle_enter(void) {}
-void arch_cpu_idle_exit(void) {}
+void arch_cpu_idle_enter(void)
+{
+	smt_measurement_begin();
+}
+
+void arch_cpu_idle_exit(void)
+{
+	smt_measurement_done();
+}
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 925e8517a7e8..2720083aaa17 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1174,6 +1174,24 @@ config SCHED_SOFT_DOMAIN
 
 	  If in doubt, say N.
 
+#
+# For architectures that want to enable the support for SCHED_SOFT_QUOTA
+#
+config ARCH_SUPPORTS_SCHED_SOFT_QUOTA
+	bool
+
+config SCHED_SOFT_QUOTA
+	bool "More flexible use of CPU quota"
+	depends on ARCH_SUPPORTS_SCHED_SOFT_QUOTA
+	depends on CFS_BANDWIDTH
+	default n
+	help
+	  This option allows users to use CPU quota more flexibly when CPU
+	  is idle. It is better for users to have some understanding of
+	  CFS_BANDWIDTH. It cannot be used in scenarios where there are strict
+	  restrictions on the use of the CPU quota, such as some commercial
+	  scenarios that charge based on the use of CPU quota.
+
 config SCHED_MM_CID
 	def_bool n
 	depends on SMP && RSEQ
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fab904f44c87..ba3cd68cbd03 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11682,6 +11682,30 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css,
 }
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+static int cpu_soft_quota_write(struct cgroup_subsys_state *css,
+			 struct cftype *cftype, s64 soft_quota)
+{
+	struct task_group *tg = css_tg(css);
+
+	if (soft_quota != 1 && soft_quota != 0)
+		return -EINVAL;
+
+	if (tg->soft_quota == soft_quota)
+		return 0;
+
+	tg->soft_quota = soft_quota;
+
+	return 0;
+}
+
+static inline s64 cpu_soft_quota_read(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	return css_tg(css)->soft_quota;
+}
+#endif
+
 #ifdef CONFIG_BPF_SCHED
 void sched_settag(struct task_struct *tsk, s64 tag)
 {
@@ -11928,6 +11952,14 @@ static struct cftype cpu_legacy_files[] = {
 		.write_s64 = cpu_qos_write,
 	},
 #endif
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	{
+		.name = "soft_quota",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_soft_quota_read,
+		.write_s64 = cpu_soft_quota_write,
+	},
+#endif
 #ifdef CONFIG_BPF_SCHED
 	{
 		.name = "tag",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 99175318885c..89ce1269b474 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -190,6 +190,10 @@ unsigned int sysctl_qos_level_weights[5] = {
 static long qos_reweight(long shares, struct task_group *tg);
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, soft_quota_throttled_cfs_rq);
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 /*
  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
@@ -223,6 +227,10 @@ extern unsigned int sysctl_smart_grid_strategy_ctrl;
 static int sysctl_affinity_adjust_delay_ms = 5000;
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+unsigned int sysctl_soft_runtime_ratio = 20;
+#endif
+
 #ifdef CONFIG_SYSCTL
 static struct ctl_table sched_fair_sysctls[] = {
 	{
@@ -322,6 +330,17 @@ static struct ctl_table sched_fair_sysctls[] = {
 		.extra1         = SYSCTL_ZERO,
 		.extra2		= &hundred_thousand,
 	},
+#endif
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	{
+		.procname       = "sched_soft_runtime_ratio",
+		.data           = &sysctl_soft_runtime_ratio,
+		.maxlen         = sizeof(sysctl_soft_runtime_ratio),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ONE,
+		.extra2         = SYSCTL_ONE_HUNDRED,
+	},
 #endif
 	{}
 };
@@ -592,10 +611,11 @@ static inline struct sched_entity *parent_entity(const struct sched_entity *se)
 	return se->parent;
 }
 
-static void
+static bool
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
 	int se_depth, pse_depth;
+	bool ret = false;
 
 	/*
 	 * preemption test can be made between sibling entities who are in the
@@ -609,6 +629,10 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	pse_depth = (*pse)->depth;
 
 	while (se_depth > pse_depth) {
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+		if (!ret && cfs_rq_of(*se)->soft_quota_enable == 1)
+			ret = true;
+#endif
 		se_depth--;
 		*se = parent_entity(*se);
 	}
@@ -619,9 +643,15 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	}
 
 	while (!is_same_group(*se, *pse)) {
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+		if (!ret && cfs_rq_of(*se)->soft_quota_enable == 1)
+			ret = true;
+#endif
 		*se = parent_entity(*se);
 		*pse = parent_entity(*pse);
 	}
+
+	return ret;
 }
 
 static int tg_is_idle(struct task_group *tg)
@@ -667,9 +697,10 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return NULL;
 }
 
-static inline void
+static inline bool
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
+	return false;
 }
 
 static inline int tg_is_idle(struct task_group *tg)
@@ -6030,6 +6061,14 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	SCHED_WARN_ON(cfs_rq->throttled_clock);
 	if (cfs_rq->nr_running)
 		cfs_rq->throttled_clock = rq_clock(rq);
+
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	if (cfs_rq->tg->soft_quota == 1) {
+		list_add(&cfs_rq->soft_quota_throttled_list,
+			 &per_cpu(soft_quota_throttled_cfs_rq, cpu_of(rq)));
+	}
+#endif
+
 	return true;
 }
 
@@ -6046,6 +6085,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	list_del_init(&cfs_rq->soft_quota_throttled_list);
+#endif
+
 #ifdef CONFIG_QOS_SCHED
 	/*
 	 * if this cfs_rq throttled by qos, not need unthrottle it.
@@ -6244,6 +6287,16 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 		}
 
 		rq_lock_irqsave(rq, &rf);
+
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+		if (cfs_rq->soft_quota_enable == 1) {
+			if (cfs_rq->runtime_remaining > 0)
+				cfs_rq->runtime_remaining = 0;
+
+			cfs_rq->soft_quota_enable = 0;
+		}
+#endif
+
 		if (!cfs_rq_throttled(cfs_rq))
 			goto next;
 
@@ -6306,6 +6359,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 	return throttled;
 }
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+static inline void init_tg_sum_soft_runtime(struct cfs_bandwidth *cfs_b)
+{
+	unsigned int cpu;
+	struct task_group *tg = container_of(cfs_b, struct task_group, cfs_bandwidth);
+
+	for_each_possible_cpu(cpu)
+		tg->cfs_rq[cpu]->sum_soft_runtime = 0;
+}
+#endif
+
 /*
  * Responsible for refilling a task_group's bandwidth and unthrottling its
  * cfs_rqs as appropriate. If there has been no activity within the last
@@ -6323,6 +6387,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 	cfs_b->nr_periods += overrun;
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	init_tg_sum_soft_runtime(cfs_b);
+#endif
+
 	/* Refill extra burst quota even if cfs_b->idle */
 	__refill_cfs_bandwidth_runtime(cfs_b);
 
@@ -6637,6 +6705,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_QOS_SCHED
 	INIT_LIST_HEAD(&cfs_rq->qos_throttled_list);
 #endif
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	INIT_LIST_HEAD(&cfs_rq->soft_quota_throttled_list);
+#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -9457,6 +9528,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int next_buddy_marked = 0;
 	int cse_is_idle, pse_is_idle;
+	bool ret = 0;
 
 	if (unlikely(se == pse))
 		return;
@@ -9491,7 +9563,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (!sched_feat(WAKEUP_PREEMPTION))
 		return;
 
-	find_matching_se(&se, &pse);
+	ret = find_matching_se(&se, &pse);
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	if (ret)
+		goto preempt;
+#endif
+
 	WARN_ON_ONCE(!pse);
 
 	cse_is_idle = se_is_idle(se);
@@ -14982,6 +15059,9 @@ void unregister_fair_sched_group(struct task_group *tg)
 	unsigned long flags;
 	struct rq *rq;
 	int cpu;
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	struct cfs_rq *cfs_rq;
+#endif
 
 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
 	destroy_auto_affinity(tg);
@@ -14991,10 +15071,16 @@ void unregister_fair_sched_group(struct task_group *tg)
 		if (tg->se[cpu])
 			remove_entity_load_avg(tg->se[cpu]);
 
-		#ifdef CONFIG_QOS_SCHED
-			if (tg->cfs_rq && tg->cfs_rq[cpu])
-				unthrottle_qos_sched_group(tg->cfs_rq[cpu]);
-		#endif
+#ifdef CONFIG_QOS_SCHED
+		if (tg->cfs_rq && tg->cfs_rq[cpu])
+			unthrottle_qos_sched_group(tg->cfs_rq[cpu]);
+#endif
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+		if (tg->cfs_rq && tg->cfs_rq[cpu]) {
+			cfs_rq = tg->cfs_rq[cpu];
+			list_del_init(&cfs_rq->soft_quota_throttled_list);
+		}
+#endif
 
 		/*
 		 * Only empty task groups can be destroyed; so we can speculatively
@@ -15309,6 +15395,11 @@ __init void init_sched_fair_class(void)
 		INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i));
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(soft_quota_throttled_cfs_rq, i));
+#endif
+
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -15319,3 +15410,66 @@ __init void init_sched_fair_class(void)
 #endif /* SMP */
 
 }
+
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+static bool check_soft_runtime(struct task_group *tg, int slice)
+{
+	int cpu;
+	u64 sum_soft_runtime = slice;
+	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+	if (cfs_b->quota == RUNTIME_INF)
+		return true;
+
+	for_each_possible_cpu(cpu)
+		sum_soft_runtime += tg->cfs_rq[cpu]->sum_soft_runtime;
+
+	return sum_soft_runtime < sysctl_soft_runtime_ratio * cfs_b->quota / 100;
+}
+
+int __weak is_sibling_idle(void)
+{
+	return 0;
+}
+
+bool unthrottle_cfs_rq_soft_quota(struct rq *rq)
+{
+	int max_cnt = 0;
+	bool ret = false;
+	struct cfs_rq *cfs_rq, *tmp_rq;
+	struct cfs_bandwidth *cfs_b;
+	int slice = sched_cfs_bandwidth_slice();
+
+	if (!is_sibling_idle())
+		return ret;
+
+	list_for_each_entry_safe(cfs_rq, tmp_rq, &per_cpu(soft_quota_throttled_cfs_rq, cpu_of(rq)),
+				 soft_quota_throttled_list) {
+		if (max_cnt++ > 20)
+			break;
+
+		if (cfs_rq->throttled) {
+			cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+			raw_spin_lock(&cfs_b->lock);
+
+			if (!check_soft_runtime(cfs_rq->tg, slice)) {
+				raw_spin_unlock(&cfs_b->lock);
+				continue;
+			}
+
+			raw_spin_unlock(&cfs_b->lock);
+
+			if (cfs_rq->runtime_remaining + slice > 0) {
+				cfs_rq->runtime_remaining += slice;
+				cfs_rq->sum_soft_runtime += slice;
+				cfs_rq->soft_quota_enable = 1;
+				unthrottle_cfs_rq(cfs_rq);
+				ret = true;
+				break;
+			}
+		}
+	}
+
+	return ret;
+}
+#endif
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 67939d04542f..b95797360dd6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -114,3 +114,7 @@ SCHED_FEAT(DA_UTIL_TASKGROUP, true)
 #ifdef CONFIG_SCHED_SOFT_DOMAIN
 SCHED_FEAT(SOFT_DOMAIN, false)
 #endif
+
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+SCHED_FEAT(SOFT_QUOTA, false)
+#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 5007b25c5bc6..3518a1a28e8c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -427,6 +427,13 @@ struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	struct task_struct *next = rq->idle;
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	if (sched_feat(SOFT_QUOTA)) {
+		if (unthrottle_cfs_rq_soft_quota(rq) && rq->cfs.nr_running)
+			return pick_next_task_fair(rq, NULL, NULL);
+	}
+#endif
+
 	set_next_task_idle(rq, next, true);
 
 	return next;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f6a3f93d1f75..0e21ad151ec9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -484,7 +484,11 @@ struct task_group {
 #else
 	KABI_RESERVE(1)
 #endif
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	KABI_USE(2, u64 soft_quota)
+#else
 	KABI_RESERVE(2)
+#endif
 	KABI_RESERVE(3)
 	KABI_RESERVE(4)
 	KABI_RESERVE(5)
@@ -578,6 +582,10 @@ static inline void tg_update_affinity_domains(int cpu, int online) {}
 static inline void offline_auto_affinity(struct task_group *tg) { }
 #endif
 
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+extern bool unthrottle_cfs_rq_soft_quota(struct rq *rq);
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
@@ -769,10 +777,17 @@ struct cfs_rq {
 		unsigned long           qos_idle_h_nr_running_padding;
 	};
 #endif
+#ifdef CONFIG_SCHED_SOFT_QUOTA
+	KABI_USE(1, u64 soft_quota_enable)
+	KABI_USE(2, u64 sum_soft_runtime)
+	KABI_REPLACE(_KABI_RESERVE(3); _KABI_RESERVE(4),
+		struct list_head soft_quota_throttled_list)
+#else
 	KABI_RESERVE(1)
 	KABI_RESERVE(2)
 	KABI_RESERVE(3)
 	KABI_RESERVE(4)
+#endif
 	KABI_RESERVE(5)
 	KABI_RESERVE(6)
 	KABI_RESERVE(7)
-- 
2.25.1